src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 static uint32_t
  20 tu_pack_float32_for_unorm(float val, int bits)
  21 {
  22    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
  23 }
  24
  25 /* r2d_ = BLIT_OP_SCALE operations */
  26
  27 static enum a6xx_2d_ifmt
  28 format_to_ifmt(enum a6xx_format fmt)
  29 {
  30    switch (fmt) {
  31    case FMT6_A8_UNORM:
  32    case FMT6_8_UNORM:
  33    case FMT6_8_SNORM:
  34    case FMT6_8_8_UNORM:
  35    case FMT6_8_8_SNORM:
  36    case FMT6_8_8_8_8_UNORM:
  37    case FMT6_8_8_8_X8_UNORM:
  38    case FMT6_8_8_8_8_SNORM:
  39    case FMT6_4_4_4_4_UNORM:
  40    case FMT6_5_5_5_1_UNORM:
  41    case FMT6_5_6_5_UNORM:
  42    case FMT6_Z24_UNORM_S8_UINT:
  43    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
  44       return R2D_UNORM8;
  45
  46    case FMT6_32_UINT:
  47    case FMT6_32_SINT:
  48    case FMT6_32_32_UINT:
  49    case FMT6_32_32_SINT:
  50    case FMT6_32_32_32_32_UINT:
  51    case FMT6_32_32_32_32_SINT:
  52       return R2D_INT32;
  53
  54    case FMT6_16_UINT:
  55    case FMT6_16_SINT:
  56    case FMT6_16_16_UINT:
  57    case FMT6_16_16_SINT:
  58    case FMT6_16_16_16_16_UINT:
  59    case FMT6_16_16_16_16_SINT:
  60    case FMT6_10_10_10_2_UINT:
  61       return R2D_INT16;
  62
  63    case FMT6_8_UINT:
  64    case FMT6_8_SINT:
  65    case FMT6_8_8_UINT:
  66    case FMT6_8_8_SINT:
  67    case FMT6_8_8_8_8_UINT:
  68    case FMT6_8_8_8_8_SINT:
  69       return R2D_INT8;
  70
  71    case FMT6_16_UNORM:
  72    case FMT6_16_SNORM:
  73    case FMT6_16_16_UNORM:
  74    case FMT6_16_16_SNORM:
  75    case FMT6_16_16_16_16_UNORM:
  76    case FMT6_16_16_16_16_SNORM:
  77    case FMT6_32_FLOAT:
  78    case FMT6_32_32_FLOAT:
  79    case FMT6_32_32_32_32_FLOAT:
  80       return R2D_FLOAT32;
  81
  82    case FMT6_16_FLOAT:
  83    case FMT6_16_16_FLOAT:
  84    case FMT6_16_16_16_16_FLOAT:
  85    case FMT6_11_11_10_FLOAT:
  86    case FMT6_10_10_10_2_UNORM:
  87    case FMT6_10_10_10_2_UNORM_DEST:
  88       return R2D_FLOAT16;
  89
  90    default:
  91       unreachable("bad format");
  92       return 0;
  93    }
  94 }
  95
  96 static void
  97 r2d_coords(struct tu_cs *cs,
  98            const VkOffset2D *dst,
  99            const VkOffset2D *src,
 100            const VkExtent2D *extent)
 101 {
 102    tu_cs_emit_regs(cs,
 103       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 104       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 105
 106    if (!src)
 107       return;
 108
 109    tu_cs_emit_regs(cs,
 110                    A6XX_GRAS_2D_SRC_TL_X(src->x),
 111                    A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
 112                    A6XX_GRAS_2D_SRC_TL_Y(src->y),
 113                    A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
 114 }
 115
 116 static void
 117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 118 {
 119    uint32_t clear_value[4] = {};
 120
 121    switch (format) {
 122    case VK_FORMAT_X8_D24_UNORM_PACK32:
 123    case VK_FORMAT_D24_UNORM_S8_UINT:
 124       /* cleared as r8g8b8a8_unorm using special format */
 125       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 126       clear_value[1] = clear_value[0] >> 8;
 127       clear_value[2] = clear_value[0] >> 16;
 128       clear_value[3] = val->depthStencil.stencil;
 129       break;
 130    case VK_FORMAT_D16_UNORM:
 131    case VK_FORMAT_D32_SFLOAT:
 132       /* R2D_FLOAT32 */
 133       clear_value[0] = fui(val->depthStencil.depth);
 134       break;
 135    case VK_FORMAT_S8_UINT:
 136       clear_value[0] = val->depthStencil.stencil;
 137       break;
 138    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 139       /* cleared as UINT32 */
 140       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 141       break;
 142    default:
 143       assert(!vk_format_is_depth_or_stencil(format));
 144       const struct util_format_description *desc = vk_format_description(format);
 145       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 146
 147       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 148                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 149
 150       for (unsigned i = 0; i < desc->nr_channels; i++) {
 151          const struct util_format_channel_description *ch = &desc->channel[i];
 152          if (ifmt == R2D_UNORM8) {
 153             float linear = val->color.float32[i];
 154             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 155                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 156
 157             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 158                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
 159             else
 160                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 161          } else if (ifmt == R2D_FLOAT16) {
 162             clear_value[i] = util_float_to_half(val->color.float32[i]);
 163          } else {
 164             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 165                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 166             clear_value[i] = val->color.uint32[i];
 167          }
 168       }
 169       break;
 170    }
 171
 172    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 173    tu_cs_emit_array(cs, clear_value, 4);
 174 }
 175
 176 static void
 177 r2d_src(struct tu_cmd_buffer *cmd,
 178         struct tu_cs *cs,
 179         const struct tu_image_view *iview,
 180         uint32_t layer,
 181         VkFilter filter)
 182 {
 183    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
 184    if (filter != VK_FILTER_NEAREST)
 185       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
 186
 187    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 188    tu_cs_emit(cs, src_info);
 189    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 190    tu_cs_image_ref_2d(cs, iview, layer, true);
 191
 192    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 193    tu_cs_image_flag_ref(cs, iview, layer);
 194 }
 195
 196 static void
 197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 198                struct tu_cs *cs,
 199                VkFormat vk_format,
 200                uint64_t va, uint32_t pitch,
 201                uint32_t width, uint32_t height)
 202 {
 203    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 204
 205    tu_cs_emit_regs(cs,
 206                    A6XX_SP_PS_2D_SRC_INFO(
 207                       .color_format = format.fmt,
 208                       .color_swap = format.swap,
 209                       .srgb = vk_format_is_srgb(vk_format),
 210                       .unk20 = 1,
 211                       .unk22 = 1),
 212                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 213                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 214                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 215                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 216 }
 217
 218 static void
 219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 220 {
 221    assert(iview->image->samples == 1);
 222
 223    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 224    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 225    tu_cs_image_ref_2d(cs, iview, layer, false);
 226
 227    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 228    tu_cs_image_flag_ref(cs, iview, layer);
 229 }
 230
 231 static void
 232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 233 {
 234    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 235
 236    tu_cs_emit_regs(cs,
 237                    A6XX_RB_2D_DST_INFO(
 238                       .color_format = format.fmt,
 239                       .color_swap = format.swap,
 240                       .srgb = vk_format_is_srgb(vk_format)),
 241                    A6XX_RB_2D_DST_LO((uint32_t) va),
 242                    A6XX_RB_2D_DST_HI(va >> 32),
 243                    A6XX_RB_2D_DST_PITCH(pitch));
 244 }
 245
 246 static void
 247 r2d_setup_common(struct tu_cmd_buffer *cmd,
 248                  struct tu_cs *cs,
 249                  VkFormat vk_format,
 250                  VkImageAspectFlags aspect_mask,
 251                  enum a6xx_rotation rotation,
 252                  bool clear,
 253                  bool ubwc,
 254                  bool scissor)
 255 {
 256    enum a6xx_format format = tu6_base_format(vk_format);
 257    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 258    uint32_t unknown_8c01 = 0;
 259
 260    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
 261         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
 262       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
 263    }
 264
 265    /* note: the only format with partial clearing is D24S8 */
 266    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 267       /* preserve stencil channel */
 268       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 269          unknown_8c01 = 0x08000041;
 270       /* preserve depth channels */
 271       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 272          unknown_8c01 = 0x00084001;
 273    }
 274
 275    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
 276    tu_cs_emit(cs, unknown_8c01);
 277
 278    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 279          .scissor = scissor,
 280          .rotate = rotation,
 281          .solid_color = clear,
 282          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 283          .color_format = format,
 284          .mask = 0xf,
 285          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 286       ).value;
 287
 288    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 289    tu_cs_emit(cs, blit_cntl);
 290
 291    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 292    tu_cs_emit(cs, blit_cntl);
 293
 294    if (format == FMT6_10_10_10_2_UNORM_DEST)
 295       format = FMT6_16_16_16_16_FLOAT;
 296
 297    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
 298          .sint = vk_format_is_sint(vk_format),
 299          .uint = vk_format_is_uint(vk_format),
 300          .color_format = format,
 301          .srgb = vk_format_is_srgb(vk_format),
 302          .mask = 0xf));
 303 }
 304
 305 static void
 306 r2d_setup(struct tu_cmd_buffer *cmd,
 307           struct tu_cs *cs,
 308           VkFormat vk_format,
 309           VkImageAspectFlags aspect_mask,
 310           enum a6xx_rotation rotation,
 311           bool clear,
 312           bool ubwc)
 313 {
 314    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 315
 316    r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
 317 }
 318
 319 static void
 320 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 321 {
 322    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 323    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 324 }
 325
 326 /* r3d_ = shader path operations */
 327
 328 void
 329 tu_init_clear_blit_shaders(struct tu6_global *global)
 330 {
 331 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
 332 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 333 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 334
 335    static const instr_t vs_code[] = {
 336       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 337        * r1.xy = r0.w ? c1.zw : c0.zw
 338        * r0.w = 1.0f
 339        */
 340       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 341          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 342          .src2 = 3,
 343          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 344       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 345          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 346          .src2 = 3,
 347          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 348       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 349       { .cat0 = { .opc = OPC_END } },
 350    };
 351
 352    static const instr_t fs_blit[] = {
 353       /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 354        * blit path (its not clear what allows it to not have it)
 355        */
 356       CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
 357       { .cat0 = { .opc = OPC_END } },
 358    };
 359
 360    memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
 361    memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
 362
 363    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
 364       instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
 365       for (uint32_t i = 0; i < num_rts; i++) {
 366          /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 367          *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
 368       }
 369       *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 370    }
 371 }
 372
 373 static void
 374 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 375            bool layered_clear)
 376 {
 377    struct ir3_const_state dummy_const_state = {};
 378    struct ir3_shader dummy_shader = {};
 379
 380    struct ir3_shader_variant vs = {
 381       .type = MESA_SHADER_VERTEX,
 382       .instrlen = 1,
 383       .constlen = 4,
 384       .info.max_reg = 1,
 385       .inputs_count = 1,
 386       .inputs[0] = {
 387          .slot = SYSTEM_VALUE_VERTEX_ID,
 388          .regid = regid(0, 3),
 389          .sysval = true,
 390       },
 391       .outputs_count = blit ? 2 : 1,
 392       .outputs[0] = {
 393          .slot = VARYING_SLOT_POS,
 394          .regid = regid(0, 0),
 395       },
 396       .outputs[1] = {
 397          .slot = VARYING_SLOT_VAR0,
 398          .regid = regid(1, 0),
 399       },
 400       .shader = &dummy_shader,
 401       .const_state = &dummy_const_state,
 402    };
 403    if (layered_clear) {
 404       vs.outputs[1].slot = VARYING_SLOT_LAYER;
 405       vs.outputs[1].regid = regid(1, 1);
 406       vs.outputs_count = 2;
 407    }
 408
 409    struct ir3_shader_variant fs = {
 410       .type = MESA_SHADER_FRAGMENT,
 411       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 412       .constlen = align(num_rts, 4),
 413       .info.max_reg = MAX2(num_rts, 1) - 1,
 414       .total_in = blit ? 2 : 0,
 415       .num_samp = blit ? 1 : 0,
 416       .inputs_count = blit ? 2 : 0,
 417       .inputs[0] = {
 418          .slot = VARYING_SLOT_VAR0,
 419          .inloc = 0,
 420          .compmask = 3,
 421          .bary = true,
 422       },
 423       .inputs[1] = {
 424          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 425          .regid = regid(0, 0),
 426          .sysval = 1,
 427       },
 428       .num_sampler_prefetch = blit ? 1 : 0,
 429       .sampler_prefetch[0] = {
 430          .src = 0,
 431          .wrmask = 0xf,
 432          .cmd = 4,
 433       },
 434       .shader = &dummy_shader,
 435       .const_state = &dummy_const_state,
 436    };
 437
 438    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
 439          .vs_state = true,
 440          .hs_state = true,
 441          .ds_state = true,
 442          .gs_state = true,
 443          .fs_state = true,
 444          .cs_state = true,
 445          .gfx_ibo = true,
 446          .cs_ibo = true,
 447          .gfx_shared_const = true,
 448          .gfx_bindless = 0x1f,
 449          .cs_bindless = 0x1f));
 450
 451    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
 452    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 453    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 454    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
 455    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
 456          global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
 457
 458    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 459    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 460
 461    tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
 462
 463    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 464    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 465    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 466
 467    tu6_emit_fs_inputs(cs, &fs);
 468
 469    tu_cs_emit_regs(cs,
 470                    A6XX_GRAS_CL_CNTL(
 471                       .persp_division_disable = 1,
 472                       .vp_xform_disable = 1,
 473                       .vp_clip_code_ignore = 1,
 474                       .clip_disable = 1));
 475    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 476
 477    tu_cs_emit_regs(cs,
 478                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
 479                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
 480    tu_cs_emit_regs(cs,
 481                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
 482                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
 483
 484    tu_cs_emit_regs(cs,
 485                    A6XX_VFD_INDEX_OFFSET(),
 486                    A6XX_VFD_INSTANCE_START_OFFSET());
 487 }
 488
 489 static void
 490 r3d_coords_raw(struct tu_cs *cs, const float *coords)
 491 {
 492    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 493    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 494                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 495                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 496                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
 497                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 498    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 499    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 500    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 501 }
 502
 503 static void
 504 r3d_coords(struct tu_cs *cs,
 505            const VkOffset2D *dst,
 506            const VkOffset2D *src,
 507            const VkExtent2D *extent)
 508 {
 509    int32_t src_x1 = src ? src->x : 0;
 510    int32_t src_y1 = src ? src->y : 0;
 511    r3d_coords_raw(cs, (float[]) {
 512       dst->x,                 dst->y,
 513       src_x1,                 src_y1,
 514       dst->x + extent->width, dst->y + extent->height,
 515       src_x1 + extent->width, src_y1 + extent->height,
 516    });
 517 }
 518
 519 static void
 520 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 521 {
 522    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 523    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 524                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 525                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 526                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 527                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 528    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 529    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 530    switch (format) {
 531    case VK_FORMAT_X8_D24_UNORM_PACK32:
 532    case VK_FORMAT_D24_UNORM_S8_UINT: {
 533       /* cleared as r8g8b8a8_unorm using special format */
 534       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 535       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 536       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 537       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 538       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 539    } break;
 540    case VK_FORMAT_D16_UNORM:
 541    case VK_FORMAT_D32_SFLOAT:
 542       tu_cs_emit(cs, fui(val->depthStencil.depth));
 543       tu_cs_emit(cs, 0);
 544       tu_cs_emit(cs, 0);
 545       tu_cs_emit(cs, 0);
 546       break;
 547    case VK_FORMAT_S8_UINT:
 548       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 549       tu_cs_emit(cs, 0);
 550       tu_cs_emit(cs, 0);
 551       tu_cs_emit(cs, 0);
 552       break;
 553    default:
 554       /* as color formats use clear value as-is */
 555       assert(!vk_format_is_depth_or_stencil(format));
 556       tu_cs_emit_array(cs, val->color.uint32, 4);
 557       break;
 558    }
 559 }
 560
 561 static void
 562 r3d_src_common(struct tu_cmd_buffer *cmd,
 563                struct tu_cs *cs,
 564                const uint32_t *tex_const,
 565                uint32_t offset_base,
 566                uint32_t offset_ubwc,
 567                VkFilter filter)
 568 {
 569    struct tu_cs_memory texture = { };
 570    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 571                                  2, /* allocate space for a sampler too */
 572                                  A6XX_TEX_CONST_DWORDS, &texture);
 573    assert(result == VK_SUCCESS);
 574
 575    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 576
 577    /* patch addresses for layer offset */
 578    *(uint64_t*) (texture.map + 4) += offset_base;
 579    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 580    texture.map[7] = ubwc_addr;
 581    texture.map[8] = ubwc_addr >> 32;
 582
 583    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 584       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
 585       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
 586       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 587       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 588       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 589       0x60000; /* XXX used by blob, doesn't seem necessary */
 590    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 591       0x1 | /* XXX used by blob, doesn't seem necessary */
 592       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 593       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 594    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 595    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 596
 597    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 598    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 599                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 600                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 601                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 602                CP_LOAD_STATE6_0_NUM_UNIT(1));
 603    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 604
 605    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 606    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 607
 608    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 609    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 610       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 611       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 612       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 613       CP_LOAD_STATE6_0_NUM_UNIT(1));
 614    tu_cs_emit_qw(cs, texture.iova);
 615
 616    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 617    tu_cs_emit_qw(cs, texture.iova);
 618
 619    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 620 }
 621
 622 static void
 623 r3d_src(struct tu_cmd_buffer *cmd,
 624         struct tu_cs *cs,
 625         const struct tu_image_view *iview,
 626         uint32_t layer,
 627         VkFilter filter)
 628 {
 629    r3d_src_common(cmd, cs, iview->descriptor,
 630                   iview->layer_size * layer,
 631                   iview->ubwc_layer_size * layer,
 632                   filter);
 633 }
 634
 635 static void
 636 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 637                struct tu_cs *cs,
 638                VkFormat vk_format,
 639                uint64_t va, uint32_t pitch,
 640                uint32_t width, uint32_t height)
 641 {
 642    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 643
 644    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 645
 646    desc[0] =
 647       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 648       A6XX_TEX_CONST_0_FMT(format.fmt) |
 649       A6XX_TEX_CONST_0_SWAP(format.swap) |
 650       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 651       // XXX to swizzle into .w for stencil buffer_to_image
 652       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 653       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 654       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 655    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 656    desc[2] =
 657       A6XX_TEX_CONST_2_PITCH(pitch) |
 658       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 659    desc[3] = 0;
 660    desc[4] = va;
 661    desc[5] = va >> 32;
 662    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 663       desc[i] = 0;
 664
 665    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 666 }
 667
 668 static void
 669 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 670 {
 671    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 672
 673    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 674    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 675    tu_cs_image_ref(cs, iview, layer);
 676    tu_cs_emit(cs, 0);
 677
 678    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 679    tu_cs_image_flag_ref(cs, iview, layer);
 680
 681    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 682 }
 683
 684 static void
 685 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 686 {
 687    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 688
 689    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 690
 691    tu_cs_emit_regs(cs,
 692                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 693                    A6XX_RB_MRT_PITCH(0, pitch),
 694                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 695                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 696                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 697                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 698
 699    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 700 }
 701
 702 static uint8_t
 703 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
 704 {
 705    uint8_t mask = 0xf;
 706    assert(aspect_mask);
 707    /* note: the only format with partial writing is D24S8,
 708     * clear/blit uses the _AS_R8G8B8A8 format to access it
 709     */
 710    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 711       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 712          mask = 0x7;
 713       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 714          mask = 0x8;
 715    }
 716    return mask;
 717 }
 718
 719 static void
 720 r3d_setup(struct tu_cmd_buffer *cmd,
 721           struct tu_cs *cs,
 722           VkFormat vk_format,
 723           VkImageAspectFlags aspect_mask,
 724           enum a6xx_rotation rotation,
 725           bool clear,
 726           bool ubwc)
 727 {
 728    enum a6xx_format format = tu6_base_format(vk_format);
 729
 730    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
 731         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
 732       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
 733    }
 734
 735    if (!cmd->state.pass) {
 736       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 737       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
 738    }
 739
 740    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 741    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 742
 743    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 744
 745    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 746    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 747                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 748                   0xfc000000);
 749    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 750
 751    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 752    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 753
 754    tu_cs_emit_regs(cs,
 755                    A6XX_RB_FS_OUTPUT_CNTL0(),
 756                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 757
 758    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 759    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 760    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 761
 762    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 763    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 764    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 765    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 766    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 767    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 768    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 769
 770    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 771    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 772
 773    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 774                         .color_format = format,
 775                         .color_sint = vk_format_is_sint(vk_format),
 776                         .color_uint = vk_format_is_uint(vk_format)));
 777
 778    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
 779       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
 780    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 781    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 782 }
 783
 784 static void
 785 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 786 {
 787    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 788    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 789                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 790                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 791    tu_cs_emit(cs, 1); /* instance count */
 792    tu_cs_emit(cs, 2); /* vertex count */
 793 }
 794
 795 /* blit ops - common interface for 2d/shader paths */
 796
 797 struct blit_ops {
 798    void (*coords)(struct tu_cs *cs,
 799                   const VkOffset2D *dst,
 800                   const VkOffset2D *src,
 801                   const VkExtent2D *extent);
 802    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 803    void (*src)(
 804         struct tu_cmd_buffer *cmd,
 805         struct tu_cs *cs,
 806         const struct tu_image_view *iview,
 807         uint32_t layer,
 808         VkFilter filter);
 809    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 810                       VkFormat vk_format,
 811                       uint64_t va, uint32_t pitch,
 812                       uint32_t width, uint32_t height);
 813    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 814    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
 815    void (*setup)(struct tu_cmd_buffer *cmd,
 816                  struct tu_cs *cs,
 817                  VkFormat vk_format,
 818                  VkImageAspectFlags aspect_mask,
 819                  enum a6xx_rotation rotation,
 820                  bool clear,
 821                  bool ubwc);
 822    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 823 };
 824
 825 static const struct blit_ops r2d_ops = {
 826    .coords = r2d_coords,
 827    .clear_value = r2d_clear_value,
 828    .src = r2d_src,
 829    .src_buffer = r2d_src_buffer,
 830    .dst = r2d_dst,
 831    .dst_buffer = r2d_dst_buffer,
 832    .setup = r2d_setup,
 833    .run = r2d_run,
 834 };
 835
 836 static const struct blit_ops r3d_ops = {
 837    .coords = r3d_coords,
 838    .clear_value = r3d_clear_value,
 839    .src = r3d_src,
 840    .src_buffer = r3d_src_buffer,
 841    .dst = r3d_dst,
 842    .dst_buffer = r3d_dst_buffer,
 843    .setup = r3d_setup,
 844    .run = r3d_run,
 845 };
 846
 847 /* passthrough set coords from 3D extents */
 848 static void
 849 coords(const struct blit_ops *ops,
 850        struct tu_cs *cs,
 851        const VkOffset3D *dst,
 852        const VkOffset3D *src,
 853        const VkExtent3D *extent)
 854 {
 855    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 856 }
 857
 858 static VkFormat
 859 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
 860 {
 861    if (vk_format_is_compressed(format)) {
 862       switch (vk_format_get_blocksize(format)) {
 863       case 1: return VK_FORMAT_R8_UINT;
 864       case 2: return VK_FORMAT_R16_UINT;
 865       case 4: return VK_FORMAT_R32_UINT;
 866       case 8: return VK_FORMAT_R32G32_UINT;
 867       case 16:return VK_FORMAT_R32G32B32A32_UINT;
 868       default:
 869          unreachable("unhandled format size");
 870       }
 871    }
 872
 873    switch (format) {
 874    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 875       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
 876          return VK_FORMAT_R8G8_UNORM;
 877       /* fallthrough */
 878    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
 879       return VK_FORMAT_R8_UNORM;
 880    case VK_FORMAT_D24_UNORM_S8_UINT:
 881       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
 882          return VK_FORMAT_R8_UNORM;
 883       /* fallthrough */
 884    default:
 885       return format;
 886    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 887       return VK_FORMAT_R32_UINT;
 888    }
 889 }
 890
 891 static void
 892 tu_image_view_copy_blit(struct tu_image_view *iview,
 893                         struct tu_image *image,
 894                         VkFormat format,
 895                         const VkImageSubresourceLayers *subres,
 896                         uint32_t layer,
 897                         bool stencil_read)
 898 {
 899    VkImageAspectFlags aspect_mask = subres->aspectMask;
 900
 901    /* always use the AS_R8G8B8A8 format for these */
 902    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
 903        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
 904       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
 905    }
 906
 907    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
 908       .image = tu_image_to_handle(image),
 909       .viewType = VK_IMAGE_VIEW_TYPE_2D,
 910       .format = format,
 911       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
 912       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
 913       .subresourceRange = {
 914          .aspectMask = aspect_mask,
 915          .baseMipLevel = subres->mipLevel,
 916          .levelCount = 1,
 917          .baseArrayLayer = subres->baseArrayLayer + layer,
 918          .layerCount = 1,
 919       },
 920    }, false);
 921 }
 922
 923 static void
 924 tu_image_view_copy(struct tu_image_view *iview,
 925                    struct tu_image *image,
 926                    VkFormat format,
 927                    const VkImageSubresourceLayers *subres,
 928                    uint32_t layer,
 929                    bool stencil_read)
 930 {
 931    format = copy_format(format, subres->aspectMask, false);
 932    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
 933 }
 934
 935 static void
 936 tu_image_view_blit(struct tu_image_view *iview,
 937                    struct tu_image *image,
 938                    const VkImageSubresourceLayers *subres,
 939                    uint32_t layer)
 940 {
 941    tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
 942 }
 943
 944 static void
 945 tu6_blit_image(struct tu_cmd_buffer *cmd,
 946                struct tu_image *src_image,
 947                struct tu_image *dst_image,
 948                const VkImageBlit *info,
 949                VkFilter filter)
 950 {
 951    const struct blit_ops *ops = &r2d_ops;
 952    struct tu_cs *cs = &cmd->cs;
 953    uint32_t layers;
 954
 955    /* 2D blit can't do rotation mirroring from just coordinates */
 956    static const enum a6xx_rotation rotate[2][2] = {
 957       {ROTATE_0, ROTATE_HFLIP},
 958       {ROTATE_VFLIP, ROTATE_180},
 959    };
 960
 961    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
 962                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
 963    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
 964                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
 965    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
 966                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
 967
 968    if (mirror_z) {
 969       tu_finishme("blit z mirror\n");
 970       return;
 971    }
 972
 973    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
 974        info->dstOffsets[1].z - info->dstOffsets[0].z) {
 975       tu_finishme("blit z filter\n");
 976       return;
 977    }
 978
 979    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
 980    if (info->dstSubresource.layerCount > 1) {
 981       assert(layers <= 1);
 982       layers = info->dstSubresource.layerCount;
 983    }
 984
 985    /* BC1_RGB_* formats need to have their last components overriden with 1
 986     * when sampling, which is normally handled with the texture descriptor
 987     * swizzle. The 2d path can't handle that, so use the 3d path.
 988     *
 989     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
 990     * the 2d path.
 991     */
 992
 993    if (dst_image->samples > 1 ||
 994        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
 995        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
 996        filter == VK_FILTER_CUBIC_EXT)
 997       ops = &r3d_ops;
 998
 999    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1000     * figure out why (should be able to pass all tests with only shader path)
1001     */
1002
1003    ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1004               rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1005
1006    if (ops == &r3d_ops) {
1007       r3d_coords_raw(cs, (float[]) {
1008          info->dstOffsets[0].x, info->dstOffsets[0].y,
1009          info->srcOffsets[0].x, info->srcOffsets[0].y,
1010          info->dstOffsets[1].x, info->dstOffsets[1].y,
1011          info->srcOffsets[1].x, info->srcOffsets[1].y
1012       });
1013    } else {
1014       tu_cs_emit_regs(cs,
1015          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1016                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1017          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1018                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1019       tu_cs_emit_regs(cs,
1020          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1021          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1022          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1023          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1024    }
1025
1026    struct tu_image_view dst, src;
1027    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1028    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1029
1030    for (uint32_t i = 0; i < layers; i++) {
1031       ops->dst(cs, &dst, i);
1032       ops->src(cmd, cs, &src, i, filter);
1033       ops->run(cmd, cs);
1034    }
1035 }
1036
1037 void
1038 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1039                 VkImage srcImage,
1040                 VkImageLayout srcImageLayout,
1041                 VkImage dstImage,
1042                 VkImageLayout dstImageLayout,
1043                 uint32_t regionCount,
1044                 const VkImageBlit *pRegions,
1045                 VkFilter filter)
1046
1047 {
1048    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1049    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1050    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1051
1052    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1053    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1054
1055    for (uint32_t i = 0; i < regionCount; ++i)
1056       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1057 }
1058
1059 static void
1060 copy_compressed(VkFormat format,
1061                 VkOffset3D *offset,
1062                 VkExtent3D *extent,
1063                 uint32_t *width,
1064                 uint32_t *height)
1065 {
1066    if (!vk_format_is_compressed(format))
1067       return;
1068
1069    uint32_t block_width = vk_format_get_blockwidth(format);
1070    uint32_t block_height = vk_format_get_blockheight(format);
1071
1072    offset->x /= block_width;
1073    offset->y /= block_height;
1074
1075    if (extent) {
1076       extent->width = DIV_ROUND_UP(extent->width, block_width);
1077       extent->height = DIV_ROUND_UP(extent->height, block_height);
1078    }
1079    if (width)
1080       *width = DIV_ROUND_UP(*width, block_width);
1081    if (height)
1082       *height = DIV_ROUND_UP(*height, block_height);
1083 }
1084
1085 static void
1086 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1087                         struct tu_buffer *src_buffer,
1088                         struct tu_image *dst_image,
1089                         const VkBufferImageCopy *info)
1090 {
1091    struct tu_cs *cs = &cmd->cs;
1092    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1093    VkFormat src_format =
1094       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1095    const struct blit_ops *ops = &r2d_ops;
1096
1097    /* special case for buffer to stencil */
1098    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1099        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1100       ops = &r3d_ops;
1101    }
1102
1103    /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1104     * which matters for UBWC. buffer_to_image/etc can fail because of this
1105     */
1106
1107    VkOffset3D offset = info->imageOffset;
1108    VkExtent3D extent = info->imageExtent;
1109    uint32_t src_width = info->bufferRowLength ?: extent.width;
1110    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1111
1112    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1113
1114    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1115    uint32_t layer_size = src_height * pitch;
1116
1117    ops->setup(cmd, cs,
1118               copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1119               info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1120
1121    struct tu_image_view dst;
1122    tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1123
1124    for (uint32_t i = 0; i < layers; i++) {
1125       ops->dst(cs, &dst, i);
1126
1127       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1128       if ((src_va & 63) || (pitch & 63)) {
1129          for (uint32_t y = 0; y < extent.height; y++) {
1130             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1131             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1132                             x + extent.width, 1);
1133             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1134                         &(VkExtent2D) {extent.width, 1});
1135             ops->run(cmd, cs);
1136             src_va += pitch;
1137          }
1138       } else {
1139          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1140          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1141          ops->run(cmd, cs);
1142       }
1143    }
1144 }
1145
1146 void
1147 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1148                         VkBuffer srcBuffer,
1149                         VkImage dstImage,
1150                         VkImageLayout dstImageLayout,
1151                         uint32_t regionCount,
1152                         const VkBufferImageCopy *pRegions)
1153 {
1154    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1155    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1156    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1157
1158    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1159    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1160
1161    for (unsigned i = 0; i < regionCount; ++i)
1162       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1163 }
1164
1165 static void
1166 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1167                         struct tu_image *src_image,
1168                         struct tu_buffer *dst_buffer,
1169                         const VkBufferImageCopy *info)
1170 {
1171    struct tu_cs *cs = &cmd->cs;
1172    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1173    VkFormat dst_format =
1174       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1175    bool stencil_read = false;
1176
1177    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1178        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1179       stencil_read = true;
1180    }
1181
1182    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1183    VkOffset3D offset = info->imageOffset;
1184    VkExtent3D extent = info->imageExtent;
1185    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1186    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1187
1188    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1189
1190    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1191    uint32_t layer_size = pitch * dst_height;
1192
1193    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1194
1195    struct tu_image_view src;
1196    tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1197
1198    for (uint32_t i = 0; i < layers; i++) {
1199       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1200
1201       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1202       if ((dst_va & 63) || (pitch & 63)) {
1203          for (uint32_t y = 0; y < extent.height; y++) {
1204             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1205             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1206             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1207                         &(VkExtent2D) {extent.width, 1});
1208             ops->run(cmd, cs);
1209             dst_va += pitch;
1210          }
1211       } else {
1212          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1213          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1214          ops->run(cmd, cs);
1215       }
1216    }
1217 }
1218
1219 void
1220 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1221                         VkImage srcImage,
1222                         VkImageLayout srcImageLayout,
1223                         VkBuffer dstBuffer,
1224                         uint32_t regionCount,
1225                         const VkBufferImageCopy *pRegions)
1226 {
1227    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1228    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1229    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1230
1231    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1232    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1233
1234    for (unsigned i = 0; i < regionCount; ++i)
1235       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1236 }
1237
1238 /* Tiled formats don't support swapping, which means that we can't support
1239  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1240  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1241  * Currently we fake support for tiled swapped formats and use the unswapped
1242  * format instead, but this means that reinterpreting copies to and from
1243  * swapped formats can't be performed correctly unless we can swizzle the
1244  * components by reinterpreting the other image as the "correct" swapped
1245  * format, i.e. only when the other image is linear.
1246  */
1247
1248 static bool
1249 is_swapped_format(VkFormat format)
1250 {
1251    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1252    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1253    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1254 }
1255
1256 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1257  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1258  * versa). This should mirror the logic in fdl6_layout.
1259  */
1260 static bool
1261 image_is_r8g8(struct tu_image *image)
1262 {
1263    return image->layout[0].cpp == 2 &&
1264       vk_format_get_nr_components(image->vk_format) == 2;
1265 }
1266
1267 static void
1268 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1269                        struct tu_image *src_image,
1270                        struct tu_image *dst_image,
1271                        const VkImageCopy *info)
1272 {
1273    const struct blit_ops *ops = &r2d_ops;
1274    struct tu_cs *cs = &cmd->cs;
1275
1276    if (dst_image->samples > 1)
1277       ops = &r3d_ops;
1278
1279    VkFormat format = VK_FORMAT_UNDEFINED;
1280    VkOffset3D src_offset = info->srcOffset;
1281    VkOffset3D dst_offset = info->dstOffset;
1282    VkExtent3D extent = info->extent;
1283
1284    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1285     * Images":
1286     *
1287     *    When copying between compressed and uncompressed formats the extent
1288     *    members represent the texel dimensions of the source image and not
1289     *    the destination. When copying from a compressed image to an
1290     *    uncompressed image the image texel dimensions written to the
1291     *    uncompressed image will be source extent divided by the compressed
1292     *    texel block dimensions. When copying from an uncompressed image to a
1293     *    compressed image the image texel dimensions written to the compressed
1294     *    image will be the source extent multiplied by the compressed texel
1295     *    block dimensions.
1296     *
1297     * This means we only have to adjust the extent if the source image is
1298     * compressed.
1299     */
1300    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1301    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1302
1303    VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1304    VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1305
1306    bool use_staging_blit = false;
1307
1308    if (src_format == dst_format) {
1309       /* Images that share a format can always be copied directly because it's
1310        * the same as a blit.
1311        */
1312       format = src_format;
1313    } else if (!src_image->layout[0].tile_mode) {
1314       /* If an image is linear, we can always safely reinterpret it with the
1315        * other image's format and then do a regular blit.
1316        */
1317       format = dst_format;
1318    } else if (!dst_image->layout[0].tile_mode) {
1319       format = src_format;
1320    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1321       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1322        * due to the different tile layout.
1323        */
1324       use_staging_blit = true;
1325    } else if (is_swapped_format(src_format) ||
1326               is_swapped_format(dst_format)) {
1327       /* If either format has a non-identity swap, then we can't copy
1328        * to/from it.
1329        */
1330       use_staging_blit = true;
1331    } else if (!src_image->layout[0].ubwc) {
1332       format = dst_format;
1333    } else if (!dst_image->layout[0].ubwc) {
1334       format = src_format;
1335    } else {
1336       /* Both formats use UBWC and so neither can be reinterpreted.
1337        * TODO: We could do an in-place decompression of the dst instead.
1338        */
1339       use_staging_blit = true;
1340    }
1341
1342    struct tu_image_view dst, src;
1343
1344    if (use_staging_blit) {
1345       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1346       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1347
1348       struct tu_image staging_image = {
1349          .vk_format = src_format,
1350          .type = src_image->type,
1351          .tiling = VK_IMAGE_TILING_LINEAR,
1352          .extent = extent,
1353          .level_count = 1,
1354          .layer_count = info->srcSubresource.layerCount,
1355          .samples = src_image->samples,
1356          .bo_offset = 0,
1357       };
1358
1359       VkImageSubresourceLayers staging_subresource = {
1360          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1361          .mipLevel = 0,
1362          .baseArrayLayer = 0,
1363          .layerCount = info->srcSubresource.layerCount,
1364       };
1365
1366       VkOffset3D staging_offset = { 0 };
1367
1368       staging_image.layout[0].tile_mode = TILE6_LINEAR;
1369       staging_image.layout[0].ubwc = false;
1370
1371       fdl6_layout(&staging_image.layout[0],
1372                   vk_format_to_pipe_format(staging_image.vk_format),
1373                   staging_image.samples,
1374                   staging_image.extent.width,
1375                   staging_image.extent.height,
1376                   staging_image.extent.depth,
1377                   staging_image.level_count,
1378                   staging_image.layer_count,
1379                   staging_image.type == VK_IMAGE_TYPE_3D,
1380                   NULL);
1381
1382       VkResult result = tu_get_scratch_bo(cmd->device,
1383                                           staging_image.layout[0].size,
1384                                           &staging_image.bo);
1385       if (result != VK_SUCCESS) {
1386          cmd->record_result = result;
1387          return;
1388       }
1389
1390       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1391                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1392
1393       struct tu_image_view staging;
1394       tu_image_view_copy(&staging, &staging_image, src_format,
1395                          &staging_subresource, 0, false);
1396
1397       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1398       coords(ops, cs, &staging_offset, &src_offset, &extent);
1399
1400       for (uint32_t i = 0; i < info->extent.depth; i++) {
1401          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1402          ops->dst(cs, &staging, i);
1403          ops->run(cmd, cs);
1404       }
1405
1406       /* When executed by the user there has to be a pipeline barrier here,
1407        * but since we're doing it manually we'll have to flush ourselves.
1408        */
1409       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1410       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1411
1412       tu_image_view_copy(&staging, &staging_image, dst_format,
1413                          &staging_subresource, 0, false);
1414
1415       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1416                  ROTATE_0, false, dst_image->layout[0].ubwc);
1417       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1418
1419       for (uint32_t i = 0; i < info->extent.depth; i++) {
1420          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1421          ops->dst(cs, &dst, i);
1422          ops->run(cmd, cs);
1423       }
1424    } else {
1425       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1426       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1427
1428       ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1429                  ROTATE_0, false, dst_image->layout[0].ubwc);
1430       coords(ops, cs, &dst_offset, &src_offset, &extent);
1431
1432       for (uint32_t i = 0; i < info->extent.depth; i++) {
1433          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1434          ops->dst(cs, &dst, i);
1435          ops->run(cmd, cs);
1436       }
1437    }
1438 }
1439
1440 void
1441 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1442                 VkImage srcImage,
1443                 VkImageLayout srcImageLayout,
1444                 VkImage destImage,
1445                 VkImageLayout destImageLayout,
1446                 uint32_t regionCount,
1447                 const VkImageCopy *pRegions)
1448 {
1449    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1450    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1451    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1452
1453    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1454    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1455
1456    for (uint32_t i = 0; i < regionCount; ++i)
1457       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1458 }
1459
1460 static void
1461 copy_buffer(struct tu_cmd_buffer *cmd,
1462             uint64_t dst_va,
1463             uint64_t src_va,
1464             uint64_t size,
1465             uint32_t block_size)
1466 {
1467    const struct blit_ops *ops = &r2d_ops;
1468    struct tu_cs *cs = &cmd->cs;
1469    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1470    uint64_t blocks = size / block_size;
1471
1472    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1473
1474    while (blocks) {
1475       uint32_t src_x = (src_va & 63) / block_size;
1476       uint32_t dst_x = (dst_va & 63) / block_size;
1477       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1478
1479       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1480       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1481       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1482       ops->run(cmd, cs);
1483
1484       src_va += width * block_size;
1485       dst_va += width * block_size;
1486       blocks -= width;
1487    }
1488 }
1489
1490 void
1491 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1492                  VkBuffer srcBuffer,
1493                  VkBuffer dstBuffer,
1494                  uint32_t regionCount,
1495                  const VkBufferCopy *pRegions)
1496 {
1497    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1498    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1499    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1500
1501    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1502    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1503
1504    for (unsigned i = 0; i < regionCount; ++i) {
1505       copy_buffer(cmd,
1506                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1507                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1508                   pRegions[i].size, 1);
1509    }
1510 }
1511
1512 void
1513 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1514                    VkBuffer dstBuffer,
1515                    VkDeviceSize dstOffset,
1516                    VkDeviceSize dataSize,
1517                    const void *pData)
1518 {
1519    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1520    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1521
1522    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1523
1524    struct tu_cs_memory tmp;
1525    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1526    if (result != VK_SUCCESS) {
1527       cmd->record_result = result;
1528       return;
1529    }
1530
1531    memcpy(tmp.map, pData, dataSize);
1532    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1533 }
1534
1535 void
1536 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1537                  VkBuffer dstBuffer,
1538                  VkDeviceSize dstOffset,
1539                  VkDeviceSize fillSize,
1540                  uint32_t data)
1541 {
1542    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1543    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1544    const struct blit_ops *ops = &r2d_ops;
1545    struct tu_cs *cs = &cmd->cs;
1546
1547    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1548
1549    if (fillSize == VK_WHOLE_SIZE)
1550       fillSize = buffer->size - dstOffset;
1551
1552    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1553    uint32_t blocks = fillSize / 4;
1554
1555    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1556    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1557
1558    while (blocks) {
1559       uint32_t dst_x = (dst_va & 63) / 4;
1560       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1561
1562       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1563       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1564       ops->run(cmd, cs);
1565
1566       dst_va += width * 4;
1567       blocks -= width;
1568    }
1569 }
1570
1571 void
1572 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1573                    VkImage srcImage,
1574                    VkImageLayout srcImageLayout,
1575                    VkImage dstImage,
1576                    VkImageLayout dstImageLayout,
1577                    uint32_t regionCount,
1578                    const VkImageResolve *pRegions)
1579 {
1580    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1581    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1582    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1583    const struct blit_ops *ops = &r2d_ops;
1584    struct tu_cs *cs = &cmd->cs;
1585
1586    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1587    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1588
1589    ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1590               ROTATE_0, false, dst_image->layout[0].ubwc);
1591
1592    for (uint32_t i = 0; i < regionCount; ++i) {
1593       const VkImageResolve *info = &pRegions[i];
1594       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1595
1596       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1597       /* TODO: aspect masks possible ? */
1598
1599       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1600
1601       struct tu_image_view dst, src;
1602       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1603       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1604
1605       for (uint32_t i = 0; i < layers; i++) {
1606          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1607          ops->dst(cs, &dst, i);
1608          ops->run(cmd, cs);
1609       }
1610    }
1611 }
1612
1613 void
1614 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1615                   struct tu_cs *cs,
1616                   struct tu_image_view *src,
1617                   struct tu_image_view *dst,
1618                   uint32_t layers,
1619                   const VkRect2D *rect)
1620 {
1621    const struct blit_ops *ops = &r2d_ops;
1622
1623    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1624    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1625
1626    assert(src->image->vk_format == dst->image->vk_format);
1627
1628    ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1629               ROTATE_0, false, dst->ubwc_enabled);
1630    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1631
1632    for (uint32_t i = 0; i < layers; i++) {
1633       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1634       ops->dst(cs, dst, i);
1635       ops->run(cmd, cs);
1636    }
1637 }
1638
1639 static void
1640 clear_image(struct tu_cmd_buffer *cmd,
1641             struct tu_image *image,
1642             const VkClearValue *clear_value,
1643             const VkImageSubresourceRange *range)
1644 {
1645    uint32_t level_count = tu_get_levelCount(image, range);
1646    uint32_t layer_count = tu_get_layerCount(image, range);
1647    struct tu_cs *cs = &cmd->cs;
1648    VkFormat format = image->vk_format;
1649    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1650       format = VK_FORMAT_R32_UINT;
1651
1652    if (image->type == VK_IMAGE_TYPE_3D) {
1653       assert(layer_count == 1);
1654       assert(range->baseArrayLayer == 0);
1655    }
1656
1657    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1658
1659    ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true, image->layout[0].ubwc);
1660    ops->clear_value(cs, image->vk_format, clear_value);
1661
1662    for (unsigned j = 0; j < level_count; j++) {
1663       if (image->type == VK_IMAGE_TYPE_3D)
1664          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1665
1666       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1667                      u_minify(image->extent.width, range->baseMipLevel + j),
1668                      u_minify(image->extent.height, range->baseMipLevel + j)
1669                   });
1670
1671       struct tu_image_view dst;
1672       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1673          .aspectMask = range->aspectMask,
1674          .mipLevel = range->baseMipLevel + j,
1675          .baseArrayLayer = range->baseArrayLayer,
1676          .layerCount = 1,
1677       }, 0, false);
1678
1679       for (uint32_t i = 0; i < layer_count; i++) {
1680          ops->dst(cs, &dst, i);
1681          ops->run(cmd, cs);
1682       }
1683    }
1684 }
1685
1686 void
1687 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1688                       VkImage image_h,
1689                       VkImageLayout imageLayout,
1690                       const VkClearColorValue *pColor,
1691                       uint32_t rangeCount,
1692                       const VkImageSubresourceRange *pRanges)
1693 {
1694    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1695    TU_FROM_HANDLE(tu_image, image, image_h);
1696
1697    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1698
1699    for (unsigned i = 0; i < rangeCount; i++)
1700       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1701 }
1702
1703 void
1704 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1705                              VkImage image_h,
1706                              VkImageLayout imageLayout,
1707                              const VkClearDepthStencilValue *pDepthStencil,
1708                              uint32_t rangeCount,
1709                              const VkImageSubresourceRange *pRanges)
1710 {
1711    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1712    TU_FROM_HANDLE(tu_image, image, image_h);
1713
1714    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1715
1716    for (unsigned i = 0; i < rangeCount; i++)
1717       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1718 }
1719
1720 static void
1721 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1722                                uint32_t attachment_count,
1723                                const VkClearAttachment *attachments,
1724                                uint32_t rect_count,
1725                                const VkClearRect *rects)
1726 {
1727    const struct tu_subpass *subpass = cmd->state.subpass;
1728    /* note: cannot use shader path here.. there is a special shader path
1729     * in tu_clear_sysmem_attachments()
1730     */
1731    const struct blit_ops *ops = &r2d_ops;
1732    struct tu_cs *cs = &cmd->draw_cs;
1733
1734    for (uint32_t j = 0; j < attachment_count; j++) {
1735          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1736           * Pass Instance" says that:
1737           *
1738           *     Unlike other clear commands, vkCmdClearAttachments executes as
1739           *     a drawing command, rather than a transfer command, with writes
1740           *     performed by it executing in rasterization order. Clears to
1741           *     color attachments are executed as color attachment writes, by
1742           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1743           *     Clears to depth/stencil attachments are executed as depth
1744           *     writes and writes by the
1745           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1746           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1747           *
1748           * However, the 2d path here is executed the same way as a
1749           * transfer command, using the CCU color cache exclusively with
1750           * a special depth-as-color format for depth clears. This means that
1751           * we can't rely on the normal pipeline barrier mechanism here, and
1752           * have to manually flush whenever using a different cache domain
1753           * from what the 3d path would've used. This happens when we clear
1754           * depth/stencil, since normally depth attachments use CCU depth, but
1755           * we clear it using a special depth-as-color format. Since the clear
1756           * potentially uses a different attachment state we also need to
1757           * invalidate color beforehand and flush it afterwards.
1758           */
1759
1760          uint32_t a;
1761          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1762             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1763             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1764          } else {
1765             a = subpass->depth_stencil_attachment.attachment;
1766             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1767             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1768             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1769          }
1770
1771          if (a == VK_ATTACHMENT_UNUSED)
1772                continue;
1773
1774          const struct tu_image_view *iview =
1775             cmd->state.framebuffer->attachments[a].attachment;
1776
1777          ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask,
1778                     ROTATE_0, true, iview->ubwc_enabled);
1779          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1780
1781          /* Wait for the flushes we triggered manually to complete */
1782          tu_cs_emit_wfi(cs);
1783
1784          for (uint32_t i = 0; i < rect_count; i++) {
1785             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1786             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1787                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1788                ops->run(cmd, cs);
1789             }
1790          }
1791
1792          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1793             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1794             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1795          } else {
1796             /* sync color into depth */
1797             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1798             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1799          }
1800    }
1801 }
1802
1803 static void
1804 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1805                             uint32_t attachment_count,
1806                             const VkClearAttachment *attachments,
1807                             uint32_t rect_count,
1808                             const VkClearRect *rects)
1809 {
1810    /* the shader path here is special, it avoids changing MRT/etc state */
1811    const struct tu_render_pass *pass = cmd->state.pass;
1812    const struct tu_subpass *subpass = cmd->state.subpass;
1813    const uint32_t mrt_count = subpass->color_count;
1814    struct tu_cs *cs = &cmd->draw_cs;
1815    uint32_t clear_value[MAX_RTS][4];
1816    float z_clear_val = 0.0f;
1817    uint8_t s_clear_val = 0;
1818    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1819    bool z_clear = false;
1820    bool s_clear = false;
1821    bool layered_clear = false;
1822    uint32_t max_samples = 1;
1823
1824    for (uint32_t i = 0; i < attachment_count; i++) {
1825       uint32_t a;
1826       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1827          uint32_t c = attachments[i].colorAttachment;
1828          a = subpass->color_attachments[c].attachment;
1829          if (a == VK_ATTACHMENT_UNUSED)
1830             continue;
1831
1832          clear_rts |= 1 << c;
1833          clear_components |= 0xf << (c * 4);
1834          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1835       } else {
1836          a = subpass->depth_stencil_attachment.attachment;
1837          if (a == VK_ATTACHMENT_UNUSED)
1838             continue;
1839
1840          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1841             z_clear = true;
1842             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1843          }
1844
1845          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1846             s_clear = true;
1847             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1848          }
1849       }
1850
1851       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1852    }
1853
1854    /* prefer to use 2D path for clears
1855     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1856     */
1857    if (max_samples == 1 && cmd->state.framebuffer) {
1858       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1859       return;
1860    }
1861
1862    /* This clear path behaves like a draw, needs the same flush as tu_draw */
1863    tu_emit_cache_flush_renderpass(cmd, cs);
1864
1865    /* disable all draw states so they don't interfere
1866     * TODO: use and re-use draw states for this path
1867     * we have to disable draw states individually to preserve
1868     * input attachment states, because a secondary command buffer
1869     * won't be able to restore them
1870     */
1871    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1872    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1873       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1874           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1875          continue;
1876       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1877                      CP_SET_DRAW_STATE__0_DISABLE);
1878       tu_cs_emit_qw(cs, 0);
1879    }
1880    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1881
1882    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1883    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1884                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1885                   0xfc000000);
1886    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1887
1888    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1889    for (uint32_t i = 0; i < mrt_count; i++) {
1890       if (clear_rts & (1 << i))
1891          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1892       else
1893          tu_cs_emit(cs, 0);
1894    }
1895
1896    for (uint32_t i = 0; i < rect_count; i++) {
1897       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1898          layered_clear = true;
1899    }
1900
1901    r3d_common(cmd, cs, false, num_rts, layered_clear);
1902
1903    tu_cs_emit_regs(cs,
1904                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1905    tu_cs_emit_regs(cs,
1906                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1907
1908    tu_cs_emit_regs(cs,
1909                    A6XX_RB_FS_OUTPUT_CNTL0(),
1910                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1911
1912    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1913    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1914    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1915    for (uint32_t i = 0; i < mrt_count; i++) {
1916       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1917             .component_enable = COND(clear_rts & (1 << i), 0xf)));
1918    }
1919
1920    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1921    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1922          .z_enable = z_clear,
1923          .z_write_enable = z_clear,
1924          .zfunc = FUNC_ALWAYS));
1925    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1926    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1927          .stencil_enable = s_clear,
1928          .func = FUNC_ALWAYS,
1929          .zpass = STENCIL_REPLACE));
1930    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1931    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1932    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1933
1934    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1935    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1936                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1937                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1938                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1939                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1940    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1941    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1942    for_each_bit(b, clear_rts)
1943       tu_cs_emit_array(cs, clear_value[b], 4);
1944
1945    for (uint32_t i = 0; i < rect_count; i++) {
1946       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1947          r3d_coords_raw(cs, (float[]) {
1948             rects[i].rect.offset.x, rects[i].rect.offset.y,
1949             z_clear_val, uif(rects[i].baseArrayLayer + layer),
1950             rects[i].rect.offset.x + rects[i].rect.extent.width,
1951             rects[i].rect.offset.y + rects[i].rect.extent.height,
1952             z_clear_val, 1.0f,
1953          });
1954          r3d_run(cmd, cs);
1955       }
1956    }
1957 }
1958
1959 static void
1960 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1961 {
1962    enum pipe_format pformat = vk_format_to_pipe_format(format);
1963
1964    switch (format) {
1965    case VK_FORMAT_X8_D24_UNORM_PACK32:
1966    case VK_FORMAT_D24_UNORM_S8_UINT:
1967       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1968                        val->depthStencil.stencil << 24;
1969       return;
1970    case VK_FORMAT_D16_UNORM:
1971       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1972       return;
1973    case VK_FORMAT_D32_SFLOAT:
1974       clear_value[0] = fui(val->depthStencil.depth);
1975       return;
1976    case VK_FORMAT_S8_UINT:
1977       clear_value[0] = val->depthStencil.stencil;
1978       return;
1979    /* these formats use a different base format when tiled
1980     * the same format can be used for both because GMEM is always in WZYX order
1981     */
1982    case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1983    case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1984       pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1985    default:
1986       break;
1987    }
1988
1989    VkClearColorValue color;
1990
1991    /**
1992     * GMEM is tiled and wants the components in WZYX order,
1993     * apply swizzle to the color before packing, to counteract
1994     * deswizzling applied by packing functions
1995     */
1996    pipe_swizzle_4f(color.float32, val->color.float32,
1997                    util_format_description(pformat)->swizzle);
1998
1999    util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2000 }
2001
2002 static void
2003 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2004                               struct tu_cs *cs,
2005                               uint32_t attachment,
2006                               VkImageAspectFlags mask,
2007                               const VkClearValue *value)
2008 {
2009    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2010
2011
2012    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2013    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2014
2015    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
2016       .clear_mask = aspect_write_mask(vk_format, mask)));
2017
2018    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2019    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2020
2021    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2022    tu_cs_emit(cs, 0);
2023
2024    uint32_t clear_vals[4] = {};
2025    pack_gmem_clear_value(value, vk_format, clear_vals);
2026
2027    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2028    tu_cs_emit_array(cs, clear_vals, 4);
2029
2030    tu6_emit_event_write(cmd, cs, BLIT);
2031 }
2032
2033 static void
2034 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2035                           uint32_t attachment_count,
2036                           const VkClearAttachment *attachments,
2037                           uint32_t rect_count,
2038                           const VkClearRect *rects)
2039 {
2040    const struct tu_subpass *subpass = cmd->state.subpass;
2041    struct tu_cs *cs = &cmd->draw_cs;
2042
2043    /* TODO: swap the loops for smaller cmdstream */
2044    for (unsigned i = 0; i < rect_count; i++) {
2045       unsigned x1 = rects[i].rect.offset.x;
2046       unsigned y1 = rects[i].rect.offset.y;
2047       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2048       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2049
2050       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2051       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2052       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2053
2054       for (unsigned j = 0; j < attachment_count; j++) {
2055          uint32_t a;
2056          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2057             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2058          else
2059             a = subpass->depth_stencil_attachment.attachment;
2060
2061          if (a == VK_ATTACHMENT_UNUSED)
2062                continue;
2063
2064          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2065                                        &attachments[j].clearValue);
2066       }
2067    }
2068 }
2069
2070 void
2071 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2072                        uint32_t attachmentCount,
2073                        const VkClearAttachment *pAttachments,
2074                        uint32_t rectCount,
2075                        const VkClearRect *pRects)
2076 {
2077    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2078    struct tu_cs *cs = &cmd->draw_cs;
2079
2080    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2081    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2082    tu_cond_exec_end(cs);
2083
2084    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2085    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2086    tu_cond_exec_end(cs);
2087 }
2088
2089 void
2090 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2091                            struct tu_cs *cs,
2092                            uint32_t a,
2093                            const VkRenderPassBeginInfo *info)
2094 {
2095    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2096    const struct tu_image_view *iview = fb->attachments[a].attachment;
2097    const struct tu_render_pass_attachment *attachment =
2098       &cmd->state.pass->attachments[a];
2099
2100    if (!attachment->clear_mask)
2101       return;
2102
2103    const struct blit_ops *ops = &r2d_ops;
2104    if (attachment->samples > 1)
2105       ops = &r3d_ops;
2106
2107    ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0,
2108               true, iview->ubwc_enabled);
2109    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2110    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2111
2112    /* Wait for any flushes at the beginning of the renderpass to complete */
2113    tu_cs_emit_wfi(cs);
2114
2115    for (uint32_t i = 0; i < fb->layers; i++) {
2116       ops->dst(cs, iview, i);
2117       ops->run(cmd, cs);
2118    }
2119
2120    /* The spec doesn't explicitly say, but presumably the initial renderpass
2121     * clear is considered part of the renderpass, and therefore barriers
2122     * aren't required inside the subpass/renderpass.  Therefore we need to
2123     * flush CCU color into CCU depth here, just like with
2124     * vkCmdClearAttachments(). Note that because this only happens at the
2125     * beginning of a renderpass, and renderpass writes are considered
2126     * "incoherent", we shouldn't have to worry about syncing depth into color
2127     * beforehand as depth should already be flushed.
2128     */
2129    if (vk_format_is_depth_or_stencil(attachment->format)) {
2130       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2131       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2132    } else {
2133       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2134       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2135    }
2136 }
2137
2138 void
2139 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2140                          struct tu_cs *cs,
2141                          uint32_t a,
2142                          const VkRenderPassBeginInfo *info)
2143 {
2144    const struct tu_render_pass_attachment *attachment =
2145       &cmd->state.pass->attachments[a];
2146
2147    if (!attachment->clear_mask)
2148       return;
2149
2150    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2151
2152    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2153                                  &info->pClearValues[a]);
2154 }
2155
2156 static void
2157 tu_emit_blit(struct tu_cmd_buffer *cmd,
2158              struct tu_cs *cs,
2159              const struct tu_image_view *iview,
2160              const struct tu_render_pass_attachment *attachment,
2161              bool resolve)
2162 {
2163    tu_cs_emit_regs(cs,
2164                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2165
2166    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2167       .unk0 = !resolve,
2168       .gmem = !resolve,
2169       /* "integer" bit disables msaa resolve averaging */
2170       .integer = vk_format_is_int(attachment->format)));
2171
2172    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2173    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2174    tu_cs_image_ref_2d(cs, iview, 0, false);
2175
2176    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2177    tu_cs_image_flag_ref(cs, iview, 0);
2178
2179    tu_cs_emit_regs(cs,
2180                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2181
2182    tu6_emit_event_write(cmd, cs, BLIT);
2183 }
2184
2185 static bool
2186 blit_can_resolve(VkFormat format)
2187 {
2188    const struct util_format_description *desc = vk_format_description(format);
2189
2190    /* blit event can only do resolve for simple cases:
2191     * averaging samples as unsigned integers or choosing only one sample
2192     */
2193    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2194       return false;
2195
2196    /* can't do formats with larger channel sizes
2197     * note: this includes all float formats
2198     * note2: single channel integer formats seem OK
2199     */
2200    if (desc->channel[0].size > 10)
2201       return false;
2202
2203    switch (format) {
2204    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2205     * likely related to these formats having different layout from other cpp=2 formats
2206     */
2207    case VK_FORMAT_R8G8_UNORM:
2208    case VK_FORMAT_R8G8_UINT:
2209    case VK_FORMAT_R8G8_SINT:
2210    /* TODO: this one should be able to work? */
2211    case VK_FORMAT_D24_UNORM_S8_UINT:
2212       return false;
2213    default:
2214       break;
2215    }
2216
2217    return true;
2218 }
2219
2220 void
2221 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2222                         struct tu_cs *cs,
2223                         uint32_t a,
2224                         bool force_load)
2225 {
2226    const struct tu_image_view *iview =
2227       cmd->state.framebuffer->attachments[a].attachment;
2228    const struct tu_render_pass_attachment *attachment =
2229       &cmd->state.pass->attachments[a];
2230
2231    if (attachment->load || force_load)
2232       tu_emit_blit(cmd, cs, iview, attachment, false);
2233 }
2234
2235 void
2236 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2237                          struct tu_cs *cs,
2238                          uint32_t a,
2239                          uint32_t gmem_a)
2240 {
2241    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2242    const VkRect2D *render_area = &cmd->state.render_area;
2243    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2244    struct tu_image_view *iview = fb->attachments[a].attachment;
2245    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2246
2247    if (!dst->store)
2248       return;
2249
2250    uint32_t x1 = render_area->offset.x;
2251    uint32_t y1 = render_area->offset.y;
2252    uint32_t x2 = x1 + render_area->extent.width;
2253    uint32_t y2 = y1 + render_area->extent.height;
2254    /* x2/y2 can be unaligned if equal to the size of the image,
2255     * since it will write into padding space
2256     * the one exception is linear levels which don't have the
2257     * required y padding in the layout (except for the last level)
2258     */
2259    bool need_y2_align =
2260       y2 != iview->extent.height || iview->need_y2_align;
2261
2262    bool unaligned =
2263       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2264       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2265
2266    /* use fast path when render area is aligned, except for unsupported resolve cases */
2267    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2268       tu_emit_blit(cmd, cs, iview, src, true);
2269       return;
2270    }
2271
2272    if (dst->samples > 1) {
2273       /* I guess we need to use shader path in this case?
2274        * need a testcase which fails because of this
2275        */
2276       tu_finishme("unaligned store of msaa attachment\n");
2277       return;
2278    }
2279
2280    r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT,
2281                     ROTATE_0, false, iview->ubwc_enabled, true);
2282    r2d_dst(cs, iview, 0);
2283    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2284
2285    tu_cs_emit_regs(cs,
2286                    A6XX_SP_PS_2D_SRC_INFO(
2287                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2288                       .tile_mode = TILE6_2,
2289                       .srgb = vk_format_is_srgb(src->format),
2290                       .samples = tu_msaa_samples(src->samples),
2291                       .samples_average = !vk_format_is_int(src->format),
2292                       .unk20 = 1,
2293                       .unk22 = 1),
2294                    /* note: src size does not matter when not scaling */
2295                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2296                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2297                    A6XX_SP_PS_2D_SRC_HI(),
2298                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2299
2300    /* sync GMEM writes with CACHE. */
2301    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2302
2303    /* Wait for CACHE_INVALIDATE to land */
2304    tu_cs_emit_wfi(cs);
2305
2306    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2307    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2308
2309    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2310     * sysmem, and we generally assume that GMEM renderpasses leave their
2311     * results in sysmem, so we need to flush manually here.
2312     */
2313    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2314 }