src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 /* helper functions previously in tu_formats.c */
  20
  21 static uint32_t
  22 tu_pack_mask(int bits)
  23 {
  24    assert(bits <= 32);
  25    return (1ull << bits) - 1;
  26 }
  27
  28 static uint32_t
  29 tu_pack_float32_for_unorm(float val, int bits)
  30 {
  31    const uint32_t max = tu_pack_mask(bits);
  32    if (val < 0.0f)
  33       return 0;
  34    else if (val > 1.0f)
  35       return max;
  36    else
  37       return _mesa_lroundevenf(val * (float) max);
  38 }
  39
  40 static uint32_t
  41 tu_pack_float32_for_snorm(float val, int bits)
  42 {
  43    const int32_t max = tu_pack_mask(bits - 1);
  44    int32_t tmp;
  45    if (val < -1.0f)
  46       tmp = -max;
  47    else if (val > 1.0f)
  48       tmp = max;
  49    else
  50       tmp = _mesa_lroundevenf(val * (float) max);
  51
  52    return tmp & tu_pack_mask(bits);
  53 }
  54
  55 static uint32_t
  56 tu_pack_float32_for_uscaled(float val, int bits)
  57 {
  58    const uint32_t max = tu_pack_mask(bits);
  59    if (val < 0.0f)
  60       return 0;
  61    else if (val > (float) max)
  62       return max;
  63    else
  64       return (uint32_t) val;
  65 }
  66
  67 static uint32_t
  68 tu_pack_float32_for_sscaled(float val, int bits)
  69 {
  70    const int32_t max = tu_pack_mask(bits - 1);
  71    const int32_t min = -max - 1;
  72    int32_t tmp;
  73    if (val < (float) min)
  74       tmp = min;
  75    else if (val > (float) max)
  76       tmp = max;
  77    else
  78       tmp = (int32_t) val;
  79
  80    return tmp & tu_pack_mask(bits);
  81 }
  82
  83 static uint32_t
  84 tu_pack_uint32_for_uint(uint32_t val, int bits)
  85 {
  86    return val & tu_pack_mask(bits);
  87 }
  88
  89 static uint32_t
  90 tu_pack_int32_for_sint(int32_t val, int bits)
  91 {
  92    return val & tu_pack_mask(bits);
  93 }
  94
  95 static uint32_t
  96 tu_pack_float32_for_sfloat(float val, int bits)
  97 {
  98    assert(bits == 16 || bits == 32);
  99    return bits == 16 ? util_float_to_half(val) : fui(val);
 100 }
 101
 102 union tu_clear_component_value {
 103    float float32;
 104    int32_t int32;
 105    uint32_t uint32;
 106 };
 107
 108 static uint32_t
 109 tu_pack_clear_component_value(union tu_clear_component_value val,
 110                               const struct util_format_channel_description *ch)
 111 {
 112    uint32_t packed;
 113
 114    switch (ch->type) {
 115    case UTIL_FORMAT_TYPE_UNSIGNED:
 116       /* normalized, scaled, or pure integer */
 117       if (ch->normalized)
 118          packed = tu_pack_float32_for_unorm(val.float32, ch->size);
 119       else if (ch->pure_integer)
 120          packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
 121       else
 122          packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
 123       break;
 124    case UTIL_FORMAT_TYPE_SIGNED:
 125       /* normalized, scaled, or pure integer */
 126       if (ch->normalized)
 127          packed = tu_pack_float32_for_snorm(val.float32, ch->size);
 128       else if (ch->pure_integer)
 129          packed = tu_pack_int32_for_sint(val.int32, ch->size);
 130       else
 131          packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
 132       break;
 133    case UTIL_FORMAT_TYPE_FLOAT:
 134       packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
 135       break;
 136    default:
 137       unreachable("unexpected channel type");
 138       packed = 0;
 139       break;
 140    }
 141
 142    assert((packed & tu_pack_mask(ch->size)) == packed);
 143    return packed;
 144 }
 145
 146 static const struct util_format_channel_description *
 147 tu_get_format_channel_description(const struct util_format_description *desc,
 148                                   int comp)
 149 {
 150    switch (desc->swizzle[comp]) {
 151    case PIPE_SWIZZLE_X:
 152       return &desc->channel[0];
 153    case PIPE_SWIZZLE_Y:
 154       return &desc->channel[1];
 155    case PIPE_SWIZZLE_Z:
 156       return &desc->channel[2];
 157    case PIPE_SWIZZLE_W:
 158       return &desc->channel[3];
 159    default:
 160       return NULL;
 161    }
 162 }
 163
 164 static union tu_clear_component_value
 165 tu_get_clear_component_value(const VkClearValue *val, int comp,
 166                              enum util_format_colorspace colorspace)
 167 {
 168    assert(comp < 4);
 169
 170    union tu_clear_component_value tmp;
 171    switch (colorspace) {
 172    case UTIL_FORMAT_COLORSPACE_ZS:
 173       assert(comp < 2);
 174       if (comp == 0)
 175          tmp.float32 = val->depthStencil.depth;
 176       else
 177          tmp.uint32 = val->depthStencil.stencil;
 178       break;
 179    case UTIL_FORMAT_COLORSPACE_SRGB:
 180       if (comp < 3) {
 181          tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
 182          break;
 183       }
 184    default:
 185       assert(comp < 4);
 186       tmp.uint32 = val->color.uint32[comp];
 187       break;
 188    }
 189
 190    return tmp;
 191 }
 192
 193 /* r2d_ = BLIT_OP_SCALE operations */
 194
 195 static enum a6xx_2d_ifmt
 196 format_to_ifmt(enum a6xx_format fmt)
 197 {
 198    switch (fmt) {
 199    case FMT6_A8_UNORM:
 200    case FMT6_8_UNORM:
 201    case FMT6_8_SNORM:
 202    case FMT6_8_8_UNORM:
 203    case FMT6_8_8_SNORM:
 204    case FMT6_8_8_8_8_UNORM:
 205    case FMT6_8_8_8_X8_UNORM:
 206    case FMT6_8_8_8_8_SNORM:
 207    case FMT6_4_4_4_4_UNORM:
 208    case FMT6_5_5_5_1_UNORM:
 209    case FMT6_5_6_5_UNORM:
 210    case FMT6_Z24_UNORM_S8_UINT:
 211    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
 212       return R2D_UNORM8;
 213
 214    case FMT6_32_UINT:
 215    case FMT6_32_SINT:
 216    case FMT6_32_32_UINT:
 217    case FMT6_32_32_SINT:
 218    case FMT6_32_32_32_32_UINT:
 219    case FMT6_32_32_32_32_SINT:
 220       return R2D_INT32;
 221
 222    case FMT6_16_UINT:
 223    case FMT6_16_SINT:
 224    case FMT6_16_16_UINT:
 225    case FMT6_16_16_SINT:
 226    case FMT6_16_16_16_16_UINT:
 227    case FMT6_16_16_16_16_SINT:
 228    case FMT6_10_10_10_2_UINT:
 229       return R2D_INT16;
 230
 231    case FMT6_8_UINT:
 232    case FMT6_8_SINT:
 233    case FMT6_8_8_UINT:
 234    case FMT6_8_8_SINT:
 235    case FMT6_8_8_8_8_UINT:
 236    case FMT6_8_8_8_8_SINT:
 237       return R2D_INT8;
 238
 239    case FMT6_16_UNORM:
 240    case FMT6_16_SNORM:
 241    case FMT6_16_16_UNORM:
 242    case FMT6_16_16_SNORM:
 243    case FMT6_16_16_16_16_UNORM:
 244    case FMT6_16_16_16_16_SNORM:
 245    case FMT6_32_FLOAT:
 246    case FMT6_32_32_FLOAT:
 247    case FMT6_32_32_32_32_FLOAT:
 248       return R2D_FLOAT32;
 249
 250    case FMT6_16_FLOAT:
 251    case FMT6_16_16_FLOAT:
 252    case FMT6_16_16_16_16_FLOAT:
 253    case FMT6_11_11_10_FLOAT:
 254    case FMT6_10_10_10_2_UNORM:
 255    case FMT6_10_10_10_2_UNORM_DEST:
 256       return R2D_FLOAT16;
 257
 258    default:
 259       unreachable("bad format");
 260       return 0;
 261    }
 262 }
 263
 264 static void
 265 r2d_coords(struct tu_cs *cs,
 266            const VkOffset2D *dst,
 267            const VkOffset2D *src,
 268            const VkExtent2D *extent)
 269 {
 270    tu_cs_emit_regs(cs,
 271       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 272       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 273
 274    if (!src)
 275       return;
 276
 277    tu_cs_emit_regs(cs,
 278                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 279                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 280                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 281                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 282 }
 283
 284 static void
 285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 286 {
 287    uint32_t clear_value[4] = {};
 288
 289    switch (format) {
 290    case VK_FORMAT_X8_D24_UNORM_PACK32:
 291    case VK_FORMAT_D24_UNORM_S8_UINT:
 292       /* cleared as r8g8b8a8_unorm using special format */
 293       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 294       clear_value[1] = clear_value[0] >> 8;
 295       clear_value[2] = clear_value[0] >> 16;
 296       clear_value[3] = val->depthStencil.stencil;
 297       break;
 298    case VK_FORMAT_D16_UNORM:
 299    case VK_FORMAT_D32_SFLOAT:
 300       /* R2D_FLOAT32 */
 301       clear_value[0] = fui(val->depthStencil.depth);
 302       break;
 303    case VK_FORMAT_S8_UINT:
 304       clear_value[0] = val->depthStencil.stencil;
 305       break;
 306    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 307       /* cleared as UINT32 */
 308       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 309       break;
 310    default:
 311       assert(!vk_format_is_depth_or_stencil(format));
 312       const struct util_format_description *desc = vk_format_description(format);
 313       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 314
 315       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 316                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 317
 318       for (unsigned i = 0; i < desc->nr_channels; i++) {
 319          const struct util_format_channel_description *ch = &desc->channel[i];
 320          if (ifmt == R2D_UNORM8) {
 321             float linear = val->color.float32[i];
 322             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 323                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 324
 325             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 326                clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
 327             else
 328                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 329          } else if (ifmt == R2D_FLOAT16) {
 330             clear_value[i] = util_float_to_half(val->color.float32[i]);
 331          } else {
 332             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 333                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 334             clear_value[i] = val->color.uint32[i];
 335          }
 336       }
 337       break;
 338    }
 339
 340    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 341    tu_cs_emit_array(cs, clear_value, 4);
 342 }
 343
 344 static void
 345 r2d_src(struct tu_cmd_buffer *cmd,
 346         struct tu_cs *cs,
 347         const struct tu_image_view *iview,
 348         uint32_t layer,
 349         bool linear_filter)
 350 {
 351    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 352    tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
 353                   COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
 354    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 355    tu_cs_image_ref_2d(cs, iview, layer, true);
 356
 357    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 358    tu_cs_image_flag_ref(cs, iview, layer);
 359 }
 360
 361 static void
 362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 363                struct tu_cs *cs,
 364                VkFormat vk_format,
 365                uint64_t va, uint32_t pitch,
 366                uint32_t width, uint32_t height)
 367 {
 368    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 369
 370    tu_cs_emit_regs(cs,
 371                    A6XX_SP_PS_2D_SRC_INFO(
 372                       .color_format = format.fmt,
 373                       .color_swap = format.swap,
 374                       .srgb = vk_format_is_srgb(vk_format),
 375                       .unk20 = 1,
 376                       .unk22 = 1),
 377                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 378                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 379                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 380                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 381 }
 382
 383 static void
 384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 385 {
 386    assert(iview->image->samples == 1);
 387
 388    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 389    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 390    tu_cs_image_ref_2d(cs, iview, layer, false);
 391
 392    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 393    tu_cs_image_flag_ref(cs, iview, layer);
 394 }
 395
 396 static void
 397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 398 {
 399    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 400
 401    tu_cs_emit_regs(cs,
 402                    A6XX_RB_2D_DST_INFO(
 403                       .color_format = format.fmt,
 404                       .color_swap = format.swap,
 405                       .srgb = vk_format_is_srgb(vk_format)),
 406                    A6XX_RB_2D_DST_LO((uint32_t) va),
 407                    A6XX_RB_2D_DST_HI(va >> 32),
 408                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 409 }
 410
 411 static void
 412 r2d_setup_common(struct tu_cmd_buffer *cmd,
 413                  struct tu_cs *cs,
 414                  VkFormat vk_format,
 415                  enum a6xx_rotation rotation,
 416                  bool clear,
 417                  uint8_t mask,
 418                  bool scissor)
 419 {
 420    enum a6xx_format format = tu6_base_format(vk_format);
 421    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 422    uint32_t unknown_8c01 = 0;
 423
 424    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 425       /* preserve depth channels */
 426       if (mask == 0x8)
 427          unknown_8c01 = 0x00084001;
 428       /* preserve stencil channel */
 429       if (mask == 0x7)
 430          unknown_8c01 = 0x08000041;
 431    }
 432
 433    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 434    tu_cs_emit(cs, unknown_8c01);
 435
 436    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 437          .scissor = scissor,
 438          .rotate = rotation,
 439          .solid_color = clear,
 440          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 441          .color_format = format,
 442          .mask = 0xf,
 443          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 444       ).value;
 445
 446    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 447    tu_cs_emit(cs, blit_cntl);
 448
 449    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 450    tu_cs_emit(cs, blit_cntl);
 451
 452    if (format == FMT6_10_10_10_2_UNORM_DEST)
 453       format = FMT6_16_16_16_16_FLOAT;
 454
 455    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 456          .sint = vk_format_is_sint(vk_format),
 457          .uint = vk_format_is_uint(vk_format),
 458          .color_format = format,
 459          .srgb = vk_format_is_srgb(vk_format),
 460          .mask = 0xf));
 461 }
 462
 463 static void
 464 r2d_setup(struct tu_cmd_buffer *cmd,
 465           struct tu_cs *cs,
 466           VkFormat vk_format,
 467           enum a6xx_rotation rotation,
 468           bool clear,
 469           uint8_t mask)
 470 {
 471    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 472
 473    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 474 }
 475
 476 static void
 477 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 478 {
 479    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 480    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 481 }
 482
 483 /* r3d_ = shader path operations */
 484
 485 static void
 486 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 487              bool layered_clear)
 488 {
 489    struct ir3_shader dummy_shader = {};
 490
 491    struct ir3_shader_variant vs = {
 492       .type = MESA_SHADER_VERTEX,
 493       .instrlen = 1,
 494       .constlen = 2,
 495       .info.max_reg = 1,
 496       .inputs_count = 1,
 497       .inputs[0] = {
 498          .slot = SYSTEM_VALUE_VERTEX_ID,
 499          .regid = regid(0, 3),
 500          .sysval = true,
 501       },
 502       .outputs_count = blit ? 2 : 1,
 503       .outputs[0] = {
 504          .slot = VARYING_SLOT_POS,
 505          .regid = regid(0, 0),
 506       },
 507       .outputs[1] = {
 508          .slot = VARYING_SLOT_VAR0,
 509          .regid = regid(1, 0),
 510       },
 511       .shader = &dummy_shader,
 512    };
 513    if (layered_clear) {
 514       vs = (struct ir3_shader_variant) {
 515          .type = MESA_SHADER_VERTEX,
 516          .instrlen = 1,
 517          .info.max_reg = 0,
 518          .shader = &dummy_shader,
 519       };
 520    }
 521
 522    struct ir3_shader_variant fs = {
 523       .type = MESA_SHADER_FRAGMENT,
 524       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 525       .constlen = num_rts,
 526       .info.max_reg = MAX2(num_rts, 1) - 1,
 527       .total_in = blit ? 2 : 0,
 528       .num_samp = blit ? 1 : 0,
 529       .inputs_count = blit ? 2 : 0,
 530       .inputs[0] = {
 531          .slot = VARYING_SLOT_VAR0,
 532          .inloc = 0,
 533          .compmask = 3,
 534          .bary = true,
 535       },
 536       .inputs[1] = {
 537          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 538          .regid = regid(0, 0),
 539          .sysval = 1,
 540       },
 541       .num_sampler_prefetch = blit ? 1 : 0,
 542       .sampler_prefetch[0] = {
 543          .src = 0,
 544          .wrmask = 0xf,
 545          .cmd = 4,
 546       },
 547       .shader = &dummy_shader,
 548    };
 549
 550    struct ir3_shader_variant gs_shader = {
 551       .type = MESA_SHADER_GEOMETRY,
 552       .instrlen = 1,
 553       .constlen = 2,
 554       .info.max_reg = 1,
 555       .inputs_count = 1,
 556       .inputs[0] = {
 557          .slot = SYSTEM_VALUE_GS_HEADER_IR3,
 558          .regid = regid(0, 0),
 559          .sysval = true,
 560       },
 561       .outputs_count = 3,
 562       .outputs[0] = {
 563          .slot = VARYING_SLOT_POS,
 564          .regid = regid(0, 0),
 565       },
 566       .outputs[1] = {
 567          .slot = VARYING_SLOT_LAYER,
 568          .regid = regid(1, 1),
 569       },
 570       .outputs[2] = {
 571          .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
 572          .regid = regid(1, 0),
 573       },
 574       .shader = &dummy_shader,
 575    }, *gs = layered_clear ? &gs_shader : NULL;
 576
 577
 578 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
 579 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 580 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 581
 582    static const instr_t vs_code[] = {
 583       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 584        * r1.xy = r0.w ? c1.zw : c0.zw
 585        * r0.w = 1.0f
 586        */
 587       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 588          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 589          .src2 = 3,
 590          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 591       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 592          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 593          .src2 = 3,
 594          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 595       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 596       { .cat0 = { .opc = OPC_END } },
 597    };
 598
 599    static const instr_t vs_layered[] = {
 600       { .cat0 = { .opc = OPC_CHMASK } },
 601       { .cat0 = { .opc = OPC_CHSH } },
 602    };
 603
 604    static const instr_t gs_code[16] = {
 605       /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
 606       CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
 607            .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
 608       /* x = (local_id & 1) ? c1.x : c0.x */
 609       CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
 610       /* y = (local_id & 2) ? c1.y : c0.y */
 611       CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
 612       /* pred = (local_id >= 4), used by OPC_KILL */
 613       CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
 614       /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
 615       CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
 616
 617       MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
 618       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
 619       MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
 620
 621       /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
 622       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
 623          .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
 624          .src2 = 0,
 625          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 626
 627       CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
 628
 629       { .cat0 = { .opc = OPC_KILL } },
 630       { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
 631    };
 632 #define FS_OFFSET (16 * sizeof(instr_t))
 633 #define GS_OFFSET (32 * sizeof(instr_t))
 634
 635    /* shaders */
 636    struct ts_cs_memory shaders = { };
 637    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
 638                                  16 * sizeof(instr_t), &shaders);
 639    assert(result == VK_SUCCESS);
 640
 641    if (layered_clear) {
 642       memcpy(shaders.map, vs_layered, sizeof(vs_layered));
 643       memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
 644    } else {
 645       memcpy(shaders.map, vs_code, sizeof(vs_code));
 646    }
 647
 648    instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 649    for (uint32_t i = 0; i < num_rts; i++) {
 650       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 651       *fs_code++ = (instr_t) { .cat1 = {
 652          .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 653          .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
 654       } };
 655    }
 656
 657    /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 658     * blit path (its not clear what allows it to not have it)
 659     */
 660    if (blit) {
 661       *fs_code++ = (instr_t) { .cat2 = {
 662          .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
 663          .dst = regid(63, 0), .src1_im = 1
 664       } };
 665    }
 666    *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 667    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 668
 669    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 670
 671    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
 672    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 673    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 674    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
 675    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
 676
 677    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 678    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 679
 680    tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
 681
 682    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 683    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 684    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 685
 686    tu6_emit_fs_inputs(cs, &fs);
 687
 688    tu_cs_emit_regs(cs,
 689                    A6XX_GRAS_CL_CNTL(
 690                       .persp_division_disable = 1,
 691                       .vp_xform_disable = 1,
 692                       .vp_clip_code_ignore = 1,
 693                       .clip_disable = 1),
 694                    A6XX_GRAS_UNKNOWN_8001(0));
 695    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 696
 697    tu_cs_emit_regs(cs,
 698                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 699                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 700    tu_cs_emit_regs(cs,
 701                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 702                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 703 }
 704
 705 static void
 706 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
 707 {
 708    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 709    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 710                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 711                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 712                   CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
 713                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 714    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 715    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 716    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 717 }
 718
 719 static void
 720 r3d_coords(struct tu_cs *cs,
 721            const VkOffset2D *dst,
 722            const VkOffset2D *src,
 723            const VkExtent2D *extent)
 724 {
 725    int32_t src_x1 = src ? src->x : 0;
 726    int32_t src_y1 = src ? src->y : 0;
 727    r3d_coords_raw(cs, false, (float[]) {
 728       dst->x,                 dst->y,
 729       src_x1,                 src_y1,
 730       dst->x + extent->width, dst->y + extent->height,
 731       src_x1 + extent->width, src_y1 + extent->height,
 732    });
 733 }
 734
 735 static void
 736 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 737 {
 738    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 739    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 740                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 741                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 742                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 743                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 744    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 745    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 746    switch (format) {
 747    case VK_FORMAT_X8_D24_UNORM_PACK32:
 748    case VK_FORMAT_D24_UNORM_S8_UINT: {
 749       /* cleared as r8g8b8a8_unorm using special format */
 750       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 751       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 752       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 753       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 754       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 755    } break;
 756    case VK_FORMAT_D16_UNORM:
 757    case VK_FORMAT_D32_SFLOAT:
 758       tu_cs_emit(cs, fui(val->depthStencil.depth));
 759       tu_cs_emit(cs, 0);
 760       tu_cs_emit(cs, 0);
 761       tu_cs_emit(cs, 0);
 762       break;
 763    case VK_FORMAT_S8_UINT:
 764       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 765       tu_cs_emit(cs, 0);
 766       tu_cs_emit(cs, 0);
 767       tu_cs_emit(cs, 0);
 768       break;
 769    default:
 770       /* as color formats use clear value as-is */
 771       assert(!vk_format_is_depth_or_stencil(format));
 772       tu_cs_emit_array(cs, val->color.uint32, 4);
 773       break;
 774    }
 775 }
 776
 777 static void
 778 r3d_src_common(struct tu_cmd_buffer *cmd,
 779                struct tu_cs *cs,
 780                const uint32_t *tex_const,
 781                uint32_t offset_base,
 782                uint32_t offset_ubwc,
 783                bool linear_filter)
 784 {
 785    struct ts_cs_memory texture = { };
 786    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 787                                  2, /* allocate space for a sampler too */
 788                                  A6XX_TEX_CONST_DWORDS, &texture);
 789    assert(result == VK_SUCCESS);
 790
 791    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 792
 793    /* patch addresses for layer offset */
 794    *(uint64_t*) (texture.map + 4) += offset_base;
 795    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 796    texture.map[7] = ubwc_addr;
 797    texture.map[8] = ubwc_addr >> 32;
 798
 799    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 800       A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 801       A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 802       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 803       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 804       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 805       0x60000; /* XXX used by blob, doesn't seem necessary */
 806    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 807       0x1 | /* XXX used by blob, doesn't seem necessary */
 808       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 809       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 810    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 811    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 812
 813    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 814    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 815                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 816                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 817                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 818                CP_LOAD_STATE6_0_NUM_UNIT(1));
 819    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 820
 821    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 822    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 823
 824    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 825    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 826       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 827       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 828       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 829       CP_LOAD_STATE6_0_NUM_UNIT(1));
 830    tu_cs_emit_qw(cs, texture.iova);
 831
 832    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 833    tu_cs_emit_qw(cs, texture.iova);
 834
 835    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 836 }
 837
 838 static void
 839 r3d_src(struct tu_cmd_buffer *cmd,
 840         struct tu_cs *cs,
 841         const struct tu_image_view *iview,
 842         uint32_t layer,
 843         bool linear_filter)
 844 {
 845    r3d_src_common(cmd, cs, iview->descriptor,
 846                   iview->layer_size * layer,
 847                   iview->ubwc_layer_size * layer,
 848                   linear_filter);
 849 }
 850
 851 static void
 852 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 853                struct tu_cs *cs,
 854                VkFormat vk_format,
 855                uint64_t va, uint32_t pitch,
 856                uint32_t width, uint32_t height)
 857 {
 858    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 859
 860    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 861
 862    desc[0] =
 863       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 864       A6XX_TEX_CONST_0_FMT(format.fmt) |
 865       A6XX_TEX_CONST_0_SWAP(format.swap) |
 866       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 867       // XXX to swizzle into .w for stencil buffer_to_image
 868       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 869       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 870       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 871    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 872    desc[2] =
 873       A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
 874       A6XX_TEX_CONST_2_PITCH(pitch) |
 875       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 876    desc[3] = 0;
 877    desc[4] = va;
 878    desc[5] = va >> 32;
 879    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 880       desc[i] = 0;
 881
 882    r3d_src_common(cmd, cs, desc, 0, 0, false);
 883 }
 884
 885 static void
 886 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 887 {
 888    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 889
 890    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 891    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 892    tu_cs_image_ref(cs, iview, layer);
 893    tu_cs_emit(cs, 0);
 894
 895    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 896    tu_cs_image_flag_ref(cs, iview, layer);
 897
 898    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 899 }
 900
 901 static void
 902 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 903 {
 904    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 905
 906    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 907
 908    tu_cs_emit_regs(cs,
 909                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 910                    A6XX_RB_MRT_PITCH(0, pitch),
 911                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 912                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 913                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 914                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 915
 916    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 917 }
 918
 919 static void
 920 r3d_setup(struct tu_cmd_buffer *cmd,
 921           struct tu_cs *cs,
 922           VkFormat vk_format,
 923           enum a6xx_rotation rotation,
 924           bool clear,
 925           uint8_t mask)
 926 {
 927    if (!cmd->state.pass) {
 928       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 929       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 930    }
 931
 932    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 933    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 934
 935    r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0, false);
 936
 937    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 938    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 939                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 940                   0xfc000000);
 941    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 942
 943    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 944    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 945
 946    tu_cs_emit_regs(cs,
 947                    A6XX_RB_FS_OUTPUT_CNTL0(),
 948                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 949
 950    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 951    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 952    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 953
 954    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 955    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 956    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 957    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 958    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 959    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 960    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 961
 962    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 963    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 964
 965    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 966                         .color_format = tu6_base_format(vk_format),
 967                         .color_sint = vk_format_is_sint(vk_format),
 968                         .color_uint = vk_format_is_uint(vk_format)));
 969
 970    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 971    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 972    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 973 }
 974
 975 static void
 976 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 977 {
 978    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 979    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 980                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 981                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 982    tu_cs_emit(cs, 1); /* instance count */
 983    tu_cs_emit(cs, 2); /* vertex count */
 984 }
 985
 986 /* blit ops - common interface for 2d/shader paths */
 987
 988 struct blit_ops {
 989    void (*coords)(struct tu_cs *cs,
 990                   const VkOffset2D *dst,
 991                   const VkOffset2D *src,
 992                   const VkExtent2D *extent);
 993    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 994    void (*src)(
 995         struct tu_cmd_buffer *cmd,
 996         struct tu_cs *cs,
 997         const struct tu_image_view *iview,
 998         uint32_t layer,
 999         bool linear_filter);
1000    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1001                       VkFormat vk_format,
1002                       uint64_t va, uint32_t pitch,
1003                       uint32_t width, uint32_t height);
1004    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1005    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1006    void (*setup)(struct tu_cmd_buffer *cmd,
1007                  struct tu_cs *cs,
1008                  VkFormat vk_format,
1009                  enum a6xx_rotation rotation,
1010                  bool clear,
1011                  uint8_t mask);
1012    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1013 };
1014
1015 static const struct blit_ops r2d_ops = {
1016    .coords = r2d_coords,
1017    .clear_value = r2d_clear_value,
1018    .src = r2d_src,
1019    .src_buffer = r2d_src_buffer,
1020    .dst = r2d_dst,
1021    .dst_buffer = r2d_dst_buffer,
1022    .setup = r2d_setup,
1023    .run = r2d_run,
1024 };
1025
1026 static const struct blit_ops r3d_ops = {
1027    .coords = r3d_coords,
1028    .clear_value = r3d_clear_value,
1029    .src = r3d_src,
1030    .src_buffer = r3d_src_buffer,
1031    .dst = r3d_dst,
1032    .dst_buffer = r3d_dst_buffer,
1033    .setup = r3d_setup,
1034    .run = r3d_run,
1035 };
1036
1037 /* passthrough set coords from 3D extents */
1038 static void
1039 coords(const struct blit_ops *ops,
1040        struct tu_cs *cs,
1041        const VkOffset3D *dst,
1042        const VkOffset3D *src,
1043        const VkExtent3D *extent)
1044 {
1045    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1046 }
1047
1048 static void
1049 tu_image_view_blit2(struct tu_image_view *iview,
1050                     struct tu_image *image,
1051                     VkFormat format,
1052                     const VkImageSubresourceLayers *subres,
1053                     uint32_t layer,
1054                     bool stencil_read)
1055 {
1056    VkImageAspectFlags aspect_mask = subres->aspectMask;
1057
1058    /* always use the AS_R8G8B8A8 format for these */
1059    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1060        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1061       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1062    }
1063
1064    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1065       .image = tu_image_to_handle(image),
1066       .viewType = VK_IMAGE_VIEW_TYPE_2D,
1067       .format = format,
1068       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1069       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1070       .subresourceRange = {
1071          .aspectMask = aspect_mask,
1072          .baseMipLevel = subres->mipLevel,
1073          .levelCount = 1,
1074          .baseArrayLayer = subres->baseArrayLayer + layer,
1075          .layerCount = 1,
1076       },
1077    });
1078 }
1079
1080 static void
1081 tu_image_view_blit(struct tu_image_view *iview,
1082                    struct tu_image *image,
1083                    const VkImageSubresourceLayers *subres,
1084                    uint32_t layer)
1085 {
1086    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1087 }
1088
1089 static void
1090 tu6_blit_image(struct tu_cmd_buffer *cmd,
1091                struct tu_image *src_image,
1092                struct tu_image *dst_image,
1093                const VkImageBlit *info,
1094                VkFilter filter)
1095 {
1096    const struct blit_ops *ops = &r3d_ops;
1097    struct tu_cs *cs = &cmd->cs;
1098    uint32_t layers;
1099
1100    /* 2D blit can't do rotation mirroring from just coordinates */
1101    static const enum a6xx_rotation rotate[2][2] = {
1102       {ROTATE_0, ROTATE_HFLIP},
1103       {ROTATE_VFLIP, ROTATE_180},
1104    };
1105
1106    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1107                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1108    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1109                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1110    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1111                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1112
1113    if (mirror_z) {
1114       tu_finishme("blit z mirror\n");
1115       return;
1116    }
1117
1118    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1119        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1120       tu_finishme("blit z filter\n");
1121       return;
1122    }
1123
1124    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1125    if (info->dstSubresource.layerCount > 1) {
1126       assert(layers <= 1);
1127       layers = info->dstSubresource.layerCount;
1128    }
1129
1130    uint8_t mask = 0xf;
1131    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1132       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1133       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1134          mask = 0x7;
1135       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1136          mask = 0x8;
1137    }
1138
1139    /* BC1_RGB_* formats need to have their last components overriden with 1
1140     * when sampling, which is normally handled with the texture descriptor
1141     * swizzle. The 2d path can't handle that, so use the 3d path.
1142     *
1143     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1144     * the 2d path.
1145     */
1146
1147    if (dst_image->samples > 1 ||
1148        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1149        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1150       ops = &r3d_ops;
1151
1152    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1153     * figure out why (should be able to pass all tests with only shader path)
1154     */
1155
1156    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1157
1158    if (ops == &r3d_ops) {
1159       r3d_coords_raw(cs, false, (float[]) {
1160          info->dstOffsets[0].x, info->dstOffsets[0].y,
1161          info->srcOffsets[0].x, info->srcOffsets[0].y,
1162          info->dstOffsets[1].x, info->dstOffsets[1].y,
1163          info->srcOffsets[1].x, info->srcOffsets[1].y
1164       });
1165    } else {
1166       tu_cs_emit_regs(cs,
1167          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1168                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1169          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1170                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1171       tu_cs_emit_regs(cs,
1172          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1173          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1174          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1175          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1176    }
1177
1178    struct tu_image_view dst, src;
1179    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1180    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1181
1182    for (uint32_t i = 0; i < layers; i++) {
1183       ops->dst(cs, &dst, i);
1184       ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1185       ops->run(cmd, cs);
1186    }
1187 }
1188
1189 void
1190 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1191                 VkImage srcImage,
1192                 VkImageLayout srcImageLayout,
1193                 VkImage dstImage,
1194                 VkImageLayout dstImageLayout,
1195                 uint32_t regionCount,
1196                 const VkImageBlit *pRegions,
1197                 VkFilter filter)
1198
1199 {
1200    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1201    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1202    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1203
1204    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1205    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1206
1207    for (uint32_t i = 0; i < regionCount; ++i)
1208       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1209 }
1210
1211 static VkFormat
1212 copy_format(VkFormat format)
1213 {
1214    switch (vk_format_get_blocksizebits(format)) {
1215    case 8:  return VK_FORMAT_R8_UINT;
1216    case 16: return VK_FORMAT_R16_UINT;
1217    case 32: return VK_FORMAT_R32_UINT;
1218    case 64: return VK_FORMAT_R32G32_UINT;
1219    case 96: return VK_FORMAT_R32G32B32_UINT;
1220    case 128:return VK_FORMAT_R32G32B32A32_UINT;
1221    default:
1222       unreachable("unhandled format size");
1223    }
1224 }
1225
1226 static void
1227 copy_compressed(VkFormat format,
1228                 VkOffset3D *offset,
1229                 VkExtent3D *extent,
1230                 uint32_t *width,
1231                 uint32_t *height)
1232 {
1233    if (!vk_format_is_compressed(format))
1234       return;
1235
1236    uint32_t block_width = vk_format_get_blockwidth(format);
1237    uint32_t block_height = vk_format_get_blockheight(format);
1238
1239    offset->x /= block_width;
1240    offset->y /= block_height;
1241
1242    if (extent) {
1243       extent->width = DIV_ROUND_UP(extent->width, block_width);
1244       extent->height = DIV_ROUND_UP(extent->height, block_height);
1245    }
1246    if (width)
1247       *width = DIV_ROUND_UP(*width, block_width);
1248    if (height)
1249       *height = DIV_ROUND_UP(*height, block_height);
1250 }
1251
1252 static void
1253 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1254                         struct tu_buffer *src_buffer,
1255                         struct tu_image *dst_image,
1256                         const VkBufferImageCopy *info)
1257 {
1258    struct tu_cs *cs = &cmd->cs;
1259    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1260    VkFormat dst_format = dst_image->vk_format;
1261    VkFormat src_format = dst_image->vk_format;
1262    const struct blit_ops *ops = &r2d_ops;
1263
1264    uint8_t mask = 0xf;
1265
1266    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1267       switch (info->imageSubresource.aspectMask) {
1268       case VK_IMAGE_ASPECT_STENCIL_BIT:
1269          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1270          mask = 0x8;
1271          ops = &r3d_ops;
1272          break;
1273       case VK_IMAGE_ASPECT_DEPTH_BIT:
1274          mask = 0x7;
1275          break;
1276       }
1277    }
1278
1279    VkOffset3D offset = info->imageOffset;
1280    VkExtent3D extent = info->imageExtent;
1281    uint32_t src_width = info->bufferRowLength ?: extent.width;
1282    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1283
1284    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1285       assert(src_format == dst_format);
1286       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1287       src_format = dst_format = copy_format(dst_format);
1288    }
1289
1290    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1291    uint32_t layer_size = src_height * pitch;
1292
1293    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1294     * it is also valid for 1cpp format with shader path (stencil aspect path)
1295     */
1296
1297    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1298
1299    struct tu_image_view dst;
1300    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1301
1302    for (uint32_t i = 0; i < layers; i++) {
1303       ops->dst(cs, &dst, i);
1304
1305       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1306       if ((src_va & 63) || (pitch & 63)) {
1307          for (uint32_t y = 0; y < extent.height; y++) {
1308             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1309             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1310                             x + extent.width, 1);
1311             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1312                         &(VkExtent2D) {extent.width, 1});
1313             ops->run(cmd, cs);
1314             src_va += pitch;
1315          }
1316       } else {
1317          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1318          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1319          ops->run(cmd, cs);
1320       }
1321    }
1322 }
1323
1324 void
1325 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1326                         VkBuffer srcBuffer,
1327                         VkImage dstImage,
1328                         VkImageLayout dstImageLayout,
1329                         uint32_t regionCount,
1330                         const VkBufferImageCopy *pRegions)
1331 {
1332    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1333    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1334    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1335
1336    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1337    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1338
1339    for (unsigned i = 0; i < regionCount; ++i)
1340       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1341 }
1342
1343 static void
1344 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1345                         struct tu_image *src_image,
1346                         struct tu_buffer *dst_buffer,
1347                         const VkBufferImageCopy *info)
1348 {
1349    struct tu_cs *cs = &cmd->cs;
1350    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1351    VkFormat src_format = src_image->vk_format;
1352    VkFormat dst_format = src_image->vk_format;
1353    bool stencil_read = false;
1354
1355    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1356        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1357       dst_format = VK_FORMAT_R8_UNORM;
1358       stencil_read = true;
1359    }
1360
1361    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1362    VkOffset3D offset = info->imageOffset;
1363    VkExtent3D extent = info->imageExtent;
1364    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1365    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1366
1367    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1368       assert(src_format == dst_format);
1369       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1370       src_format = dst_format = copy_format(dst_format);
1371    }
1372
1373    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1374    uint32_t layer_size = pitch * dst_height;
1375
1376    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1377     * it is also valid for 1cpp format with shader path (stencil aspect)
1378     */
1379
1380    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1381
1382    struct tu_image_view src;
1383    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1384
1385    for (uint32_t i = 0; i < layers; i++) {
1386       ops->src(cmd, cs, &src, i, false);
1387
1388       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1389       if ((dst_va & 63) || (pitch & 63)) {
1390          for (uint32_t y = 0; y < extent.height; y++) {
1391             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1392             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1393             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1394                         &(VkExtent2D) {extent.width, 1});
1395             ops->run(cmd, cs);
1396             dst_va += pitch;
1397          }
1398       } else {
1399          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1400          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1401          ops->run(cmd, cs);
1402       }
1403    }
1404 }
1405
1406 void
1407 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1408                         VkImage srcImage,
1409                         VkImageLayout srcImageLayout,
1410                         VkBuffer dstBuffer,
1411                         uint32_t regionCount,
1412                         const VkBufferImageCopy *pRegions)
1413 {
1414    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1415    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1416    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1417
1418    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1419    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1420
1421    for (unsigned i = 0; i < regionCount; ++i)
1422       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1423 }
1424
1425 /* Tiled formats don't support swapping, which means that we can't support
1426  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1427  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1428  * Currently we fake support for tiled swapped formats and use the unswapped
1429  * format instead, but this means that reinterpreting copies to and from
1430  * swapped formats can't be performed correctly unless we can swizzle the
1431  * components by reinterpreting the other image as the "correct" swapped
1432  * format, i.e. only when the other image is linear.
1433  */
1434
1435 static bool
1436 is_swapped_format(VkFormat format)
1437 {
1438    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1439    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1440    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1441 }
1442
1443 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1444  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1445  * versa). This should mirror the logic in fdl6_layout.
1446  */
1447 static bool
1448 image_is_r8g8(struct tu_image *image)
1449 {
1450    return image->layout.cpp == 2 &&
1451       vk_format_get_nr_components(image->vk_format) == 2;
1452 }
1453
1454 static void
1455 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1456                        struct tu_image *src_image,
1457                        struct tu_image *dst_image,
1458                        const VkImageCopy *info)
1459 {
1460    const struct blit_ops *ops = &r2d_ops;
1461    struct tu_cs *cs = &cmd->cs;
1462
1463    uint8_t mask = 0xf;
1464    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1465       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1466          mask = 0x7;
1467       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1468          mask = 0x8;
1469    }
1470
1471    if (dst_image->samples > 1)
1472       ops = &r3d_ops;
1473
1474    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1475
1476    VkFormat format = VK_FORMAT_UNDEFINED;
1477    VkOffset3D src_offset = info->srcOffset;
1478    VkOffset3D dst_offset = info->dstOffset;
1479    VkExtent3D extent = info->extent;
1480
1481    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1482     * Images":
1483     *
1484     *    When copying between compressed and uncompressed formats the extent
1485     *    members represent the texel dimensions of the source image and not
1486     *    the destination. When copying from a compressed image to an
1487     *    uncompressed image the image texel dimensions written to the
1488     *    uncompressed image will be source extent divided by the compressed
1489     *    texel block dimensions. When copying from an uncompressed image to a
1490     *    compressed image the image texel dimensions written to the compressed
1491     *    image will be the source extent multiplied by the compressed texel
1492     *    block dimensions.
1493     *
1494     * This means we only have to adjust the extent if the source image is
1495     * compressed.
1496     */
1497    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1498    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1499
1500    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1501       copy_format(dst_image->vk_format) : dst_image->vk_format;
1502    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1503       copy_format(src_image->vk_format) : src_image->vk_format;
1504
1505    bool use_staging_blit = false;
1506
1507    if (src_format == dst_format) {
1508       /* Images that share a format can always be copied directly because it's
1509        * the same as a blit.
1510        */
1511       format = src_format;
1512    } else if (!src_image->layout.tile_mode) {
1513       /* If an image is linear, we can always safely reinterpret it with the
1514        * other image's format and then do a regular blit.
1515        */
1516       format = dst_format;
1517    } else if (!dst_image->layout.tile_mode) {
1518       format = src_format;
1519    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1520       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1521        * due to the different tile layout.
1522        */
1523       use_staging_blit = true;
1524    } else if (is_swapped_format(src_format) ||
1525               is_swapped_format(dst_format)) {
1526       /* If either format has a non-identity swap, then we can't copy
1527        * to/from it.
1528        */
1529       use_staging_blit = true;
1530    } else if (!src_image->layout.ubwc) {
1531       format = dst_format;
1532    } else if (!dst_image->layout.ubwc) {
1533       format = src_format;
1534    } else {
1535       /* Both formats use UBWC and so neither can be reinterpreted.
1536        * TODO: We could do an in-place decompression of the dst instead.
1537        */
1538       use_staging_blit = true;
1539    }
1540
1541    struct tu_image_view dst, src;
1542
1543    if (use_staging_blit) {
1544       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1545       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1546
1547       struct tu_image staging_image = {
1548          .vk_format = src_format,
1549          .type = src_image->type,
1550          .tiling = VK_IMAGE_TILING_LINEAR,
1551          .extent = extent,
1552          .level_count = 1,
1553          .layer_count = info->srcSubresource.layerCount,
1554          .samples = src_image->samples,
1555          .bo_offset = 0,
1556       };
1557
1558       VkImageSubresourceLayers staging_subresource = {
1559          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1560          .mipLevel = 0,
1561          .baseArrayLayer = 0,
1562          .layerCount = info->srcSubresource.layerCount,
1563       };
1564
1565       VkOffset3D staging_offset = { 0 };
1566
1567       staging_image.layout.tile_mode = TILE6_LINEAR;
1568       staging_image.layout.ubwc = false;
1569
1570       fdl6_layout(&staging_image.layout,
1571                   vk_format_to_pipe_format(staging_image.vk_format),
1572                   staging_image.samples,
1573                   staging_image.extent.width,
1574                   staging_image.extent.height,
1575                   staging_image.extent.depth,
1576                   staging_image.level_count,
1577                   staging_image.layer_count,
1578                   staging_image.type == VK_IMAGE_TYPE_3D,
1579                   NULL);
1580
1581       VkResult result = tu_get_scratch_bo(cmd->device,
1582                                           staging_image.layout.size,
1583                                           &staging_image.bo);
1584       if (result != VK_SUCCESS) {
1585          cmd->record_result = result;
1586          return;
1587       }
1588
1589       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1590                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1591
1592       struct tu_image_view staging;
1593       tu_image_view_blit2(&staging, &staging_image, src_format,
1594                           &staging_subresource, 0, false);
1595
1596       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1597       coords(ops, cs, &staging_offset, &src_offset, &extent);
1598
1599       for (uint32_t i = 0; i < info->extent.depth; i++) {
1600          ops->src(cmd, cs, &src, i, false);
1601          ops->dst(cs, &staging, i);
1602          ops->run(cmd, cs);
1603       }
1604
1605       /* When executed by the user there has to be a pipeline barrier here,
1606        * but since we're doing it manually we'll have to flush ourselves.
1607        */
1608       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1609       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1610
1611       tu_image_view_blit2(&staging, &staging_image, dst_format,
1612                           &staging_subresource, 0, false);
1613
1614       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1615       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1616
1617       for (uint32_t i = 0; i < info->extent.depth; i++) {
1618          ops->src(cmd, cs, &staging, i, false);
1619          ops->dst(cs, &dst, i);
1620          ops->run(cmd, cs);
1621       }
1622    } else {
1623       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1624       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1625
1626       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1627       coords(ops, cs, &dst_offset, &src_offset, &extent);
1628
1629       for (uint32_t i = 0; i < info->extent.depth; i++) {
1630          ops->src(cmd, cs, &src, i, false);
1631          ops->dst(cs, &dst, i);
1632          ops->run(cmd, cs);
1633       }
1634    }
1635 }
1636
1637 void
1638 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1639                 VkImage srcImage,
1640                 VkImageLayout srcImageLayout,
1641                 VkImage destImage,
1642                 VkImageLayout destImageLayout,
1643                 uint32_t regionCount,
1644                 const VkImageCopy *pRegions)
1645 {
1646    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1647    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1648    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1649
1650    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1651    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1652
1653    for (uint32_t i = 0; i < regionCount; ++i)
1654       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1655 }
1656
1657 static void
1658 copy_buffer(struct tu_cmd_buffer *cmd,
1659             uint64_t dst_va,
1660             uint64_t src_va,
1661             uint64_t size,
1662             uint32_t block_size)
1663 {
1664    const struct blit_ops *ops = &r2d_ops;
1665    struct tu_cs *cs = &cmd->cs;
1666    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1667    uint64_t blocks = size / block_size;
1668
1669    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1670
1671    while (blocks) {
1672       uint32_t src_x = (src_va & 63) / block_size;
1673       uint32_t dst_x = (dst_va & 63) / block_size;
1674       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1675
1676       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1677       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1678       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1679       ops->run(cmd, cs);
1680
1681       src_va += width * block_size;
1682       dst_va += width * block_size;
1683       blocks -= width;
1684    }
1685 }
1686
1687 void
1688 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1689                  VkBuffer srcBuffer,
1690                  VkBuffer dstBuffer,
1691                  uint32_t regionCount,
1692                  const VkBufferCopy *pRegions)
1693 {
1694    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1695    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1696    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1697
1698    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1699    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1700
1701    for (unsigned i = 0; i < regionCount; ++i) {
1702       copy_buffer(cmd,
1703                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1704                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1705                   pRegions[i].size, 1);
1706    }
1707 }
1708
1709 void
1710 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1711                    VkBuffer dstBuffer,
1712                    VkDeviceSize dstOffset,
1713                    VkDeviceSize dataSize,
1714                    const void *pData)
1715 {
1716    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1717    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1718
1719    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1720
1721    struct ts_cs_memory tmp;
1722    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1723    if (result != VK_SUCCESS) {
1724       cmd->record_result = result;
1725       return;
1726    }
1727
1728    memcpy(tmp.map, pData, dataSize);
1729    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1730 }
1731
1732 void
1733 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1734                  VkBuffer dstBuffer,
1735                  VkDeviceSize dstOffset,
1736                  VkDeviceSize fillSize,
1737                  uint32_t data)
1738 {
1739    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1740    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1741    const struct blit_ops *ops = &r2d_ops;
1742    struct tu_cs *cs = &cmd->cs;
1743
1744    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1745
1746    if (fillSize == VK_WHOLE_SIZE)
1747       fillSize = buffer->size - dstOffset;
1748
1749    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1750    uint32_t blocks = fillSize / 4;
1751
1752    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1753    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1754
1755    while (blocks) {
1756       uint32_t dst_x = (dst_va & 63) / 4;
1757       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1758
1759       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1760       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1761       ops->run(cmd, cs);
1762
1763       dst_va += width * 4;
1764       blocks -= width;
1765    }
1766 }
1767
1768 void
1769 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1770                    VkImage srcImage,
1771                    VkImageLayout srcImageLayout,
1772                    VkImage dstImage,
1773                    VkImageLayout dstImageLayout,
1774                    uint32_t regionCount,
1775                    const VkImageResolve *pRegions)
1776 {
1777    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1778    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1779    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1780    const struct blit_ops *ops = &r2d_ops;
1781    struct tu_cs *cs = &cmd->cs;
1782
1783    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1784    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1785
1786    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1787
1788    for (uint32_t i = 0; i < regionCount; ++i) {
1789       const VkImageResolve *info = &pRegions[i];
1790       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1791
1792       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1793       /* TODO: aspect masks possible ? */
1794
1795       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1796
1797       struct tu_image_view dst, src;
1798       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1799       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1800
1801       for (uint32_t i = 0; i < layers; i++) {
1802          ops->src(cmd, cs, &src, i, false);
1803          ops->dst(cs, &dst, i);
1804          ops->run(cmd, cs);
1805       }
1806    }
1807 }
1808
1809 void
1810 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1811                   struct tu_cs *cs,
1812                   struct tu_image_view *src,
1813                   struct tu_image_view *dst,
1814                   uint32_t layers,
1815                   const VkRect2D *rect)
1816 {
1817    const struct blit_ops *ops = &r2d_ops;
1818
1819    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1820    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1821
1822    assert(src->image->vk_format == dst->image->vk_format);
1823
1824    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1825    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1826
1827    for (uint32_t i = 0; i < layers; i++) {
1828       ops->src(cmd, cs, src, i, false);
1829       ops->dst(cs, dst, i);
1830       ops->run(cmd, cs);
1831    }
1832 }
1833
1834 static void
1835 clear_image(struct tu_cmd_buffer *cmd,
1836             struct tu_image *image,
1837             const VkClearValue *clear_value,
1838             const VkImageSubresourceRange *range)
1839 {
1840    uint32_t level_count = tu_get_levelCount(image, range);
1841    uint32_t layer_count = tu_get_layerCount(image, range);
1842    struct tu_cs *cs = &cmd->cs;
1843    VkFormat format = image->vk_format;
1844    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1845       format = VK_FORMAT_R32_UINT;
1846
1847    if (image->type == VK_IMAGE_TYPE_3D) {
1848       assert(layer_count == 1);
1849       assert(range->baseArrayLayer == 0);
1850    }
1851
1852    uint8_t mask = 0xf;
1853    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1854       mask = 0;
1855       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1856          mask |= 0x7;
1857       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1858          mask |= 0x8;
1859    }
1860
1861    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1862
1863    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1864    ops->clear_value(cs, image->vk_format, clear_value);
1865
1866    for (unsigned j = 0; j < level_count; j++) {
1867       if (image->type == VK_IMAGE_TYPE_3D)
1868          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1869
1870       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1871                      u_minify(image->extent.width, range->baseMipLevel + j),
1872                      u_minify(image->extent.height, range->baseMipLevel + j)
1873                   });
1874
1875       struct tu_image_view dst;
1876       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1877          .aspectMask = range->aspectMask,
1878          .mipLevel = range->baseMipLevel + j,
1879          .baseArrayLayer = range->baseArrayLayer,
1880          .layerCount = 1,
1881       }, 0, false);
1882
1883       for (uint32_t i = 0; i < layer_count; i++) {
1884          ops->dst(cs, &dst, i);
1885          ops->run(cmd, cs);
1886       }
1887    }
1888 }
1889
1890 void
1891 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1892                       VkImage image_h,
1893                       VkImageLayout imageLayout,
1894                       const VkClearColorValue *pColor,
1895                       uint32_t rangeCount,
1896                       const VkImageSubresourceRange *pRanges)
1897 {
1898    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1899    TU_FROM_HANDLE(tu_image, image, image_h);
1900
1901    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1902
1903    for (unsigned i = 0; i < rangeCount; i++)
1904       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1905 }
1906
1907 void
1908 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1909                              VkImage image_h,
1910                              VkImageLayout imageLayout,
1911                              const VkClearDepthStencilValue *pDepthStencil,
1912                              uint32_t rangeCount,
1913                              const VkImageSubresourceRange *pRanges)
1914 {
1915    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1916    TU_FROM_HANDLE(tu_image, image, image_h);
1917
1918    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1919
1920    for (unsigned i = 0; i < rangeCount; i++)
1921       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1922 }
1923
1924 static void
1925 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1926                                uint32_t attachment_count,
1927                                const VkClearAttachment *attachments,
1928                                uint32_t rect_count,
1929                                const VkClearRect *rects)
1930 {
1931    const struct tu_subpass *subpass = cmd->state.subpass;
1932    /* note: cannot use shader path here.. there is a special shader path
1933     * in tu_clear_sysmem_attachments()
1934     */
1935    const struct blit_ops *ops = &r2d_ops;
1936    struct tu_cs *cs = &cmd->draw_cs;
1937
1938    for (uint32_t j = 0; j < attachment_count; j++) {
1939          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1940           * Pass Instance" says that:
1941           *
1942           *     Unlike other clear commands, vkCmdClearAttachments executes as
1943           *     a drawing command, rather than a transfer command, with writes
1944           *     performed by it executing in rasterization order. Clears to
1945           *     color attachments are executed as color attachment writes, by
1946           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1947           *     Clears to depth/stencil attachments are executed as depth
1948           *     writes and writes by the
1949           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1950           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1951           *
1952           * However, the 2d path here is executed the same way as a
1953           * transfer command, using the CCU color cache exclusively with
1954           * a special depth-as-color format for depth clears. This means that
1955           * we can't rely on the normal pipeline barrier mechanism here, and
1956           * have to manually flush whenever using a different cache domain
1957           * from what the 3d path would've used. This happens when we clear
1958           * depth/stencil, since normally depth attachments use CCU depth, but
1959           * we clear it using a special depth-as-color format. Since the clear
1960           * potentially uses a different attachment state we also need to
1961           * invalidate color beforehand and flush it afterwards.
1962           */
1963
1964          uint32_t a;
1965          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1966             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1967             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1968          } else {
1969             a = subpass->depth_stencil_attachment.attachment;
1970             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1971             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1972             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1973          }
1974
1975          if (a == VK_ATTACHMENT_UNUSED)
1976                continue;
1977
1978          uint8_t mask = 0xf;
1979          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1980             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1981                mask &= ~0x7;
1982             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1983                mask &= ~0x8;
1984          }
1985
1986          const struct tu_image_view *iview =
1987             cmd->state.framebuffer->attachments[a].attachment;
1988
1989          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1990          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1991
1992          /* Wait for the flushes we triggered manually to complete */
1993          tu_cs_emit_wfi(cs);
1994
1995          for (uint32_t i = 0; i < rect_count; i++) {
1996             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1997             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1998                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1999                ops->run(cmd, cs);
2000             }
2001          }
2002
2003          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2004             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2005             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2006          } else {
2007             /* sync color into depth */
2008             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2009             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2010          }
2011    }
2012 }
2013
2014 static void
2015 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2016                             uint32_t attachment_count,
2017                             const VkClearAttachment *attachments,
2018                             uint32_t rect_count,
2019                             const VkClearRect *rects)
2020 {
2021    /* the shader path here is special, it avoids changing MRT/etc state */
2022    const struct tu_render_pass *pass = cmd->state.pass;
2023    const struct tu_subpass *subpass = cmd->state.subpass;
2024    const uint32_t mrt_count = subpass->color_count;
2025    struct tu_cs *cs = &cmd->draw_cs;
2026    uint32_t clear_value[MAX_RTS][4];
2027    float z_clear_val = 0.0f;
2028    uint8_t s_clear_val = 0;
2029    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2030    bool z_clear = false;
2031    bool s_clear = false;
2032    bool layered_clear = false;
2033    uint32_t max_samples = 1;
2034
2035    for (uint32_t i = 0; i < attachment_count; i++) {
2036       uint32_t a;
2037       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2038          uint32_t c = attachments[i].colorAttachment;
2039          a = subpass->color_attachments[c].attachment;
2040          if (a == VK_ATTACHMENT_UNUSED)
2041             continue;
2042
2043          clear_rts |= 1 << c;
2044          clear_components |= 0xf << (c * 4);
2045          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2046       } else {
2047          a = subpass->depth_stencil_attachment.attachment;
2048          if (a == VK_ATTACHMENT_UNUSED)
2049             continue;
2050
2051          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2052             z_clear = true;
2053             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2054          }
2055
2056          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2057             s_clear = true;
2058             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2059          }
2060       }
2061
2062       max_samples = MAX2(max_samples, pass->attachments[a].samples);
2063    }
2064
2065    /* prefer to use 2D path for clears
2066     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2067     */
2068    if (max_samples == 1 && cmd->state.framebuffer) {
2069       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2070       return;
2071    }
2072
2073    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2074    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2075                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2076                   0xfc000000);
2077    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2078
2079    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2080    for (uint32_t i = 0; i < mrt_count; i++) {
2081       if (clear_rts & (1 << i))
2082          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2083       else
2084          tu_cs_emit(cs, 0);
2085    }
2086
2087    for (uint32_t i = 0; i < rect_count; i++) {
2088       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
2089          layered_clear = true;
2090    }
2091
2092    r3d_pipeline(cmd, cs, false, num_rts, layered_clear);
2093
2094    tu_cs_emit_regs(cs,
2095                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2096    tu_cs_emit_regs(cs,
2097                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2098
2099    tu_cs_emit_regs(cs,
2100                    A6XX_RB_FS_OUTPUT_CNTL0(),
2101                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2102
2103    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2104    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2105    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2106    for (uint32_t i = 0; i < mrt_count; i++) {
2107       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2108             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2109    }
2110
2111    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2112    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2113          .z_enable = z_clear,
2114          .z_write_enable = z_clear,
2115          .zfunc = FUNC_ALWAYS));
2116    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2117    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2118          .stencil_enable = s_clear,
2119          .func = FUNC_ALWAYS,
2120          .zpass = STENCIL_REPLACE));
2121    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2122    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2123    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2124
2125    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2126    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2127                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2128                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2129                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2130                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2131    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2132    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2133    for_each_bit(b, clear_rts)
2134       tu_cs_emit_array(cs, clear_value[b], 4);
2135
2136    for (uint32_t i = 0; i < rect_count; i++) {
2137       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2138          r3d_coords_raw(cs, layered_clear, (float[]) {
2139             rects[i].rect.offset.x, rects[i].rect.offset.y,
2140             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2141             rects[i].rect.offset.x + rects[i].rect.extent.width,
2142             rects[i].rect.offset.y + rects[i].rect.extent.height,
2143             z_clear_val, 1.0f,
2144          });
2145
2146          if (layered_clear) {
2147             tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2148             tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2149                            CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2150                            CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2151                            CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2152             tu_cs_emit(cs, 1); /* instance count */
2153             tu_cs_emit(cs, 1); /* vertex count */
2154          } else {
2155             r3d_run(cmd, cs);
2156          }
2157       }
2158    }
2159
2160    cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2161       TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2162       TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2163       TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2164       TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2165       TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2166 }
2167
2168 /**
2169  * Pack a VkClearValue into a 128-bit buffer. format is respected except
2170  * for the component order.  The components are always packed in WZYX order,
2171  * because gmem is tiled and tiled formats always have WZYX swap
2172  */
2173 static void
2174 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2175 {
2176    const struct util_format_description *desc = vk_format_description(format);
2177
2178    switch (format) {
2179    case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2180       buf[0] = float3_to_r11g11b10f(val->color.float32);
2181       return;
2182    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2183       buf[0] = float3_to_rgb9e5(val->color.float32);
2184       return;
2185    default:
2186       break;
2187    }
2188
2189    assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2190
2191    /* S8_UINT is special and has no depth */
2192    const int max_components =
2193       format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2194
2195    int buf_offset = 0;
2196    int bit_shift = 0;
2197    for (int comp = 0; comp < max_components; comp++) {
2198       const struct util_format_channel_description *ch =
2199          tu_get_format_channel_description(desc, comp);
2200       if (!ch) {
2201          assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2202                 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2203          continue;
2204       }
2205
2206       union tu_clear_component_value v = tu_get_clear_component_value(
2207          val, comp, desc->colorspace);
2208
2209       /* move to the next uint32_t when there is not enough space */
2210       assert(ch->size <= 32);
2211       if (bit_shift + ch->size > 32) {
2212          buf_offset++;
2213          bit_shift = 0;
2214       }
2215
2216       if (bit_shift == 0)
2217          buf[buf_offset] = 0;
2218
2219       buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2220       bit_shift += ch->size;
2221    }
2222 }
2223
2224 static void
2225 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2226                               struct tu_cs *cs,
2227                               uint32_t attachment,
2228                               uint8_t component_mask,
2229                               const VkClearValue *value)
2230 {
2231    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2232    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2233     * because D24S8 is cleared with AS_R8G8B8A8 format
2234     */
2235
2236    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2237    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2238
2239    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2240    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2241
2242    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2243    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2244
2245    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2246    tu_cs_emit(cs, 0);
2247
2248    uint32_t clear_vals[4] = {};
2249    pack_gmem_clear_value(value, vk_format, clear_vals);
2250
2251    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2252    tu_cs_emit_array(cs, clear_vals, 4);
2253
2254    tu6_emit_event_write(cmd, cs, BLIT);
2255 }
2256
2257 static void
2258 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2259                           uint32_t attachment_count,
2260                           const VkClearAttachment *attachments,
2261                           uint32_t rect_count,
2262                           const VkClearRect *rects)
2263 {
2264    const struct tu_subpass *subpass = cmd->state.subpass;
2265    struct tu_cs *cs = &cmd->draw_cs;
2266
2267    /* TODO: swap the loops for smaller cmdstream */
2268    for (unsigned i = 0; i < rect_count; i++) {
2269       unsigned x1 = rects[i].rect.offset.x;
2270       unsigned y1 = rects[i].rect.offset.y;
2271       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2272       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2273
2274       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2275       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2276       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2277
2278       for (unsigned j = 0; j < attachment_count; j++) {
2279          uint32_t a;
2280          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2281             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2282          else
2283             a = subpass->depth_stencil_attachment.attachment;
2284
2285          if (a == VK_ATTACHMENT_UNUSED)
2286                continue;
2287
2288          unsigned clear_mask = 0xf;
2289          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2290             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2291                clear_mask &= ~0x7;
2292             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2293                clear_mask &= ~0x8;
2294          }
2295
2296          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2297                                        &attachments[j].clearValue);
2298       }
2299    }
2300 }
2301
2302 void
2303 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2304                        uint32_t attachmentCount,
2305                        const VkClearAttachment *pAttachments,
2306                        uint32_t rectCount,
2307                        const VkClearRect *pRects)
2308 {
2309    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2310    struct tu_cs *cs = &cmd->draw_cs;
2311
2312    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2313    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2314    tu_cond_exec_end(cs);
2315
2316    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2317    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2318    tu_cond_exec_end(cs);
2319 }
2320
2321 void
2322 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2323                            struct tu_cs *cs,
2324                            uint32_t a,
2325                            const VkRenderPassBeginInfo *info)
2326 {
2327    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2328    const struct tu_image_view *iview = fb->attachments[a].attachment;
2329    const struct tu_render_pass_attachment *attachment =
2330       &cmd->state.pass->attachments[a];
2331    uint8_t mask = 0;
2332
2333    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2334       mask = 0xf;
2335    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2336       mask |= 0x7;
2337    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2338       mask |= 0x8;
2339
2340    if (!mask)
2341       return;
2342
2343    const struct blit_ops *ops = &r2d_ops;
2344    if (attachment->samples > 1)
2345       ops = &r3d_ops;
2346
2347    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2348    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2349    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2350
2351    /* Wait for any flushes at the beginning of the renderpass to complete */
2352    tu_cs_emit_wfi(cs);
2353
2354    for (uint32_t i = 0; i < fb->layers; i++) {
2355       ops->dst(cs, iview, i);
2356       ops->run(cmd, cs);
2357    }
2358
2359    /* The spec doesn't explicitly say, but presumably the initial renderpass
2360     * clear is considered part of the renderpass, and therefore barriers
2361     * aren't required inside the subpass/renderpass.  Therefore we need to
2362     * flush CCU color into CCU depth here, just like with
2363     * vkCmdClearAttachments(). Note that because this only happens at the
2364     * beginning of a renderpass, and renderpass writes are considered
2365     * "incoherent", we shouldn't have to worry about syncing depth into color
2366     * beforehand as depth should already be flushed.
2367     */
2368    if (vk_format_is_depth_or_stencil(attachment->format)) {
2369       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2370       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2371    } else {
2372       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2373       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2374    }
2375 }
2376
2377 void
2378 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2379                          struct tu_cs *cs,
2380                          uint32_t a,
2381                          const VkRenderPassBeginInfo *info)
2382 {
2383    const struct tu_render_pass_attachment *attachment =
2384       &cmd->state.pass->attachments[a];
2385    unsigned clear_mask = 0;
2386
2387    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2388       clear_mask = 0xf;
2389    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2390       clear_mask |= 0x7;
2391    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2392       clear_mask |= 0x8;
2393
2394    if (!clear_mask)
2395       return;
2396
2397    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2398
2399    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2400                                  &info->pClearValues[a]);
2401 }
2402
2403 static void
2404 tu_emit_blit(struct tu_cmd_buffer *cmd,
2405              struct tu_cs *cs,
2406              const struct tu_image_view *iview,
2407              const struct tu_render_pass_attachment *attachment,
2408              bool resolve)
2409 {
2410    tu_cs_emit_regs(cs,
2411                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2412
2413    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2414       .unk0 = !resolve,
2415       .gmem = !resolve,
2416       /* "integer" bit disables msaa resolve averaging */
2417       .integer = vk_format_is_int(attachment->format)));
2418
2419    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2420    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2421    tu_cs_image_ref_2d(cs, iview, 0, false);
2422
2423    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2424    tu_cs_image_flag_ref(cs, iview, 0);
2425
2426    tu_cs_emit_regs(cs,
2427                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2428
2429    tu6_emit_event_write(cmd, cs, BLIT);
2430 }
2431
2432 static bool
2433 blit_can_resolve(VkFormat format)
2434 {
2435    const struct util_format_description *desc = vk_format_description(format);
2436
2437    /* blit event can only do resolve for simple cases:
2438     * averaging samples as unsigned integers or choosing only one sample
2439     */
2440    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2441       return false;
2442
2443    /* can't do formats with larger channel sizes
2444     * note: this includes all float formats
2445     * note2: single channel integer formats seem OK
2446     */
2447    if (desc->channel[0].size > 10)
2448       return false;
2449
2450    switch (format) {
2451    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2452     * likely related to these formats having different layout from other cpp=2 formats
2453     */
2454    case VK_FORMAT_R8G8_UNORM:
2455    case VK_FORMAT_R8G8_UINT:
2456    case VK_FORMAT_R8G8_SINT:
2457    /* TODO: this one should be able to work? */
2458    case VK_FORMAT_D24_UNORM_S8_UINT:
2459       return false;
2460    default:
2461       break;
2462    }
2463
2464    return true;
2465 }
2466
2467 void
2468 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2469                         struct tu_cs *cs,
2470                         uint32_t a,
2471                         bool force_load)
2472 {
2473    const struct tu_image_view *iview =
2474       cmd->state.framebuffer->attachments[a].attachment;
2475    const struct tu_render_pass_attachment *attachment =
2476       &cmd->state.pass->attachments[a];
2477
2478    if (attachment->load || force_load)
2479       tu_emit_blit(cmd, cs, iview, attachment, false);
2480 }
2481
2482 void
2483 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2484                          struct tu_cs *cs,
2485                          uint32_t a,
2486                          uint32_t gmem_a)
2487 {
2488    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2489    const VkRect2D *render_area = &tiling->render_area;
2490    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2491    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2492    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2493
2494    if (!dst->store)
2495       return;
2496
2497    uint32_t x1 = render_area->offset.x;
2498    uint32_t y1 = render_area->offset.y;
2499    uint32_t x2 = x1 + render_area->extent.width;
2500    uint32_t y2 = y1 + render_area->extent.height;
2501    /* x2/y2 can be unaligned if equal to the size of the image,
2502     * since it will write into padding space
2503     * the one exception is linear levels which don't have the
2504     * required y padding in the layout (except for the last level)
2505     */
2506    bool need_y2_align =
2507       y2 != iview->extent.height || iview->need_y2_align;
2508
2509    bool unaligned =
2510       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2511       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2512
2513    /* use fast path when render area is aligned, except for unsupported resolve cases */
2514    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2515       tu_emit_blit(cmd, cs, iview, src, true);
2516       return;
2517    }
2518
2519    if (dst->samples > 1) {
2520       /* I guess we need to use shader path in this case?
2521        * need a testcase which fails because of this
2522        */
2523       tu_finishme("unaligned store of msaa attachment\n");
2524       return;
2525    }
2526
2527    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2528    r2d_dst(cs, iview, 0);
2529    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2530
2531    tu_cs_emit_regs(cs,
2532                    A6XX_SP_PS_2D_SRC_INFO(
2533                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2534                       .tile_mode = TILE6_2,
2535                       .srgb = vk_format_is_srgb(src->format),
2536                       .samples = tu_msaa_samples(src->samples),
2537                       .samples_average = !vk_format_is_int(src->format),
2538                       .unk20 = 1,
2539                       .unk22 = 1),
2540                    /* note: src size does not matter when not scaling */
2541                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2542                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2543                    A6XX_SP_PS_2D_SRC_HI(),
2544                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2545
2546    /* sync GMEM writes with CACHE. */
2547    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2548
2549    /* Wait for CACHE_INVALIDATE to land */
2550    tu_cs_emit_wfi(cs);
2551
2552    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2553    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2554
2555    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2556     * sysmem, and we generally assume that GMEM renderpasses leave their
2557     * results in sysmem, so we need to flush manually here.
2558     */
2559    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2560 }