src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 /* helper functions previously in tu_formats.c */
  20
  21 static uint32_t
  22 tu_pack_mask(int bits)
  23 {
  24    assert(bits <= 32);
  25    return (1ull << bits) - 1;
  26 }
  27
  28 static uint32_t
  29 tu_pack_float32_for_unorm(float val, int bits)
  30 {
  31    const uint32_t max = tu_pack_mask(bits);
  32    if (val < 0.0f)
  33       return 0;
  34    else if (val > 1.0f)
  35       return max;
  36    else
  37       return _mesa_lroundevenf(val * (float) max);
  38 }
  39
  40 static uint32_t
  41 tu_pack_float32_for_snorm(float val, int bits)
  42 {
  43    const int32_t max = tu_pack_mask(bits - 1);
  44    int32_t tmp;
  45    if (val < -1.0f)
  46       tmp = -max;
  47    else if (val > 1.0f)
  48       tmp = max;
  49    else
  50       tmp = _mesa_lroundevenf(val * (float) max);
  51
  52    return tmp & tu_pack_mask(bits);
  53 }
  54
  55 static uint32_t
  56 tu_pack_float32_for_uscaled(float val, int bits)
  57 {
  58    const uint32_t max = tu_pack_mask(bits);
  59    if (val < 0.0f)
  60       return 0;
  61    else if (val > (float) max)
  62       return max;
  63    else
  64       return (uint32_t) val;
  65 }
  66
  67 static uint32_t
  68 tu_pack_float32_for_sscaled(float val, int bits)
  69 {
  70    const int32_t max = tu_pack_mask(bits - 1);
  71    const int32_t min = -max - 1;
  72    int32_t tmp;
  73    if (val < (float) min)
  74       tmp = min;
  75    else if (val > (float) max)
  76       tmp = max;
  77    else
  78       tmp = (int32_t) val;
  79
  80    return tmp & tu_pack_mask(bits);
  81 }
  82
  83 static uint32_t
  84 tu_pack_uint32_for_uint(uint32_t val, int bits)
  85 {
  86    return val & tu_pack_mask(bits);
  87 }
  88
  89 static uint32_t
  90 tu_pack_int32_for_sint(int32_t val, int bits)
  91 {
  92    return val & tu_pack_mask(bits);
  93 }
  94
  95 static uint32_t
  96 tu_pack_float32_for_sfloat(float val, int bits)
  97 {
  98    assert(bits == 16 || bits == 32);
  99    return bits == 16 ? util_float_to_half(val) : fui(val);
 100 }
 101
 102 union tu_clear_component_value {
 103    float float32;
 104    int32_t int32;
 105    uint32_t uint32;
 106 };
 107
 108 static uint32_t
 109 tu_pack_clear_component_value(union tu_clear_component_value val,
 110                               const struct util_format_channel_description *ch)
 111 {
 112    uint32_t packed;
 113
 114    switch (ch->type) {
 115    case UTIL_FORMAT_TYPE_UNSIGNED:
 116       /* normalized, scaled, or pure integer */
 117       if (ch->normalized)
 118          packed = tu_pack_float32_for_unorm(val.float32, ch->size);
 119       else if (ch->pure_integer)
 120          packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
 121       else
 122          packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
 123       break;
 124    case UTIL_FORMAT_TYPE_SIGNED:
 125       /* normalized, scaled, or pure integer */
 126       if (ch->normalized)
 127          packed = tu_pack_float32_for_snorm(val.float32, ch->size);
 128       else if (ch->pure_integer)
 129          packed = tu_pack_int32_for_sint(val.int32, ch->size);
 130       else
 131          packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
 132       break;
 133    case UTIL_FORMAT_TYPE_FLOAT:
 134       packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
 135       break;
 136    default:
 137       unreachable("unexpected channel type");
 138       packed = 0;
 139       break;
 140    }
 141
 142    assert((packed & tu_pack_mask(ch->size)) == packed);
 143    return packed;
 144 }
 145
 146 static const struct util_format_channel_description *
 147 tu_get_format_channel_description(const struct util_format_description *desc,
 148                                   int comp)
 149 {
 150    switch (desc->swizzle[comp]) {
 151    case PIPE_SWIZZLE_X:
 152       return &desc->channel[0];
 153    case PIPE_SWIZZLE_Y:
 154       return &desc->channel[1];
 155    case PIPE_SWIZZLE_Z:
 156       return &desc->channel[2];
 157    case PIPE_SWIZZLE_W:
 158       return &desc->channel[3];
 159    default:
 160       return NULL;
 161    }
 162 }
 163
 164 static union tu_clear_component_value
 165 tu_get_clear_component_value(const VkClearValue *val, int comp,
 166                              enum util_format_colorspace colorspace)
 167 {
 168    assert(comp < 4);
 169
 170    union tu_clear_component_value tmp;
 171    switch (colorspace) {
 172    case UTIL_FORMAT_COLORSPACE_ZS:
 173       assert(comp < 2);
 174       if (comp == 0)
 175          tmp.float32 = val->depthStencil.depth;
 176       else
 177          tmp.uint32 = val->depthStencil.stencil;
 178       break;
 179    case UTIL_FORMAT_COLORSPACE_SRGB:
 180       if (comp < 3) {
 181          tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
 182          break;
 183       }
 184    default:
 185       assert(comp < 4);
 186       tmp.uint32 = val->color.uint32[comp];
 187       break;
 188    }
 189
 190    return tmp;
 191 }
 192
 193 /* r2d_ = BLIT_OP_SCALE operations */
 194
 195 static enum a6xx_2d_ifmt
 196 format_to_ifmt(enum a6xx_format fmt)
 197 {
 198    switch (fmt) {
 199    case FMT6_A8_UNORM:
 200    case FMT6_8_UNORM:
 201    case FMT6_8_SNORM:
 202    case FMT6_8_8_UNORM:
 203    case FMT6_8_8_SNORM:
 204    case FMT6_8_8_8_8_UNORM:
 205    case FMT6_8_8_8_X8_UNORM:
 206    case FMT6_8_8_8_8_SNORM:
 207    case FMT6_4_4_4_4_UNORM:
 208    case FMT6_5_5_5_1_UNORM:
 209    case FMT6_5_6_5_UNORM:
 210    case FMT6_Z24_UNORM_S8_UINT:
 211    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
 212       return R2D_UNORM8;
 213
 214    case FMT6_32_UINT:
 215    case FMT6_32_SINT:
 216    case FMT6_32_32_UINT:
 217    case FMT6_32_32_SINT:
 218    case FMT6_32_32_32_32_UINT:
 219    case FMT6_32_32_32_32_SINT:
 220       return R2D_INT32;
 221
 222    case FMT6_16_UINT:
 223    case FMT6_16_SINT:
 224    case FMT6_16_16_UINT:
 225    case FMT6_16_16_SINT:
 226    case FMT6_16_16_16_16_UINT:
 227    case FMT6_16_16_16_16_SINT:
 228    case FMT6_10_10_10_2_UINT:
 229       return R2D_INT16;
 230
 231    case FMT6_8_UINT:
 232    case FMT6_8_SINT:
 233    case FMT6_8_8_UINT:
 234    case FMT6_8_8_SINT:
 235    case FMT6_8_8_8_8_UINT:
 236    case FMT6_8_8_8_8_SINT:
 237       return R2D_INT8;
 238
 239    case FMT6_16_UNORM:
 240    case FMT6_16_SNORM:
 241    case FMT6_16_16_UNORM:
 242    case FMT6_16_16_SNORM:
 243    case FMT6_16_16_16_16_UNORM:
 244    case FMT6_16_16_16_16_SNORM:
 245    case FMT6_32_FLOAT:
 246    case FMT6_32_32_FLOAT:
 247    case FMT6_32_32_32_32_FLOAT:
 248       return R2D_FLOAT32;
 249
 250    case FMT6_16_FLOAT:
 251    case FMT6_16_16_FLOAT:
 252    case FMT6_16_16_16_16_FLOAT:
 253    case FMT6_11_11_10_FLOAT:
 254    case FMT6_10_10_10_2_UNORM:
 255    case FMT6_10_10_10_2_UNORM_DEST:
 256       return R2D_FLOAT16;
 257
 258    default:
 259       unreachable("bad format");
 260       return 0;
 261    }
 262 }
 263
 264 static void
 265 r2d_coords(struct tu_cs *cs,
 266            const VkOffset2D *dst,
 267            const VkOffset2D *src,
 268            const VkExtent2D *extent)
 269 {
 270    tu_cs_emit_regs(cs,
 271       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 272       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 273
 274    if (!src)
 275       return;
 276
 277    tu_cs_emit_regs(cs,
 278                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 279                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 280                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 281                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 282 }
 283
 284 static void
 285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 286 {
 287    uint32_t clear_value[4] = {};
 288
 289    switch (format) {
 290    case VK_FORMAT_X8_D24_UNORM_PACK32:
 291    case VK_FORMAT_D24_UNORM_S8_UINT:
 292       /* cleared as r8g8b8a8_unorm using special format */
 293       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 294       clear_value[1] = clear_value[0] >> 8;
 295       clear_value[2] = clear_value[0] >> 16;
 296       clear_value[3] = val->depthStencil.stencil;
 297       break;
 298    case VK_FORMAT_D16_UNORM:
 299    case VK_FORMAT_D32_SFLOAT:
 300       /* R2D_FLOAT32 */
 301       clear_value[0] = fui(val->depthStencil.depth);
 302       break;
 303    case VK_FORMAT_S8_UINT:
 304       clear_value[0] = val->depthStencil.stencil;
 305       break;
 306    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 307       /* cleared as UINT32 */
 308       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 309       break;
 310    default:
 311       assert(!vk_format_is_depth_or_stencil(format));
 312       const struct util_format_description *desc = vk_format_description(format);
 313       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 314
 315       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 316                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 317
 318       for (unsigned i = 0; i < desc->nr_channels; i++) {
 319          const struct util_format_channel_description *ch = &desc->channel[i];
 320          if (ifmt == R2D_UNORM8) {
 321             float linear = val->color.float32[i];
 322             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 323                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 324
 325             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 326                clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
 327             else
 328                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 329          } else if (ifmt == R2D_FLOAT16) {
 330             clear_value[i] = util_float_to_half(val->color.float32[i]);
 331          } else {
 332             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 333                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 334             clear_value[i] = val->color.uint32[i];
 335          }
 336       }
 337       break;
 338    }
 339
 340    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 341    tu_cs_emit_array(cs, clear_value, 4);
 342 }
 343
 344 static void
 345 r2d_src(struct tu_cmd_buffer *cmd,
 346         struct tu_cs *cs,
 347         const struct tu_image_view *iview,
 348         uint32_t layer,
 349         bool linear_filter)
 350 {
 351    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 352    tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
 353                   COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
 354    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 355    tu_cs_image_ref_2d(cs, iview, layer, true);
 356
 357    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 358    tu_cs_image_flag_ref(cs, iview, layer);
 359 }
 360
 361 static void
 362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 363                struct tu_cs *cs,
 364                VkFormat vk_format,
 365                uint64_t va, uint32_t pitch,
 366                uint32_t width, uint32_t height)
 367 {
 368    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 369
 370    tu_cs_emit_regs(cs,
 371                    A6XX_SP_PS_2D_SRC_INFO(
 372                       .color_format = format.fmt,
 373                       .color_swap = format.swap,
 374                       .srgb = vk_format_is_srgb(vk_format),
 375                       .unk20 = 1,
 376                       .unk22 = 1),
 377                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 378                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 379                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 380                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 381 }
 382
 383 static void
 384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 385 {
 386    assert(iview->image->samples == 1);
 387
 388    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 389    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 390    tu_cs_image_ref_2d(cs, iview, layer, false);
 391
 392    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 393    tu_cs_image_flag_ref(cs, iview, layer);
 394 }
 395
 396 static void
 397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 398 {
 399    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 400
 401    tu_cs_emit_regs(cs,
 402                    A6XX_RB_2D_DST_INFO(
 403                       .color_format = format.fmt,
 404                       .color_swap = format.swap,
 405                       .srgb = vk_format_is_srgb(vk_format)),
 406                    A6XX_RB_2D_DST_LO((uint32_t) va),
 407                    A6XX_RB_2D_DST_HI(va >> 32),
 408                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 409 }
 410
 411 static void
 412 r2d_setup_common(struct tu_cmd_buffer *cmd,
 413                  struct tu_cs *cs,
 414                  VkFormat vk_format,
 415                  enum a6xx_rotation rotation,
 416                  bool clear,
 417                  uint8_t mask,
 418                  bool scissor)
 419 {
 420    enum a6xx_format format = tu6_base_format(vk_format);
 421    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 422    uint32_t unknown_8c01 = 0;
 423
 424    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 425       /* preserve depth channels */
 426       if (mask == 0x8)
 427          unknown_8c01 = 0x00084001;
 428       /* preserve stencil channel */
 429       if (mask == 0x7)
 430          unknown_8c01 = 0x08000041;
 431    }
 432
 433    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 434    tu_cs_emit(cs, unknown_8c01);
 435
 436    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 437          .scissor = scissor,
 438          .rotate = rotation,
 439          .solid_color = clear,
 440          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 441          .color_format = format,
 442          .mask = 0xf,
 443          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 444       ).value;
 445
 446    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 447    tu_cs_emit(cs, blit_cntl);
 448
 449    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 450    tu_cs_emit(cs, blit_cntl);
 451
 452    if (format == FMT6_10_10_10_2_UNORM_DEST)
 453       format = FMT6_16_16_16_16_FLOAT;
 454
 455    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 456          .sint = vk_format_is_sint(vk_format),
 457          .uint = vk_format_is_uint(vk_format),
 458          .color_format = format,
 459          .srgb = vk_format_is_srgb(vk_format),
 460          .mask = 0xf));
 461 }
 462
 463 static void
 464 r2d_setup(struct tu_cmd_buffer *cmd,
 465           struct tu_cs *cs,
 466           VkFormat vk_format,
 467           enum a6xx_rotation rotation,
 468           bool clear,
 469           uint8_t mask)
 470 {
 471    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 472
 473    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 474 }
 475
 476 static void
 477 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 478 {
 479    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 480    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 481 }
 482
 483 /* r3d_ = shader path operations */
 484
 485 static void
 486 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 487            bool layered_clear)
 488 {
 489    struct ir3_shader dummy_shader = {};
 490
 491    struct ir3_shader_variant vs = {
 492       .type = MESA_SHADER_VERTEX,
 493       .instrlen = 1,
 494       .constlen = 2,
 495       .info.max_reg = 1,
 496       .inputs_count = 1,
 497       .inputs[0] = {
 498          .slot = SYSTEM_VALUE_VERTEX_ID,
 499          .regid = regid(0, 3),
 500          .sysval = true,
 501       },
 502       .outputs_count = blit ? 2 : 1,
 503       .outputs[0] = {
 504          .slot = VARYING_SLOT_POS,
 505          .regid = regid(0, 0),
 506       },
 507       .outputs[1] = {
 508          .slot = VARYING_SLOT_VAR0,
 509          .regid = regid(1, 0),
 510       },
 511       .shader = &dummy_shader,
 512    };
 513    if (layered_clear) {
 514       vs = (struct ir3_shader_variant) {
 515          .type = MESA_SHADER_VERTEX,
 516          .instrlen = 1,
 517          .info.max_reg = 0,
 518          .shader = &dummy_shader,
 519       };
 520    }
 521
 522    struct ir3_shader_variant fs = {
 523       .type = MESA_SHADER_FRAGMENT,
 524       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 525       .constlen = num_rts,
 526       .info.max_reg = MAX2(num_rts, 1) - 1,
 527       .total_in = blit ? 2 : 0,
 528       .num_samp = blit ? 1 : 0,
 529       .inputs_count = blit ? 2 : 0,
 530       .inputs[0] = {
 531          .slot = VARYING_SLOT_VAR0,
 532          .inloc = 0,
 533          .compmask = 3,
 534          .bary = true,
 535       },
 536       .inputs[1] = {
 537          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 538          .regid = regid(0, 0),
 539          .sysval = 1,
 540       },
 541       .num_sampler_prefetch = blit ? 1 : 0,
 542       .sampler_prefetch[0] = {
 543          .src = 0,
 544          .wrmask = 0xf,
 545          .cmd = 4,
 546       },
 547       .shader = &dummy_shader,
 548    };
 549
 550    struct ir3_shader_variant gs_shader = {
 551       .type = MESA_SHADER_GEOMETRY,
 552       .instrlen = 1,
 553       .constlen = 2,
 554       .info.max_reg = 1,
 555       .inputs_count = 1,
 556       .inputs[0] = {
 557          .slot = SYSTEM_VALUE_GS_HEADER_IR3,
 558          .regid = regid(0, 0),
 559          .sysval = true,
 560       },
 561       .outputs_count = 3,
 562       .outputs[0] = {
 563          .slot = VARYING_SLOT_POS,
 564          .regid = regid(0, 0),
 565       },
 566       .outputs[1] = {
 567          .slot = VARYING_SLOT_LAYER,
 568          .regid = regid(1, 1),
 569       },
 570       .outputs[2] = {
 571          .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
 572          .regid = regid(1, 0),
 573       },
 574       .shader = &dummy_shader,
 575    }, *gs = layered_clear ? &gs_shader : NULL;
 576
 577
 578 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
 579 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 580 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 581
 582    static const instr_t vs_code[] = {
 583       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 584        * r1.xy = r0.w ? c1.zw : c0.zw
 585        * r0.w = 1.0f
 586        */
 587       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 588          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 589          .src2 = 3,
 590          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 591       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 592          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 593          .src2 = 3,
 594          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 595       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 596       { .cat0 = { .opc = OPC_END } },
 597    };
 598
 599    static const instr_t vs_layered[] = {
 600       { .cat0 = { .opc = OPC_CHMASK } },
 601       { .cat0 = { .opc = OPC_CHSH } },
 602    };
 603
 604    static const instr_t gs_code[16] = {
 605       /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
 606       CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
 607            .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
 608       /* x = (local_id & 1) ? c1.x : c0.x */
 609       CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
 610       /* y = (local_id & 2) ? c1.y : c0.y */
 611       CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
 612       /* pred = (local_id >= 4), used by OPC_KILL */
 613       CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
 614       /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
 615       CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
 616
 617       MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
 618       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
 619       MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
 620
 621       /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
 622       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
 623          .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
 624          .src2 = 0,
 625          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 626
 627       CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
 628
 629       { .cat0 = { .opc = OPC_KILL } },
 630       { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
 631    };
 632 #define FS_OFFSET (16 * sizeof(instr_t))
 633 #define GS_OFFSET (32 * sizeof(instr_t))
 634
 635    /* shaders */
 636    struct ts_cs_memory shaders = { };
 637    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
 638                                  16 * sizeof(instr_t), &shaders);
 639    assert(result == VK_SUCCESS);
 640
 641    if (layered_clear) {
 642       memcpy(shaders.map, vs_layered, sizeof(vs_layered));
 643       memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
 644    } else {
 645       memcpy(shaders.map, vs_code, sizeof(vs_code));
 646    }
 647
 648    instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 649    for (uint32_t i = 0; i < num_rts; i++) {
 650       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 651       *fs_code++ = (instr_t) { .cat1 = {
 652          .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 653          .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
 654       } };
 655    }
 656
 657    /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 658     * blit path (its not clear what allows it to not have it)
 659     */
 660    if (blit) {
 661       *fs_code++ = (instr_t) { .cat2 = {
 662          .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
 663          .dst = regid(63, 0), .src1_im = 1
 664       } };
 665    }
 666    *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 667    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 668
 669    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 670
 671    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
 672    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 673    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 674    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
 675    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
 676
 677    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 678    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 679
 680    tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
 681
 682    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 683    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 684    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 685
 686    tu6_emit_fs_inputs(cs, &fs);
 687
 688    tu_cs_emit_regs(cs,
 689                    A6XX_GRAS_CL_CNTL(
 690                       .persp_division_disable = 1,
 691                       .vp_xform_disable = 1,
 692                       .vp_clip_code_ignore = 1,
 693                       .clip_disable = 1),
 694                    A6XX_GRAS_UNKNOWN_8001(0));
 695    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 696
 697    tu_cs_emit_regs(cs,
 698                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 699                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 700    tu_cs_emit_regs(cs,
 701                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 702                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 703
 704    tu_cs_emit_regs(cs,
 705                    A6XX_VFD_INDEX_OFFSET(),
 706                    A6XX_VFD_INSTANCE_START_OFFSET());
 707 }
 708
 709 static void
 710 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
 711 {
 712    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 713    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 714                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 715                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 716                   CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
 717                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 718    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 719    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 720    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 721 }
 722
 723 static void
 724 r3d_coords(struct tu_cs *cs,
 725            const VkOffset2D *dst,
 726            const VkOffset2D *src,
 727            const VkExtent2D *extent)
 728 {
 729    int32_t src_x1 = src ? src->x : 0;
 730    int32_t src_y1 = src ? src->y : 0;
 731    r3d_coords_raw(cs, false, (float[]) {
 732       dst->x,                 dst->y,
 733       src_x1,                 src_y1,
 734       dst->x + extent->width, dst->y + extent->height,
 735       src_x1 + extent->width, src_y1 + extent->height,
 736    });
 737 }
 738
 739 static void
 740 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 741 {
 742    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 743    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 744                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 745                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 746                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 747                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 748    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 749    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 750    switch (format) {
 751    case VK_FORMAT_X8_D24_UNORM_PACK32:
 752    case VK_FORMAT_D24_UNORM_S8_UINT: {
 753       /* cleared as r8g8b8a8_unorm using special format */
 754       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 755       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 756       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 757       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 758       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 759    } break;
 760    case VK_FORMAT_D16_UNORM:
 761    case VK_FORMAT_D32_SFLOAT:
 762       tu_cs_emit(cs, fui(val->depthStencil.depth));
 763       tu_cs_emit(cs, 0);
 764       tu_cs_emit(cs, 0);
 765       tu_cs_emit(cs, 0);
 766       break;
 767    case VK_FORMAT_S8_UINT:
 768       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 769       tu_cs_emit(cs, 0);
 770       tu_cs_emit(cs, 0);
 771       tu_cs_emit(cs, 0);
 772       break;
 773    default:
 774       /* as color formats use clear value as-is */
 775       assert(!vk_format_is_depth_or_stencil(format));
 776       tu_cs_emit_array(cs, val->color.uint32, 4);
 777       break;
 778    }
 779 }
 780
 781 static void
 782 r3d_src_common(struct tu_cmd_buffer *cmd,
 783                struct tu_cs *cs,
 784                const uint32_t *tex_const,
 785                uint32_t offset_base,
 786                uint32_t offset_ubwc,
 787                bool linear_filter)
 788 {
 789    struct ts_cs_memory texture = { };
 790    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 791                                  2, /* allocate space for a sampler too */
 792                                  A6XX_TEX_CONST_DWORDS, &texture);
 793    assert(result == VK_SUCCESS);
 794
 795    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 796
 797    /* patch addresses for layer offset */
 798    *(uint64_t*) (texture.map + 4) += offset_base;
 799    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 800    texture.map[7] = ubwc_addr;
 801    texture.map[8] = ubwc_addr >> 32;
 802
 803    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 804       A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 805       A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 806       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 807       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 808       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 809       0x60000; /* XXX used by blob, doesn't seem necessary */
 810    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 811       0x1 | /* XXX used by blob, doesn't seem necessary */
 812       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 813       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 814    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 815    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 816
 817    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 818    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 819                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 820                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 821                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 822                CP_LOAD_STATE6_0_NUM_UNIT(1));
 823    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 824
 825    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 826    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 827
 828    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 829    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 830       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 831       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 832       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 833       CP_LOAD_STATE6_0_NUM_UNIT(1));
 834    tu_cs_emit_qw(cs, texture.iova);
 835
 836    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 837    tu_cs_emit_qw(cs, texture.iova);
 838
 839    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 840 }
 841
 842 static void
 843 r3d_src(struct tu_cmd_buffer *cmd,
 844         struct tu_cs *cs,
 845         const struct tu_image_view *iview,
 846         uint32_t layer,
 847         bool linear_filter)
 848 {
 849    r3d_src_common(cmd, cs, iview->descriptor,
 850                   iview->layer_size * layer,
 851                   iview->ubwc_layer_size * layer,
 852                   linear_filter);
 853 }
 854
 855 static void
 856 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 857                struct tu_cs *cs,
 858                VkFormat vk_format,
 859                uint64_t va, uint32_t pitch,
 860                uint32_t width, uint32_t height)
 861 {
 862    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 863
 864    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 865
 866    desc[0] =
 867       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 868       A6XX_TEX_CONST_0_FMT(format.fmt) |
 869       A6XX_TEX_CONST_0_SWAP(format.swap) |
 870       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 871       // XXX to swizzle into .w for stencil buffer_to_image
 872       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 873       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 874       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 875    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 876    desc[2] =
 877       A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
 878       A6XX_TEX_CONST_2_PITCH(pitch) |
 879       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 880    desc[3] = 0;
 881    desc[4] = va;
 882    desc[5] = va >> 32;
 883    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 884       desc[i] = 0;
 885
 886    r3d_src_common(cmd, cs, desc, 0, 0, false);
 887 }
 888
 889 static void
 890 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 891 {
 892    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 893
 894    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 895    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 896    tu_cs_image_ref(cs, iview, layer);
 897    tu_cs_emit(cs, 0);
 898
 899    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 900    tu_cs_image_flag_ref(cs, iview, layer);
 901
 902    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 903 }
 904
 905 static void
 906 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 907 {
 908    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 909
 910    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 911
 912    tu_cs_emit_regs(cs,
 913                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 914                    A6XX_RB_MRT_PITCH(0, pitch),
 915                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 916                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 917                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 918                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 919
 920    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 921 }
 922
 923 static void
 924 r3d_setup(struct tu_cmd_buffer *cmd,
 925           struct tu_cs *cs,
 926           VkFormat vk_format,
 927           enum a6xx_rotation rotation,
 928           bool clear,
 929           uint8_t mask)
 930 {
 931    if (!cmd->state.pass) {
 932       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 933       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 934    }
 935
 936    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 937    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 938
 939    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 940
 941    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 942    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 943                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 944                   0xfc000000);
 945    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 946
 947    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 948    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 949
 950    tu_cs_emit_regs(cs,
 951                    A6XX_RB_FS_OUTPUT_CNTL0(),
 952                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 953
 954    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 955    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 956    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 957
 958    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 959    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 960    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 961    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 962    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 963    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 964    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 965
 966    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 967    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 968
 969    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 970                         .color_format = tu6_base_format(vk_format),
 971                         .color_sint = vk_format_is_sint(vk_format),
 972                         .color_uint = vk_format_is_uint(vk_format)));
 973
 974    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 975    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 976    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 977 }
 978
 979 static void
 980 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 981 {
 982    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 983    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 984                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 985                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 986    tu_cs_emit(cs, 1); /* instance count */
 987    tu_cs_emit(cs, 2); /* vertex count */
 988 }
 989
 990 /* blit ops - common interface for 2d/shader paths */
 991
 992 struct blit_ops {
 993    void (*coords)(struct tu_cs *cs,
 994                   const VkOffset2D *dst,
 995                   const VkOffset2D *src,
 996                   const VkExtent2D *extent);
 997    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 998    void (*src)(
 999         struct tu_cmd_buffer *cmd,
1000         struct tu_cs *cs,
1001         const struct tu_image_view *iview,
1002         uint32_t layer,
1003         bool linear_filter);
1004    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1005                       VkFormat vk_format,
1006                       uint64_t va, uint32_t pitch,
1007                       uint32_t width, uint32_t height);
1008    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1009    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1010    void (*setup)(struct tu_cmd_buffer *cmd,
1011                  struct tu_cs *cs,
1012                  VkFormat vk_format,
1013                  enum a6xx_rotation rotation,
1014                  bool clear,
1015                  uint8_t mask);
1016    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1017 };
1018
1019 static const struct blit_ops r2d_ops = {
1020    .coords = r2d_coords,
1021    .clear_value = r2d_clear_value,
1022    .src = r2d_src,
1023    .src_buffer = r2d_src_buffer,
1024    .dst = r2d_dst,
1025    .dst_buffer = r2d_dst_buffer,
1026    .setup = r2d_setup,
1027    .run = r2d_run,
1028 };
1029
1030 static const struct blit_ops r3d_ops = {
1031    .coords = r3d_coords,
1032    .clear_value = r3d_clear_value,
1033    .src = r3d_src,
1034    .src_buffer = r3d_src_buffer,
1035    .dst = r3d_dst,
1036    .dst_buffer = r3d_dst_buffer,
1037    .setup = r3d_setup,
1038    .run = r3d_run,
1039 };
1040
1041 /* passthrough set coords from 3D extents */
1042 static void
1043 coords(const struct blit_ops *ops,
1044        struct tu_cs *cs,
1045        const VkOffset3D *dst,
1046        const VkOffset3D *src,
1047        const VkExtent3D *extent)
1048 {
1049    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1050 }
1051
1052 static void
1053 tu_image_view_blit2(struct tu_image_view *iview,
1054                     struct tu_image *image,
1055                     VkFormat format,
1056                     const VkImageSubresourceLayers *subres,
1057                     uint32_t layer,
1058                     bool stencil_read)
1059 {
1060    VkImageAspectFlags aspect_mask = subres->aspectMask;
1061
1062    /* always use the AS_R8G8B8A8 format for these */
1063    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1064        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1065       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1066    }
1067
1068    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1069       .image = tu_image_to_handle(image),
1070       .viewType = VK_IMAGE_VIEW_TYPE_2D,
1071       .format = format,
1072       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1073       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1074       .subresourceRange = {
1075          .aspectMask = aspect_mask,
1076          .baseMipLevel = subres->mipLevel,
1077          .levelCount = 1,
1078          .baseArrayLayer = subres->baseArrayLayer + layer,
1079          .layerCount = 1,
1080       },
1081    });
1082 }
1083
1084 static void
1085 tu_image_view_blit(struct tu_image_view *iview,
1086                    struct tu_image *image,
1087                    const VkImageSubresourceLayers *subres,
1088                    uint32_t layer)
1089 {
1090    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1091 }
1092
1093 static void
1094 tu6_blit_image(struct tu_cmd_buffer *cmd,
1095                struct tu_image *src_image,
1096                struct tu_image *dst_image,
1097                const VkImageBlit *info,
1098                VkFilter filter)
1099 {
1100    const struct blit_ops *ops = &r3d_ops;
1101    struct tu_cs *cs = &cmd->cs;
1102    uint32_t layers;
1103
1104    /* 2D blit can't do rotation mirroring from just coordinates */
1105    static const enum a6xx_rotation rotate[2][2] = {
1106       {ROTATE_0, ROTATE_HFLIP},
1107       {ROTATE_VFLIP, ROTATE_180},
1108    };
1109
1110    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1111                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1112    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1113                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1114    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1115                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1116
1117    if (mirror_z) {
1118       tu_finishme("blit z mirror\n");
1119       return;
1120    }
1121
1122    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1123        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1124       tu_finishme("blit z filter\n");
1125       return;
1126    }
1127
1128    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1129    if (info->dstSubresource.layerCount > 1) {
1130       assert(layers <= 1);
1131       layers = info->dstSubresource.layerCount;
1132    }
1133
1134    uint8_t mask = 0xf;
1135    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1136       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1137       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1138          mask = 0x7;
1139       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1140          mask = 0x8;
1141    }
1142
1143    /* BC1_RGB_* formats need to have their last components overriden with 1
1144     * when sampling, which is normally handled with the texture descriptor
1145     * swizzle. The 2d path can't handle that, so use the 3d path.
1146     *
1147     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1148     * the 2d path.
1149     */
1150
1151    if (dst_image->samples > 1 ||
1152        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1153        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1154       ops = &r3d_ops;
1155
1156    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1157     * figure out why (should be able to pass all tests with only shader path)
1158     */
1159
1160    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1161
1162    if (ops == &r3d_ops) {
1163       r3d_coords_raw(cs, false, (float[]) {
1164          info->dstOffsets[0].x, info->dstOffsets[0].y,
1165          info->srcOffsets[0].x, info->srcOffsets[0].y,
1166          info->dstOffsets[1].x, info->dstOffsets[1].y,
1167          info->srcOffsets[1].x, info->srcOffsets[1].y
1168       });
1169    } else {
1170       tu_cs_emit_regs(cs,
1171          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1172                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1173          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1174                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1175       tu_cs_emit_regs(cs,
1176          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1177          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1178          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1179          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1180    }
1181
1182    struct tu_image_view dst, src;
1183    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1184    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1185
1186    for (uint32_t i = 0; i < layers; i++) {
1187       ops->dst(cs, &dst, i);
1188       ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1189       ops->run(cmd, cs);
1190    }
1191 }
1192
1193 void
1194 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1195                 VkImage srcImage,
1196                 VkImageLayout srcImageLayout,
1197                 VkImage dstImage,
1198                 VkImageLayout dstImageLayout,
1199                 uint32_t regionCount,
1200                 const VkImageBlit *pRegions,
1201                 VkFilter filter)
1202
1203 {
1204    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1205    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1206    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1207
1208    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1209    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1210
1211    for (uint32_t i = 0; i < regionCount; ++i)
1212       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1213 }
1214
1215 static VkFormat
1216 copy_format(VkFormat format)
1217 {
1218    switch (vk_format_get_blocksize(format)) {
1219    case 1: return VK_FORMAT_R8_UINT;
1220    case 2: return VK_FORMAT_R16_UINT;
1221    case 4: return VK_FORMAT_R32_UINT;
1222    case 8: return VK_FORMAT_R32G32_UINT;
1223    case 12:return VK_FORMAT_R32G32B32_UINT;
1224    case 16:return VK_FORMAT_R32G32B32A32_UINT;
1225    default:
1226       unreachable("unhandled format size");
1227    }
1228 }
1229
1230 static void
1231 copy_compressed(VkFormat format,
1232                 VkOffset3D *offset,
1233                 VkExtent3D *extent,
1234                 uint32_t *width,
1235                 uint32_t *height)
1236 {
1237    if (!vk_format_is_compressed(format))
1238       return;
1239
1240    uint32_t block_width = vk_format_get_blockwidth(format);
1241    uint32_t block_height = vk_format_get_blockheight(format);
1242
1243    offset->x /= block_width;
1244    offset->y /= block_height;
1245
1246    if (extent) {
1247       extent->width = DIV_ROUND_UP(extent->width, block_width);
1248       extent->height = DIV_ROUND_UP(extent->height, block_height);
1249    }
1250    if (width)
1251       *width = DIV_ROUND_UP(*width, block_width);
1252    if (height)
1253       *height = DIV_ROUND_UP(*height, block_height);
1254 }
1255
1256 static void
1257 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1258                         struct tu_buffer *src_buffer,
1259                         struct tu_image *dst_image,
1260                         const VkBufferImageCopy *info)
1261 {
1262    struct tu_cs *cs = &cmd->cs;
1263    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1264    VkFormat dst_format = dst_image->vk_format;
1265    VkFormat src_format = dst_image->vk_format;
1266    const struct blit_ops *ops = &r2d_ops;
1267
1268    uint8_t mask = 0xf;
1269
1270    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1271       switch (info->imageSubresource.aspectMask) {
1272       case VK_IMAGE_ASPECT_STENCIL_BIT:
1273          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1274          mask = 0x8;
1275          ops = &r3d_ops;
1276          break;
1277       case VK_IMAGE_ASPECT_DEPTH_BIT:
1278          mask = 0x7;
1279          break;
1280       }
1281    }
1282
1283    VkOffset3D offset = info->imageOffset;
1284    VkExtent3D extent = info->imageExtent;
1285    uint32_t src_width = info->bufferRowLength ?: extent.width;
1286    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1287
1288    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1289       assert(src_format == dst_format);
1290       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1291       src_format = dst_format = copy_format(dst_format);
1292    }
1293
1294    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1295    uint32_t layer_size = src_height * pitch;
1296
1297    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1298     * it is also valid for 1cpp format with shader path (stencil aspect path)
1299     */
1300
1301    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1302
1303    struct tu_image_view dst;
1304    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1305
1306    for (uint32_t i = 0; i < layers; i++) {
1307       ops->dst(cs, &dst, i);
1308
1309       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1310       if ((src_va & 63) || (pitch & 63)) {
1311          for (uint32_t y = 0; y < extent.height; y++) {
1312             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1313             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1314                             x + extent.width, 1);
1315             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1316                         &(VkExtent2D) {extent.width, 1});
1317             ops->run(cmd, cs);
1318             src_va += pitch;
1319          }
1320       } else {
1321          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1322          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1323          ops->run(cmd, cs);
1324       }
1325    }
1326 }
1327
1328 void
1329 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1330                         VkBuffer srcBuffer,
1331                         VkImage dstImage,
1332                         VkImageLayout dstImageLayout,
1333                         uint32_t regionCount,
1334                         const VkBufferImageCopy *pRegions)
1335 {
1336    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1337    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1338    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1339
1340    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1341    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1342
1343    for (unsigned i = 0; i < regionCount; ++i)
1344       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1345 }
1346
1347 static void
1348 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1349                         struct tu_image *src_image,
1350                         struct tu_buffer *dst_buffer,
1351                         const VkBufferImageCopy *info)
1352 {
1353    struct tu_cs *cs = &cmd->cs;
1354    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1355    VkFormat src_format = src_image->vk_format;
1356    VkFormat dst_format = src_image->vk_format;
1357    bool stencil_read = false;
1358
1359    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1360        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1361       dst_format = VK_FORMAT_R8_UNORM;
1362       stencil_read = true;
1363    }
1364
1365    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1366    VkOffset3D offset = info->imageOffset;
1367    VkExtent3D extent = info->imageExtent;
1368    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1369    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1370
1371    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1372       assert(src_format == dst_format);
1373       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1374       src_format = dst_format = copy_format(dst_format);
1375    }
1376
1377    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1378    uint32_t layer_size = pitch * dst_height;
1379
1380    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1381     * it is also valid for 1cpp format with shader path (stencil aspect)
1382     */
1383
1384    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1385
1386    struct tu_image_view src;
1387    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1388
1389    for (uint32_t i = 0; i < layers; i++) {
1390       ops->src(cmd, cs, &src, i, false);
1391
1392       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1393       if ((dst_va & 63) || (pitch & 63)) {
1394          for (uint32_t y = 0; y < extent.height; y++) {
1395             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1396             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1397             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1398                         &(VkExtent2D) {extent.width, 1});
1399             ops->run(cmd, cs);
1400             dst_va += pitch;
1401          }
1402       } else {
1403          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1404          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1405          ops->run(cmd, cs);
1406       }
1407    }
1408 }
1409
1410 void
1411 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1412                         VkImage srcImage,
1413                         VkImageLayout srcImageLayout,
1414                         VkBuffer dstBuffer,
1415                         uint32_t regionCount,
1416                         const VkBufferImageCopy *pRegions)
1417 {
1418    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1419    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1420    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1421
1422    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1423    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1424
1425    for (unsigned i = 0; i < regionCount; ++i)
1426       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1427 }
1428
1429 /* Tiled formats don't support swapping, which means that we can't support
1430  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1431  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1432  * Currently we fake support for tiled swapped formats and use the unswapped
1433  * format instead, but this means that reinterpreting copies to and from
1434  * swapped formats can't be performed correctly unless we can swizzle the
1435  * components by reinterpreting the other image as the "correct" swapped
1436  * format, i.e. only when the other image is linear.
1437  */
1438
1439 static bool
1440 is_swapped_format(VkFormat format)
1441 {
1442    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1443    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1444    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1445 }
1446
1447 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1448  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1449  * versa). This should mirror the logic in fdl6_layout.
1450  */
1451 static bool
1452 image_is_r8g8(struct tu_image *image)
1453 {
1454    return image->layout.cpp == 2 &&
1455       vk_format_get_nr_components(image->vk_format) == 2;
1456 }
1457
1458 static void
1459 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1460                        struct tu_image *src_image,
1461                        struct tu_image *dst_image,
1462                        const VkImageCopy *info)
1463 {
1464    const struct blit_ops *ops = &r2d_ops;
1465    struct tu_cs *cs = &cmd->cs;
1466
1467    uint8_t mask = 0xf;
1468    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1469       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1470          mask = 0x7;
1471       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1472          mask = 0x8;
1473    }
1474
1475    if (dst_image->samples > 1)
1476       ops = &r3d_ops;
1477
1478    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1479
1480    VkFormat format = VK_FORMAT_UNDEFINED;
1481    VkOffset3D src_offset = info->srcOffset;
1482    VkOffset3D dst_offset = info->dstOffset;
1483    VkExtent3D extent = info->extent;
1484
1485    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1486     * Images":
1487     *
1488     *    When copying between compressed and uncompressed formats the extent
1489     *    members represent the texel dimensions of the source image and not
1490     *    the destination. When copying from a compressed image to an
1491     *    uncompressed image the image texel dimensions written to the
1492     *    uncompressed image will be source extent divided by the compressed
1493     *    texel block dimensions. When copying from an uncompressed image to a
1494     *    compressed image the image texel dimensions written to the compressed
1495     *    image will be the source extent multiplied by the compressed texel
1496     *    block dimensions.
1497     *
1498     * This means we only have to adjust the extent if the source image is
1499     * compressed.
1500     */
1501    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1502    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1503
1504    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1505       copy_format(dst_image->vk_format) : dst_image->vk_format;
1506    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1507       copy_format(src_image->vk_format) : src_image->vk_format;
1508
1509    bool use_staging_blit = false;
1510
1511    if (src_format == dst_format) {
1512       /* Images that share a format can always be copied directly because it's
1513        * the same as a blit.
1514        */
1515       format = src_format;
1516    } else if (!src_image->layout.tile_mode) {
1517       /* If an image is linear, we can always safely reinterpret it with the
1518        * other image's format and then do a regular blit.
1519        */
1520       format = dst_format;
1521    } else if (!dst_image->layout.tile_mode) {
1522       format = src_format;
1523    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1524       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1525        * due to the different tile layout.
1526        */
1527       use_staging_blit = true;
1528    } else if (is_swapped_format(src_format) ||
1529               is_swapped_format(dst_format)) {
1530       /* If either format has a non-identity swap, then we can't copy
1531        * to/from it.
1532        */
1533       use_staging_blit = true;
1534    } else if (!src_image->layout.ubwc) {
1535       format = dst_format;
1536    } else if (!dst_image->layout.ubwc) {
1537       format = src_format;
1538    } else {
1539       /* Both formats use UBWC and so neither can be reinterpreted.
1540        * TODO: We could do an in-place decompression of the dst instead.
1541        */
1542       use_staging_blit = true;
1543    }
1544
1545    struct tu_image_view dst, src;
1546
1547    if (use_staging_blit) {
1548       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1549       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1550
1551       struct tu_image staging_image = {
1552          .vk_format = src_format,
1553          .type = src_image->type,
1554          .tiling = VK_IMAGE_TILING_LINEAR,
1555          .extent = extent,
1556          .level_count = 1,
1557          .layer_count = info->srcSubresource.layerCount,
1558          .samples = src_image->samples,
1559          .bo_offset = 0,
1560       };
1561
1562       VkImageSubresourceLayers staging_subresource = {
1563          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1564          .mipLevel = 0,
1565          .baseArrayLayer = 0,
1566          .layerCount = info->srcSubresource.layerCount,
1567       };
1568
1569       VkOffset3D staging_offset = { 0 };
1570
1571       staging_image.layout.tile_mode = TILE6_LINEAR;
1572       staging_image.layout.ubwc = false;
1573
1574       fdl6_layout(&staging_image.layout,
1575                   vk_format_to_pipe_format(staging_image.vk_format),
1576                   staging_image.samples,
1577                   staging_image.extent.width,
1578                   staging_image.extent.height,
1579                   staging_image.extent.depth,
1580                   staging_image.level_count,
1581                   staging_image.layer_count,
1582                   staging_image.type == VK_IMAGE_TYPE_3D,
1583                   NULL);
1584
1585       VkResult result = tu_get_scratch_bo(cmd->device,
1586                                           staging_image.layout.size,
1587                                           &staging_image.bo);
1588       if (result != VK_SUCCESS) {
1589          cmd->record_result = result;
1590          return;
1591       }
1592
1593       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1594                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1595
1596       struct tu_image_view staging;
1597       tu_image_view_blit2(&staging, &staging_image, src_format,
1598                           &staging_subresource, 0, false);
1599
1600       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1601       coords(ops, cs, &staging_offset, &src_offset, &extent);
1602
1603       for (uint32_t i = 0; i < info->extent.depth; i++) {
1604          ops->src(cmd, cs, &src, i, false);
1605          ops->dst(cs, &staging, i);
1606          ops->run(cmd, cs);
1607       }
1608
1609       /* When executed by the user there has to be a pipeline barrier here,
1610        * but since we're doing it manually we'll have to flush ourselves.
1611        */
1612       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1613       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1614
1615       tu_image_view_blit2(&staging, &staging_image, dst_format,
1616                           &staging_subresource, 0, false);
1617
1618       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1619       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1620
1621       for (uint32_t i = 0; i < info->extent.depth; i++) {
1622          ops->src(cmd, cs, &staging, i, false);
1623          ops->dst(cs, &dst, i);
1624          ops->run(cmd, cs);
1625       }
1626    } else {
1627       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1628       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1629
1630       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1631       coords(ops, cs, &dst_offset, &src_offset, &extent);
1632
1633       for (uint32_t i = 0; i < info->extent.depth; i++) {
1634          ops->src(cmd, cs, &src, i, false);
1635          ops->dst(cs, &dst, i);
1636          ops->run(cmd, cs);
1637       }
1638    }
1639 }
1640
1641 void
1642 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1643                 VkImage srcImage,
1644                 VkImageLayout srcImageLayout,
1645                 VkImage destImage,
1646                 VkImageLayout destImageLayout,
1647                 uint32_t regionCount,
1648                 const VkImageCopy *pRegions)
1649 {
1650    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1651    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1652    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1653
1654    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1655    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1656
1657    for (uint32_t i = 0; i < regionCount; ++i)
1658       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1659 }
1660
1661 static void
1662 copy_buffer(struct tu_cmd_buffer *cmd,
1663             uint64_t dst_va,
1664             uint64_t src_va,
1665             uint64_t size,
1666             uint32_t block_size)
1667 {
1668    const struct blit_ops *ops = &r2d_ops;
1669    struct tu_cs *cs = &cmd->cs;
1670    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1671    uint64_t blocks = size / block_size;
1672
1673    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1674
1675    while (blocks) {
1676       uint32_t src_x = (src_va & 63) / block_size;
1677       uint32_t dst_x = (dst_va & 63) / block_size;
1678       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1679
1680       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1681       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1682       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1683       ops->run(cmd, cs);
1684
1685       src_va += width * block_size;
1686       dst_va += width * block_size;
1687       blocks -= width;
1688    }
1689 }
1690
1691 void
1692 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1693                  VkBuffer srcBuffer,
1694                  VkBuffer dstBuffer,
1695                  uint32_t regionCount,
1696                  const VkBufferCopy *pRegions)
1697 {
1698    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1699    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1700    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1701
1702    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1703    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1704
1705    for (unsigned i = 0; i < regionCount; ++i) {
1706       copy_buffer(cmd,
1707                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1708                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1709                   pRegions[i].size, 1);
1710    }
1711 }
1712
1713 void
1714 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1715                    VkBuffer dstBuffer,
1716                    VkDeviceSize dstOffset,
1717                    VkDeviceSize dataSize,
1718                    const void *pData)
1719 {
1720    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1721    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1722
1723    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1724
1725    struct ts_cs_memory tmp;
1726    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1727    if (result != VK_SUCCESS) {
1728       cmd->record_result = result;
1729       return;
1730    }
1731
1732    memcpy(tmp.map, pData, dataSize);
1733    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1734 }
1735
1736 void
1737 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1738                  VkBuffer dstBuffer,
1739                  VkDeviceSize dstOffset,
1740                  VkDeviceSize fillSize,
1741                  uint32_t data)
1742 {
1743    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1744    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1745    const struct blit_ops *ops = &r2d_ops;
1746    struct tu_cs *cs = &cmd->cs;
1747
1748    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1749
1750    if (fillSize == VK_WHOLE_SIZE)
1751       fillSize = buffer->size - dstOffset;
1752
1753    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1754    uint32_t blocks = fillSize / 4;
1755
1756    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1757    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1758
1759    while (blocks) {
1760       uint32_t dst_x = (dst_va & 63) / 4;
1761       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1762
1763       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1764       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1765       ops->run(cmd, cs);
1766
1767       dst_va += width * 4;
1768       blocks -= width;
1769    }
1770 }
1771
1772 void
1773 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1774                    VkImage srcImage,
1775                    VkImageLayout srcImageLayout,
1776                    VkImage dstImage,
1777                    VkImageLayout dstImageLayout,
1778                    uint32_t regionCount,
1779                    const VkImageResolve *pRegions)
1780 {
1781    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1782    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1783    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1784    const struct blit_ops *ops = &r2d_ops;
1785    struct tu_cs *cs = &cmd->cs;
1786
1787    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1788    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1789
1790    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1791
1792    for (uint32_t i = 0; i < regionCount; ++i) {
1793       const VkImageResolve *info = &pRegions[i];
1794       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1795
1796       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1797       /* TODO: aspect masks possible ? */
1798
1799       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1800
1801       struct tu_image_view dst, src;
1802       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1803       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1804
1805       for (uint32_t i = 0; i < layers; i++) {
1806          ops->src(cmd, cs, &src, i, false);
1807          ops->dst(cs, &dst, i);
1808          ops->run(cmd, cs);
1809       }
1810    }
1811 }
1812
1813 void
1814 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1815                   struct tu_cs *cs,
1816                   struct tu_image_view *src,
1817                   struct tu_image_view *dst,
1818                   uint32_t layers,
1819                   const VkRect2D *rect)
1820 {
1821    const struct blit_ops *ops = &r2d_ops;
1822
1823    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1824    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1825
1826    assert(src->image->vk_format == dst->image->vk_format);
1827
1828    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1829    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1830
1831    for (uint32_t i = 0; i < layers; i++) {
1832       ops->src(cmd, cs, src, i, false);
1833       ops->dst(cs, dst, i);
1834       ops->run(cmd, cs);
1835    }
1836 }
1837
1838 static void
1839 clear_image(struct tu_cmd_buffer *cmd,
1840             struct tu_image *image,
1841             const VkClearValue *clear_value,
1842             const VkImageSubresourceRange *range)
1843 {
1844    uint32_t level_count = tu_get_levelCount(image, range);
1845    uint32_t layer_count = tu_get_layerCount(image, range);
1846    struct tu_cs *cs = &cmd->cs;
1847    VkFormat format = image->vk_format;
1848    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1849       format = VK_FORMAT_R32_UINT;
1850
1851    if (image->type == VK_IMAGE_TYPE_3D) {
1852       assert(layer_count == 1);
1853       assert(range->baseArrayLayer == 0);
1854    }
1855
1856    uint8_t mask = 0xf;
1857    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1858       mask = 0;
1859       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1860          mask |= 0x7;
1861       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1862          mask |= 0x8;
1863    }
1864
1865    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1866
1867    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1868    ops->clear_value(cs, image->vk_format, clear_value);
1869
1870    for (unsigned j = 0; j < level_count; j++) {
1871       if (image->type == VK_IMAGE_TYPE_3D)
1872          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1873
1874       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1875                      u_minify(image->extent.width, range->baseMipLevel + j),
1876                      u_minify(image->extent.height, range->baseMipLevel + j)
1877                   });
1878
1879       struct tu_image_view dst;
1880       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1881          .aspectMask = range->aspectMask,
1882          .mipLevel = range->baseMipLevel + j,
1883          .baseArrayLayer = range->baseArrayLayer,
1884          .layerCount = 1,
1885       }, 0, false);
1886
1887       for (uint32_t i = 0; i < layer_count; i++) {
1888          ops->dst(cs, &dst, i);
1889          ops->run(cmd, cs);
1890       }
1891    }
1892 }
1893
1894 void
1895 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1896                       VkImage image_h,
1897                       VkImageLayout imageLayout,
1898                       const VkClearColorValue *pColor,
1899                       uint32_t rangeCount,
1900                       const VkImageSubresourceRange *pRanges)
1901 {
1902    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1903    TU_FROM_HANDLE(tu_image, image, image_h);
1904
1905    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1906
1907    for (unsigned i = 0; i < rangeCount; i++)
1908       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1909 }
1910
1911 void
1912 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1913                              VkImage image_h,
1914                              VkImageLayout imageLayout,
1915                              const VkClearDepthStencilValue *pDepthStencil,
1916                              uint32_t rangeCount,
1917                              const VkImageSubresourceRange *pRanges)
1918 {
1919    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1920    TU_FROM_HANDLE(tu_image, image, image_h);
1921
1922    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1923
1924    for (unsigned i = 0; i < rangeCount; i++)
1925       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1926 }
1927
1928 static void
1929 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1930                                uint32_t attachment_count,
1931                                const VkClearAttachment *attachments,
1932                                uint32_t rect_count,
1933                                const VkClearRect *rects)
1934 {
1935    const struct tu_subpass *subpass = cmd->state.subpass;
1936    /* note: cannot use shader path here.. there is a special shader path
1937     * in tu_clear_sysmem_attachments()
1938     */
1939    const struct blit_ops *ops = &r2d_ops;
1940    struct tu_cs *cs = &cmd->draw_cs;
1941
1942    for (uint32_t j = 0; j < attachment_count; j++) {
1943          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1944           * Pass Instance" says that:
1945           *
1946           *     Unlike other clear commands, vkCmdClearAttachments executes as
1947           *     a drawing command, rather than a transfer command, with writes
1948           *     performed by it executing in rasterization order. Clears to
1949           *     color attachments are executed as color attachment writes, by
1950           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1951           *     Clears to depth/stencil attachments are executed as depth
1952           *     writes and writes by the
1953           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1954           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1955           *
1956           * However, the 2d path here is executed the same way as a
1957           * transfer command, using the CCU color cache exclusively with
1958           * a special depth-as-color format for depth clears. This means that
1959           * we can't rely on the normal pipeline barrier mechanism here, and
1960           * have to manually flush whenever using a different cache domain
1961           * from what the 3d path would've used. This happens when we clear
1962           * depth/stencil, since normally depth attachments use CCU depth, but
1963           * we clear it using a special depth-as-color format. Since the clear
1964           * potentially uses a different attachment state we also need to
1965           * invalidate color beforehand and flush it afterwards.
1966           */
1967
1968          uint32_t a;
1969          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1970             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1971             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1972          } else {
1973             a = subpass->depth_stencil_attachment.attachment;
1974             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1975             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1976             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1977          }
1978
1979          if (a == VK_ATTACHMENT_UNUSED)
1980                continue;
1981
1982          uint8_t mask = 0xf;
1983          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1984             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1985                mask &= ~0x7;
1986             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1987                mask &= ~0x8;
1988          }
1989
1990          const struct tu_image_view *iview =
1991             cmd->state.framebuffer->attachments[a].attachment;
1992
1993          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1994          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1995
1996          /* Wait for the flushes we triggered manually to complete */
1997          tu_cs_emit_wfi(cs);
1998
1999          for (uint32_t i = 0; i < rect_count; i++) {
2000             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
2001             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2002                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
2003                ops->run(cmd, cs);
2004             }
2005          }
2006
2007          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2008             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2009             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2010          } else {
2011             /* sync color into depth */
2012             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2013             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2014          }
2015    }
2016 }
2017
2018 static void
2019 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2020                             uint32_t attachment_count,
2021                             const VkClearAttachment *attachments,
2022                             uint32_t rect_count,
2023                             const VkClearRect *rects)
2024 {
2025    /* the shader path here is special, it avoids changing MRT/etc state */
2026    const struct tu_render_pass *pass = cmd->state.pass;
2027    const struct tu_subpass *subpass = cmd->state.subpass;
2028    const uint32_t mrt_count = subpass->color_count;
2029    struct tu_cs *cs = &cmd->draw_cs;
2030    uint32_t clear_value[MAX_RTS][4];
2031    float z_clear_val = 0.0f;
2032    uint8_t s_clear_val = 0;
2033    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2034    bool z_clear = false;
2035    bool s_clear = false;
2036    bool layered_clear = false;
2037    uint32_t max_samples = 1;
2038
2039    for (uint32_t i = 0; i < attachment_count; i++) {
2040       uint32_t a;
2041       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2042          uint32_t c = attachments[i].colorAttachment;
2043          a = subpass->color_attachments[c].attachment;
2044          if (a == VK_ATTACHMENT_UNUSED)
2045             continue;
2046
2047          clear_rts |= 1 << c;
2048          clear_components |= 0xf << (c * 4);
2049          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2050       } else {
2051          a = subpass->depth_stencil_attachment.attachment;
2052          if (a == VK_ATTACHMENT_UNUSED)
2053             continue;
2054
2055          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2056             z_clear = true;
2057             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2058          }
2059
2060          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2061             s_clear = true;
2062             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2063          }
2064       }
2065
2066       max_samples = MAX2(max_samples, pass->attachments[a].samples);
2067    }
2068
2069    /* prefer to use 2D path for clears
2070     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2071     */
2072    if (max_samples == 1 && cmd->state.framebuffer) {
2073       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2074       return;
2075    }
2076
2077    /* This clear path behaves like a draw, needs the same flush as tu_draw */
2078    tu_emit_cache_flush_renderpass(cmd, cs);
2079
2080    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2081    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2082                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2083                   0xfc000000);
2084    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2085
2086    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2087    for (uint32_t i = 0; i < mrt_count; i++) {
2088       if (clear_rts & (1 << i))
2089          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2090       else
2091          tu_cs_emit(cs, 0);
2092    }
2093
2094    for (uint32_t i = 0; i < rect_count; i++) {
2095       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
2096          layered_clear = true;
2097    }
2098
2099    r3d_common(cmd, cs, false, num_rts, layered_clear);
2100
2101    tu_cs_emit_regs(cs,
2102                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2103    tu_cs_emit_regs(cs,
2104                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2105
2106    tu_cs_emit_regs(cs,
2107                    A6XX_RB_FS_OUTPUT_CNTL0(),
2108                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2109
2110    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2111    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2112    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2113    for (uint32_t i = 0; i < mrt_count; i++) {
2114       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2115             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2116    }
2117
2118    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2119    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2120          .z_enable = z_clear,
2121          .z_write_enable = z_clear,
2122          .zfunc = FUNC_ALWAYS));
2123    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2124    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2125          .stencil_enable = s_clear,
2126          .func = FUNC_ALWAYS,
2127          .zpass = STENCIL_REPLACE));
2128    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2129    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2130    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2131
2132    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2133    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2134                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2135                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2136                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2137                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2138    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2139    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2140    for_each_bit(b, clear_rts)
2141       tu_cs_emit_array(cs, clear_value[b], 4);
2142
2143    for (uint32_t i = 0; i < rect_count; i++) {
2144       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2145          r3d_coords_raw(cs, layered_clear, (float[]) {
2146             rects[i].rect.offset.x, rects[i].rect.offset.y,
2147             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2148             rects[i].rect.offset.x + rects[i].rect.extent.width,
2149             rects[i].rect.offset.y + rects[i].rect.extent.height,
2150             z_clear_val, 1.0f,
2151          });
2152
2153          if (layered_clear) {
2154             tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2155             tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2156                            CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2157                            CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2158                            CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2159             tu_cs_emit(cs, 1); /* instance count */
2160             tu_cs_emit(cs, 1); /* vertex count */
2161          } else {
2162             r3d_run(cmd, cs);
2163          }
2164       }
2165    }
2166
2167    cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2168       TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2169       TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2170       TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2171       TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2172       TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2173 }
2174
2175 /**
2176  * Pack a VkClearValue into a 128-bit buffer. format is respected except
2177  * for the component order.  The components are always packed in WZYX order,
2178  * because gmem is tiled and tiled formats always have WZYX swap
2179  */
2180 static void
2181 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2182 {
2183    const struct util_format_description *desc = vk_format_description(format);
2184
2185    switch (format) {
2186    case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2187       buf[0] = float3_to_r11g11b10f(val->color.float32);
2188       return;
2189    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2190       buf[0] = float3_to_rgb9e5(val->color.float32);
2191       return;
2192    default:
2193       break;
2194    }
2195
2196    assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2197
2198    /* S8_UINT is special and has no depth */
2199    const int max_components =
2200       format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2201
2202    int buf_offset = 0;
2203    int bit_shift = 0;
2204    for (int comp = 0; comp < max_components; comp++) {
2205       const struct util_format_channel_description *ch =
2206          tu_get_format_channel_description(desc, comp);
2207       if (!ch) {
2208          assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2209                 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2210          continue;
2211       }
2212
2213       union tu_clear_component_value v = tu_get_clear_component_value(
2214          val, comp, desc->colorspace);
2215
2216       /* move to the next uint32_t when there is not enough space */
2217       assert(ch->size <= 32);
2218       if (bit_shift + ch->size > 32) {
2219          buf_offset++;
2220          bit_shift = 0;
2221       }
2222
2223       if (bit_shift == 0)
2224          buf[buf_offset] = 0;
2225
2226       buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2227       bit_shift += ch->size;
2228    }
2229 }
2230
2231 static void
2232 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2233                               struct tu_cs *cs,
2234                               uint32_t attachment,
2235                               uint8_t component_mask,
2236                               const VkClearValue *value)
2237 {
2238    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2239    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2240     * because D24S8 is cleared with AS_R8G8B8A8 format
2241     */
2242
2243    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2244    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2245
2246    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2247    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2248
2249    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2250    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2251
2252    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2253    tu_cs_emit(cs, 0);
2254
2255    uint32_t clear_vals[4] = {};
2256    pack_gmem_clear_value(value, vk_format, clear_vals);
2257
2258    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2259    tu_cs_emit_array(cs, clear_vals, 4);
2260
2261    tu6_emit_event_write(cmd, cs, BLIT);
2262 }
2263
2264 static void
2265 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2266                           uint32_t attachment_count,
2267                           const VkClearAttachment *attachments,
2268                           uint32_t rect_count,
2269                           const VkClearRect *rects)
2270 {
2271    const struct tu_subpass *subpass = cmd->state.subpass;
2272    struct tu_cs *cs = &cmd->draw_cs;
2273
2274    /* TODO: swap the loops for smaller cmdstream */
2275    for (unsigned i = 0; i < rect_count; i++) {
2276       unsigned x1 = rects[i].rect.offset.x;
2277       unsigned y1 = rects[i].rect.offset.y;
2278       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2279       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2280
2281       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2282       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2283       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2284
2285       for (unsigned j = 0; j < attachment_count; j++) {
2286          uint32_t a;
2287          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2288             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2289          else
2290             a = subpass->depth_stencil_attachment.attachment;
2291
2292          if (a == VK_ATTACHMENT_UNUSED)
2293                continue;
2294
2295          unsigned clear_mask = 0xf;
2296          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2297             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2298                clear_mask &= ~0x7;
2299             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2300                clear_mask &= ~0x8;
2301          }
2302
2303          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2304                                        &attachments[j].clearValue);
2305       }
2306    }
2307 }
2308
2309 void
2310 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2311                        uint32_t attachmentCount,
2312                        const VkClearAttachment *pAttachments,
2313                        uint32_t rectCount,
2314                        const VkClearRect *pRects)
2315 {
2316    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2317    struct tu_cs *cs = &cmd->draw_cs;
2318
2319    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2320    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2321    tu_cond_exec_end(cs);
2322
2323    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2324    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2325    tu_cond_exec_end(cs);
2326 }
2327
2328 void
2329 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2330                            struct tu_cs *cs,
2331                            uint32_t a,
2332                            const VkRenderPassBeginInfo *info)
2333 {
2334    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2335    const struct tu_image_view *iview = fb->attachments[a].attachment;
2336    const struct tu_render_pass_attachment *attachment =
2337       &cmd->state.pass->attachments[a];
2338    uint8_t mask = 0;
2339
2340    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2341       mask = 0xf;
2342    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2343       mask |= 0x7;
2344    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2345       mask |= 0x8;
2346
2347    if (!mask)
2348       return;
2349
2350    const struct blit_ops *ops = &r2d_ops;
2351    if (attachment->samples > 1)
2352       ops = &r3d_ops;
2353
2354    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2355    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2356    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2357
2358    /* Wait for any flushes at the beginning of the renderpass to complete */
2359    tu_cs_emit_wfi(cs);
2360
2361    for (uint32_t i = 0; i < fb->layers; i++) {
2362       ops->dst(cs, iview, i);
2363       ops->run(cmd, cs);
2364    }
2365
2366    /* The spec doesn't explicitly say, but presumably the initial renderpass
2367     * clear is considered part of the renderpass, and therefore barriers
2368     * aren't required inside the subpass/renderpass.  Therefore we need to
2369     * flush CCU color into CCU depth here, just like with
2370     * vkCmdClearAttachments(). Note that because this only happens at the
2371     * beginning of a renderpass, and renderpass writes are considered
2372     * "incoherent", we shouldn't have to worry about syncing depth into color
2373     * beforehand as depth should already be flushed.
2374     */
2375    if (vk_format_is_depth_or_stencil(attachment->format)) {
2376       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2377       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2378    } else {
2379       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2380       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2381    }
2382 }
2383
2384 void
2385 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2386                          struct tu_cs *cs,
2387                          uint32_t a,
2388                          const VkRenderPassBeginInfo *info)
2389 {
2390    const struct tu_render_pass_attachment *attachment =
2391       &cmd->state.pass->attachments[a];
2392    unsigned clear_mask = 0;
2393
2394    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2395       clear_mask = 0xf;
2396    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2397       clear_mask |= 0x7;
2398    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2399       clear_mask |= 0x8;
2400
2401    if (!clear_mask)
2402       return;
2403
2404    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2405
2406    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2407                                  &info->pClearValues[a]);
2408 }
2409
2410 static void
2411 tu_emit_blit(struct tu_cmd_buffer *cmd,
2412              struct tu_cs *cs,
2413              const struct tu_image_view *iview,
2414              const struct tu_render_pass_attachment *attachment,
2415              bool resolve)
2416 {
2417    tu_cs_emit_regs(cs,
2418                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2419
2420    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2421       .unk0 = !resolve,
2422       .gmem = !resolve,
2423       /* "integer" bit disables msaa resolve averaging */
2424       .integer = vk_format_is_int(attachment->format)));
2425
2426    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2427    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2428    tu_cs_image_ref_2d(cs, iview, 0, false);
2429
2430    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2431    tu_cs_image_flag_ref(cs, iview, 0);
2432
2433    tu_cs_emit_regs(cs,
2434                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2435
2436    tu6_emit_event_write(cmd, cs, BLIT);
2437 }
2438
2439 static bool
2440 blit_can_resolve(VkFormat format)
2441 {
2442    const struct util_format_description *desc = vk_format_description(format);
2443
2444    /* blit event can only do resolve for simple cases:
2445     * averaging samples as unsigned integers or choosing only one sample
2446     */
2447    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2448       return false;
2449
2450    /* can't do formats with larger channel sizes
2451     * note: this includes all float formats
2452     * note2: single channel integer formats seem OK
2453     */
2454    if (desc->channel[0].size > 10)
2455       return false;
2456
2457    switch (format) {
2458    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2459     * likely related to these formats having different layout from other cpp=2 formats
2460     */
2461    case VK_FORMAT_R8G8_UNORM:
2462    case VK_FORMAT_R8G8_UINT:
2463    case VK_FORMAT_R8G8_SINT:
2464    /* TODO: this one should be able to work? */
2465    case VK_FORMAT_D24_UNORM_S8_UINT:
2466       return false;
2467    default:
2468       break;
2469    }
2470
2471    return true;
2472 }
2473
2474 void
2475 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2476                         struct tu_cs *cs,
2477                         uint32_t a,
2478                         bool force_load)
2479 {
2480    const struct tu_image_view *iview =
2481       cmd->state.framebuffer->attachments[a].attachment;
2482    const struct tu_render_pass_attachment *attachment =
2483       &cmd->state.pass->attachments[a];
2484
2485    if (attachment->load || force_load)
2486       tu_emit_blit(cmd, cs, iview, attachment, false);
2487 }
2488
2489 void
2490 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2491                          struct tu_cs *cs,
2492                          uint32_t a,
2493                          uint32_t gmem_a)
2494 {
2495    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2496    const VkRect2D *render_area = &tiling->render_area;
2497    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2498    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2499    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2500
2501    if (!dst->store)
2502       return;
2503
2504    uint32_t x1 = render_area->offset.x;
2505    uint32_t y1 = render_area->offset.y;
2506    uint32_t x2 = x1 + render_area->extent.width;
2507    uint32_t y2 = y1 + render_area->extent.height;
2508    /* x2/y2 can be unaligned if equal to the size of the image,
2509     * since it will write into padding space
2510     * the one exception is linear levels which don't have the
2511     * required y padding in the layout (except for the last level)
2512     */
2513    bool need_y2_align =
2514       y2 != iview->extent.height || iview->need_y2_align;
2515
2516    bool unaligned =
2517       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2518       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2519
2520    /* use fast path when render area is aligned, except for unsupported resolve cases */
2521    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2522       tu_emit_blit(cmd, cs, iview, src, true);
2523       return;
2524    }
2525
2526    if (dst->samples > 1) {
2527       /* I guess we need to use shader path in this case?
2528        * need a testcase which fails because of this
2529        */
2530       tu_finishme("unaligned store of msaa attachment\n");
2531       return;
2532    }
2533
2534    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2535    r2d_dst(cs, iview, 0);
2536    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2537
2538    tu_cs_emit_regs(cs,
2539                    A6XX_SP_PS_2D_SRC_INFO(
2540                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2541                       .tile_mode = TILE6_2,
2542                       .srgb = vk_format_is_srgb(src->format),
2543                       .samples = tu_msaa_samples(src->samples),
2544                       .samples_average = !vk_format_is_int(src->format),
2545                       .unk20 = 1,
2546                       .unk22 = 1),
2547                    /* note: src size does not matter when not scaling */
2548                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2549                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2550                    A6XX_SP_PS_2D_SRC_HI(),
2551                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2552
2553    /* sync GMEM writes with CACHE. */
2554    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2555
2556    /* Wait for CACHE_INVALIDATE to land */
2557    tu_cs_emit_wfi(cs);
2558
2559    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2560    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2561
2562    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2563     * sysmem, and we generally assume that GMEM renderpasses leave their
2564     * results in sysmem, so we need to flush manually here.
2565     */
2566    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2567 }