src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 /* helper functions previously in tu_formats.c */
  20
  21 static uint32_t
  22 tu_pack_mask(int bits)
  23 {
  24    assert(bits <= 32);
  25    return (1ull << bits) - 1;
  26 }
  27
  28 static uint32_t
  29 tu_pack_float32_for_unorm(float val, int bits)
  30 {
  31    const uint32_t max = tu_pack_mask(bits);
  32    if (val < 0.0f)
  33       return 0;
  34    else if (val > 1.0f)
  35       return max;
  36    else
  37       return _mesa_lroundevenf(val * (float) max);
  38 }
  39
  40 static uint32_t
  41 tu_pack_float32_for_snorm(float val, int bits)
  42 {
  43    const int32_t max = tu_pack_mask(bits - 1);
  44    int32_t tmp;
  45    if (val < -1.0f)
  46       tmp = -max;
  47    else if (val > 1.0f)
  48       tmp = max;
  49    else
  50       tmp = _mesa_lroundevenf(val * (float) max);
  51
  52    return tmp & tu_pack_mask(bits);
  53 }
  54
  55 static uint32_t
  56 tu_pack_float32_for_uscaled(float val, int bits)
  57 {
  58    const uint32_t max = tu_pack_mask(bits);
  59    if (val < 0.0f)
  60       return 0;
  61    else if (val > (float) max)
  62       return max;
  63    else
  64       return (uint32_t) val;
  65 }
  66
  67 static uint32_t
  68 tu_pack_float32_for_sscaled(float val, int bits)
  69 {
  70    const int32_t max = tu_pack_mask(bits - 1);
  71    const int32_t min = -max - 1;
  72    int32_t tmp;
  73    if (val < (float) min)
  74       tmp = min;
  75    else if (val > (float) max)
  76       tmp = max;
  77    else
  78       tmp = (int32_t) val;
  79
  80    return tmp & tu_pack_mask(bits);
  81 }
  82
  83 static uint32_t
  84 tu_pack_uint32_for_uint(uint32_t val, int bits)
  85 {
  86    return val & tu_pack_mask(bits);
  87 }
  88
  89 static uint32_t
  90 tu_pack_int32_for_sint(int32_t val, int bits)
  91 {
  92    return val & tu_pack_mask(bits);
  93 }
  94
  95 static uint32_t
  96 tu_pack_float32_for_sfloat(float val, int bits)
  97 {
  98    assert(bits == 16 || bits == 32);
  99    return bits == 16 ? util_float_to_half(val) : fui(val);
 100 }
 101
 102 union tu_clear_component_value {
 103    float float32;
 104    int32_t int32;
 105    uint32_t uint32;
 106 };
 107
 108 static uint32_t
 109 tu_pack_clear_component_value(union tu_clear_component_value val,
 110                               const struct util_format_channel_description *ch)
 111 {
 112    uint32_t packed;
 113
 114    switch (ch->type) {
 115    case UTIL_FORMAT_TYPE_UNSIGNED:
 116       /* normalized, scaled, or pure integer */
 117       if (ch->normalized)
 118          packed = tu_pack_float32_for_unorm(val.float32, ch->size);
 119       else if (ch->pure_integer)
 120          packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
 121       else
 122          packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
 123       break;
 124    case UTIL_FORMAT_TYPE_SIGNED:
 125       /* normalized, scaled, or pure integer */
 126       if (ch->normalized)
 127          packed = tu_pack_float32_for_snorm(val.float32, ch->size);
 128       else if (ch->pure_integer)
 129          packed = tu_pack_int32_for_sint(val.int32, ch->size);
 130       else
 131          packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
 132       break;
 133    case UTIL_FORMAT_TYPE_FLOAT:
 134       packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
 135       break;
 136    default:
 137       unreachable("unexpected channel type");
 138       packed = 0;
 139       break;
 140    }
 141
 142    assert((packed & tu_pack_mask(ch->size)) == packed);
 143    return packed;
 144 }
 145
 146 static const struct util_format_channel_description *
 147 tu_get_format_channel_description(const struct util_format_description *desc,
 148                                   int comp)
 149 {
 150    switch (desc->swizzle[comp]) {
 151    case PIPE_SWIZZLE_X:
 152       return &desc->channel[0];
 153    case PIPE_SWIZZLE_Y:
 154       return &desc->channel[1];
 155    case PIPE_SWIZZLE_Z:
 156       return &desc->channel[2];
 157    case PIPE_SWIZZLE_W:
 158       return &desc->channel[3];
 159    default:
 160       return NULL;
 161    }
 162 }
 163
 164 static union tu_clear_component_value
 165 tu_get_clear_component_value(const VkClearValue *val, int comp,
 166                              enum util_format_colorspace colorspace)
 167 {
 168    assert(comp < 4);
 169
 170    union tu_clear_component_value tmp;
 171    switch (colorspace) {
 172    case UTIL_FORMAT_COLORSPACE_ZS:
 173       assert(comp < 2);
 174       if (comp == 0)
 175          tmp.float32 = val->depthStencil.depth;
 176       else
 177          tmp.uint32 = val->depthStencil.stencil;
 178       break;
 179    case UTIL_FORMAT_COLORSPACE_SRGB:
 180       if (comp < 3) {
 181          tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
 182          break;
 183       }
 184    default:
 185       assert(comp < 4);
 186       tmp.uint32 = val->color.uint32[comp];
 187       break;
 188    }
 189
 190    return tmp;
 191 }
 192
 193 /* r2d_ = BLIT_OP_SCALE operations */
 194
 195 static enum a6xx_2d_ifmt
 196 format_to_ifmt(enum a6xx_format fmt)
 197 {
 198    switch (fmt) {
 199    case FMT6_A8_UNORM:
 200    case FMT6_8_UNORM:
 201    case FMT6_8_SNORM:
 202    case FMT6_8_8_UNORM:
 203    case FMT6_8_8_SNORM:
 204    case FMT6_8_8_8_8_UNORM:
 205    case FMT6_8_8_8_X8_UNORM:
 206    case FMT6_8_8_8_8_SNORM:
 207    case FMT6_4_4_4_4_UNORM:
 208    case FMT6_5_5_5_1_UNORM:
 209    case FMT6_5_6_5_UNORM:
 210    case FMT6_Z24_UNORM_S8_UINT:
 211    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
 212       return R2D_UNORM8;
 213
 214    case FMT6_32_UINT:
 215    case FMT6_32_SINT:
 216    case FMT6_32_32_UINT:
 217    case FMT6_32_32_SINT:
 218    case FMT6_32_32_32_32_UINT:
 219    case FMT6_32_32_32_32_SINT:
 220       return R2D_INT32;
 221
 222    case FMT6_16_UINT:
 223    case FMT6_16_SINT:
 224    case FMT6_16_16_UINT:
 225    case FMT6_16_16_SINT:
 226    case FMT6_16_16_16_16_UINT:
 227    case FMT6_16_16_16_16_SINT:
 228    case FMT6_10_10_10_2_UINT:
 229       return R2D_INT16;
 230
 231    case FMT6_8_UINT:
 232    case FMT6_8_SINT:
 233    case FMT6_8_8_UINT:
 234    case FMT6_8_8_SINT:
 235    case FMT6_8_8_8_8_UINT:
 236    case FMT6_8_8_8_8_SINT:
 237       return R2D_INT8;
 238
 239    case FMT6_16_UNORM:
 240    case FMT6_16_SNORM:
 241    case FMT6_16_16_UNORM:
 242    case FMT6_16_16_SNORM:
 243    case FMT6_16_16_16_16_UNORM:
 244    case FMT6_16_16_16_16_SNORM:
 245    case FMT6_32_FLOAT:
 246    case FMT6_32_32_FLOAT:
 247    case FMT6_32_32_32_32_FLOAT:
 248       return R2D_FLOAT32;
 249
 250    case FMT6_16_FLOAT:
 251    case FMT6_16_16_FLOAT:
 252    case FMT6_16_16_16_16_FLOAT:
 253    case FMT6_11_11_10_FLOAT:
 254    case FMT6_10_10_10_2_UNORM:
 255    case FMT6_10_10_10_2_UNORM_DEST:
 256       return R2D_FLOAT16;
 257
 258    default:
 259       unreachable("bad format");
 260       return 0;
 261    }
 262 }
 263
 264 static void
 265 r2d_coords(struct tu_cs *cs,
 266            const VkOffset2D *dst,
 267            const VkOffset2D *src,
 268            const VkExtent2D *extent)
 269 {
 270    tu_cs_emit_regs(cs,
 271       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 272       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 273
 274    if (!src)
 275       return;
 276
 277    tu_cs_emit_regs(cs,
 278                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 279                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 280                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 281                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 282 }
 283
 284 static void
 285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 286 {
 287    uint32_t clear_value[4] = {};
 288
 289    switch (format) {
 290    case VK_FORMAT_X8_D24_UNORM_PACK32:
 291    case VK_FORMAT_D24_UNORM_S8_UINT:
 292       /* cleared as r8g8b8a8_unorm using special format */
 293       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 294       clear_value[1] = clear_value[0] >> 8;
 295       clear_value[2] = clear_value[0] >> 16;
 296       clear_value[3] = val->depthStencil.stencil;
 297       break;
 298    case VK_FORMAT_D16_UNORM:
 299    case VK_FORMAT_D32_SFLOAT:
 300       /* R2D_FLOAT32 */
 301       clear_value[0] = fui(val->depthStencil.depth);
 302       break;
 303    case VK_FORMAT_S8_UINT:
 304       clear_value[0] = val->depthStencil.stencil;
 305       break;
 306    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 307       /* cleared as UINT32 */
 308       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 309       break;
 310    default:
 311       assert(!vk_format_is_depth_or_stencil(format));
 312       const struct util_format_description *desc = vk_format_description(format);
 313       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 314
 315       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 316                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 317
 318       for (unsigned i = 0; i < desc->nr_channels; i++) {
 319          const struct util_format_channel_description *ch = &desc->channel[i];
 320          if (ifmt == R2D_UNORM8) {
 321             float linear = val->color.float32[i];
 322             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 323                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 324
 325             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 326                clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
 327             else
 328                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 329          } else if (ifmt == R2D_FLOAT16) {
 330             clear_value[i] = util_float_to_half(val->color.float32[i]);
 331          } else {
 332             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 333                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 334             clear_value[i] = val->color.uint32[i];
 335          }
 336       }
 337       break;
 338    }
 339
 340    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 341    tu_cs_emit_array(cs, clear_value, 4);
 342 }
 343
 344 static void
 345 r2d_src(struct tu_cmd_buffer *cmd,
 346         struct tu_cs *cs,
 347         const struct tu_image_view *iview,
 348         uint32_t layer,
 349         VkFilter filter)
 350 {
 351    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
 352    if (filter != VK_FILTER_NEAREST)
 353       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
 354
 355    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 356    tu_cs_emit(cs, src_info);
 357    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 358    tu_cs_image_ref_2d(cs, iview, layer, true);
 359
 360    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 361    tu_cs_image_flag_ref(cs, iview, layer);
 362 }
 363
 364 static void
 365 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 366                struct tu_cs *cs,
 367                VkFormat vk_format,
 368                uint64_t va, uint32_t pitch,
 369                uint32_t width, uint32_t height)
 370 {
 371    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 372
 373    tu_cs_emit_regs(cs,
 374                    A6XX_SP_PS_2D_SRC_INFO(
 375                       .color_format = format.fmt,
 376                       .color_swap = format.swap,
 377                       .srgb = vk_format_is_srgb(vk_format),
 378                       .unk20 = 1,
 379                       .unk22 = 1),
 380                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 381                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 382                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 383                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 384 }
 385
 386 static void
 387 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 388 {
 389    assert(iview->image->samples == 1);
 390
 391    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 392    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 393    tu_cs_image_ref_2d(cs, iview, layer, false);
 394
 395    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 396    tu_cs_image_flag_ref(cs, iview, layer);
 397 }
 398
 399 static void
 400 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 401 {
 402    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 403
 404    tu_cs_emit_regs(cs,
 405                    A6XX_RB_2D_DST_INFO(
 406                       .color_format = format.fmt,
 407                       .color_swap = format.swap,
 408                       .srgb = vk_format_is_srgb(vk_format)),
 409                    A6XX_RB_2D_DST_LO((uint32_t) va),
 410                    A6XX_RB_2D_DST_HI(va >> 32),
 411                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 412 }
 413
 414 static void
 415 r2d_setup_common(struct tu_cmd_buffer *cmd,
 416                  struct tu_cs *cs,
 417                  VkFormat vk_format,
 418                  enum a6xx_rotation rotation,
 419                  bool clear,
 420                  uint8_t mask,
 421                  bool scissor)
 422 {
 423    enum a6xx_format format = tu6_base_format(vk_format);
 424    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 425    uint32_t unknown_8c01 = 0;
 426
 427    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 428       /* preserve depth channels */
 429       if (mask == 0x8)
 430          unknown_8c01 = 0x00084001;
 431       /* preserve stencil channel */
 432       if (mask == 0x7)
 433          unknown_8c01 = 0x08000041;
 434    }
 435
 436    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 437    tu_cs_emit(cs, unknown_8c01);
 438
 439    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 440          .scissor = scissor,
 441          .rotate = rotation,
 442          .solid_color = clear,
 443          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 444          .color_format = format,
 445          .mask = 0xf,
 446          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 447       ).value;
 448
 449    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 450    tu_cs_emit(cs, blit_cntl);
 451
 452    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 453    tu_cs_emit(cs, blit_cntl);
 454
 455    if (format == FMT6_10_10_10_2_UNORM_DEST)
 456       format = FMT6_16_16_16_16_FLOAT;
 457
 458    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 459          .sint = vk_format_is_sint(vk_format),
 460          .uint = vk_format_is_uint(vk_format),
 461          .color_format = format,
 462          .srgb = vk_format_is_srgb(vk_format),
 463          .mask = 0xf));
 464 }
 465
 466 static void
 467 r2d_setup(struct tu_cmd_buffer *cmd,
 468           struct tu_cs *cs,
 469           VkFormat vk_format,
 470           enum a6xx_rotation rotation,
 471           bool clear,
 472           uint8_t mask)
 473 {
 474    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 475
 476    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 477 }
 478
 479 static void
 480 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 481 {
 482    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 483    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 484 }
 485
 486 /* r3d_ = shader path operations */
 487
 488 static void
 489 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 490            bool layered_clear)
 491 {
 492    struct ir3_shader dummy_shader = {};
 493
 494    struct ir3_shader_variant vs = {
 495       .type = MESA_SHADER_VERTEX,
 496       .instrlen = 1,
 497       .constlen = 2,
 498       .info.max_reg = 1,
 499       .inputs_count = 1,
 500       .inputs[0] = {
 501          .slot = SYSTEM_VALUE_VERTEX_ID,
 502          .regid = regid(0, 3),
 503          .sysval = true,
 504       },
 505       .outputs_count = blit ? 2 : 1,
 506       .outputs[0] = {
 507          .slot = VARYING_SLOT_POS,
 508          .regid = regid(0, 0),
 509       },
 510       .outputs[1] = {
 511          .slot = VARYING_SLOT_VAR0,
 512          .regid = regid(1, 0),
 513       },
 514       .shader = &dummy_shader,
 515    };
 516    if (layered_clear) {
 517       vs = (struct ir3_shader_variant) {
 518          .type = MESA_SHADER_VERTEX,
 519          .instrlen = 1,
 520          .info.max_reg = 0,
 521          .shader = &dummy_shader,
 522       };
 523    }
 524
 525    struct ir3_shader_variant fs = {
 526       .type = MESA_SHADER_FRAGMENT,
 527       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 528       .constlen = num_rts,
 529       .info.max_reg = MAX2(num_rts, 1) - 1,
 530       .total_in = blit ? 2 : 0,
 531       .num_samp = blit ? 1 : 0,
 532       .inputs_count = blit ? 2 : 0,
 533       .inputs[0] = {
 534          .slot = VARYING_SLOT_VAR0,
 535          .inloc = 0,
 536          .compmask = 3,
 537          .bary = true,
 538       },
 539       .inputs[1] = {
 540          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 541          .regid = regid(0, 0),
 542          .sysval = 1,
 543       },
 544       .num_sampler_prefetch = blit ? 1 : 0,
 545       .sampler_prefetch[0] = {
 546          .src = 0,
 547          .wrmask = 0xf,
 548          .cmd = 4,
 549       },
 550       .shader = &dummy_shader,
 551    };
 552
 553    struct ir3_shader_variant gs_shader = {
 554       .type = MESA_SHADER_GEOMETRY,
 555       .instrlen = 1,
 556       .constlen = 2,
 557       .info.max_reg = 1,
 558       .inputs_count = 1,
 559       .inputs[0] = {
 560          .slot = SYSTEM_VALUE_GS_HEADER_IR3,
 561          .regid = regid(0, 0),
 562          .sysval = true,
 563       },
 564       .outputs_count = 3,
 565       .outputs[0] = {
 566          .slot = VARYING_SLOT_POS,
 567          .regid = regid(0, 0),
 568       },
 569       .outputs[1] = {
 570          .slot = VARYING_SLOT_LAYER,
 571          .regid = regid(1, 1),
 572       },
 573       .outputs[2] = {
 574          .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
 575          .regid = regid(1, 0),
 576       },
 577       .shader = &dummy_shader,
 578    }, *gs = layered_clear ? &gs_shader : NULL;
 579
 580
 581 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
 582 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 583 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 584
 585    static const instr_t vs_code[] = {
 586       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 587        * r1.xy = r0.w ? c1.zw : c0.zw
 588        * r0.w = 1.0f
 589        */
 590       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 591          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 592          .src2 = 3,
 593          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 594       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 595          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 596          .src2 = 3,
 597          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 598       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 599       { .cat0 = { .opc = OPC_END } },
 600    };
 601
 602    static const instr_t vs_layered[] = {
 603       { .cat0 = { .opc = OPC_CHMASK } },
 604       { .cat0 = { .opc = OPC_CHSH } },
 605    };
 606
 607    static const instr_t gs_code[16] = {
 608       /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
 609       CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
 610            .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
 611       /* x = (local_id & 1) ? c1.x : c0.x */
 612       CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
 613       /* y = (local_id & 2) ? c1.y : c0.y */
 614       CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
 615       /* pred = (local_id >= 4), used by OPC_KILL */
 616       CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
 617       /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
 618       CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
 619
 620       MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
 621       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
 622       MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
 623
 624       /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
 625       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
 626          .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
 627          .src2 = 0,
 628          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 629
 630       CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
 631
 632       { .cat0 = { .opc = OPC_KILL } },
 633       { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
 634    };
 635 #define FS_OFFSET (16 * sizeof(instr_t))
 636 #define GS_OFFSET (32 * sizeof(instr_t))
 637
 638    /* shaders */
 639    struct ts_cs_memory shaders = { };
 640    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
 641                                  16 * sizeof(instr_t), &shaders);
 642    assert(result == VK_SUCCESS);
 643
 644    if (layered_clear) {
 645       memcpy(shaders.map, vs_layered, sizeof(vs_layered));
 646       memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
 647    } else {
 648       memcpy(shaders.map, vs_code, sizeof(vs_code));
 649    }
 650
 651    instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 652    for (uint32_t i = 0; i < num_rts; i++) {
 653       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 654       *fs_code++ = (instr_t) { .cat1 = {
 655          .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 656          .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
 657       } };
 658    }
 659
 660    /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 661     * blit path (its not clear what allows it to not have it)
 662     */
 663    if (blit) {
 664       *fs_code++ = (instr_t) { .cat2 = {
 665          .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
 666          .dst = regid(63, 0), .src1_im = 1
 667       } };
 668    }
 669    *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 670    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 671
 672    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 673
 674    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
 675    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 676    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 677    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
 678    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
 679
 680    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 681    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 682
 683    tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
 684
 685    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 686    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 687    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 688
 689    tu6_emit_fs_inputs(cs, &fs);
 690
 691    tu_cs_emit_regs(cs,
 692                    A6XX_GRAS_CL_CNTL(
 693                       .persp_division_disable = 1,
 694                       .vp_xform_disable = 1,
 695                       .vp_clip_code_ignore = 1,
 696                       .clip_disable = 1),
 697                    A6XX_GRAS_UNKNOWN_8001(0));
 698    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 699
 700    tu_cs_emit_regs(cs,
 701                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 702                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 703    tu_cs_emit_regs(cs,
 704                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 705                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 706
 707    tu_cs_emit_regs(cs,
 708                    A6XX_VFD_INDEX_OFFSET(),
 709                    A6XX_VFD_INSTANCE_START_OFFSET());
 710 }
 711
 712 static void
 713 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
 714 {
 715    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 716    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 717                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 718                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 719                   CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
 720                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 721    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 722    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 723    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 724 }
 725
 726 static void
 727 r3d_coords(struct tu_cs *cs,
 728            const VkOffset2D *dst,
 729            const VkOffset2D *src,
 730            const VkExtent2D *extent)
 731 {
 732    int32_t src_x1 = src ? src->x : 0;
 733    int32_t src_y1 = src ? src->y : 0;
 734    r3d_coords_raw(cs, false, (float[]) {
 735       dst->x,                 dst->y,
 736       src_x1,                 src_y1,
 737       dst->x + extent->width, dst->y + extent->height,
 738       src_x1 + extent->width, src_y1 + extent->height,
 739    });
 740 }
 741
 742 static void
 743 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 744 {
 745    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 746    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 747                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 748                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 749                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 750                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 751    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 752    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 753    switch (format) {
 754    case VK_FORMAT_X8_D24_UNORM_PACK32:
 755    case VK_FORMAT_D24_UNORM_S8_UINT: {
 756       /* cleared as r8g8b8a8_unorm using special format */
 757       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 758       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 759       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 760       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 761       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 762    } break;
 763    case VK_FORMAT_D16_UNORM:
 764    case VK_FORMAT_D32_SFLOAT:
 765       tu_cs_emit(cs, fui(val->depthStencil.depth));
 766       tu_cs_emit(cs, 0);
 767       tu_cs_emit(cs, 0);
 768       tu_cs_emit(cs, 0);
 769       break;
 770    case VK_FORMAT_S8_UINT:
 771       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 772       tu_cs_emit(cs, 0);
 773       tu_cs_emit(cs, 0);
 774       tu_cs_emit(cs, 0);
 775       break;
 776    default:
 777       /* as color formats use clear value as-is */
 778       assert(!vk_format_is_depth_or_stencil(format));
 779       tu_cs_emit_array(cs, val->color.uint32, 4);
 780       break;
 781    }
 782 }
 783
 784 static void
 785 r3d_src_common(struct tu_cmd_buffer *cmd,
 786                struct tu_cs *cs,
 787                const uint32_t *tex_const,
 788                uint32_t offset_base,
 789                uint32_t offset_ubwc,
 790                VkFilter filter)
 791 {
 792    struct ts_cs_memory texture = { };
 793    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 794                                  2, /* allocate space for a sampler too */
 795                                  A6XX_TEX_CONST_DWORDS, &texture);
 796    assert(result == VK_SUCCESS);
 797
 798    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 799
 800    /* patch addresses for layer offset */
 801    *(uint64_t*) (texture.map + 4) += offset_base;
 802    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 803    texture.map[7] = ubwc_addr;
 804    texture.map[8] = ubwc_addr >> 32;
 805
 806    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 807       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
 808       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
 809       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 810       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 811       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 812       0x60000; /* XXX used by blob, doesn't seem necessary */
 813    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 814       0x1 | /* XXX used by blob, doesn't seem necessary */
 815       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 816       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 817    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 818    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 819
 820    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 821    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 822                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 823                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 824                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 825                CP_LOAD_STATE6_0_NUM_UNIT(1));
 826    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 827
 828    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 829    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 830
 831    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 832    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 833       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 834       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 835       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 836       CP_LOAD_STATE6_0_NUM_UNIT(1));
 837    tu_cs_emit_qw(cs, texture.iova);
 838
 839    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 840    tu_cs_emit_qw(cs, texture.iova);
 841
 842    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 843 }
 844
 845 static void
 846 r3d_src(struct tu_cmd_buffer *cmd,
 847         struct tu_cs *cs,
 848         const struct tu_image_view *iview,
 849         uint32_t layer,
 850         VkFilter filter)
 851 {
 852    r3d_src_common(cmd, cs, iview->descriptor,
 853                   iview->layer_size * layer,
 854                   iview->ubwc_layer_size * layer,
 855                   filter);
 856 }
 857
 858 static void
 859 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 860                struct tu_cs *cs,
 861                VkFormat vk_format,
 862                uint64_t va, uint32_t pitch,
 863                uint32_t width, uint32_t height)
 864 {
 865    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 866
 867    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 868
 869    desc[0] =
 870       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 871       A6XX_TEX_CONST_0_FMT(format.fmt) |
 872       A6XX_TEX_CONST_0_SWAP(format.swap) |
 873       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 874       // XXX to swizzle into .w for stencil buffer_to_image
 875       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 876       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 877       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 878    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 879    desc[2] =
 880       A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
 881       A6XX_TEX_CONST_2_PITCH(pitch) |
 882       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 883    desc[3] = 0;
 884    desc[4] = va;
 885    desc[5] = va >> 32;
 886    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 887       desc[i] = 0;
 888
 889    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 890 }
 891
 892 static void
 893 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 894 {
 895    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 896
 897    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 898    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 899    tu_cs_image_ref(cs, iview, layer);
 900    tu_cs_emit(cs, 0);
 901
 902    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 903    tu_cs_image_flag_ref(cs, iview, layer);
 904
 905    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 906 }
 907
 908 static void
 909 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 910 {
 911    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 912
 913    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 914
 915    tu_cs_emit_regs(cs,
 916                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 917                    A6XX_RB_MRT_PITCH(0, pitch),
 918                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 919                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 920                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 921                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 922
 923    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 924 }
 925
 926 static void
 927 r3d_setup(struct tu_cmd_buffer *cmd,
 928           struct tu_cs *cs,
 929           VkFormat vk_format,
 930           enum a6xx_rotation rotation,
 931           bool clear,
 932           uint8_t mask)
 933 {
 934    if (!cmd->state.pass) {
 935       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 936       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 937    }
 938
 939    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 940    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 941
 942    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 943
 944    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 945    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 946                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 947                   0xfc000000);
 948    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 949
 950    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 951    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 952
 953    tu_cs_emit_regs(cs,
 954                    A6XX_RB_FS_OUTPUT_CNTL0(),
 955                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 956
 957    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 958    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 959    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 960
 961    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 962    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 963    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 964    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 965    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 966    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 967    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 968
 969    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 970    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 971
 972    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 973                         .color_format = tu6_base_format(vk_format),
 974                         .color_sint = vk_format_is_sint(vk_format),
 975                         .color_uint = vk_format_is_uint(vk_format)));
 976
 977    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 978    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 979    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 980 }
 981
 982 static void
 983 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 984 {
 985    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 986    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 987                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 988                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 989    tu_cs_emit(cs, 1); /* instance count */
 990    tu_cs_emit(cs, 2); /* vertex count */
 991 }
 992
 993 /* blit ops - common interface for 2d/shader paths */
 994
 995 struct blit_ops {
 996    void (*coords)(struct tu_cs *cs,
 997                   const VkOffset2D *dst,
 998                   const VkOffset2D *src,
 999                   const VkExtent2D *extent);
1000    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1001    void (*src)(
1002         struct tu_cmd_buffer *cmd,
1003         struct tu_cs *cs,
1004         const struct tu_image_view *iview,
1005         uint32_t layer,
1006         VkFilter filter);
1007    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1008                       VkFormat vk_format,
1009                       uint64_t va, uint32_t pitch,
1010                       uint32_t width, uint32_t height);
1011    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1012    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1013    void (*setup)(struct tu_cmd_buffer *cmd,
1014                  struct tu_cs *cs,
1015                  VkFormat vk_format,
1016                  enum a6xx_rotation rotation,
1017                  bool clear,
1018                  uint8_t mask);
1019    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1020 };
1021
1022 static const struct blit_ops r2d_ops = {
1023    .coords = r2d_coords,
1024    .clear_value = r2d_clear_value,
1025    .src = r2d_src,
1026    .src_buffer = r2d_src_buffer,
1027    .dst = r2d_dst,
1028    .dst_buffer = r2d_dst_buffer,
1029    .setup = r2d_setup,
1030    .run = r2d_run,
1031 };
1032
1033 static const struct blit_ops r3d_ops = {
1034    .coords = r3d_coords,
1035    .clear_value = r3d_clear_value,
1036    .src = r3d_src,
1037    .src_buffer = r3d_src_buffer,
1038    .dst = r3d_dst,
1039    .dst_buffer = r3d_dst_buffer,
1040    .setup = r3d_setup,
1041    .run = r3d_run,
1042 };
1043
1044 /* passthrough set coords from 3D extents */
1045 static void
1046 coords(const struct blit_ops *ops,
1047        struct tu_cs *cs,
1048        const VkOffset3D *dst,
1049        const VkOffset3D *src,
1050        const VkExtent3D *extent)
1051 {
1052    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1053 }
1054
1055 static void
1056 tu_image_view_blit2(struct tu_image_view *iview,
1057                     struct tu_image *image,
1058                     VkFormat format,
1059                     const VkImageSubresourceLayers *subres,
1060                     uint32_t layer,
1061                     bool stencil_read)
1062 {
1063    VkImageAspectFlags aspect_mask = subres->aspectMask;
1064
1065    /* always use the AS_R8G8B8A8 format for these */
1066    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1067        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1068       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1069    }
1070
1071    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1072       .image = tu_image_to_handle(image),
1073       .viewType = VK_IMAGE_VIEW_TYPE_2D,
1074       .format = format,
1075       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1076       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1077       .subresourceRange = {
1078          .aspectMask = aspect_mask,
1079          .baseMipLevel = subres->mipLevel,
1080          .levelCount = 1,
1081          .baseArrayLayer = subres->baseArrayLayer + layer,
1082          .layerCount = 1,
1083       },
1084    });
1085 }
1086
1087 static void
1088 tu_image_view_blit(struct tu_image_view *iview,
1089                    struct tu_image *image,
1090                    const VkImageSubresourceLayers *subres,
1091                    uint32_t layer)
1092 {
1093    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1094 }
1095
1096 static void
1097 tu6_blit_image(struct tu_cmd_buffer *cmd,
1098                struct tu_image *src_image,
1099                struct tu_image *dst_image,
1100                const VkImageBlit *info,
1101                VkFilter filter)
1102 {
1103    const struct blit_ops *ops = &r2d_ops;
1104    struct tu_cs *cs = &cmd->cs;
1105    uint32_t layers;
1106
1107    /* 2D blit can't do rotation mirroring from just coordinates */
1108    static const enum a6xx_rotation rotate[2][2] = {
1109       {ROTATE_0, ROTATE_HFLIP},
1110       {ROTATE_VFLIP, ROTATE_180},
1111    };
1112
1113    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1114                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1115    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1116                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1117    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1118                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1119
1120    if (mirror_z) {
1121       tu_finishme("blit z mirror\n");
1122       return;
1123    }
1124
1125    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1126        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1127       tu_finishme("blit z filter\n");
1128       return;
1129    }
1130
1131    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1132    if (info->dstSubresource.layerCount > 1) {
1133       assert(layers <= 1);
1134       layers = info->dstSubresource.layerCount;
1135    }
1136
1137    uint8_t mask = 0xf;
1138    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1139       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1140       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1141          mask = 0x7;
1142       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1143          mask = 0x8;
1144    }
1145
1146    /* BC1_RGB_* formats need to have their last components overriden with 1
1147     * when sampling, which is normally handled with the texture descriptor
1148     * swizzle. The 2d path can't handle that, so use the 3d path.
1149     *
1150     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1151     * the 2d path.
1152     */
1153
1154    if (dst_image->samples > 1 ||
1155        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1156        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1157        filter == VK_FILTER_CUBIC_EXT)
1158       ops = &r3d_ops;
1159
1160    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1161     * figure out why (should be able to pass all tests with only shader path)
1162     */
1163
1164    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1165
1166    if (ops == &r3d_ops) {
1167       r3d_coords_raw(cs, false, (float[]) {
1168          info->dstOffsets[0].x, info->dstOffsets[0].y,
1169          info->srcOffsets[0].x, info->srcOffsets[0].y,
1170          info->dstOffsets[1].x, info->dstOffsets[1].y,
1171          info->srcOffsets[1].x, info->srcOffsets[1].y
1172       });
1173    } else {
1174       tu_cs_emit_regs(cs,
1175          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1176                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1177          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1178                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1179       tu_cs_emit_regs(cs,
1180          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1181          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1182          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1183          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1184    }
1185
1186    struct tu_image_view dst, src;
1187    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1188    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1189
1190    for (uint32_t i = 0; i < layers; i++) {
1191       ops->dst(cs, &dst, i);
1192       ops->src(cmd, cs, &src, i, filter);
1193       ops->run(cmd, cs);
1194    }
1195 }
1196
1197 void
1198 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1199                 VkImage srcImage,
1200                 VkImageLayout srcImageLayout,
1201                 VkImage dstImage,
1202                 VkImageLayout dstImageLayout,
1203                 uint32_t regionCount,
1204                 const VkImageBlit *pRegions,
1205                 VkFilter filter)
1206
1207 {
1208    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1209    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1210    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1211
1212    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1213    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1214
1215    for (uint32_t i = 0; i < regionCount; ++i)
1216       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1217 }
1218
1219 static VkFormat
1220 copy_format(VkFormat format)
1221 {
1222    switch (vk_format_get_blocksize(format)) {
1223    case 1: return VK_FORMAT_R8_UINT;
1224    case 2: return VK_FORMAT_R16_UINT;
1225    case 4: return VK_FORMAT_R32_UINT;
1226    case 8: return VK_FORMAT_R32G32_UINT;
1227    case 12:return VK_FORMAT_R32G32B32_UINT;
1228    case 16:return VK_FORMAT_R32G32B32A32_UINT;
1229    default:
1230       unreachable("unhandled format size");
1231    }
1232 }
1233
1234 static void
1235 copy_compressed(VkFormat format,
1236                 VkOffset3D *offset,
1237                 VkExtent3D *extent,
1238                 uint32_t *width,
1239                 uint32_t *height)
1240 {
1241    if (!vk_format_is_compressed(format))
1242       return;
1243
1244    uint32_t block_width = vk_format_get_blockwidth(format);
1245    uint32_t block_height = vk_format_get_blockheight(format);
1246
1247    offset->x /= block_width;
1248    offset->y /= block_height;
1249
1250    if (extent) {
1251       extent->width = DIV_ROUND_UP(extent->width, block_width);
1252       extent->height = DIV_ROUND_UP(extent->height, block_height);
1253    }
1254    if (width)
1255       *width = DIV_ROUND_UP(*width, block_width);
1256    if (height)
1257       *height = DIV_ROUND_UP(*height, block_height);
1258 }
1259
1260 static void
1261 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1262                         struct tu_buffer *src_buffer,
1263                         struct tu_image *dst_image,
1264                         const VkBufferImageCopy *info)
1265 {
1266    struct tu_cs *cs = &cmd->cs;
1267    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1268    VkFormat dst_format = dst_image->vk_format;
1269    VkFormat src_format = dst_image->vk_format;
1270    const struct blit_ops *ops = &r2d_ops;
1271
1272    uint8_t mask = 0xf;
1273
1274    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1275       switch (info->imageSubresource.aspectMask) {
1276       case VK_IMAGE_ASPECT_STENCIL_BIT:
1277          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1278          mask = 0x8;
1279          ops = &r3d_ops;
1280          break;
1281       case VK_IMAGE_ASPECT_DEPTH_BIT:
1282          mask = 0x7;
1283          break;
1284       }
1285    }
1286
1287    VkOffset3D offset = info->imageOffset;
1288    VkExtent3D extent = info->imageExtent;
1289    uint32_t src_width = info->bufferRowLength ?: extent.width;
1290    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1291
1292    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1293       assert(src_format == dst_format);
1294       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1295       src_format = dst_format = copy_format(dst_format);
1296    }
1297
1298    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1299    uint32_t layer_size = src_height * pitch;
1300
1301    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1302     * it is also valid for 1cpp format with shader path (stencil aspect path)
1303     */
1304
1305    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1306
1307    struct tu_image_view dst;
1308    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1309
1310    for (uint32_t i = 0; i < layers; i++) {
1311       ops->dst(cs, &dst, i);
1312
1313       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1314       if ((src_va & 63) || (pitch & 63)) {
1315          for (uint32_t y = 0; y < extent.height; y++) {
1316             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1317             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1318                             x + extent.width, 1);
1319             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1320                         &(VkExtent2D) {extent.width, 1});
1321             ops->run(cmd, cs);
1322             src_va += pitch;
1323          }
1324       } else {
1325          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1326          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1327          ops->run(cmd, cs);
1328       }
1329    }
1330 }
1331
1332 void
1333 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1334                         VkBuffer srcBuffer,
1335                         VkImage dstImage,
1336                         VkImageLayout dstImageLayout,
1337                         uint32_t regionCount,
1338                         const VkBufferImageCopy *pRegions)
1339 {
1340    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1341    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1342    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1343
1344    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1345    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1346
1347    for (unsigned i = 0; i < regionCount; ++i)
1348       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1349 }
1350
1351 static void
1352 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1353                         struct tu_image *src_image,
1354                         struct tu_buffer *dst_buffer,
1355                         const VkBufferImageCopy *info)
1356 {
1357    struct tu_cs *cs = &cmd->cs;
1358    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1359    VkFormat src_format = src_image->vk_format;
1360    VkFormat dst_format = src_image->vk_format;
1361    bool stencil_read = false;
1362
1363    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1364        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1365       dst_format = VK_FORMAT_R8_UNORM;
1366       stencil_read = true;
1367    }
1368
1369    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1370    VkOffset3D offset = info->imageOffset;
1371    VkExtent3D extent = info->imageExtent;
1372    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1373    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1374
1375    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1376       assert(src_format == dst_format);
1377       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1378       src_format = dst_format = copy_format(dst_format);
1379    }
1380
1381    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1382    uint32_t layer_size = pitch * dst_height;
1383
1384    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1385     * it is also valid for 1cpp format with shader path (stencil aspect)
1386     */
1387
1388    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1389
1390    struct tu_image_view src;
1391    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1392
1393    for (uint32_t i = 0; i < layers; i++) {
1394       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1395
1396       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1397       if ((dst_va & 63) || (pitch & 63)) {
1398          for (uint32_t y = 0; y < extent.height; y++) {
1399             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1400             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1401             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1402                         &(VkExtent2D) {extent.width, 1});
1403             ops->run(cmd, cs);
1404             dst_va += pitch;
1405          }
1406       } else {
1407          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1408          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1409          ops->run(cmd, cs);
1410       }
1411    }
1412 }
1413
1414 void
1415 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1416                         VkImage srcImage,
1417                         VkImageLayout srcImageLayout,
1418                         VkBuffer dstBuffer,
1419                         uint32_t regionCount,
1420                         const VkBufferImageCopy *pRegions)
1421 {
1422    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1423    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1424    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1425
1426    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1427    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1428
1429    for (unsigned i = 0; i < regionCount; ++i)
1430       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1431 }
1432
1433 /* Tiled formats don't support swapping, which means that we can't support
1434  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1435  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1436  * Currently we fake support for tiled swapped formats and use the unswapped
1437  * format instead, but this means that reinterpreting copies to and from
1438  * swapped formats can't be performed correctly unless we can swizzle the
1439  * components by reinterpreting the other image as the "correct" swapped
1440  * format, i.e. only when the other image is linear.
1441  */
1442
1443 static bool
1444 is_swapped_format(VkFormat format)
1445 {
1446    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1447    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1448    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1449 }
1450
1451 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1452  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1453  * versa). This should mirror the logic in fdl6_layout.
1454  */
1455 static bool
1456 image_is_r8g8(struct tu_image *image)
1457 {
1458    return image->layout.cpp == 2 &&
1459       vk_format_get_nr_components(image->vk_format) == 2;
1460 }
1461
1462 static void
1463 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1464                        struct tu_image *src_image,
1465                        struct tu_image *dst_image,
1466                        const VkImageCopy *info)
1467 {
1468    const struct blit_ops *ops = &r2d_ops;
1469    struct tu_cs *cs = &cmd->cs;
1470
1471    uint8_t mask = 0xf;
1472    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1473       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1474          mask = 0x7;
1475       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1476          mask = 0x8;
1477    }
1478
1479    if (dst_image->samples > 1)
1480       ops = &r3d_ops;
1481
1482    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1483
1484    VkFormat format = VK_FORMAT_UNDEFINED;
1485    VkOffset3D src_offset = info->srcOffset;
1486    VkOffset3D dst_offset = info->dstOffset;
1487    VkExtent3D extent = info->extent;
1488
1489    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1490     * Images":
1491     *
1492     *    When copying between compressed and uncompressed formats the extent
1493     *    members represent the texel dimensions of the source image and not
1494     *    the destination. When copying from a compressed image to an
1495     *    uncompressed image the image texel dimensions written to the
1496     *    uncompressed image will be source extent divided by the compressed
1497     *    texel block dimensions. When copying from an uncompressed image to a
1498     *    compressed image the image texel dimensions written to the compressed
1499     *    image will be the source extent multiplied by the compressed texel
1500     *    block dimensions.
1501     *
1502     * This means we only have to adjust the extent if the source image is
1503     * compressed.
1504     */
1505    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1506    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1507
1508    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1509       copy_format(dst_image->vk_format) : dst_image->vk_format;
1510    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1511       copy_format(src_image->vk_format) : src_image->vk_format;
1512
1513    bool use_staging_blit = false;
1514
1515    if (src_format == dst_format) {
1516       /* Images that share a format can always be copied directly because it's
1517        * the same as a blit.
1518        */
1519       format = src_format;
1520    } else if (!src_image->layout.tile_mode) {
1521       /* If an image is linear, we can always safely reinterpret it with the
1522        * other image's format and then do a regular blit.
1523        */
1524       format = dst_format;
1525    } else if (!dst_image->layout.tile_mode) {
1526       format = src_format;
1527    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1528       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1529        * due to the different tile layout.
1530        */
1531       use_staging_blit = true;
1532    } else if (is_swapped_format(src_format) ||
1533               is_swapped_format(dst_format)) {
1534       /* If either format has a non-identity swap, then we can't copy
1535        * to/from it.
1536        */
1537       use_staging_blit = true;
1538    } else if (!src_image->layout.ubwc) {
1539       format = dst_format;
1540    } else if (!dst_image->layout.ubwc) {
1541       format = src_format;
1542    } else {
1543       /* Both formats use UBWC and so neither can be reinterpreted.
1544        * TODO: We could do an in-place decompression of the dst instead.
1545        */
1546       use_staging_blit = true;
1547    }
1548
1549    struct tu_image_view dst, src;
1550
1551    if (use_staging_blit) {
1552       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1553       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1554
1555       struct tu_image staging_image = {
1556          .vk_format = src_format,
1557          .type = src_image->type,
1558          .tiling = VK_IMAGE_TILING_LINEAR,
1559          .extent = extent,
1560          .level_count = 1,
1561          .layer_count = info->srcSubresource.layerCount,
1562          .samples = src_image->samples,
1563          .bo_offset = 0,
1564       };
1565
1566       VkImageSubresourceLayers staging_subresource = {
1567          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1568          .mipLevel = 0,
1569          .baseArrayLayer = 0,
1570          .layerCount = info->srcSubresource.layerCount,
1571       };
1572
1573       VkOffset3D staging_offset = { 0 };
1574
1575       staging_image.layout.tile_mode = TILE6_LINEAR;
1576       staging_image.layout.ubwc = false;
1577
1578       fdl6_layout(&staging_image.layout,
1579                   vk_format_to_pipe_format(staging_image.vk_format),
1580                   staging_image.samples,
1581                   staging_image.extent.width,
1582                   staging_image.extent.height,
1583                   staging_image.extent.depth,
1584                   staging_image.level_count,
1585                   staging_image.layer_count,
1586                   staging_image.type == VK_IMAGE_TYPE_3D,
1587                   NULL);
1588
1589       VkResult result = tu_get_scratch_bo(cmd->device,
1590                                           staging_image.layout.size,
1591                                           &staging_image.bo);
1592       if (result != VK_SUCCESS) {
1593          cmd->record_result = result;
1594          return;
1595       }
1596
1597       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1598                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1599
1600       struct tu_image_view staging;
1601       tu_image_view_blit2(&staging, &staging_image, src_format,
1602                           &staging_subresource, 0, false);
1603
1604       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1605       coords(ops, cs, &staging_offset, &src_offset, &extent);
1606
1607       for (uint32_t i = 0; i < info->extent.depth; i++) {
1608          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1609          ops->dst(cs, &staging, i);
1610          ops->run(cmd, cs);
1611       }
1612
1613       /* When executed by the user there has to be a pipeline barrier here,
1614        * but since we're doing it manually we'll have to flush ourselves.
1615        */
1616       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1617       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1618
1619       tu_image_view_blit2(&staging, &staging_image, dst_format,
1620                           &staging_subresource, 0, false);
1621
1622       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1623       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1624
1625       for (uint32_t i = 0; i < info->extent.depth; i++) {
1626          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1627          ops->dst(cs, &dst, i);
1628          ops->run(cmd, cs);
1629       }
1630    } else {
1631       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1632       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1633
1634       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1635       coords(ops, cs, &dst_offset, &src_offset, &extent);
1636
1637       for (uint32_t i = 0; i < info->extent.depth; i++) {
1638          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1639          ops->dst(cs, &dst, i);
1640          ops->run(cmd, cs);
1641       }
1642    }
1643 }
1644
1645 void
1646 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1647                 VkImage srcImage,
1648                 VkImageLayout srcImageLayout,
1649                 VkImage destImage,
1650                 VkImageLayout destImageLayout,
1651                 uint32_t regionCount,
1652                 const VkImageCopy *pRegions)
1653 {
1654    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1655    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1656    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1657
1658    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1659    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1660
1661    for (uint32_t i = 0; i < regionCount; ++i)
1662       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1663 }
1664
1665 static void
1666 copy_buffer(struct tu_cmd_buffer *cmd,
1667             uint64_t dst_va,
1668             uint64_t src_va,
1669             uint64_t size,
1670             uint32_t block_size)
1671 {
1672    const struct blit_ops *ops = &r2d_ops;
1673    struct tu_cs *cs = &cmd->cs;
1674    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1675    uint64_t blocks = size / block_size;
1676
1677    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1678
1679    while (blocks) {
1680       uint32_t src_x = (src_va & 63) / block_size;
1681       uint32_t dst_x = (dst_va & 63) / block_size;
1682       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1683
1684       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1685       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1686       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1687       ops->run(cmd, cs);
1688
1689       src_va += width * block_size;
1690       dst_va += width * block_size;
1691       blocks -= width;
1692    }
1693 }
1694
1695 void
1696 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1697                  VkBuffer srcBuffer,
1698                  VkBuffer dstBuffer,
1699                  uint32_t regionCount,
1700                  const VkBufferCopy *pRegions)
1701 {
1702    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1703    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1704    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1705
1706    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1707    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1708
1709    for (unsigned i = 0; i < regionCount; ++i) {
1710       copy_buffer(cmd,
1711                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1712                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1713                   pRegions[i].size, 1);
1714    }
1715 }
1716
1717 void
1718 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1719                    VkBuffer dstBuffer,
1720                    VkDeviceSize dstOffset,
1721                    VkDeviceSize dataSize,
1722                    const void *pData)
1723 {
1724    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1725    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1726
1727    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1728
1729    struct ts_cs_memory tmp;
1730    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1731    if (result != VK_SUCCESS) {
1732       cmd->record_result = result;
1733       return;
1734    }
1735
1736    memcpy(tmp.map, pData, dataSize);
1737    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1738 }
1739
1740 void
1741 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1742                  VkBuffer dstBuffer,
1743                  VkDeviceSize dstOffset,
1744                  VkDeviceSize fillSize,
1745                  uint32_t data)
1746 {
1747    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1748    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1749    const struct blit_ops *ops = &r2d_ops;
1750    struct tu_cs *cs = &cmd->cs;
1751
1752    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1753
1754    if (fillSize == VK_WHOLE_SIZE)
1755       fillSize = buffer->size - dstOffset;
1756
1757    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1758    uint32_t blocks = fillSize / 4;
1759
1760    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1761    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1762
1763    while (blocks) {
1764       uint32_t dst_x = (dst_va & 63) / 4;
1765       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1766
1767       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1768       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1769       ops->run(cmd, cs);
1770
1771       dst_va += width * 4;
1772       blocks -= width;
1773    }
1774 }
1775
1776 void
1777 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1778                    VkImage srcImage,
1779                    VkImageLayout srcImageLayout,
1780                    VkImage dstImage,
1781                    VkImageLayout dstImageLayout,
1782                    uint32_t regionCount,
1783                    const VkImageResolve *pRegions)
1784 {
1785    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1786    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1787    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1788    const struct blit_ops *ops = &r2d_ops;
1789    struct tu_cs *cs = &cmd->cs;
1790
1791    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1792    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1793
1794    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1795
1796    for (uint32_t i = 0; i < regionCount; ++i) {
1797       const VkImageResolve *info = &pRegions[i];
1798       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1799
1800       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1801       /* TODO: aspect masks possible ? */
1802
1803       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1804
1805       struct tu_image_view dst, src;
1806       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1807       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1808
1809       for (uint32_t i = 0; i < layers; i++) {
1810          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1811          ops->dst(cs, &dst, i);
1812          ops->run(cmd, cs);
1813       }
1814    }
1815 }
1816
1817 void
1818 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1819                   struct tu_cs *cs,
1820                   struct tu_image_view *src,
1821                   struct tu_image_view *dst,
1822                   uint32_t layers,
1823                   const VkRect2D *rect)
1824 {
1825    const struct blit_ops *ops = &r2d_ops;
1826
1827    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1828    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1829
1830    assert(src->image->vk_format == dst->image->vk_format);
1831
1832    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1833    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1834
1835    for (uint32_t i = 0; i < layers; i++) {
1836       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1837       ops->dst(cs, dst, i);
1838       ops->run(cmd, cs);
1839    }
1840 }
1841
1842 static void
1843 clear_image(struct tu_cmd_buffer *cmd,
1844             struct tu_image *image,
1845             const VkClearValue *clear_value,
1846             const VkImageSubresourceRange *range)
1847 {
1848    uint32_t level_count = tu_get_levelCount(image, range);
1849    uint32_t layer_count = tu_get_layerCount(image, range);
1850    struct tu_cs *cs = &cmd->cs;
1851    VkFormat format = image->vk_format;
1852    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1853       format = VK_FORMAT_R32_UINT;
1854
1855    if (image->type == VK_IMAGE_TYPE_3D) {
1856       assert(layer_count == 1);
1857       assert(range->baseArrayLayer == 0);
1858    }
1859
1860    uint8_t mask = 0xf;
1861    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1862       mask = 0;
1863       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1864          mask |= 0x7;
1865       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1866          mask |= 0x8;
1867    }
1868
1869    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1870
1871    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1872    ops->clear_value(cs, image->vk_format, clear_value);
1873
1874    for (unsigned j = 0; j < level_count; j++) {
1875       if (image->type == VK_IMAGE_TYPE_3D)
1876          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1877
1878       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1879                      u_minify(image->extent.width, range->baseMipLevel + j),
1880                      u_minify(image->extent.height, range->baseMipLevel + j)
1881                   });
1882
1883       struct tu_image_view dst;
1884       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1885          .aspectMask = range->aspectMask,
1886          .mipLevel = range->baseMipLevel + j,
1887          .baseArrayLayer = range->baseArrayLayer,
1888          .layerCount = 1,
1889       }, 0, false);
1890
1891       for (uint32_t i = 0; i < layer_count; i++) {
1892          ops->dst(cs, &dst, i);
1893          ops->run(cmd, cs);
1894       }
1895    }
1896 }
1897
1898 void
1899 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1900                       VkImage image_h,
1901                       VkImageLayout imageLayout,
1902                       const VkClearColorValue *pColor,
1903                       uint32_t rangeCount,
1904                       const VkImageSubresourceRange *pRanges)
1905 {
1906    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1907    TU_FROM_HANDLE(tu_image, image, image_h);
1908
1909    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1910
1911    for (unsigned i = 0; i < rangeCount; i++)
1912       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1913 }
1914
1915 void
1916 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1917                              VkImage image_h,
1918                              VkImageLayout imageLayout,
1919                              const VkClearDepthStencilValue *pDepthStencil,
1920                              uint32_t rangeCount,
1921                              const VkImageSubresourceRange *pRanges)
1922 {
1923    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1924    TU_FROM_HANDLE(tu_image, image, image_h);
1925
1926    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1927
1928    for (unsigned i = 0; i < rangeCount; i++)
1929       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1930 }
1931
1932 static void
1933 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1934                                uint32_t attachment_count,
1935                                const VkClearAttachment *attachments,
1936                                uint32_t rect_count,
1937                                const VkClearRect *rects)
1938 {
1939    const struct tu_subpass *subpass = cmd->state.subpass;
1940    /* note: cannot use shader path here.. there is a special shader path
1941     * in tu_clear_sysmem_attachments()
1942     */
1943    const struct blit_ops *ops = &r2d_ops;
1944    struct tu_cs *cs = &cmd->draw_cs;
1945
1946    for (uint32_t j = 0; j < attachment_count; j++) {
1947          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1948           * Pass Instance" says that:
1949           *
1950           *     Unlike other clear commands, vkCmdClearAttachments executes as
1951           *     a drawing command, rather than a transfer command, with writes
1952           *     performed by it executing in rasterization order. Clears to
1953           *     color attachments are executed as color attachment writes, by
1954           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1955           *     Clears to depth/stencil attachments are executed as depth
1956           *     writes and writes by the
1957           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1958           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1959           *
1960           * However, the 2d path here is executed the same way as a
1961           * transfer command, using the CCU color cache exclusively with
1962           * a special depth-as-color format for depth clears. This means that
1963           * we can't rely on the normal pipeline barrier mechanism here, and
1964           * have to manually flush whenever using a different cache domain
1965           * from what the 3d path would've used. This happens when we clear
1966           * depth/stencil, since normally depth attachments use CCU depth, but
1967           * we clear it using a special depth-as-color format. Since the clear
1968           * potentially uses a different attachment state we also need to
1969           * invalidate color beforehand and flush it afterwards.
1970           */
1971
1972          uint32_t a;
1973          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1974             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1975             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1976          } else {
1977             a = subpass->depth_stencil_attachment.attachment;
1978             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1979             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1980             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1981          }
1982
1983          if (a == VK_ATTACHMENT_UNUSED)
1984                continue;
1985
1986          uint8_t mask = 0xf;
1987          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1988             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1989                mask &= ~0x7;
1990             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1991                mask &= ~0x8;
1992          }
1993
1994          const struct tu_image_view *iview =
1995             cmd->state.framebuffer->attachments[a].attachment;
1996
1997          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1998          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1999
2000          /* Wait for the flushes we triggered manually to complete */
2001          tu_cs_emit_wfi(cs);
2002
2003          for (uint32_t i = 0; i < rect_count; i++) {
2004             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
2005             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2006                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
2007                ops->run(cmd, cs);
2008             }
2009          }
2010
2011          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2012             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2013             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2014          } else {
2015             /* sync color into depth */
2016             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2017             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2018          }
2019    }
2020 }
2021
2022 static void
2023 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2024                             uint32_t attachment_count,
2025                             const VkClearAttachment *attachments,
2026                             uint32_t rect_count,
2027                             const VkClearRect *rects)
2028 {
2029    /* the shader path here is special, it avoids changing MRT/etc state */
2030    const struct tu_render_pass *pass = cmd->state.pass;
2031    const struct tu_subpass *subpass = cmd->state.subpass;
2032    const uint32_t mrt_count = subpass->color_count;
2033    struct tu_cs *cs = &cmd->draw_cs;
2034    uint32_t clear_value[MAX_RTS][4];
2035    float z_clear_val = 0.0f;
2036    uint8_t s_clear_val = 0;
2037    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2038    bool z_clear = false;
2039    bool s_clear = false;
2040    bool layered_clear = false;
2041    uint32_t max_samples = 1;
2042
2043    for (uint32_t i = 0; i < attachment_count; i++) {
2044       uint32_t a;
2045       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2046          uint32_t c = attachments[i].colorAttachment;
2047          a = subpass->color_attachments[c].attachment;
2048          if (a == VK_ATTACHMENT_UNUSED)
2049             continue;
2050
2051          clear_rts |= 1 << c;
2052          clear_components |= 0xf << (c * 4);
2053          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2054       } else {
2055          a = subpass->depth_stencil_attachment.attachment;
2056          if (a == VK_ATTACHMENT_UNUSED)
2057             continue;
2058
2059          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2060             z_clear = true;
2061             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2062          }
2063
2064          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2065             s_clear = true;
2066             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2067          }
2068       }
2069
2070       max_samples = MAX2(max_samples, pass->attachments[a].samples);
2071    }
2072
2073    /* prefer to use 2D path for clears
2074     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2075     */
2076    if (max_samples == 1 && cmd->state.framebuffer) {
2077       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2078       return;
2079    }
2080
2081    /* This clear path behaves like a draw, needs the same flush as tu_draw */
2082    tu_emit_cache_flush_renderpass(cmd, cs);
2083
2084    /* disable all draw states so they don't interfere
2085     * TODO: use and re-use draw states for this path
2086     */
2087    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
2088    tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
2089                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
2090                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
2091    tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
2092    tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
2093    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2094
2095    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2096    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2097                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2098                   0xfc000000);
2099    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2100
2101    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2102    for (uint32_t i = 0; i < mrt_count; i++) {
2103       if (clear_rts & (1 << i))
2104          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2105       else
2106          tu_cs_emit(cs, 0);
2107    }
2108
2109    for (uint32_t i = 0; i < rect_count; i++) {
2110       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
2111          layered_clear = true;
2112    }
2113
2114    r3d_common(cmd, cs, false, num_rts, layered_clear);
2115
2116    tu_cs_emit_regs(cs,
2117                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2118    tu_cs_emit_regs(cs,
2119                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2120
2121    tu_cs_emit_regs(cs,
2122                    A6XX_RB_FS_OUTPUT_CNTL0(),
2123                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2124
2125    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2126    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2127    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2128    for (uint32_t i = 0; i < mrt_count; i++) {
2129       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2130             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2131    }
2132
2133    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2134    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2135          .z_enable = z_clear,
2136          .z_write_enable = z_clear,
2137          .zfunc = FUNC_ALWAYS));
2138    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2139    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2140          .stencil_enable = s_clear,
2141          .func = FUNC_ALWAYS,
2142          .zpass = STENCIL_REPLACE));
2143    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2144    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2145    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2146
2147    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2148    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2149                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2150                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2151                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2152                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2153    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2154    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2155    for_each_bit(b, clear_rts)
2156       tu_cs_emit_array(cs, clear_value[b], 4);
2157
2158    for (uint32_t i = 0; i < rect_count; i++) {
2159       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
2160          r3d_coords_raw(cs, layered_clear, (float[]) {
2161             rects[i].rect.offset.x, rects[i].rect.offset.y,
2162             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2163             rects[i].rect.offset.x + rects[i].rect.extent.width,
2164             rects[i].rect.offset.y + rects[i].rect.extent.height,
2165             z_clear_val, 1.0f,
2166          });
2167
2168          if (layered_clear) {
2169             tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2170             tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2171                            CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2172                            CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2173                            CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2174             tu_cs_emit(cs, 1); /* instance count */
2175             tu_cs_emit(cs, 1); /* vertex count */
2176          } else {
2177             r3d_run(cmd, cs);
2178          }
2179       }
2180    }
2181 }
2182
2183 /**
2184  * Pack a VkClearValue into a 128-bit buffer. format is respected except
2185  * for the component order.  The components are always packed in WZYX order,
2186  * because gmem is tiled and tiled formats always have WZYX swap
2187  */
2188 static void
2189 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2190 {
2191    const struct util_format_description *desc = vk_format_description(format);
2192
2193    switch (format) {
2194    case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2195       buf[0] = float3_to_r11g11b10f(val->color.float32);
2196       return;
2197    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2198       buf[0] = float3_to_rgb9e5(val->color.float32);
2199       return;
2200    default:
2201       break;
2202    }
2203
2204    assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2205
2206    /* S8_UINT is special and has no depth */
2207    const int max_components =
2208       format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2209
2210    int buf_offset = 0;
2211    int bit_shift = 0;
2212    for (int comp = 0; comp < max_components; comp++) {
2213       const struct util_format_channel_description *ch =
2214          tu_get_format_channel_description(desc, comp);
2215       if (!ch) {
2216          assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2217                 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2218          continue;
2219       }
2220
2221       union tu_clear_component_value v = tu_get_clear_component_value(
2222          val, comp, desc->colorspace);
2223
2224       /* move to the next uint32_t when there is not enough space */
2225       assert(ch->size <= 32);
2226       if (bit_shift + ch->size > 32) {
2227          buf_offset++;
2228          bit_shift = 0;
2229       }
2230
2231       if (bit_shift == 0)
2232          buf[buf_offset] = 0;
2233
2234       buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2235       bit_shift += ch->size;
2236    }
2237 }
2238
2239 static void
2240 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2241                               struct tu_cs *cs,
2242                               uint32_t attachment,
2243                               uint8_t component_mask,
2244                               const VkClearValue *value)
2245 {
2246    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2247    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2248     * because D24S8 is cleared with AS_R8G8B8A8 format
2249     */
2250
2251    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2252    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2253
2254    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2255    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2256
2257    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2258    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2259
2260    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2261    tu_cs_emit(cs, 0);
2262
2263    uint32_t clear_vals[4] = {};
2264    pack_gmem_clear_value(value, vk_format, clear_vals);
2265
2266    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2267    tu_cs_emit_array(cs, clear_vals, 4);
2268
2269    tu6_emit_event_write(cmd, cs, BLIT);
2270 }
2271
2272 static void
2273 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2274                           uint32_t attachment_count,
2275                           const VkClearAttachment *attachments,
2276                           uint32_t rect_count,
2277                           const VkClearRect *rects)
2278 {
2279    const struct tu_subpass *subpass = cmd->state.subpass;
2280    struct tu_cs *cs = &cmd->draw_cs;
2281
2282    /* TODO: swap the loops for smaller cmdstream */
2283    for (unsigned i = 0; i < rect_count; i++) {
2284       unsigned x1 = rects[i].rect.offset.x;
2285       unsigned y1 = rects[i].rect.offset.y;
2286       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2287       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2288
2289       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2290       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2291       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2292
2293       for (unsigned j = 0; j < attachment_count; j++) {
2294          uint32_t a;
2295          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2296             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2297          else
2298             a = subpass->depth_stencil_attachment.attachment;
2299
2300          if (a == VK_ATTACHMENT_UNUSED)
2301                continue;
2302
2303          unsigned clear_mask = 0xf;
2304          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2305             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2306                clear_mask &= ~0x7;
2307             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2308                clear_mask &= ~0x8;
2309          }
2310
2311          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2312                                        &attachments[j].clearValue);
2313       }
2314    }
2315 }
2316
2317 void
2318 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2319                        uint32_t attachmentCount,
2320                        const VkClearAttachment *pAttachments,
2321                        uint32_t rectCount,
2322                        const VkClearRect *pRects)
2323 {
2324    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2325    struct tu_cs *cs = &cmd->draw_cs;
2326
2327    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2328    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2329    tu_cond_exec_end(cs);
2330
2331    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2332    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2333    tu_cond_exec_end(cs);
2334 }
2335
2336 void
2337 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2338                            struct tu_cs *cs,
2339                            uint32_t a,
2340                            const VkRenderPassBeginInfo *info)
2341 {
2342    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2343    const struct tu_image_view *iview = fb->attachments[a].attachment;
2344    const struct tu_render_pass_attachment *attachment =
2345       &cmd->state.pass->attachments[a];
2346    uint8_t mask = 0;
2347
2348    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2349       mask = 0xf;
2350    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2351       mask |= 0x7;
2352    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2353       mask |= 0x8;
2354
2355    if (!mask)
2356       return;
2357
2358    const struct blit_ops *ops = &r2d_ops;
2359    if (attachment->samples > 1)
2360       ops = &r3d_ops;
2361
2362    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2363    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2364    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2365
2366    /* Wait for any flushes at the beginning of the renderpass to complete */
2367    tu_cs_emit_wfi(cs);
2368
2369    for (uint32_t i = 0; i < fb->layers; i++) {
2370       ops->dst(cs, iview, i);
2371       ops->run(cmd, cs);
2372    }
2373
2374    /* The spec doesn't explicitly say, but presumably the initial renderpass
2375     * clear is considered part of the renderpass, and therefore barriers
2376     * aren't required inside the subpass/renderpass.  Therefore we need to
2377     * flush CCU color into CCU depth here, just like with
2378     * vkCmdClearAttachments(). Note that because this only happens at the
2379     * beginning of a renderpass, and renderpass writes are considered
2380     * "incoherent", we shouldn't have to worry about syncing depth into color
2381     * beforehand as depth should already be flushed.
2382     */
2383    if (vk_format_is_depth_or_stencil(attachment->format)) {
2384       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2385       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2386    } else {
2387       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2388       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2389    }
2390 }
2391
2392 void
2393 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2394                          struct tu_cs *cs,
2395                          uint32_t a,
2396                          const VkRenderPassBeginInfo *info)
2397 {
2398    const struct tu_render_pass_attachment *attachment =
2399       &cmd->state.pass->attachments[a];
2400    unsigned clear_mask = 0;
2401
2402    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2403       clear_mask = 0xf;
2404    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2405       clear_mask |= 0x7;
2406    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2407       clear_mask |= 0x8;
2408
2409    if (!clear_mask)
2410       return;
2411
2412    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2413
2414    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2415                                  &info->pClearValues[a]);
2416 }
2417
2418 static void
2419 tu_emit_blit(struct tu_cmd_buffer *cmd,
2420              struct tu_cs *cs,
2421              const struct tu_image_view *iview,
2422              const struct tu_render_pass_attachment *attachment,
2423              bool resolve)
2424 {
2425    tu_cs_emit_regs(cs,
2426                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2427
2428    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2429       .unk0 = !resolve,
2430       .gmem = !resolve,
2431       /* "integer" bit disables msaa resolve averaging */
2432       .integer = vk_format_is_int(attachment->format)));
2433
2434    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2435    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2436    tu_cs_image_ref_2d(cs, iview, 0, false);
2437
2438    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2439    tu_cs_image_flag_ref(cs, iview, 0);
2440
2441    tu_cs_emit_regs(cs,
2442                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2443
2444    tu6_emit_event_write(cmd, cs, BLIT);
2445 }
2446
2447 static bool
2448 blit_can_resolve(VkFormat format)
2449 {
2450    const struct util_format_description *desc = vk_format_description(format);
2451
2452    /* blit event can only do resolve for simple cases:
2453     * averaging samples as unsigned integers or choosing only one sample
2454     */
2455    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2456       return false;
2457
2458    /* can't do formats with larger channel sizes
2459     * note: this includes all float formats
2460     * note2: single channel integer formats seem OK
2461     */
2462    if (desc->channel[0].size > 10)
2463       return false;
2464
2465    switch (format) {
2466    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2467     * likely related to these formats having different layout from other cpp=2 formats
2468     */
2469    case VK_FORMAT_R8G8_UNORM:
2470    case VK_FORMAT_R8G8_UINT:
2471    case VK_FORMAT_R8G8_SINT:
2472    /* TODO: this one should be able to work? */
2473    case VK_FORMAT_D24_UNORM_S8_UINT:
2474       return false;
2475    default:
2476       break;
2477    }
2478
2479    return true;
2480 }
2481
2482 void
2483 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2484                         struct tu_cs *cs,
2485                         uint32_t a,
2486                         bool force_load)
2487 {
2488    const struct tu_image_view *iview =
2489       cmd->state.framebuffer->attachments[a].attachment;
2490    const struct tu_render_pass_attachment *attachment =
2491       &cmd->state.pass->attachments[a];
2492
2493    if (attachment->load || force_load)
2494       tu_emit_blit(cmd, cs, iview, attachment, false);
2495 }
2496
2497 void
2498 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2499                          struct tu_cs *cs,
2500                          uint32_t a,
2501                          uint32_t gmem_a)
2502 {
2503    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2504    const VkRect2D *render_area = &tiling->render_area;
2505    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2506    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2507    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2508
2509    if (!dst->store)
2510       return;
2511
2512    uint32_t x1 = render_area->offset.x;
2513    uint32_t y1 = render_area->offset.y;
2514    uint32_t x2 = x1 + render_area->extent.width;
2515    uint32_t y2 = y1 + render_area->extent.height;
2516    /* x2/y2 can be unaligned if equal to the size of the image,
2517     * since it will write into padding space
2518     * the one exception is linear levels which don't have the
2519     * required y padding in the layout (except for the last level)
2520     */
2521    bool need_y2_align =
2522       y2 != iview->extent.height || iview->need_y2_align;
2523
2524    bool unaligned =
2525       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2526       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2527
2528    /* use fast path when render area is aligned, except for unsupported resolve cases */
2529    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2530       tu_emit_blit(cmd, cs, iview, src, true);
2531       return;
2532    }
2533
2534    if (dst->samples > 1) {
2535       /* I guess we need to use shader path in this case?
2536        * need a testcase which fails because of this
2537        */
2538       tu_finishme("unaligned store of msaa attachment\n");
2539       return;
2540    }
2541
2542    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2543    r2d_dst(cs, iview, 0);
2544    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2545
2546    tu_cs_emit_regs(cs,
2547                    A6XX_SP_PS_2D_SRC_INFO(
2548                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2549                       .tile_mode = TILE6_2,
2550                       .srgb = vk_format_is_srgb(src->format),
2551                       .samples = tu_msaa_samples(src->samples),
2552                       .samples_average = !vk_format_is_int(src->format),
2553                       .unk20 = 1,
2554                       .unk22 = 1),
2555                    /* note: src size does not matter when not scaling */
2556                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2557                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2558                    A6XX_SP_PS_2D_SRC_HI(),
2559                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2560
2561    /* sync GMEM writes with CACHE. */
2562    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2563
2564    /* Wait for CACHE_INVALIDATE to land */
2565    tu_cs_emit_wfi(cs);
2566
2567    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2568    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2569
2570    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2571     * sysmem, and we generally assume that GMEM renderpasses leave their
2572     * results in sysmem, so we need to flush manually here.
2573     */
2574    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2575 }