src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 /* helper functions previously in tu_formats.c */
  20
  21 static uint32_t
  22 tu_pack_mask(int bits)
  23 {
  24    assert(bits <= 32);
  25    return (1ull << bits) - 1;
  26 }
  27
  28 static uint32_t
  29 tu_pack_float32_for_unorm(float val, int bits)
  30 {
  31    const uint32_t max = tu_pack_mask(bits);
  32    if (val < 0.0f)
  33       return 0;
  34    else if (val > 1.0f)
  35       return max;
  36    else
  37       return _mesa_lroundevenf(val * (float) max);
  38 }
  39
  40 static uint32_t
  41 tu_pack_float32_for_snorm(float val, int bits)
  42 {
  43    const int32_t max = tu_pack_mask(bits - 1);
  44    int32_t tmp;
  45    if (val < -1.0f)
  46       tmp = -max;
  47    else if (val > 1.0f)
  48       tmp = max;
  49    else
  50       tmp = _mesa_lroundevenf(val * (float) max);
  51
  52    return tmp & tu_pack_mask(bits);
  53 }
  54
  55 static uint32_t
  56 tu_pack_float32_for_uscaled(float val, int bits)
  57 {
  58    const uint32_t max = tu_pack_mask(bits);
  59    if (val < 0.0f)
  60       return 0;
  61    else if (val > (float) max)
  62       return max;
  63    else
  64       return (uint32_t) val;
  65 }
  66
  67 static uint32_t
  68 tu_pack_float32_for_sscaled(float val, int bits)
  69 {
  70    const int32_t max = tu_pack_mask(bits - 1);
  71    const int32_t min = -max - 1;
  72    int32_t tmp;
  73    if (val < (float) min)
  74       tmp = min;
  75    else if (val > (float) max)
  76       tmp = max;
  77    else
  78       tmp = (int32_t) val;
  79
  80    return tmp & tu_pack_mask(bits);
  81 }
  82
  83 static uint32_t
  84 tu_pack_uint32_for_uint(uint32_t val, int bits)
  85 {
  86    return val & tu_pack_mask(bits);
  87 }
  88
  89 static uint32_t
  90 tu_pack_int32_for_sint(int32_t val, int bits)
  91 {
  92    return val & tu_pack_mask(bits);
  93 }
  94
  95 static uint32_t
  96 tu_pack_float32_for_sfloat(float val, int bits)
  97 {
  98    assert(bits == 16 || bits == 32);
  99    return bits == 16 ? util_float_to_half(val) : fui(val);
 100 }
 101
 102 union tu_clear_component_value {
 103    float float32;
 104    int32_t int32;
 105    uint32_t uint32;
 106 };
 107
 108 static uint32_t
 109 tu_pack_clear_component_value(union tu_clear_component_value val,
 110                               const struct util_format_channel_description *ch)
 111 {
 112    uint32_t packed;
 113
 114    switch (ch->type) {
 115    case UTIL_FORMAT_TYPE_UNSIGNED:
 116       /* normalized, scaled, or pure integer */
 117       if (ch->normalized)
 118          packed = tu_pack_float32_for_unorm(val.float32, ch->size);
 119       else if (ch->pure_integer)
 120          packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
 121       else
 122          packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
 123       break;
 124    case UTIL_FORMAT_TYPE_SIGNED:
 125       /* normalized, scaled, or pure integer */
 126       if (ch->normalized)
 127          packed = tu_pack_float32_for_snorm(val.float32, ch->size);
 128       else if (ch->pure_integer)
 129          packed = tu_pack_int32_for_sint(val.int32, ch->size);
 130       else
 131          packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
 132       break;
 133    case UTIL_FORMAT_TYPE_FLOAT:
 134       packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
 135       break;
 136    default:
 137       unreachable("unexpected channel type");
 138       packed = 0;
 139       break;
 140    }
 141
 142    assert((packed & tu_pack_mask(ch->size)) == packed);
 143    return packed;
 144 }
 145
 146 static const struct util_format_channel_description *
 147 tu_get_format_channel_description(const struct util_format_description *desc,
 148                                   int comp)
 149 {
 150    switch (desc->swizzle[comp]) {
 151    case PIPE_SWIZZLE_X:
 152       return &desc->channel[0];
 153    case PIPE_SWIZZLE_Y:
 154       return &desc->channel[1];
 155    case PIPE_SWIZZLE_Z:
 156       return &desc->channel[2];
 157    case PIPE_SWIZZLE_W:
 158       return &desc->channel[3];
 159    default:
 160       return NULL;
 161    }
 162 }
 163
 164 static union tu_clear_component_value
 165 tu_get_clear_component_value(const VkClearValue *val, int comp,
 166                              enum util_format_colorspace colorspace)
 167 {
 168    assert(comp < 4);
 169
 170    union tu_clear_component_value tmp;
 171    switch (colorspace) {
 172    case UTIL_FORMAT_COLORSPACE_ZS:
 173       assert(comp < 2);
 174       if (comp == 0)
 175          tmp.float32 = val->depthStencil.depth;
 176       else
 177          tmp.uint32 = val->depthStencil.stencil;
 178       break;
 179    case UTIL_FORMAT_COLORSPACE_SRGB:
 180       if (comp < 3) {
 181          tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
 182          break;
 183       }
 184    default:
 185       assert(comp < 4);
 186       tmp.uint32 = val->color.uint32[comp];
 187       break;
 188    }
 189
 190    return tmp;
 191 }
 192
 193 /* r2d_ = BLIT_OP_SCALE operations */
 194
 195 static enum a6xx_2d_ifmt
 196 format_to_ifmt(enum a6xx_format fmt)
 197 {
 198    switch (fmt) {
 199    case FMT6_A8_UNORM:
 200    case FMT6_8_UNORM:
 201    case FMT6_8_SNORM:
 202    case FMT6_8_8_UNORM:
 203    case FMT6_8_8_SNORM:
 204    case FMT6_8_8_8_8_UNORM:
 205    case FMT6_8_8_8_X8_UNORM:
 206    case FMT6_8_8_8_8_SNORM:
 207    case FMT6_4_4_4_4_UNORM:
 208    case FMT6_5_5_5_1_UNORM:
 209    case FMT6_5_6_5_UNORM:
 210    case FMT6_Z24_UNORM_S8_UINT:
 211    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
 212       return R2D_UNORM8;
 213
 214    case FMT6_32_UINT:
 215    case FMT6_32_SINT:
 216    case FMT6_32_32_UINT:
 217    case FMT6_32_32_SINT:
 218    case FMT6_32_32_32_32_UINT:
 219    case FMT6_32_32_32_32_SINT:
 220       return R2D_INT32;
 221
 222    case FMT6_16_UINT:
 223    case FMT6_16_SINT:
 224    case FMT6_16_16_UINT:
 225    case FMT6_16_16_SINT:
 226    case FMT6_16_16_16_16_UINT:
 227    case FMT6_16_16_16_16_SINT:
 228    case FMT6_10_10_10_2_UINT:
 229       return R2D_INT16;
 230
 231    case FMT6_8_UINT:
 232    case FMT6_8_SINT:
 233    case FMT6_8_8_UINT:
 234    case FMT6_8_8_SINT:
 235    case FMT6_8_8_8_8_UINT:
 236    case FMT6_8_8_8_8_SINT:
 237       return R2D_INT8;
 238
 239    case FMT6_16_UNORM:
 240    case FMT6_16_SNORM:
 241    case FMT6_16_16_UNORM:
 242    case FMT6_16_16_SNORM:
 243    case FMT6_16_16_16_16_UNORM:
 244    case FMT6_16_16_16_16_SNORM:
 245    case FMT6_32_FLOAT:
 246    case FMT6_32_32_FLOAT:
 247    case FMT6_32_32_32_32_FLOAT:
 248       return R2D_FLOAT32;
 249
 250    case FMT6_16_FLOAT:
 251    case FMT6_16_16_FLOAT:
 252    case FMT6_16_16_16_16_FLOAT:
 253    case FMT6_11_11_10_FLOAT:
 254    case FMT6_10_10_10_2_UNORM:
 255    case FMT6_10_10_10_2_UNORM_DEST:
 256       return R2D_FLOAT16;
 257
 258    default:
 259       unreachable("bad format");
 260       return 0;
 261    }
 262 }
 263
 264 static void
 265 r2d_coords(struct tu_cs *cs,
 266            const VkOffset2D *dst,
 267            const VkOffset2D *src,
 268            const VkExtent2D *extent)
 269 {
 270    tu_cs_emit_regs(cs,
 271       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 272       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 273
 274    if (!src)
 275       return;
 276
 277    tu_cs_emit_regs(cs,
 278                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 279                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 280                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 281                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 282 }
 283
 284 static void
 285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 286 {
 287    uint32_t clear_value[4] = {};
 288
 289    switch (format) {
 290    case VK_FORMAT_X8_D24_UNORM_PACK32:
 291    case VK_FORMAT_D24_UNORM_S8_UINT:
 292       /* cleared as r8g8b8a8_unorm using special format */
 293       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 294       clear_value[1] = clear_value[0] >> 8;
 295       clear_value[2] = clear_value[0] >> 16;
 296       clear_value[3] = val->depthStencil.stencil;
 297       break;
 298    case VK_FORMAT_D16_UNORM:
 299    case VK_FORMAT_D32_SFLOAT:
 300       /* R2D_FLOAT32 */
 301       clear_value[0] = fui(val->depthStencil.depth);
 302       break;
 303    case VK_FORMAT_S8_UINT:
 304       clear_value[0] = val->depthStencil.stencil;
 305       break;
 306    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 307       /* cleared as UINT32 */
 308       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 309       break;
 310    default:
 311       assert(!vk_format_is_depth_or_stencil(format));
 312       const struct util_format_description *desc = vk_format_description(format);
 313       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 314
 315       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 316                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 317
 318       for (unsigned i = 0; i < desc->nr_channels; i++) {
 319          const struct util_format_channel_description *ch = &desc->channel[i];
 320          if (ifmt == R2D_UNORM8) {
 321             float linear = val->color.float32[i];
 322             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 323                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 324
 325             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 326                clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
 327             else
 328                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 329          } else if (ifmt == R2D_FLOAT16) {
 330             clear_value[i] = util_float_to_half(val->color.float32[i]);
 331          } else {
 332             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 333                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 334             clear_value[i] = val->color.uint32[i];
 335          }
 336       }
 337       break;
 338    }
 339
 340    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 341    tu_cs_emit_array(cs, clear_value, 4);
 342 }
 343
 344 static void
 345 r2d_src(struct tu_cmd_buffer *cmd,
 346         struct tu_cs *cs,
 347         const struct tu_image_view *iview,
 348         uint32_t layer,
 349         bool linear_filter)
 350 {
 351    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 352    tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
 353                   COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
 354    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 355    tu_cs_image_ref_2d(cs, iview, layer, true);
 356
 357    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 358    tu_cs_image_flag_ref(cs, iview, layer);
 359 }
 360
 361 static void
 362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 363                struct tu_cs *cs,
 364                VkFormat vk_format,
 365                uint64_t va, uint32_t pitch,
 366                uint32_t width, uint32_t height)
 367 {
 368    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 369
 370    tu_cs_emit_regs(cs,
 371                    A6XX_SP_PS_2D_SRC_INFO(
 372                       .color_format = format.fmt,
 373                       .color_swap = format.swap,
 374                       .srgb = vk_format_is_srgb(vk_format),
 375                       .unk20 = 1,
 376                       .unk22 = 1),
 377                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 378                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 379                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 380                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 381 }
 382
 383 static void
 384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 385 {
 386    assert(iview->image->samples == 1);
 387
 388    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 389    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 390    tu_cs_image_ref_2d(cs, iview, layer, false);
 391
 392    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 393    tu_cs_image_flag_ref(cs, iview, layer);
 394 }
 395
 396 static void
 397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 398 {
 399    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 400
 401    tu_cs_emit_regs(cs,
 402                    A6XX_RB_2D_DST_INFO(
 403                       .color_format = format.fmt,
 404                       .color_swap = format.swap,
 405                       .srgb = vk_format_is_srgb(vk_format)),
 406                    A6XX_RB_2D_DST_LO((uint32_t) va),
 407                    A6XX_RB_2D_DST_HI(va >> 32),
 408                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 409 }
 410
 411 static void
 412 r2d_setup_common(struct tu_cmd_buffer *cmd,
 413                  struct tu_cs *cs,
 414                  VkFormat vk_format,
 415                  enum a6xx_rotation rotation,
 416                  bool clear,
 417                  uint8_t mask,
 418                  bool scissor)
 419 {
 420    enum a6xx_format format = tu6_base_format(vk_format);
 421    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 422    uint32_t unknown_8c01 = 0;
 423
 424    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 425       /* preserve depth channels */
 426       if (mask == 0x8)
 427          unknown_8c01 = 0x00084001;
 428       /* preserve stencil channel */
 429       if (mask == 0x7)
 430          unknown_8c01 = 0x08000041;
 431    }
 432
 433    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 434    tu_cs_emit(cs, unknown_8c01);
 435
 436    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 437          .scissor = scissor,
 438          .rotate = rotation,
 439          .solid_color = clear,
 440          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 441          .color_format = format,
 442          .mask = 0xf,
 443          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 444       ).value;
 445
 446    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 447    tu_cs_emit(cs, blit_cntl);
 448
 449    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 450    tu_cs_emit(cs, blit_cntl);
 451
 452    if (format == FMT6_10_10_10_2_UNORM_DEST)
 453       format = FMT6_16_16_16_16_FLOAT;
 454
 455    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 456          .sint = vk_format_is_sint(vk_format),
 457          .uint = vk_format_is_uint(vk_format),
 458          .color_format = format,
 459          .srgb = vk_format_is_srgb(vk_format),
 460          .mask = 0xf));
 461 }
 462
 463 static void
 464 r2d_setup(struct tu_cmd_buffer *cmd,
 465           struct tu_cs *cs,
 466           VkFormat vk_format,
 467           enum a6xx_rotation rotation,
 468           bool clear,
 469           uint8_t mask)
 470 {
 471    const struct tu_physical_device *phys_dev = cmd->device->physical_device;
 472
 473    /* TODO: flushing with barriers instead of blindly always flushing */
 474    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
 475    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 476    tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
 477    tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
 478    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
 479
 480    tu_cs_emit_wfi(cs);
 481    tu_cs_emit_regs(cs,
 482                   A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
 483
 484    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 485 }
 486
 487 static void
 488 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 489 {
 490    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 491    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 492
 493    /* TODO: flushing with barriers instead of blindly always flushing */
 494    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
 495    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 496    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
 497 }
 498
 499 /* r3d_ = shader path operations */
 500
 501 static void
 502 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
 503 {
 504    static const instr_t vs_code[] = {
 505       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 506        * r1.xy = r0.w ? c1.zw : c0.zw
 507        * r0.w = 1.0f
 508        */
 509       { .cat3 = {
 510          .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
 511          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 512          .src2 = 3,
 513          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
 514       } },
 515       { .cat3 = {
 516          .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
 517          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 518          .src2 = 3,
 519          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
 520       } },
 521       { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
 522                   .src_im = 1, .fim_val = 1.0f } },
 523       { .cat0 = { .opc = OPC_END } },
 524    };
 525 #define FS_OFFSET (16 * sizeof(instr_t))
 526    STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
 527
 528    /* vs inputs: only vtx id in r0.w */
 529    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 7);
 530    tu_cs_emit(cs, 0x00000000);
 531    tu_cs_emit(cs, 0xfcfcfc00 | A6XX_VFD_CONTROL_1_REGID4VTX(3));
 532    tu_cs_emit(cs, 0x0000fcfc);
 533    tu_cs_emit(cs, 0xfcfcfcfc);
 534    tu_cs_emit(cs, 0x000000fc);
 535    tu_cs_emit(cs, 0x0000fcfc);
 536    tu_cs_emit(cs, 0x00000000);
 537
 538    /* vs outputs: position in r0.xyzw, blit coords in r1.xy */
 539    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
 540    tu_cs_emit(cs, blit ? 0xffffffcf : 0xffffffff);
 541    tu_cs_emit(cs, 0xffffffff);
 542    tu_cs_emit(cs, 0xffffffff);
 543    tu_cs_emit(cs, 0xffffffff);
 544
 545    tu_cs_emit_regs(cs, A6XX_SP_VS_OUT_REG(0,
 546          .a_regid = 0, .a_compmask = 0xf,
 547          .b_regid = 4, .b_compmask = 0x3));
 548    tu_cs_emit_regs(cs, A6XX_SP_VS_VPC_DST_REG(0, .outloc0 = 0, .outloc1 = 4));
 549
 550    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
 551    tu_cs_emit(cs, 0xff00ff00 |
 552                   COND(blit, A6XX_VPC_CNTL_0_VARYING) |
 553                   A6XX_VPC_CNTL_0_NUMNONPOSVAR(blit ? 8 : 0));
 554
 555    tu_cs_emit_regs(cs, A6XX_VPC_PACK(
 556          .positionloc = 0,
 557          .psizeloc = 0xff,
 558          .stride_in_vpc = blit ? 6 : 4));
 559    tu_cs_emit_regs(cs, A6XX_SP_PRIMITIVE_CNTL(.vsout = blit ? 2 : 1));
 560    tu_cs_emit_regs(cs,
 561                    A6XX_PC_PRIMITIVE_CNTL_0(),
 562                    A6XX_PC_PRIMITIVE_CNTL_1(.stride_in_vpc = blit ? 6 : 4));
 563
 564
 565    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
 566    tu_cs_emit(cs, blit ? 0xe000 : 0); // I think this can just be 0
 567    for (uint32_t i = 1; i < 8; i++)
 568       tu_cs_emit(cs, 0);
 569
 570    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
 571    for (uint32_t i = 0; i < 8; i++)
 572       tu_cs_emit(cs, 0x99999999);
 573
 574    /* fs inputs: none, prefetch in blit case */
 575    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + blit);
 576    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(blit) |
 577                   A6XX_SP_FS_PREFETCH_CNTL_UNK4(0xfc) |
 578                   0x7000);
 579    if (blit) {
 580          tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(4) |
 581                         A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(0) |
 582                         A6XX_SP_FS_PREFETCH_CMD_TEX_ID(0) |
 583                         A6XX_SP_FS_PREFETCH_CMD_DST(0) |
 584                         A6XX_SP_FS_PREFETCH_CMD_WRMASK(0xf) |
 585                         A6XX_SP_FS_PREFETCH_CMD_CMD(0x4));
 586    }
 587
 588    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
 589    tu_cs_emit(cs, 0x3); // XXX blob uses 3 in blit path
 590    tu_cs_emit(cs, 0xfcfcfcfc);
 591    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(blit ? 0 : 0xfc) |
 592                   A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(0xfc) |
 593                   0xfc00fc00);
 594    tu_cs_emit(cs, 0xfcfcfcfc);
 595    tu_cs_emit(cs, 0xfcfc);
 596
 597    tu_cs_emit_regs(cs, A6XX_HLSQ_UNKNOWN_B980(blit ? 3 : 1));
 598    tu_cs_emit_regs(cs, A6XX_GRAS_CNTL(.varying = blit));
 599    tu_cs_emit_regs(cs,
 600                    A6XX_RB_RENDER_CONTROL0(.varying = blit, .unk10 = blit),
 601                    A6XX_RB_RENDER_CONTROL1());
 602
 603    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_CNTL());
 604    tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8101());
 605    tu_cs_emit_regs(cs, A6XX_GRAS_SAMPLE_CNTL());
 606
 607    /* shaders */
 608    struct ts_cs_memory shaders = { };
 609    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
 610    assert(result == VK_SUCCESS);
 611
 612    memcpy(shaders.map, vs_code, sizeof(vs_code));
 613
 614    instr_t *fs = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 615    for (uint32_t i = 0; i < num_rts; i++) {
 616       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 617       fs[i] =  (instr_t) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 618                               .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4 } };
 619    }
 620    fs[num_rts] = (instr_t) { .cat0 = { .opc = OPC_END } };
 621    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 622
 623    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 624    tu_cs_emit_regs(cs,
 625                    A6XX_HLSQ_VS_CNTL(.constlen = 8, .enabled = true),
 626                    A6XX_HLSQ_HS_CNTL(),
 627                    A6XX_HLSQ_DS_CNTL(),
 628                    A6XX_HLSQ_GS_CNTL());
 629    tu_cs_emit_regs(cs, A6XX_HLSQ_FS_CNTL(.constlen = 4 * num_rts, .enabled = true));
 630
 631    tu_cs_emit_regs(cs,
 632                    A6XX_SP_VS_CONFIG(.enabled = true),
 633                    A6XX_SP_VS_INSTRLEN(1));
 634    tu_cs_emit_regs(cs, A6XX_SP_HS_CONFIG());
 635    tu_cs_emit_regs(cs, A6XX_SP_DS_CONFIG());
 636    tu_cs_emit_regs(cs, A6XX_SP_GS_CONFIG());
 637    tu_cs_emit_regs(cs,
 638                    A6XX_SP_FS_CONFIG(.enabled = true, .ntex = blit, .nsamp = blit),
 639                    A6XX_SP_FS_INSTRLEN(1));
 640
 641    tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
 642                         .threadsize = FOUR_QUADS,
 643                         .fullregfootprint = 2,
 644                         .mergedregs = true));
 645    tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
 646                         .varying = blit,
 647                         .threadsize = FOUR_QUADS,
 648                         /* could this be 0 in !blit && !num_rts case ? */
 649                         .fullregfootprint = MAX2(1, num_rts),
 650                         .mergedregs = true)); /* note: tu_pipeline also sets 0x1000000 bit */
 651
 652    tu_cs_emit_regs(cs, A6XX_SP_IBO_COUNT(0));
 653
 654    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3);
 655    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 656                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 657                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 658                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
 659                      CP_LOAD_STATE6_0_NUM_UNIT(1));
 660    tu_cs_emit_qw(cs, shaders.iova);
 661
 662    tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OBJ_START_LO, 2);
 663    tu_cs_emit_qw(cs, shaders.iova);
 664
 665    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 666    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 667                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 668                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 669                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 670                      CP_LOAD_STATE6_0_NUM_UNIT(1));
 671    tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
 672
 673    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OBJ_START_LO, 2);
 674    tu_cs_emit_qw(cs, shaders.iova + FS_OFFSET);
 675
 676    tu_cs_emit_regs(cs,
 677                    A6XX_GRAS_CL_CNTL(
 678                       .persp_division_disable = 1,
 679                       .vp_xform_disable = 1,
 680                       .vp_clip_code_ignore = 1,
 681                       .clip_disable = 1),
 682                    A6XX_GRAS_UNKNOWN_8001(0));
 683    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 684
 685    tu_cs_emit_regs(cs,
 686                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 687                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 688    tu_cs_emit_regs(cs,
 689                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 690                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 691 }
 692
 693 static void
 694 r3d_coords_raw(struct tu_cs *cs, const float *coords)
 695 {
 696    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 697    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 698                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 699                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 700                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
 701                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 702    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 703    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 704    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 705 }
 706
 707 static void
 708 r3d_coords(struct tu_cs *cs,
 709            const VkOffset2D *dst,
 710            const VkOffset2D *src,
 711            const VkExtent2D *extent)
 712 {
 713    int32_t src_x1 = src ? src->x : 0;
 714    int32_t src_y1 = src ? src->y : 0;
 715    r3d_coords_raw(cs, (float[]) {
 716       dst->x,                 dst->y,
 717       src_x1,                 src_y1,
 718       dst->x + extent->width, dst->y + extent->height,
 719       src_x1 + extent->width, src_y1 + extent->height,
 720    });
 721 }
 722
 723 static void
 724 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 725 {
 726    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 727    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 728                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 729                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 730                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 731                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 732    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 733    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 734    switch (format) {
 735    case VK_FORMAT_X8_D24_UNORM_PACK32:
 736    case VK_FORMAT_D24_UNORM_S8_UINT: {
 737       /* cleared as r8g8b8a8_unorm using special format */
 738       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 739       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 740       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 741       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 742       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 743    } break;
 744    case VK_FORMAT_D16_UNORM:
 745    case VK_FORMAT_D32_SFLOAT:
 746       tu_cs_emit(cs, fui(val->depthStencil.depth));
 747       tu_cs_emit(cs, 0);
 748       tu_cs_emit(cs, 0);
 749       tu_cs_emit(cs, 0);
 750       break;
 751    case VK_FORMAT_S8_UINT:
 752       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 753       tu_cs_emit(cs, 0);
 754       tu_cs_emit(cs, 0);
 755       tu_cs_emit(cs, 0);
 756       break;
 757    default:
 758       /* as color formats use clear value as-is */
 759       assert(!vk_format_is_depth_or_stencil(format));
 760       tu_cs_emit_array(cs, val->color.uint32, 4);
 761       break;
 762    }
 763 }
 764
 765 static void
 766 r3d_src_common(struct tu_cmd_buffer *cmd,
 767                struct tu_cs *cs,
 768                const uint32_t *tex_const,
 769                uint32_t offset_base,
 770                uint32_t offset_ubwc,
 771                bool linear_filter)
 772 {
 773    struct ts_cs_memory texture = { };
 774    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 775                                  2, /* allocate space for a sampler too */
 776                                  A6XX_TEX_CONST_DWORDS, &texture);
 777    assert(result == VK_SUCCESS);
 778
 779    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 780
 781    /* patch addresses for layer offset */
 782    *(uint64_t*) (texture.map + 4) += offset_base;
 783    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 784    texture.map[7] = ubwc_addr;
 785    texture.map[8] = ubwc_addr >> 32;
 786
 787    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 788       A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 789       A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 790       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 791       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 792       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 793       0x60000; /* XXX used by blob, doesn't seem necessary */
 794    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 795       0x1 | /* XXX used by blob, doesn't seem necessary */
 796       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 797       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 798    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 799    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 800
 801    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 802    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 803                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 804                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 805                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 806                CP_LOAD_STATE6_0_NUM_UNIT(1));
 807    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 808
 809    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 810    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 811
 812    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 813    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 814       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 815       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 816       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 817       CP_LOAD_STATE6_0_NUM_UNIT(1));
 818    tu_cs_emit_qw(cs, texture.iova);
 819
 820    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 821    tu_cs_emit_qw(cs, texture.iova);
 822
 823    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 824 }
 825
 826 static void
 827 r3d_src(struct tu_cmd_buffer *cmd,
 828         struct tu_cs *cs,
 829         const struct tu_image_view *iview,
 830         uint32_t layer,
 831         bool linear_filter)
 832 {
 833    r3d_src_common(cmd, cs, iview->descriptor,
 834                   iview->layer_size * layer,
 835                   iview->ubwc_layer_size * layer,
 836                   linear_filter);
 837 }
 838
 839 static void
 840 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 841                struct tu_cs *cs,
 842                VkFormat vk_format,
 843                uint64_t va, uint32_t pitch,
 844                uint32_t width, uint32_t height)
 845 {
 846    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 847
 848    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 849
 850    desc[0] =
 851       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 852       A6XX_TEX_CONST_0_FMT(format.fmt) |
 853       A6XX_TEX_CONST_0_SWAP(format.swap) |
 854       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 855       // XXX to swizzle into .w for stencil buffer_to_image
 856       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 857       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 858       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 859    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 860    desc[2] =
 861       A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
 862       A6XX_TEX_CONST_2_PITCH(pitch) |
 863       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 864    desc[3] = 0;
 865    desc[4] = va;
 866    desc[5] = va >> 32;
 867    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 868       desc[i] = 0;
 869
 870    r3d_src_common(cmd, cs, desc, 0, 0, false);
 871 }
 872
 873 static void
 874 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 875 {
 876    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 877
 878    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 879    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 880    tu_cs_image_ref(cs, iview, layer);
 881    tu_cs_emit(cs, 0);
 882
 883    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 884    tu_cs_image_flag_ref(cs, iview, layer);
 885
 886    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 887 }
 888
 889 static void
 890 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 891 {
 892    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 893
 894    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 895
 896    tu_cs_emit_regs(cs,
 897                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 898                    A6XX_RB_MRT_PITCH(0, pitch),
 899                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 900                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 901                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 902                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 903
 904    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 905 }
 906
 907 static void
 908 r3d_setup(struct tu_cmd_buffer *cmd,
 909           struct tu_cs *cs,
 910           VkFormat vk_format,
 911           enum a6xx_rotation rotation,
 912           bool clear,
 913           uint8_t mask)
 914 {
 915    const struct tu_physical_device *phys_dev = cmd->device->physical_device;
 916
 917    if (!cmd->state.pass) {
 918       /* TODO: flushing with barriers instead of blindly always flushing */
 919       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
 920       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 921       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
 922       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
 923       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
 924
 925       tu_cs_emit_regs(cs,
 926                      A6XX_RB_CCU_CNTL(.offset = phys_dev->ccu_offset_bypass));
 927
 928       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 929    }
 930    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 931    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 932
 933    r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
 934
 935    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 936    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 937                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 938                   0xfc000000);
 939    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 940
 941    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 942    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 943
 944    tu_cs_emit_regs(cs,
 945                    A6XX_RB_FS_OUTPUT_CNTL0(),
 946                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 947
 948    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 949    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 950    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 951
 952    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 953    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 954    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 955    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 956    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 957    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 958    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 959
 960    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 961    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 962
 963    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 964                         .color_format = tu6_base_format(vk_format),
 965                         .color_sint = vk_format_is_sint(vk_format),
 966                         .color_uint = vk_format_is_uint(vk_format)));
 967
 968    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 969    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 970    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 971 }
 972
 973 static void
 974 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 975 {
 976    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 977    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 978                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 979                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 980    tu_cs_emit(cs, 1); /* instance count */
 981    tu_cs_emit(cs, 2); /* vertex count */
 982
 983    if (!cmd->state.pass) {
 984       /* TODO: flushing with barriers instead of blindly always flushing */
 985       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
 986       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
 987       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
 988    }
 989 }
 990
 991 /* blit ops - common interface for 2d/shader paths */
 992
 993 struct blit_ops {
 994    void (*coords)(struct tu_cs *cs,
 995                   const VkOffset2D *dst,
 996                   const VkOffset2D *src,
 997                   const VkExtent2D *extent);
 998    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 999    void (*src)(
1000         struct tu_cmd_buffer *cmd,
1001         struct tu_cs *cs,
1002         const struct tu_image_view *iview,
1003         uint32_t layer,
1004         bool linear_filter);
1005    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1006                       VkFormat vk_format,
1007                       uint64_t va, uint32_t pitch,
1008                       uint32_t width, uint32_t height);
1009    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1010    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1011    void (*setup)(struct tu_cmd_buffer *cmd,
1012                  struct tu_cs *cs,
1013                  VkFormat vk_format,
1014                  enum a6xx_rotation rotation,
1015                  bool clear,
1016                  uint8_t mask);
1017    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1018 };
1019
1020 static const struct blit_ops r2d_ops = {
1021    .coords = r2d_coords,
1022    .clear_value = r2d_clear_value,
1023    .src = r2d_src,
1024    .src_buffer = r2d_src_buffer,
1025    .dst = r2d_dst,
1026    .dst_buffer = r2d_dst_buffer,
1027    .setup = r2d_setup,
1028    .run = r2d_run,
1029 };
1030
1031 static const struct blit_ops r3d_ops = {
1032    .coords = r3d_coords,
1033    .clear_value = r3d_clear_value,
1034    .src = r3d_src,
1035    .src_buffer = r3d_src_buffer,
1036    .dst = r3d_dst,
1037    .dst_buffer = r3d_dst_buffer,
1038    .setup = r3d_setup,
1039    .run = r3d_run,
1040 };
1041
1042 /* passthrough set coords from 3D extents */
1043 static void
1044 coords(const struct blit_ops *ops,
1045        struct tu_cs *cs,
1046        const VkOffset3D *dst,
1047        const VkOffset3D *src,
1048        const VkExtent3D *extent)
1049 {
1050    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1051 }
1052
1053 static void
1054 tu_image_view_blit2(struct tu_image_view *iview,
1055                     struct tu_image *image,
1056                     VkFormat format,
1057                     const VkImageSubresourceLayers *subres,
1058                     uint32_t layer,
1059                     bool stencil_read)
1060 {
1061    VkImageAspectFlags aspect_mask = subres->aspectMask;
1062
1063    /* always use the AS_R8G8B8A8 format for these */
1064    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1065        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1066       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1067    }
1068
1069    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1070       .image = tu_image_to_handle(image),
1071       .viewType = VK_IMAGE_VIEW_TYPE_2D,
1072       .format = format,
1073       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1074       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1075       .subresourceRange = {
1076          .aspectMask = aspect_mask,
1077          .baseMipLevel = subres->mipLevel,
1078          .levelCount = 1,
1079          .baseArrayLayer = subres->baseArrayLayer + layer,
1080          .layerCount = 1,
1081       },
1082    });
1083 }
1084
1085 static void
1086 tu_image_view_blit(struct tu_image_view *iview,
1087                    struct tu_image *image,
1088                    const VkImageSubresourceLayers *subres,
1089                    uint32_t layer)
1090 {
1091    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1092 }
1093
1094 static void
1095 tu6_blit_image(struct tu_cmd_buffer *cmd,
1096                struct tu_image *src_image,
1097                struct tu_image *dst_image,
1098                const VkImageBlit *info,
1099                VkFilter filter)
1100 {
1101    const struct blit_ops *ops = &r2d_ops;
1102    struct tu_cs *cs = &cmd->cs;
1103    uint32_t layers;
1104
1105    /* 2D blit can't do rotation mirroring from just coordinates */
1106    static const enum a6xx_rotation rotate[2][2] = {
1107       {ROTATE_0, ROTATE_HFLIP},
1108       {ROTATE_VFLIP, ROTATE_180},
1109    };
1110
1111    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1112                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1113    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1114                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1115    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1116                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1117
1118    if (mirror_z) {
1119       tu_finishme("blit z mirror\n");
1120       return;
1121    }
1122
1123    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1124        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1125       tu_finishme("blit z filter\n");
1126       return;
1127    }
1128
1129    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1130    if (info->dstSubresource.layerCount > 1) {
1131       assert(layers <= 1);
1132       layers = info->dstSubresource.layerCount;
1133    }
1134
1135    uint8_t mask = 0xf;
1136    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1137       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1138       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1139          mask = 0x7;
1140       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1141          mask = 0x8;
1142    }
1143
1144    /* BC1_RGB_* formats need to have their last components overriden with 1
1145     * when sampling, which is normally handled with the texture descriptor
1146     * swizzle. The 2d path can't handle that, so use the 3d path.
1147     *
1148     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1149     * the 2d path.
1150     */
1151
1152    if (dst_image->samples > 1 ||
1153        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1154        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1155       ops = &r3d_ops;
1156
1157    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1158     * figure out why (should be able to pass all tests with only shader path)
1159     */
1160
1161    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1162
1163    if (ops == &r3d_ops) {
1164       r3d_coords_raw(cs, (float[]) {
1165          info->dstOffsets[0].x, info->dstOffsets[0].y,
1166          info->srcOffsets[0].x, info->srcOffsets[0].y,
1167          info->dstOffsets[1].x, info->dstOffsets[1].y,
1168          info->srcOffsets[1].x, info->srcOffsets[1].y
1169       });
1170    } else {
1171       tu_cs_emit_regs(cs,
1172          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1173                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1174          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1175                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1176       tu_cs_emit_regs(cs,
1177          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1178          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1179          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1180          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1181    }
1182
1183    struct tu_image_view dst, src;
1184    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1185    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1186
1187    for (uint32_t i = 0; i < layers; i++) {
1188       ops->dst(cs, &dst, i);
1189       ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1190       ops->run(cmd, cs);
1191    }
1192 }
1193
1194 void
1195 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1196                 VkImage srcImage,
1197                 VkImageLayout srcImageLayout,
1198                 VkImage dstImage,
1199                 VkImageLayout dstImageLayout,
1200                 uint32_t regionCount,
1201                 const VkImageBlit *pRegions,
1202                 VkFilter filter)
1203
1204 {
1205    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1206    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1207    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1208
1209    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1210    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1211
1212    for (uint32_t i = 0; i < regionCount; ++i)
1213       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1214 }
1215
1216 static VkFormat
1217 copy_format(VkFormat format)
1218 {
1219    switch (vk_format_get_blocksizebits(format)) {
1220    case 8:  return VK_FORMAT_R8_UINT;
1221    case 16: return VK_FORMAT_R16_UINT;
1222    case 32: return VK_FORMAT_R32_UINT;
1223    case 64: return VK_FORMAT_R32G32_UINT;
1224    case 96: return VK_FORMAT_R32G32B32_UINT;
1225    case 128:return VK_FORMAT_R32G32B32A32_UINT;
1226    default:
1227       unreachable("unhandled format size");
1228    }
1229 }
1230
1231 static void
1232 copy_compressed(VkFormat format,
1233                 VkOffset3D *offset,
1234                 VkExtent3D *extent,
1235                 uint32_t *width,
1236                 uint32_t *height)
1237 {
1238    if (!vk_format_is_compressed(format))
1239       return;
1240
1241    uint32_t block_width = vk_format_get_blockwidth(format);
1242    uint32_t block_height = vk_format_get_blockheight(format);
1243
1244    offset->x /= block_width;
1245    offset->y /= block_height;
1246
1247    if (extent) {
1248       extent->width = DIV_ROUND_UP(extent->width, block_width);
1249       extent->height = DIV_ROUND_UP(extent->height, block_height);
1250    }
1251    if (width)
1252       *width = DIV_ROUND_UP(*width, block_width);
1253    if (height)
1254       *height = DIV_ROUND_UP(*height, block_height);
1255 }
1256
1257 static void
1258 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1259                         struct tu_buffer *src_buffer,
1260                         struct tu_image *dst_image,
1261                         const VkBufferImageCopy *info)
1262 {
1263    struct tu_cs *cs = &cmd->cs;
1264    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1265    VkFormat dst_format = dst_image->vk_format;
1266    VkFormat src_format = dst_image->vk_format;
1267    const struct blit_ops *ops = &r2d_ops;
1268
1269    uint8_t mask = 0xf;
1270
1271    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1272       switch (info->imageSubresource.aspectMask) {
1273       case VK_IMAGE_ASPECT_STENCIL_BIT:
1274          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1275          mask = 0x8;
1276          ops = &r3d_ops;
1277          break;
1278       case VK_IMAGE_ASPECT_DEPTH_BIT:
1279          mask = 0x7;
1280          break;
1281       }
1282    }
1283
1284    VkOffset3D offset = info->imageOffset;
1285    VkExtent3D extent = info->imageExtent;
1286    uint32_t src_width = info->bufferRowLength ?: extent.width;
1287    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1288
1289    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1290       assert(src_format == dst_format);
1291       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1292       src_format = dst_format = copy_format(dst_format);
1293    }
1294
1295    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1296    uint32_t layer_size = src_height * pitch;
1297
1298    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1299     * it is also valid for 1cpp format with shader path (stencil aspect path)
1300     */
1301
1302    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1303
1304    struct tu_image_view dst;
1305    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1306
1307    for (uint32_t i = 0; i < layers; i++) {
1308       ops->dst(cs, &dst, i);
1309
1310       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1311       if ((src_va & 63) || (pitch & 63)) {
1312          for (uint32_t y = 0; y < extent.height; y++) {
1313             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1314             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1315                             x + extent.width, 1);
1316             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1317                         &(VkExtent2D) {extent.width, 1});
1318             ops->run(cmd, cs);
1319             src_va += pitch;
1320          }
1321       } else {
1322          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1323          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1324          ops->run(cmd, cs);
1325       }
1326    }
1327 }
1328
1329 void
1330 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1331                         VkBuffer srcBuffer,
1332                         VkImage dstImage,
1333                         VkImageLayout dstImageLayout,
1334                         uint32_t regionCount,
1335                         const VkBufferImageCopy *pRegions)
1336 {
1337    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1338    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1339    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1340
1341    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1342    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1343
1344    for (unsigned i = 0; i < regionCount; ++i)
1345       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1346 }
1347
1348 static void
1349 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1350                         struct tu_image *src_image,
1351                         struct tu_buffer *dst_buffer,
1352                         const VkBufferImageCopy *info)
1353 {
1354    struct tu_cs *cs = &cmd->cs;
1355    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1356    VkFormat src_format = src_image->vk_format;
1357    VkFormat dst_format = src_image->vk_format;
1358    bool stencil_read = false;
1359
1360    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1361        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1362       dst_format = VK_FORMAT_R8_UNORM;
1363       stencil_read = true;
1364    }
1365
1366    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1367    VkOffset3D offset = info->imageOffset;
1368    VkExtent3D extent = info->imageExtent;
1369    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1370    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1371
1372    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1373       assert(src_format == dst_format);
1374       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1375       src_format = dst_format = copy_format(dst_format);
1376    }
1377
1378    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1379    uint32_t layer_size = pitch * dst_height;
1380
1381    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1382     * it is also valid for 1cpp format with shader path (stencil aspect)
1383     */
1384
1385    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1386
1387    struct tu_image_view src;
1388    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1389
1390    for (uint32_t i = 0; i < layers; i++) {
1391       ops->src(cmd, cs, &src, i, false);
1392
1393       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1394       if ((dst_va & 63) || (pitch & 63)) {
1395          for (uint32_t y = 0; y < extent.height; y++) {
1396             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1397             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1398             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1399                         &(VkExtent2D) {extent.width, 1});
1400             ops->run(cmd, cs);
1401             dst_va += pitch;
1402          }
1403       } else {
1404          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1405          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1406          ops->run(cmd, cs);
1407       }
1408    }
1409 }
1410
1411 void
1412 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1413                         VkImage srcImage,
1414                         VkImageLayout srcImageLayout,
1415                         VkBuffer dstBuffer,
1416                         uint32_t regionCount,
1417                         const VkBufferImageCopy *pRegions)
1418 {
1419    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1420    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1421    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1422
1423    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1424    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1425
1426    for (unsigned i = 0; i < regionCount; ++i)
1427       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1428 }
1429
1430 /* Tiled formats don't support swapping, which means that we can't support
1431  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1432  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1433  * Currently we fake support for tiled swapped formats and use the unswapped
1434  * format instead, but this means that reinterpreting copies to and from
1435  * swapped formats can't be performed correctly unless we can swizzle the
1436  * components by reinterpreting the other image as the "correct" swapped
1437  * format, i.e. only when the other image is linear.
1438  */
1439
1440 static bool
1441 is_swapped_format(VkFormat format)
1442 {
1443    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1444    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1445    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1446 }
1447
1448 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1449  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1450  * versa). This should mirror the logic in fdl6_layout.
1451  */
1452 static bool
1453 image_is_r8g8(struct tu_image *image)
1454 {
1455    return image->layout.cpp == 2 &&
1456       vk_format_get_nr_components(image->vk_format) == 2;
1457 }
1458
1459 static void
1460 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1461                        struct tu_image *src_image,
1462                        struct tu_image *dst_image,
1463                        const VkImageCopy *info)
1464 {
1465    const struct blit_ops *ops = &r2d_ops;
1466    struct tu_cs *cs = &cmd->cs;
1467
1468    uint8_t mask = 0xf;
1469    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1470       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1471          mask = 0x7;
1472       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1473          mask = 0x8;
1474    }
1475
1476    if (dst_image->samples > 1)
1477       ops = &r3d_ops;
1478
1479    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1480
1481    VkFormat format = VK_FORMAT_UNDEFINED;
1482    VkOffset3D src_offset = info->srcOffset;
1483    VkOffset3D dst_offset = info->dstOffset;
1484    VkExtent3D extent = info->extent;
1485
1486    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1487     * Images":
1488     *
1489     *    When copying between compressed and uncompressed formats the extent
1490     *    members represent the texel dimensions of the source image and not
1491     *    the destination. When copying from a compressed image to an
1492     *    uncompressed image the image texel dimensions written to the
1493     *    uncompressed image will be source extent divided by the compressed
1494     *    texel block dimensions. When copying from an uncompressed image to a
1495     *    compressed image the image texel dimensions written to the compressed
1496     *    image will be the source extent multiplied by the compressed texel
1497     *    block dimensions.
1498     *
1499     * This means we only have to adjust the extent if the source image is
1500     * compressed.
1501     */
1502    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1503    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1504
1505    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1506       copy_format(dst_image->vk_format) : dst_image->vk_format;
1507    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1508       copy_format(src_image->vk_format) : src_image->vk_format;
1509
1510    bool use_staging_blit = false;
1511
1512    if (src_format == dst_format) {
1513       /* Images that share a format can always be copied directly because it's
1514        * the same as a blit.
1515        */
1516       format = src_format;
1517    } else if (!src_image->layout.tile_mode) {
1518       /* If an image is linear, we can always safely reinterpret it with the
1519        * other image's format and then do a regular blit.
1520        */
1521       format = dst_format;
1522    } else if (!dst_image->layout.tile_mode) {
1523       format = src_format;
1524    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1525       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1526        * due to the different tile layout.
1527        */
1528       use_staging_blit = true;
1529    } else if (is_swapped_format(src_format) ||
1530               is_swapped_format(dst_format)) {
1531       /* If either format has a non-identity swap, then we can't copy
1532        * to/from it.
1533        */
1534       use_staging_blit = true;
1535    } else if (!src_image->layout.ubwc) {
1536       format = dst_format;
1537    } else if (!dst_image->layout.ubwc) {
1538       format = src_format;
1539    } else {
1540       /* Both formats use UBWC and so neither can be reinterpreted.
1541        * TODO: We could do an in-place decompression of the dst instead.
1542        */
1543       use_staging_blit = true;
1544    }
1545
1546    struct tu_image_view dst, src;
1547
1548    if (use_staging_blit) {
1549       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1550       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1551
1552       struct tu_image staging_image = {
1553          .vk_format = src_format,
1554          .type = src_image->type,
1555          .tiling = VK_IMAGE_TILING_LINEAR,
1556          .extent = extent,
1557          .level_count = 1,
1558          .layer_count = info->srcSubresource.layerCount,
1559          .samples = src_image->samples,
1560          .bo_offset = 0,
1561       };
1562
1563       VkImageSubresourceLayers staging_subresource = {
1564          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1565          .mipLevel = 0,
1566          .baseArrayLayer = 0,
1567          .layerCount = info->srcSubresource.layerCount,
1568       };
1569
1570       VkOffset3D staging_offset = { 0 };
1571
1572       staging_image.layout.tile_mode = TILE6_LINEAR;
1573       staging_image.layout.ubwc = false;
1574
1575       fdl6_layout(&staging_image.layout,
1576                   vk_format_to_pipe_format(staging_image.vk_format),
1577                   staging_image.samples,
1578                   staging_image.extent.width,
1579                   staging_image.extent.height,
1580                   staging_image.extent.depth,
1581                   staging_image.level_count,
1582                   staging_image.layer_count,
1583                   staging_image.type == VK_IMAGE_TYPE_3D,
1584                   NULL);
1585
1586       VkResult result = tu_get_scratch_bo(cmd->device,
1587                                           staging_image.layout.size,
1588                                           &staging_image.bo);
1589       if (result != VK_SUCCESS) {
1590          cmd->record_result = result;
1591          return;
1592       }
1593
1594       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1595                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1596
1597       struct tu_image_view staging;
1598       tu_image_view_blit2(&staging, &staging_image, src_format,
1599                           &staging_subresource, 0, false);
1600
1601       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1602       coords(ops, cs, &staging_offset, &src_offset, &extent);
1603
1604       for (uint32_t i = 0; i < info->extent.depth; i++) {
1605          ops->src(cmd, cs, &src, i, false);
1606          ops->dst(cs, &staging, i);
1607          ops->run(cmd, cs);
1608       }
1609
1610       /* When executed by the user there has to be a pipeline barrier here,
1611        * but since we're doing it manually we'll have to flush ourselves.
1612        */
1613       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1614       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
1615
1616       tu_image_view_blit2(&staging, &staging_image, dst_format,
1617                           &staging_subresource, 0, false);
1618
1619       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1620       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1621
1622       for (uint32_t i = 0; i < info->extent.depth; i++) {
1623          ops->src(cmd, cs, &staging, i, false);
1624          ops->dst(cs, &dst, i);
1625          ops->run(cmd, cs);
1626       }
1627    } else {
1628       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1629       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1630
1631       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1632       coords(ops, cs, &dst_offset, &src_offset, &extent);
1633
1634       for (uint32_t i = 0; i < info->extent.depth; i++) {
1635          ops->src(cmd, cs, &src, i, false);
1636          ops->dst(cs, &dst, i);
1637          ops->run(cmd, cs);
1638       }
1639    }
1640 }
1641
1642 void
1643 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1644                 VkImage srcImage,
1645                 VkImageLayout srcImageLayout,
1646                 VkImage destImage,
1647                 VkImageLayout destImageLayout,
1648                 uint32_t regionCount,
1649                 const VkImageCopy *pRegions)
1650 {
1651    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1652    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1653    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1654
1655    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1656    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1657
1658    for (uint32_t i = 0; i < regionCount; ++i)
1659       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1660 }
1661
1662 static void
1663 copy_buffer(struct tu_cmd_buffer *cmd,
1664             uint64_t dst_va,
1665             uint64_t src_va,
1666             uint64_t size,
1667             uint32_t block_size)
1668 {
1669    const struct blit_ops *ops = &r2d_ops;
1670    struct tu_cs *cs = &cmd->cs;
1671    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1672    uint64_t blocks = size / block_size;
1673
1674    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1675
1676    while (blocks) {
1677       uint32_t src_x = (src_va & 63) / block_size;
1678       uint32_t dst_x = (dst_va & 63) / block_size;
1679       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1680
1681       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1682       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1683       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1684       ops->run(cmd, cs);
1685
1686       src_va += width * block_size;
1687       dst_va += width * block_size;
1688       blocks -= width;
1689    }
1690 }
1691
1692 void
1693 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1694                  VkBuffer srcBuffer,
1695                  VkBuffer dstBuffer,
1696                  uint32_t regionCount,
1697                  const VkBufferCopy *pRegions)
1698 {
1699    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1700    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1701    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1702
1703    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1704    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1705
1706    for (unsigned i = 0; i < regionCount; ++i) {
1707       copy_buffer(cmd,
1708                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1709                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1710                   pRegions[i].size, 1);
1711    }
1712 }
1713
1714 void
1715 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1716                    VkBuffer dstBuffer,
1717                    VkDeviceSize dstOffset,
1718                    VkDeviceSize dataSize,
1719                    const void *pData)
1720 {
1721    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1722    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1723
1724    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1725
1726    struct ts_cs_memory tmp;
1727    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1728    if (result != VK_SUCCESS) {
1729       cmd->record_result = result;
1730       return;
1731    }
1732
1733    memcpy(tmp.map, pData, dataSize);
1734    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1735 }
1736
1737 void
1738 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1739                  VkBuffer dstBuffer,
1740                  VkDeviceSize dstOffset,
1741                  VkDeviceSize fillSize,
1742                  uint32_t data)
1743 {
1744    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1745    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1746    const struct blit_ops *ops = &r2d_ops;
1747    struct tu_cs *cs = &cmd->cs;
1748
1749    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1750
1751    if (fillSize == VK_WHOLE_SIZE)
1752       fillSize = buffer->size - dstOffset;
1753
1754    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1755    uint32_t blocks = fillSize / 4;
1756
1757    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1758    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1759
1760    while (blocks) {
1761       uint32_t dst_x = (dst_va & 63) / 4;
1762       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1763
1764       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1765       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1766       ops->run(cmd, cs);
1767
1768       dst_va += width * 4;
1769       blocks -= width;
1770    }
1771 }
1772
1773 void
1774 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1775                    VkImage srcImage,
1776                    VkImageLayout srcImageLayout,
1777                    VkImage dstImage,
1778                    VkImageLayout dstImageLayout,
1779                    uint32_t regionCount,
1780                    const VkImageResolve *pRegions)
1781 {
1782    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1783    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1784    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1785    const struct blit_ops *ops = &r2d_ops;
1786    struct tu_cs *cs = &cmd->cs;
1787
1788    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1789    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1790
1791    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1792
1793    for (uint32_t i = 0; i < regionCount; ++i) {
1794       const VkImageResolve *info = &pRegions[i];
1795       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1796
1797       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1798       /* TODO: aspect masks possible ? */
1799
1800       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1801
1802       struct tu_image_view dst, src;
1803       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1804       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1805
1806       for (uint32_t i = 0; i < layers; i++) {
1807          ops->src(cmd, cs, &src, i, false);
1808          ops->dst(cs, &dst, i);
1809          ops->run(cmd, cs);
1810       }
1811    }
1812 }
1813
1814 void
1815 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1816                   struct tu_cs *cs,
1817                   struct tu_image_view *src,
1818                   struct tu_image_view *dst,
1819                   uint32_t layers,
1820                   const VkRect2D *rect)
1821 {
1822    const struct blit_ops *ops = &r2d_ops;
1823
1824    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1825    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1826
1827    assert(src->image->vk_format == dst->image->vk_format);
1828
1829    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1830    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1831
1832    for (uint32_t i = 0; i < layers; i++) {
1833       ops->src(cmd, cs, src, i, false);
1834       ops->dst(cs, dst, i);
1835       ops->run(cmd, cs);
1836    }
1837 }
1838
1839 static void
1840 clear_image(struct tu_cmd_buffer *cmd,
1841             struct tu_image *image,
1842             const VkClearValue *clear_value,
1843             const VkImageSubresourceRange *range)
1844 {
1845    uint32_t level_count = tu_get_levelCount(image, range);
1846    uint32_t layer_count = tu_get_layerCount(image, range);
1847    struct tu_cs *cs = &cmd->cs;
1848    VkFormat format = image->vk_format;
1849    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1850       format = VK_FORMAT_R32_UINT;
1851
1852    if (image->type == VK_IMAGE_TYPE_3D) {
1853       assert(layer_count == 1);
1854       assert(range->baseArrayLayer == 0);
1855    }
1856
1857    uint8_t mask = 0xf;
1858    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1859       mask = 0;
1860       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1861          mask |= 0x7;
1862       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1863          mask |= 0x8;
1864    }
1865
1866    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1867
1868    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1869    ops->clear_value(cs, image->vk_format, clear_value);
1870
1871    for (unsigned j = 0; j < level_count; j++) {
1872       if (image->type == VK_IMAGE_TYPE_3D)
1873          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1874
1875       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1876                      u_minify(image->extent.width, range->baseMipLevel + j),
1877                      u_minify(image->extent.height, range->baseMipLevel + j)
1878                   });
1879
1880       struct tu_image_view dst;
1881       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1882          .aspectMask = range->aspectMask,
1883          .mipLevel = range->baseMipLevel + j,
1884          .baseArrayLayer = range->baseArrayLayer,
1885          .layerCount = 1,
1886       }, 0, false);
1887
1888       for (uint32_t i = 0; i < layer_count; i++) {
1889          ops->dst(cs, &dst, i);
1890          ops->run(cmd, cs);
1891       }
1892    }
1893 }
1894
1895 void
1896 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1897                       VkImage image_h,
1898                       VkImageLayout imageLayout,
1899                       const VkClearColorValue *pColor,
1900                       uint32_t rangeCount,
1901                       const VkImageSubresourceRange *pRanges)
1902 {
1903    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1904    TU_FROM_HANDLE(tu_image, image, image_h);
1905
1906    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1907
1908    for (unsigned i = 0; i < rangeCount; i++)
1909       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1910 }
1911
1912 void
1913 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1914                              VkImage image_h,
1915                              VkImageLayout imageLayout,
1916                              const VkClearDepthStencilValue *pDepthStencil,
1917                              uint32_t rangeCount,
1918                              const VkImageSubresourceRange *pRanges)
1919 {
1920    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1921    TU_FROM_HANDLE(tu_image, image, image_h);
1922
1923    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1924
1925    for (unsigned i = 0; i < rangeCount; i++)
1926       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1927 }
1928
1929 static void
1930 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1931                                uint32_t attachment_count,
1932                                const VkClearAttachment *attachments,
1933                                uint32_t rect_count,
1934                                const VkClearRect *rects)
1935 {
1936    const struct tu_subpass *subpass = cmd->state.subpass;
1937    /* note: cannot use shader path here.. there is a special shader path
1938     * in tu_clear_sysmem_attachments()
1939     */
1940    const struct blit_ops *ops = &r2d_ops;
1941    struct tu_cs *cs = &cmd->draw_cs;
1942
1943    for (uint32_t j = 0; j < attachment_count; j++) {
1944          uint32_t a;
1945          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1946             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1947          } else {
1948             a = subpass->depth_stencil_attachment.attachment;
1949
1950             /* sync depth into color */
1951             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
1952             /* also flush color to avoid losing contents from invalidate */
1953             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1954             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR, false);
1955          }
1956
1957          if (a == VK_ATTACHMENT_UNUSED)
1958                continue;
1959
1960          uint8_t mask = 0xf;
1961          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1962             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1963                mask &= ~0x7;
1964             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1965                mask &= ~0x8;
1966          }
1967
1968          const struct tu_image_view *iview =
1969             cmd->state.framebuffer->attachments[a].attachment;
1970
1971          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1972          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1973
1974          for (uint32_t i = 0; i < rect_count; i++) {
1975             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1976             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1977                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1978                ops->run(cmd, cs);
1979             }
1980          }
1981
1982          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1983             /* does not use CCU - flush
1984              * note: cache invalidate might be needed to, and just not covered by test cases
1985              */
1986             if (attachments[j].colorAttachment > 0)
1987                tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1988          } else {
1989             /* sync color into depth */
1990             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
1991             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH, false);
1992          }
1993    }
1994 }
1995
1996 static void
1997 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1998                             uint32_t attachment_count,
1999                             const VkClearAttachment *attachments,
2000                             uint32_t rect_count,
2001                             const VkClearRect *rects)
2002 {
2003    /* the shader path here is special, it avoids changing MRT/etc state */
2004    const struct tu_render_pass *pass = cmd->state.pass;
2005    const struct tu_subpass *subpass = cmd->state.subpass;
2006    const uint32_t mrt_count = subpass->color_count;
2007    struct tu_cs *cs = &cmd->draw_cs;
2008    uint32_t clear_value[MAX_RTS][4];
2009    float z_clear_val = 0.0f;
2010    uint8_t s_clear_val = 0;
2011    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
2012    bool z_clear = false;
2013    bool s_clear = false;
2014    uint32_t max_samples = 1;
2015
2016    for (uint32_t i = 0; i < attachment_count; i++) {
2017       uint32_t a;
2018       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2019          uint32_t c = attachments[i].colorAttachment;
2020          a = subpass->color_attachments[c].attachment;
2021          if (a == VK_ATTACHMENT_UNUSED)
2022             continue;
2023
2024          clear_rts |= 1 << c;
2025          clear_components |= 0xf << (c * 4);
2026          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2027       } else {
2028          a = subpass->depth_stencil_attachment.attachment;
2029          if (a == VK_ATTACHMENT_UNUSED)
2030             continue;
2031
2032          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2033             z_clear = true;
2034             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2035          }
2036
2037          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2038             s_clear = true;
2039             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2040          }
2041       }
2042
2043       max_samples = MAX2(max_samples, pass->attachments[a].samples);
2044    }
2045
2046    /* prefer to use 2D path for clears
2047     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
2048     */
2049    if (max_samples == 1 && cmd->state.framebuffer) {
2050       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
2051       return;
2052    }
2053
2054    /* TODO: this path doesn't take into account multilayer rendering */
2055
2056    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2057    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2058                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2059                   0xfc000000);
2060    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2061
2062    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2063    for (uint32_t i = 0; i < mrt_count; i++) {
2064       if (clear_rts & (1 << i))
2065          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2066       else
2067          tu_cs_emit(cs, 0);
2068    }
2069
2070    r3d_pipeline(cmd, cs, false, num_rts);
2071
2072    tu_cs_emit_regs(cs,
2073                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2074    tu_cs_emit_regs(cs,
2075                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2076
2077    tu_cs_emit_regs(cs,
2078                    A6XX_RB_FS_OUTPUT_CNTL0(),
2079                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2080
2081    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2082    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2083    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2084    for (uint32_t i = 0; i < mrt_count; i++) {
2085       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2086             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2087    }
2088
2089    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2090    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2091          .z_enable = z_clear,
2092          .z_write_enable = z_clear,
2093          .zfunc = FUNC_ALWAYS));
2094    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2095    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2096          .stencil_enable = s_clear,
2097          .func = FUNC_ALWAYS,
2098          .zpass = STENCIL_REPLACE));
2099    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2100    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2101    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2102
2103    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2104    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2105                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2106                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2107                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2108                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2109    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2110    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2111    for_each_bit(b, clear_rts)
2112       tu_cs_emit_array(cs, clear_value[b], 4);
2113
2114    for (uint32_t i = 0; i < rect_count; i++) {
2115       r3d_coords_raw(cs, (float[]) {
2116          rects[i].rect.offset.x, rects[i].rect.offset.y,
2117          z_clear_val, 1.0f,
2118          rects[i].rect.offset.x + rects[i].rect.extent.width,
2119          rects[i].rect.offset.y + rects[i].rect.extent.height,
2120          z_clear_val, 1.0f
2121       });
2122       r3d_run(cmd, cs);
2123    }
2124
2125    cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2126       TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2127       TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2128       TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2129       TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2130       TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2131 }
2132
2133 /**
2134  * Pack a VkClearValue into a 128-bit buffer. format is respected except
2135  * for the component order.  The components are always packed in WZYX order,
2136  * because gmem is tiled and tiled formats always have WZYX swap
2137  */
2138 static void
2139 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2140 {
2141    const struct util_format_description *desc = vk_format_description(format);
2142
2143    switch (format) {
2144    case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2145       buf[0] = float3_to_r11g11b10f(val->color.float32);
2146       return;
2147    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2148       buf[0] = float3_to_rgb9e5(val->color.float32);
2149       return;
2150    default:
2151       break;
2152    }
2153
2154    assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2155
2156    /* S8_UINT is special and has no depth */
2157    const int max_components =
2158       format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2159
2160    int buf_offset = 0;
2161    int bit_shift = 0;
2162    for (int comp = 0; comp < max_components; comp++) {
2163       const struct util_format_channel_description *ch =
2164          tu_get_format_channel_description(desc, comp);
2165       if (!ch) {
2166          assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2167                 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2168          continue;
2169       }
2170
2171       union tu_clear_component_value v = tu_get_clear_component_value(
2172          val, comp, desc->colorspace);
2173
2174       /* move to the next uint32_t when there is not enough space */
2175       assert(ch->size <= 32);
2176       if (bit_shift + ch->size > 32) {
2177          buf_offset++;
2178          bit_shift = 0;
2179       }
2180
2181       if (bit_shift == 0)
2182          buf[buf_offset] = 0;
2183
2184       buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2185       bit_shift += ch->size;
2186    }
2187 }
2188
2189 static void
2190 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2191                               struct tu_cs *cs,
2192                               uint32_t attachment,
2193                               uint8_t component_mask,
2194                               const VkClearValue *value)
2195 {
2196    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2197    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2198     * because D24S8 is cleared with AS_R8G8B8A8 format
2199     */
2200
2201    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2202    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2203
2204    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2205    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2206
2207    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2208    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2209
2210    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2211    tu_cs_emit(cs, 0);
2212
2213    uint32_t clear_vals[4] = {};
2214    pack_gmem_clear_value(value, vk_format, clear_vals);
2215
2216    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2217    tu_cs_emit_array(cs, clear_vals, 4);
2218
2219    tu6_emit_event_write(cmd, cs, BLIT, false);
2220 }
2221
2222 static void
2223 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2224                           uint32_t attachment_count,
2225                           const VkClearAttachment *attachments,
2226                           uint32_t rect_count,
2227                           const VkClearRect *rects)
2228 {
2229    const struct tu_subpass *subpass = cmd->state.subpass;
2230    struct tu_cs *cs = &cmd->draw_cs;
2231
2232    /* TODO: swap the loops for smaller cmdstream */
2233    for (unsigned i = 0; i < rect_count; i++) {
2234       unsigned x1 = rects[i].rect.offset.x;
2235       unsigned y1 = rects[i].rect.offset.y;
2236       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2237       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2238
2239       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2240       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2241       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2242
2243       for (unsigned j = 0; j < attachment_count; j++) {
2244          uint32_t a;
2245          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2246             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2247          else
2248             a = subpass->depth_stencil_attachment.attachment;
2249
2250          if (a == VK_ATTACHMENT_UNUSED)
2251                continue;
2252
2253          unsigned clear_mask = 0xf;
2254          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2255             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2256                clear_mask &= ~0x7;
2257             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2258                clear_mask &= ~0x8;
2259          }
2260
2261          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2262                                        &attachments[j].clearValue);
2263       }
2264    }
2265 }
2266
2267 void
2268 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2269                        uint32_t attachmentCount,
2270                        const VkClearAttachment *pAttachments,
2271                        uint32_t rectCount,
2272                        const VkClearRect *pRects)
2273 {
2274    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2275    struct tu_cs *cs = &cmd->draw_cs;
2276
2277    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2278    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2279    tu_cond_exec_end(cs);
2280
2281    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2282    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2283    tu_cond_exec_end(cs);
2284 }
2285
2286 void
2287 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2288                            struct tu_cs *cs,
2289                            uint32_t a,
2290                            const VkRenderPassBeginInfo *info)
2291 {
2292    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2293    const struct tu_image_view *iview = fb->attachments[a].attachment;
2294    const struct tu_render_pass_attachment *attachment =
2295       &cmd->state.pass->attachments[a];
2296    uint8_t mask = 0;
2297
2298    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2299       mask = 0xf;
2300    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2301       mask |= 0x7;
2302    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2303       mask |= 0x8;
2304
2305    if (!mask)
2306       return;
2307
2308    const struct blit_ops *ops = &r2d_ops;
2309    if (attachment->samples > 1)
2310       ops = &r3d_ops;
2311
2312    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2313    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2314    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2315
2316    for (uint32_t i = 0; i < fb->layers; i++) {
2317       ops->dst(cs, iview, i);
2318       ops->run(cmd, cs);
2319    }
2320 }
2321
2322 void
2323 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2324                          struct tu_cs *cs,
2325                          uint32_t a,
2326                          const VkRenderPassBeginInfo *info)
2327 {
2328    const struct tu_render_pass_attachment *attachment =
2329       &cmd->state.pass->attachments[a];
2330    unsigned clear_mask = 0;
2331
2332    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2333       clear_mask = 0xf;
2334    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2335       clear_mask |= 0x7;
2336    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2337       clear_mask |= 0x8;
2338
2339    if (!clear_mask)
2340       return;
2341
2342    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2343
2344    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2345                                  &info->pClearValues[a]);
2346 }
2347
2348 static void
2349 tu_emit_blit(struct tu_cmd_buffer *cmd,
2350              struct tu_cs *cs,
2351              const struct tu_image_view *iview,
2352              const struct tu_render_pass_attachment *attachment,
2353              bool resolve)
2354 {
2355    tu_cs_emit_regs(cs,
2356                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2357
2358    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2359       .unk0 = !resolve,
2360       .gmem = !resolve,
2361       /* "integer" bit disables msaa resolve averaging */
2362       .integer = vk_format_is_int(attachment->format)));
2363
2364    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2365    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2366    tu_cs_image_ref_2d(cs, iview, 0, false);
2367
2368    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2369    tu_cs_image_flag_ref(cs, iview, 0);
2370
2371    tu_cs_emit_regs(cs,
2372                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2373
2374    tu6_emit_event_write(cmd, cs, BLIT, false);
2375 }
2376
2377 static bool
2378 blit_can_resolve(VkFormat format)
2379 {
2380    const struct util_format_description *desc = vk_format_description(format);
2381
2382    /* blit event can only do resolve for simple cases:
2383     * averaging samples as unsigned integers or choosing only one sample
2384     */
2385    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2386       return false;
2387
2388    /* can't do formats with larger channel sizes
2389     * note: this includes all float formats
2390     * note2: single channel integer formats seem OK
2391     */
2392    if (desc->channel[0].size > 10)
2393       return false;
2394
2395    switch (format) {
2396    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2397     * likely related to these formats having different layout from other cpp=2 formats
2398     */
2399    case VK_FORMAT_R8G8_UNORM:
2400    case VK_FORMAT_R8G8_UINT:
2401    case VK_FORMAT_R8G8_SINT:
2402    /* TODO: this one should be able to work? */
2403    case VK_FORMAT_D24_UNORM_S8_UINT:
2404       return false;
2405    default:
2406       break;
2407    }
2408
2409    return true;
2410 }
2411
2412 void
2413 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2414                         struct tu_cs *cs,
2415                         uint32_t a,
2416                         bool force_load)
2417 {
2418    const struct tu_image_view *iview =
2419       cmd->state.framebuffer->attachments[a].attachment;
2420    const struct tu_render_pass_attachment *attachment =
2421       &cmd->state.pass->attachments[a];
2422
2423    if (attachment->load || force_load)
2424       tu_emit_blit(cmd, cs, iview, attachment, false);
2425 }
2426
2427 void
2428 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2429                          struct tu_cs *cs,
2430                          uint32_t a,
2431                          uint32_t gmem_a)
2432 {
2433    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2434    const VkRect2D *render_area = &tiling->render_area;
2435    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2436    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2437    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2438
2439    if (!dst->store)
2440       return;
2441
2442    uint32_t x1 = render_area->offset.x;
2443    uint32_t y1 = render_area->offset.y;
2444    uint32_t x2 = x1 + render_area->extent.width;
2445    uint32_t y2 = y1 + render_area->extent.height;
2446    /* x2/y2 can be unaligned if equal to the size of the image,
2447     * since it will write into padding space
2448     * the one exception is linear levels which don't have the
2449     * required y padding in the layout (except for the last level)
2450     */
2451    bool need_y2_align =
2452       y2 != iview->extent.height || iview->need_y2_align;
2453
2454    bool unaligned =
2455       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2456       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2457
2458    /* use fast path when render area is aligned, except for unsupported resolve cases */
2459    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2460       tu_emit_blit(cmd, cs, iview, src, true);
2461       return;
2462    }
2463
2464    if (dst->samples > 1) {
2465       /* I guess we need to use shader path in this case?
2466        * need a testcase which fails because of this
2467        */
2468       tu_finishme("unaligned store of msaa attachment\n");
2469       return;
2470    }
2471
2472    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2473    r2d_dst(cs, iview, 0);
2474    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2475
2476    tu_cs_emit_regs(cs,
2477                    A6XX_SP_PS_2D_SRC_INFO(
2478                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2479                       .tile_mode = TILE6_2,
2480                       .srgb = vk_format_is_srgb(src->format),
2481                       .samples = tu_msaa_samples(src->samples),
2482                       .samples_average = !vk_format_is_int(src->format),
2483                       .unk20 = 1,
2484                       .unk22 = 1),
2485                    /* note: src size does not matter when not scaling */
2486                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2487                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2488                    A6XX_SP_PS_2D_SRC_HI(),
2489                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2490
2491    /* sync GMEM writes with CACHE */
2492    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2493
2494    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2495    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2496
2497    /* TODO: flushing with barriers instead of blindly always flushing */
2498    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS, true);
2499    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS, true);
2500    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE, false);
2501 }