src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 /* helper functions previously in tu_formats.c */
  20
  21 static uint32_t
  22 tu_pack_mask(int bits)
  23 {
  24    assert(bits <= 32);
  25    return (1ull << bits) - 1;
  26 }
  27
  28 static uint32_t
  29 tu_pack_float32_for_unorm(float val, int bits)
  30 {
  31    const uint32_t max = tu_pack_mask(bits);
  32    if (val < 0.0f)
  33       return 0;
  34    else if (val > 1.0f)
  35       return max;
  36    else
  37       return _mesa_lroundevenf(val * (float) max);
  38 }
  39
  40 static uint32_t
  41 tu_pack_float32_for_snorm(float val, int bits)
  42 {
  43    const int32_t max = tu_pack_mask(bits - 1);
  44    int32_t tmp;
  45    if (val < -1.0f)
  46       tmp = -max;
  47    else if (val > 1.0f)
  48       tmp = max;
  49    else
  50       tmp = _mesa_lroundevenf(val * (float) max);
  51
  52    return tmp & tu_pack_mask(bits);
  53 }
  54
  55 static uint32_t
  56 tu_pack_float32_for_uscaled(float val, int bits)
  57 {
  58    const uint32_t max = tu_pack_mask(bits);
  59    if (val < 0.0f)
  60       return 0;
  61    else if (val > (float) max)
  62       return max;
  63    else
  64       return (uint32_t) val;
  65 }
  66
  67 static uint32_t
  68 tu_pack_float32_for_sscaled(float val, int bits)
  69 {
  70    const int32_t max = tu_pack_mask(bits - 1);
  71    const int32_t min = -max - 1;
  72    int32_t tmp;
  73    if (val < (float) min)
  74       tmp = min;
  75    else if (val > (float) max)
  76       tmp = max;
  77    else
  78       tmp = (int32_t) val;
  79
  80    return tmp & tu_pack_mask(bits);
  81 }
  82
  83 static uint32_t
  84 tu_pack_uint32_for_uint(uint32_t val, int bits)
  85 {
  86    return val & tu_pack_mask(bits);
  87 }
  88
  89 static uint32_t
  90 tu_pack_int32_for_sint(int32_t val, int bits)
  91 {
  92    return val & tu_pack_mask(bits);
  93 }
  94
  95 static uint32_t
  96 tu_pack_float32_for_sfloat(float val, int bits)
  97 {
  98    assert(bits == 16 || bits == 32);
  99    return bits == 16 ? util_float_to_half(val) : fui(val);
 100 }
 101
 102 union tu_clear_component_value {
 103    float float32;
 104    int32_t int32;
 105    uint32_t uint32;
 106 };
 107
 108 static uint32_t
 109 tu_pack_clear_component_value(union tu_clear_component_value val,
 110                               const struct util_format_channel_description *ch)
 111 {
 112    uint32_t packed;
 113
 114    switch (ch->type) {
 115    case UTIL_FORMAT_TYPE_UNSIGNED:
 116       /* normalized, scaled, or pure integer */
 117       if (ch->normalized)
 118          packed = tu_pack_float32_for_unorm(val.float32, ch->size);
 119       else if (ch->pure_integer)
 120          packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
 121       else
 122          packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
 123       break;
 124    case UTIL_FORMAT_TYPE_SIGNED:
 125       /* normalized, scaled, or pure integer */
 126       if (ch->normalized)
 127          packed = tu_pack_float32_for_snorm(val.float32, ch->size);
 128       else if (ch->pure_integer)
 129          packed = tu_pack_int32_for_sint(val.int32, ch->size);
 130       else
 131          packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
 132       break;
 133    case UTIL_FORMAT_TYPE_FLOAT:
 134       packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
 135       break;
 136    default:
 137       unreachable("unexpected channel type");
 138       packed = 0;
 139       break;
 140    }
 141
 142    assert((packed & tu_pack_mask(ch->size)) == packed);
 143    return packed;
 144 }
 145
 146 static const struct util_format_channel_description *
 147 tu_get_format_channel_description(const struct util_format_description *desc,
 148                                   int comp)
 149 {
 150    switch (desc->swizzle[comp]) {
 151    case PIPE_SWIZZLE_X:
 152       return &desc->channel[0];
 153    case PIPE_SWIZZLE_Y:
 154       return &desc->channel[1];
 155    case PIPE_SWIZZLE_Z:
 156       return &desc->channel[2];
 157    case PIPE_SWIZZLE_W:
 158       return &desc->channel[3];
 159    default:
 160       return NULL;
 161    }
 162 }
 163
 164 static union tu_clear_component_value
 165 tu_get_clear_component_value(const VkClearValue *val, int comp,
 166                              enum util_format_colorspace colorspace)
 167 {
 168    assert(comp < 4);
 169
 170    union tu_clear_component_value tmp;
 171    switch (colorspace) {
 172    case UTIL_FORMAT_COLORSPACE_ZS:
 173       assert(comp < 2);
 174       if (comp == 0)
 175          tmp.float32 = val->depthStencil.depth;
 176       else
 177          tmp.uint32 = val->depthStencil.stencil;
 178       break;
 179    case UTIL_FORMAT_COLORSPACE_SRGB:
 180       if (comp < 3) {
 181          tmp.float32 = util_format_linear_to_srgb_float(val->color.float32[comp]);
 182          break;
 183       }
 184    default:
 185       assert(comp < 4);
 186       tmp.uint32 = val->color.uint32[comp];
 187       break;
 188    }
 189
 190    return tmp;
 191 }
 192
 193 /* r2d_ = BLIT_OP_SCALE operations */
 194
 195 static enum a6xx_2d_ifmt
 196 format_to_ifmt(enum a6xx_format fmt)
 197 {
 198    switch (fmt) {
 199    case FMT6_A8_UNORM:
 200    case FMT6_8_UNORM:
 201    case FMT6_8_SNORM:
 202    case FMT6_8_8_UNORM:
 203    case FMT6_8_8_SNORM:
 204    case FMT6_8_8_8_8_UNORM:
 205    case FMT6_8_8_8_X8_UNORM:
 206    case FMT6_8_8_8_8_SNORM:
 207    case FMT6_4_4_4_4_UNORM:
 208    case FMT6_5_5_5_1_UNORM:
 209    case FMT6_5_6_5_UNORM:
 210    case FMT6_Z24_UNORM_S8_UINT:
 211    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
 212       return R2D_UNORM8;
 213
 214    case FMT6_32_UINT:
 215    case FMT6_32_SINT:
 216    case FMT6_32_32_UINT:
 217    case FMT6_32_32_SINT:
 218    case FMT6_32_32_32_32_UINT:
 219    case FMT6_32_32_32_32_SINT:
 220       return R2D_INT32;
 221
 222    case FMT6_16_UINT:
 223    case FMT6_16_SINT:
 224    case FMT6_16_16_UINT:
 225    case FMT6_16_16_SINT:
 226    case FMT6_16_16_16_16_UINT:
 227    case FMT6_16_16_16_16_SINT:
 228    case FMT6_10_10_10_2_UINT:
 229       return R2D_INT16;
 230
 231    case FMT6_8_UINT:
 232    case FMT6_8_SINT:
 233    case FMT6_8_8_UINT:
 234    case FMT6_8_8_SINT:
 235    case FMT6_8_8_8_8_UINT:
 236    case FMT6_8_8_8_8_SINT:
 237       return R2D_INT8;
 238
 239    case FMT6_16_UNORM:
 240    case FMT6_16_SNORM:
 241    case FMT6_16_16_UNORM:
 242    case FMT6_16_16_SNORM:
 243    case FMT6_16_16_16_16_UNORM:
 244    case FMT6_16_16_16_16_SNORM:
 245    case FMT6_32_FLOAT:
 246    case FMT6_32_32_FLOAT:
 247    case FMT6_32_32_32_32_FLOAT:
 248       return R2D_FLOAT32;
 249
 250    case FMT6_16_FLOAT:
 251    case FMT6_16_16_FLOAT:
 252    case FMT6_16_16_16_16_FLOAT:
 253    case FMT6_11_11_10_FLOAT:
 254    case FMT6_10_10_10_2_UNORM:
 255    case FMT6_10_10_10_2_UNORM_DEST:
 256       return R2D_FLOAT16;
 257
 258    default:
 259       unreachable("bad format");
 260       return 0;
 261    }
 262 }
 263
 264 static void
 265 r2d_coords(struct tu_cs *cs,
 266            const VkOffset2D *dst,
 267            const VkOffset2D *src,
 268            const VkExtent2D *extent)
 269 {
 270    tu_cs_emit_regs(cs,
 271       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 272       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 273
 274    if (!src)
 275       return;
 276
 277    tu_cs_emit_regs(cs,
 278                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 279                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 280                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 281                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 282 }
 283
 284 static void
 285 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 286 {
 287    uint32_t clear_value[4] = {};
 288
 289    switch (format) {
 290    case VK_FORMAT_X8_D24_UNORM_PACK32:
 291    case VK_FORMAT_D24_UNORM_S8_UINT:
 292       /* cleared as r8g8b8a8_unorm using special format */
 293       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 294       clear_value[1] = clear_value[0] >> 8;
 295       clear_value[2] = clear_value[0] >> 16;
 296       clear_value[3] = val->depthStencil.stencil;
 297       break;
 298    case VK_FORMAT_D16_UNORM:
 299    case VK_FORMAT_D32_SFLOAT:
 300       /* R2D_FLOAT32 */
 301       clear_value[0] = fui(val->depthStencil.depth);
 302       break;
 303    case VK_FORMAT_S8_UINT:
 304       clear_value[0] = val->depthStencil.stencil;
 305       break;
 306    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 307       /* cleared as UINT32 */
 308       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 309       break;
 310    default:
 311       assert(!vk_format_is_depth_or_stencil(format));
 312       const struct util_format_description *desc = vk_format_description(format);
 313       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 314
 315       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 316                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 317
 318       for (unsigned i = 0; i < desc->nr_channels; i++) {
 319          const struct util_format_channel_description *ch = &desc->channel[i];
 320          if (ifmt == R2D_UNORM8) {
 321             float linear = val->color.float32[i];
 322             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 323                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 324
 325             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 326                clear_value[i] = tu_pack_float32_for_snorm(linear, 8);
 327             else
 328                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 329          } else if (ifmt == R2D_FLOAT16) {
 330             clear_value[i] = util_float_to_half(val->color.float32[i]);
 331          } else {
 332             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 333                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 334             clear_value[i] = val->color.uint32[i];
 335          }
 336       }
 337       break;
 338    }
 339
 340    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 341    tu_cs_emit_array(cs, clear_value, 4);
 342 }
 343
 344 static void
 345 r2d_src(struct tu_cmd_buffer *cmd,
 346         struct tu_cs *cs,
 347         const struct tu_image_view *iview,
 348         uint32_t layer,
 349         bool linear_filter)
 350 {
 351    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 352    tu_cs_emit(cs, iview->SP_PS_2D_SRC_INFO |
 353                   COND(linear_filter, A6XX_SP_PS_2D_SRC_INFO_FILTER));
 354    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 355    tu_cs_image_ref_2d(cs, iview, layer, true);
 356
 357    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 358    tu_cs_image_flag_ref(cs, iview, layer);
 359 }
 360
 361 static void
 362 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 363                struct tu_cs *cs,
 364                VkFormat vk_format,
 365                uint64_t va, uint32_t pitch,
 366                uint32_t width, uint32_t height)
 367 {
 368    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 369
 370    tu_cs_emit_regs(cs,
 371                    A6XX_SP_PS_2D_SRC_INFO(
 372                       .color_format = format.fmt,
 373                       .color_swap = format.swap,
 374                       .srgb = vk_format_is_srgb(vk_format),
 375                       .unk20 = 1,
 376                       .unk22 = 1),
 377                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 378                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 379                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 380                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 381 }
 382
 383 static void
 384 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 385 {
 386    assert(iview->image->samples == 1);
 387
 388    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 389    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 390    tu_cs_image_ref_2d(cs, iview, layer, false);
 391
 392    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 393    tu_cs_image_flag_ref(cs, iview, layer);
 394 }
 395
 396 static void
 397 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 398 {
 399    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 400
 401    tu_cs_emit_regs(cs,
 402                    A6XX_RB_2D_DST_INFO(
 403                       .color_format = format.fmt,
 404                       .color_swap = format.swap,
 405                       .srgb = vk_format_is_srgb(vk_format)),
 406                    A6XX_RB_2D_DST_LO((uint32_t) va),
 407                    A6XX_RB_2D_DST_HI(va >> 32),
 408                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 409 }
 410
 411 static void
 412 r2d_setup_common(struct tu_cmd_buffer *cmd,
 413                  struct tu_cs *cs,
 414                  VkFormat vk_format,
 415                  enum a6xx_rotation rotation,
 416                  bool clear,
 417                  uint8_t mask,
 418                  bool scissor)
 419 {
 420    enum a6xx_format format = tu6_base_format(vk_format);
 421    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 422    uint32_t unknown_8c01 = 0;
 423
 424    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 425       /* preserve depth channels */
 426       if (mask == 0x8)
 427          unknown_8c01 = 0x00084001;
 428       /* preserve stencil channel */
 429       if (mask == 0x7)
 430          unknown_8c01 = 0x08000041;
 431    }
 432
 433    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 434    tu_cs_emit(cs, unknown_8c01);
 435
 436    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 437          .scissor = scissor,
 438          .rotate = rotation,
 439          .solid_color = clear,
 440          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 441          .color_format = format,
 442          .mask = 0xf,
 443          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 444       ).value;
 445
 446    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 447    tu_cs_emit(cs, blit_cntl);
 448
 449    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 450    tu_cs_emit(cs, blit_cntl);
 451
 452    if (format == FMT6_10_10_10_2_UNORM_DEST)
 453       format = FMT6_16_16_16_16_FLOAT;
 454
 455    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 456          .sint = vk_format_is_sint(vk_format),
 457          .uint = vk_format_is_uint(vk_format),
 458          .color_format = format,
 459          .srgb = vk_format_is_srgb(vk_format),
 460          .mask = 0xf));
 461 }
 462
 463 static void
 464 r2d_setup(struct tu_cmd_buffer *cmd,
 465           struct tu_cs *cs,
 466           VkFormat vk_format,
 467           enum a6xx_rotation rotation,
 468           bool clear,
 469           uint8_t mask)
 470 {
 471    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 472
 473    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 474 }
 475
 476 static void
 477 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 478 {
 479    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 480    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 481 }
 482
 483 /* r3d_ = shader path operations */
 484
 485 static void
 486 r3d_pipeline(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts)
 487 {
 488    struct ir3_shader dummy_shader = {};
 489
 490    struct ir3_shader_variant vs = {
 491       .type = MESA_SHADER_VERTEX,
 492       .instrlen = 1,
 493       .constlen = 2,
 494       .info.max_reg = 1,
 495       .inputs_count = 1,
 496       .inputs[0] = {
 497          .slot = SYSTEM_VALUE_VERTEX_ID,
 498          .regid = regid(0, 3),
 499          .sysval = true,
 500       },
 501       .outputs_count = blit ? 2 : 1,
 502       .outputs[0] = {
 503          .slot = VARYING_SLOT_POS,
 504          .regid = regid(0, 0),
 505       },
 506       .outputs[1] = {
 507          .slot = VARYING_SLOT_VAR0,
 508          .regid = regid(1, 0),
 509       },
 510       .shader = &dummy_shader,
 511    };
 512
 513    struct ir3_shader_variant fs = {
 514       .type = MESA_SHADER_FRAGMENT,
 515       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 516       .constlen = num_rts,
 517       .info.max_reg = MAX2(num_rts, 1) - 1,
 518       .total_in = blit ? 2 : 0,
 519       .num_samp = blit ? 1 : 0,
 520       .inputs_count = blit ? 2 : 0,
 521       .inputs[0] = {
 522          .slot = VARYING_SLOT_VAR0,
 523          .inloc = 0,
 524          .compmask = 3,
 525          .bary = true,
 526       },
 527       .inputs[1] = {
 528          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 529          .regid = regid(0, 0),
 530          .sysval = 1,
 531       },
 532       .num_sampler_prefetch = blit ? 1 : 0,
 533       .sampler_prefetch[0] = {
 534          .src = 0,
 535          .wrmask = 0xf,
 536          .cmd = 4,
 537       },
 538       .shader = &dummy_shader,
 539    };
 540
 541    static const instr_t vs_code[] = {
 542       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 543        * r1.xy = r0.w ? c1.zw : c0.zw
 544        * r0.w = 1.0f
 545        */
 546       { .cat3 = {
 547          .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 2, .dst = 0,
 548          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 549          .src2 = 3,
 550          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0},
 551       } },
 552       { .cat3 = {
 553          .opc_cat = 3, .opc = OPC_SEL_B32 & 63, .repeat = 1, .dst = 4,
 554          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 555          .src2 = 3,
 556          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2},
 557       } },
 558       { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, .dst = 3,
 559                   .src_im = 1, .fim_val = 1.0f } },
 560       { .cat0 = { .opc = OPC_END } },
 561    };
 562 #define FS_OFFSET (16 * sizeof(instr_t))
 563    STATIC_ASSERT(sizeof(vs_code) <= FS_OFFSET);
 564
 565    /* shaders */
 566    struct ts_cs_memory shaders = { };
 567    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2, 16 * sizeof(instr_t), &shaders);
 568    assert(result == VK_SUCCESS);
 569
 570    memcpy(shaders.map, vs_code, sizeof(vs_code));
 571
 572    instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 573    for (uint32_t i = 0; i < num_rts; i++) {
 574       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 575       *fs_code++ = (instr_t) { .cat1 = {
 576          .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 577          .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
 578       } };
 579    }
 580
 581    /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 582     * blit path (its not clear what allows it to not have it)
 583     */
 584    if (blit) {
 585       *fs_code++ = (instr_t) { .cat2 = {
 586          .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
 587          .dst = regid(63, 0), .src1_im = 1
 588       } };
 589    }
 590    *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 591    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 592
 593    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 594
 595    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
 596    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 597    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 598    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
 599    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
 600
 601    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 602    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 603
 604    tu6_emit_vpc(cs, &vs, NULL, &fs, NULL);
 605
 606    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 607    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 608    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 609
 610    tu6_emit_fs_inputs(cs, &fs);
 611
 612    tu_cs_emit_regs(cs,
 613                    A6XX_GRAS_CL_CNTL(
 614                       .persp_division_disable = 1,
 615                       .vp_xform_disable = 1,
 616                       .vp_clip_code_ignore = 1,
 617                       .clip_disable = 1),
 618                    A6XX_GRAS_UNKNOWN_8001(0));
 619    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 620
 621    tu_cs_emit_regs(cs,
 622                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 623                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 624    tu_cs_emit_regs(cs,
 625                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 626                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 627 }
 628
 629 static void
 630 r3d_coords_raw(struct tu_cs *cs, const float *coords)
 631 {
 632    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 633    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 634                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 635                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 636                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
 637                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 638    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 639    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 640    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 641 }
 642
 643 static void
 644 r3d_coords(struct tu_cs *cs,
 645            const VkOffset2D *dst,
 646            const VkOffset2D *src,
 647            const VkExtent2D *extent)
 648 {
 649    int32_t src_x1 = src ? src->x : 0;
 650    int32_t src_y1 = src ? src->y : 0;
 651    r3d_coords_raw(cs, (float[]) {
 652       dst->x,                 dst->y,
 653       src_x1,                 src_y1,
 654       dst->x + extent->width, dst->y + extent->height,
 655       src_x1 + extent->width, src_y1 + extent->height,
 656    });
 657 }
 658
 659 static void
 660 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 661 {
 662    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 663    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 664                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 665                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 666                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 667                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 668    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 669    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 670    switch (format) {
 671    case VK_FORMAT_X8_D24_UNORM_PACK32:
 672    case VK_FORMAT_D24_UNORM_S8_UINT: {
 673       /* cleared as r8g8b8a8_unorm using special format */
 674       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 675       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 676       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 677       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 678       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 679    } break;
 680    case VK_FORMAT_D16_UNORM:
 681    case VK_FORMAT_D32_SFLOAT:
 682       tu_cs_emit(cs, fui(val->depthStencil.depth));
 683       tu_cs_emit(cs, 0);
 684       tu_cs_emit(cs, 0);
 685       tu_cs_emit(cs, 0);
 686       break;
 687    case VK_FORMAT_S8_UINT:
 688       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 689       tu_cs_emit(cs, 0);
 690       tu_cs_emit(cs, 0);
 691       tu_cs_emit(cs, 0);
 692       break;
 693    default:
 694       /* as color formats use clear value as-is */
 695       assert(!vk_format_is_depth_or_stencil(format));
 696       tu_cs_emit_array(cs, val->color.uint32, 4);
 697       break;
 698    }
 699 }
 700
 701 static void
 702 r3d_src_common(struct tu_cmd_buffer *cmd,
 703                struct tu_cs *cs,
 704                const uint32_t *tex_const,
 705                uint32_t offset_base,
 706                uint32_t offset_ubwc,
 707                bool linear_filter)
 708 {
 709    struct ts_cs_memory texture = { };
 710    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 711                                  2, /* allocate space for a sampler too */
 712                                  A6XX_TEX_CONST_DWORDS, &texture);
 713    assert(result == VK_SUCCESS);
 714
 715    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 716
 717    /* patch addresses for layer offset */
 718    *(uint64_t*) (texture.map + 4) += offset_base;
 719    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 720    texture.map[7] = ubwc_addr;
 721    texture.map[8] = ubwc_addr >> 32;
 722
 723    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 724       A6XX_TEX_SAMP_0_XY_MAG(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 725       A6XX_TEX_SAMP_0_XY_MIN(linear_filter ? A6XX_TEX_LINEAR : A6XX_TEX_NEAREST) |
 726       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 727       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 728       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 729       0x60000; /* XXX used by blob, doesn't seem necessary */
 730    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 731       0x1 | /* XXX used by blob, doesn't seem necessary */
 732       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 733       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 734    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 735    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 736
 737    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 738    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 739                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 740                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 741                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 742                CP_LOAD_STATE6_0_NUM_UNIT(1));
 743    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 744
 745    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 746    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 747
 748    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 749    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 750       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 751       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 752       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 753       CP_LOAD_STATE6_0_NUM_UNIT(1));
 754    tu_cs_emit_qw(cs, texture.iova);
 755
 756    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 757    tu_cs_emit_qw(cs, texture.iova);
 758
 759    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 760 }
 761
 762 static void
 763 r3d_src(struct tu_cmd_buffer *cmd,
 764         struct tu_cs *cs,
 765         const struct tu_image_view *iview,
 766         uint32_t layer,
 767         bool linear_filter)
 768 {
 769    r3d_src_common(cmd, cs, iview->descriptor,
 770                   iview->layer_size * layer,
 771                   iview->ubwc_layer_size * layer,
 772                   linear_filter);
 773 }
 774
 775 static void
 776 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 777                struct tu_cs *cs,
 778                VkFormat vk_format,
 779                uint64_t va, uint32_t pitch,
 780                uint32_t width, uint32_t height)
 781 {
 782    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 783
 784    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 785
 786    desc[0] =
 787       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 788       A6XX_TEX_CONST_0_FMT(format.fmt) |
 789       A6XX_TEX_CONST_0_SWAP(format.swap) |
 790       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 791       // XXX to swizzle into .w for stencil buffer_to_image
 792       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 793       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 794       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 795    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 796    desc[2] =
 797       A6XX_TEX_CONST_2_FETCHSIZE(tu6_fetchsize(vk_format)) |
 798       A6XX_TEX_CONST_2_PITCH(pitch) |
 799       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 800    desc[3] = 0;
 801    desc[4] = va;
 802    desc[5] = va >> 32;
 803    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 804       desc[i] = 0;
 805
 806    r3d_src_common(cmd, cs, desc, 0, 0, false);
 807 }
 808
 809 static void
 810 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 811 {
 812    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 813
 814    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 815    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 816    tu_cs_image_ref(cs, iview, layer);
 817    tu_cs_emit(cs, 0);
 818
 819    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 820    tu_cs_image_flag_ref(cs, iview, layer);
 821
 822    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 823 }
 824
 825 static void
 826 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 827 {
 828    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 829
 830    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 831
 832    tu_cs_emit_regs(cs,
 833                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 834                    A6XX_RB_MRT_PITCH(0, pitch),
 835                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 836                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 837                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 838                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 839
 840    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 841 }
 842
 843 static void
 844 r3d_setup(struct tu_cmd_buffer *cmd,
 845           struct tu_cs *cs,
 846           VkFormat vk_format,
 847           enum a6xx_rotation rotation,
 848           bool clear,
 849           uint8_t mask)
 850 {
 851    if (!cmd->state.pass) {
 852       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 853       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 854    }
 855
 856    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 857    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 858
 859    r3d_pipeline(cmd, cs, !clear, clear ? 1 : 0);
 860
 861    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 862    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 863                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 864                   0xfc000000);
 865    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 866
 867    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 868    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 869
 870    tu_cs_emit_regs(cs,
 871                    A6XX_RB_FS_OUTPUT_CNTL0(),
 872                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 873
 874    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 875    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 876    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 877
 878    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 879    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 880    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 881    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 882    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 883    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 884    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 885
 886    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 887    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 888
 889    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 890                         .color_format = tu6_base_format(vk_format),
 891                         .color_sint = vk_format_is_sint(vk_format),
 892                         .color_uint = vk_format_is_uint(vk_format)));
 893
 894    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 895    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 896    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 897 }
 898
 899 static void
 900 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 901 {
 902    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 903    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 904                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 905                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 906    tu_cs_emit(cs, 1); /* instance count */
 907    tu_cs_emit(cs, 2); /* vertex count */
 908 }
 909
 910 /* blit ops - common interface for 2d/shader paths */
 911
 912 struct blit_ops {
 913    void (*coords)(struct tu_cs *cs,
 914                   const VkOffset2D *dst,
 915                   const VkOffset2D *src,
 916                   const VkExtent2D *extent);
 917    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 918    void (*src)(
 919         struct tu_cmd_buffer *cmd,
 920         struct tu_cs *cs,
 921         const struct tu_image_view *iview,
 922         uint32_t layer,
 923         bool linear_filter);
 924    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 925                       VkFormat vk_format,
 926                       uint64_t va, uint32_t pitch,
 927                       uint32_t width, uint32_t height);
 928    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 929    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
 930    void (*setup)(struct tu_cmd_buffer *cmd,
 931                  struct tu_cs *cs,
 932                  VkFormat vk_format,
 933                  enum a6xx_rotation rotation,
 934                  bool clear,
 935                  uint8_t mask);
 936    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 937 };
 938
 939 static const struct blit_ops r2d_ops = {
 940    .coords = r2d_coords,
 941    .clear_value = r2d_clear_value,
 942    .src = r2d_src,
 943    .src_buffer = r2d_src_buffer,
 944    .dst = r2d_dst,
 945    .dst_buffer = r2d_dst_buffer,
 946    .setup = r2d_setup,
 947    .run = r2d_run,
 948 };
 949
 950 static const struct blit_ops r3d_ops = {
 951    .coords = r3d_coords,
 952    .clear_value = r3d_clear_value,
 953    .src = r3d_src,
 954    .src_buffer = r3d_src_buffer,
 955    .dst = r3d_dst,
 956    .dst_buffer = r3d_dst_buffer,
 957    .setup = r3d_setup,
 958    .run = r3d_run,
 959 };
 960
 961 /* passthrough set coords from 3D extents */
 962 static void
 963 coords(const struct blit_ops *ops,
 964        struct tu_cs *cs,
 965        const VkOffset3D *dst,
 966        const VkOffset3D *src,
 967        const VkExtent3D *extent)
 968 {
 969    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 970 }
 971
 972 static void
 973 tu_image_view_blit2(struct tu_image_view *iview,
 974                     struct tu_image *image,
 975                     VkFormat format,
 976                     const VkImageSubresourceLayers *subres,
 977                     uint32_t layer,
 978                     bool stencil_read)
 979 {
 980    VkImageAspectFlags aspect_mask = subres->aspectMask;
 981
 982    /* always use the AS_R8G8B8A8 format for these */
 983    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
 984        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
 985       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
 986    }
 987
 988    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
 989       .image = tu_image_to_handle(image),
 990       .viewType = VK_IMAGE_VIEW_TYPE_2D,
 991       .format = format,
 992       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
 993       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
 994       .subresourceRange = {
 995          .aspectMask = aspect_mask,
 996          .baseMipLevel = subres->mipLevel,
 997          .levelCount = 1,
 998          .baseArrayLayer = subres->baseArrayLayer + layer,
 999          .layerCount = 1,
1000       },
1001    });
1002 }
1003
1004 static void
1005 tu_image_view_blit(struct tu_image_view *iview,
1006                    struct tu_image *image,
1007                    const VkImageSubresourceLayers *subres,
1008                    uint32_t layer)
1009 {
1010    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
1011 }
1012
1013 static void
1014 tu6_blit_image(struct tu_cmd_buffer *cmd,
1015                struct tu_image *src_image,
1016                struct tu_image *dst_image,
1017                const VkImageBlit *info,
1018                VkFilter filter)
1019 {
1020    const struct blit_ops *ops = &r2d_ops;
1021    struct tu_cs *cs = &cmd->cs;
1022    uint32_t layers;
1023
1024    /* 2D blit can't do rotation mirroring from just coordinates */
1025    static const enum a6xx_rotation rotate[2][2] = {
1026       {ROTATE_0, ROTATE_HFLIP},
1027       {ROTATE_VFLIP, ROTATE_180},
1028    };
1029
1030    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1031                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1032    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1033                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1034    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1035                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1036
1037    if (mirror_z) {
1038       tu_finishme("blit z mirror\n");
1039       return;
1040    }
1041
1042    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1043        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1044       tu_finishme("blit z filter\n");
1045       return;
1046    }
1047
1048    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1049    if (info->dstSubresource.layerCount > 1) {
1050       assert(layers <= 1);
1051       layers = info->dstSubresource.layerCount;
1052    }
1053
1054    uint8_t mask = 0xf;
1055    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1056       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1057       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1058          mask = 0x7;
1059       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1060          mask = 0x8;
1061    }
1062
1063    /* BC1_RGB_* formats need to have their last components overriden with 1
1064     * when sampling, which is normally handled with the texture descriptor
1065     * swizzle. The 2d path can't handle that, so use the 3d path.
1066     *
1067     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1068     * the 2d path.
1069     */
1070
1071    if (dst_image->samples > 1 ||
1072        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1073        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK)
1074       ops = &r3d_ops;
1075
1076    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1077     * figure out why (should be able to pass all tests with only shader path)
1078     */
1079
1080    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
1081
1082    if (ops == &r3d_ops) {
1083       r3d_coords_raw(cs, (float[]) {
1084          info->dstOffsets[0].x, info->dstOffsets[0].y,
1085          info->srcOffsets[0].x, info->srcOffsets[0].y,
1086          info->dstOffsets[1].x, info->dstOffsets[1].y,
1087          info->srcOffsets[1].x, info->srcOffsets[1].y
1088       });
1089    } else {
1090       tu_cs_emit_regs(cs,
1091          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1092                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1093          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1094                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1095       tu_cs_emit_regs(cs,
1096          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1097          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1098          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1099          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1100    }
1101
1102    struct tu_image_view dst, src;
1103    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1104    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1105
1106    for (uint32_t i = 0; i < layers; i++) {
1107       ops->dst(cs, &dst, i);
1108       ops->src(cmd, cs, &src, i, filter == VK_FILTER_LINEAR);
1109       ops->run(cmd, cs);
1110    }
1111 }
1112
1113 void
1114 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1115                 VkImage srcImage,
1116                 VkImageLayout srcImageLayout,
1117                 VkImage dstImage,
1118                 VkImageLayout dstImageLayout,
1119                 uint32_t regionCount,
1120                 const VkImageBlit *pRegions,
1121                 VkFilter filter)
1122
1123 {
1124    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1125    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1126    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1127
1128    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1129    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1130
1131    for (uint32_t i = 0; i < regionCount; ++i)
1132       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1133 }
1134
1135 static VkFormat
1136 copy_format(VkFormat format)
1137 {
1138    switch (vk_format_get_blocksizebits(format)) {
1139    case 8:  return VK_FORMAT_R8_UINT;
1140    case 16: return VK_FORMAT_R16_UINT;
1141    case 32: return VK_FORMAT_R32_UINT;
1142    case 64: return VK_FORMAT_R32G32_UINT;
1143    case 96: return VK_FORMAT_R32G32B32_UINT;
1144    case 128:return VK_FORMAT_R32G32B32A32_UINT;
1145    default:
1146       unreachable("unhandled format size");
1147    }
1148 }
1149
1150 static void
1151 copy_compressed(VkFormat format,
1152                 VkOffset3D *offset,
1153                 VkExtent3D *extent,
1154                 uint32_t *width,
1155                 uint32_t *height)
1156 {
1157    if (!vk_format_is_compressed(format))
1158       return;
1159
1160    uint32_t block_width = vk_format_get_blockwidth(format);
1161    uint32_t block_height = vk_format_get_blockheight(format);
1162
1163    offset->x /= block_width;
1164    offset->y /= block_height;
1165
1166    if (extent) {
1167       extent->width = DIV_ROUND_UP(extent->width, block_width);
1168       extent->height = DIV_ROUND_UP(extent->height, block_height);
1169    }
1170    if (width)
1171       *width = DIV_ROUND_UP(*width, block_width);
1172    if (height)
1173       *height = DIV_ROUND_UP(*height, block_height);
1174 }
1175
1176 static void
1177 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1178                         struct tu_buffer *src_buffer,
1179                         struct tu_image *dst_image,
1180                         const VkBufferImageCopy *info)
1181 {
1182    struct tu_cs *cs = &cmd->cs;
1183    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1184    VkFormat dst_format = dst_image->vk_format;
1185    VkFormat src_format = dst_image->vk_format;
1186    const struct blit_ops *ops = &r2d_ops;
1187
1188    uint8_t mask = 0xf;
1189
1190    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1191       switch (info->imageSubresource.aspectMask) {
1192       case VK_IMAGE_ASPECT_STENCIL_BIT:
1193          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1194          mask = 0x8;
1195          ops = &r3d_ops;
1196          break;
1197       case VK_IMAGE_ASPECT_DEPTH_BIT:
1198          mask = 0x7;
1199          break;
1200       }
1201    }
1202
1203    VkOffset3D offset = info->imageOffset;
1204    VkExtent3D extent = info->imageExtent;
1205    uint32_t src_width = info->bufferRowLength ?: extent.width;
1206    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1207
1208    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1209       assert(src_format == dst_format);
1210       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1211       src_format = dst_format = copy_format(dst_format);
1212    }
1213
1214    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1215    uint32_t layer_size = src_height * pitch;
1216
1217    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1218     * it is also valid for 1cpp format with shader path (stencil aspect path)
1219     */
1220
1221    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1222
1223    struct tu_image_view dst;
1224    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1225
1226    for (uint32_t i = 0; i < layers; i++) {
1227       ops->dst(cs, &dst, i);
1228
1229       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1230       if ((src_va & 63) || (pitch & 63)) {
1231          for (uint32_t y = 0; y < extent.height; y++) {
1232             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1233             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1234                             x + extent.width, 1);
1235             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1236                         &(VkExtent2D) {extent.width, 1});
1237             ops->run(cmd, cs);
1238             src_va += pitch;
1239          }
1240       } else {
1241          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1242          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1243          ops->run(cmd, cs);
1244       }
1245    }
1246 }
1247
1248 void
1249 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1250                         VkBuffer srcBuffer,
1251                         VkImage dstImage,
1252                         VkImageLayout dstImageLayout,
1253                         uint32_t regionCount,
1254                         const VkBufferImageCopy *pRegions)
1255 {
1256    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1257    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1258    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1259
1260    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1261    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1262
1263    for (unsigned i = 0; i < regionCount; ++i)
1264       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1265 }
1266
1267 static void
1268 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1269                         struct tu_image *src_image,
1270                         struct tu_buffer *dst_buffer,
1271                         const VkBufferImageCopy *info)
1272 {
1273    struct tu_cs *cs = &cmd->cs;
1274    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1275    VkFormat src_format = src_image->vk_format;
1276    VkFormat dst_format = src_image->vk_format;
1277    bool stencil_read = false;
1278
1279    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1280        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1281       dst_format = VK_FORMAT_R8_UNORM;
1282       stencil_read = true;
1283    }
1284
1285    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1286    VkOffset3D offset = info->imageOffset;
1287    VkExtent3D extent = info->imageExtent;
1288    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1289    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1290
1291    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1292       assert(src_format == dst_format);
1293       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1294       src_format = dst_format = copy_format(dst_format);
1295    }
1296
1297    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1298    uint32_t layer_size = pitch * dst_height;
1299
1300    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1301     * it is also valid for 1cpp format with shader path (stencil aspect)
1302     */
1303
1304    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1305
1306    struct tu_image_view src;
1307    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1308
1309    for (uint32_t i = 0; i < layers; i++) {
1310       ops->src(cmd, cs, &src, i, false);
1311
1312       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1313       if ((dst_va & 63) || (pitch & 63)) {
1314          for (uint32_t y = 0; y < extent.height; y++) {
1315             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1316             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1317             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1318                         &(VkExtent2D) {extent.width, 1});
1319             ops->run(cmd, cs);
1320             dst_va += pitch;
1321          }
1322       } else {
1323          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1324          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1325          ops->run(cmd, cs);
1326       }
1327    }
1328 }
1329
1330 void
1331 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1332                         VkImage srcImage,
1333                         VkImageLayout srcImageLayout,
1334                         VkBuffer dstBuffer,
1335                         uint32_t regionCount,
1336                         const VkBufferImageCopy *pRegions)
1337 {
1338    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1339    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1340    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1341
1342    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1343    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1344
1345    for (unsigned i = 0; i < regionCount; ++i)
1346       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1347 }
1348
1349 /* Tiled formats don't support swapping, which means that we can't support
1350  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1351  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1352  * Currently we fake support for tiled swapped formats and use the unswapped
1353  * format instead, but this means that reinterpreting copies to and from
1354  * swapped formats can't be performed correctly unless we can swizzle the
1355  * components by reinterpreting the other image as the "correct" swapped
1356  * format, i.e. only when the other image is linear.
1357  */
1358
1359 static bool
1360 is_swapped_format(VkFormat format)
1361 {
1362    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1363    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1364    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1365 }
1366
1367 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1368  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1369  * versa). This should mirror the logic in fdl6_layout.
1370  */
1371 static bool
1372 image_is_r8g8(struct tu_image *image)
1373 {
1374    return image->layout.cpp == 2 &&
1375       vk_format_get_nr_components(image->vk_format) == 2;
1376 }
1377
1378 static void
1379 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1380                        struct tu_image *src_image,
1381                        struct tu_image *dst_image,
1382                        const VkImageCopy *info)
1383 {
1384    const struct blit_ops *ops = &r2d_ops;
1385    struct tu_cs *cs = &cmd->cs;
1386
1387    uint8_t mask = 0xf;
1388    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1389       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1390          mask = 0x7;
1391       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1392          mask = 0x8;
1393    }
1394
1395    if (dst_image->samples > 1)
1396       ops = &r3d_ops;
1397
1398    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1399
1400    VkFormat format = VK_FORMAT_UNDEFINED;
1401    VkOffset3D src_offset = info->srcOffset;
1402    VkOffset3D dst_offset = info->dstOffset;
1403    VkExtent3D extent = info->extent;
1404
1405    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1406     * Images":
1407     *
1408     *    When copying between compressed and uncompressed formats the extent
1409     *    members represent the texel dimensions of the source image and not
1410     *    the destination. When copying from a compressed image to an
1411     *    uncompressed image the image texel dimensions written to the
1412     *    uncompressed image will be source extent divided by the compressed
1413     *    texel block dimensions. When copying from an uncompressed image to a
1414     *    compressed image the image texel dimensions written to the compressed
1415     *    image will be the source extent multiplied by the compressed texel
1416     *    block dimensions.
1417     *
1418     * This means we only have to adjust the extent if the source image is
1419     * compressed.
1420     */
1421    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1422    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1423
1424    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1425       copy_format(dst_image->vk_format) : dst_image->vk_format;
1426    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1427       copy_format(src_image->vk_format) : src_image->vk_format;
1428
1429    bool use_staging_blit = false;
1430
1431    if (src_format == dst_format) {
1432       /* Images that share a format can always be copied directly because it's
1433        * the same as a blit.
1434        */
1435       format = src_format;
1436    } else if (!src_image->layout.tile_mode) {
1437       /* If an image is linear, we can always safely reinterpret it with the
1438        * other image's format and then do a regular blit.
1439        */
1440       format = dst_format;
1441    } else if (!dst_image->layout.tile_mode) {
1442       format = src_format;
1443    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1444       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1445        * due to the different tile layout.
1446        */
1447       use_staging_blit = true;
1448    } else if (is_swapped_format(src_format) ||
1449               is_swapped_format(dst_format)) {
1450       /* If either format has a non-identity swap, then we can't copy
1451        * to/from it.
1452        */
1453       use_staging_blit = true;
1454    } else if (!src_image->layout.ubwc) {
1455       format = dst_format;
1456    } else if (!dst_image->layout.ubwc) {
1457       format = src_format;
1458    } else {
1459       /* Both formats use UBWC and so neither can be reinterpreted.
1460        * TODO: We could do an in-place decompression of the dst instead.
1461        */
1462       use_staging_blit = true;
1463    }
1464
1465    struct tu_image_view dst, src;
1466
1467    if (use_staging_blit) {
1468       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1469       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1470
1471       struct tu_image staging_image = {
1472          .vk_format = src_format,
1473          .type = src_image->type,
1474          .tiling = VK_IMAGE_TILING_LINEAR,
1475          .extent = extent,
1476          .level_count = 1,
1477          .layer_count = info->srcSubresource.layerCount,
1478          .samples = src_image->samples,
1479          .bo_offset = 0,
1480       };
1481
1482       VkImageSubresourceLayers staging_subresource = {
1483          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1484          .mipLevel = 0,
1485          .baseArrayLayer = 0,
1486          .layerCount = info->srcSubresource.layerCount,
1487       };
1488
1489       VkOffset3D staging_offset = { 0 };
1490
1491       staging_image.layout.tile_mode = TILE6_LINEAR;
1492       staging_image.layout.ubwc = false;
1493
1494       fdl6_layout(&staging_image.layout,
1495                   vk_format_to_pipe_format(staging_image.vk_format),
1496                   staging_image.samples,
1497                   staging_image.extent.width,
1498                   staging_image.extent.height,
1499                   staging_image.extent.depth,
1500                   staging_image.level_count,
1501                   staging_image.layer_count,
1502                   staging_image.type == VK_IMAGE_TYPE_3D,
1503                   NULL);
1504
1505       VkResult result = tu_get_scratch_bo(cmd->device,
1506                                           staging_image.layout.size,
1507                                           &staging_image.bo);
1508       if (result != VK_SUCCESS) {
1509          cmd->record_result = result;
1510          return;
1511       }
1512
1513       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1514                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1515
1516       struct tu_image_view staging;
1517       tu_image_view_blit2(&staging, &staging_image, src_format,
1518                           &staging_subresource, 0, false);
1519
1520       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1521       coords(ops, cs, &staging_offset, &src_offset, &extent);
1522
1523       for (uint32_t i = 0; i < info->extent.depth; i++) {
1524          ops->src(cmd, cs, &src, i, false);
1525          ops->dst(cs, &staging, i);
1526          ops->run(cmd, cs);
1527       }
1528
1529       /* When executed by the user there has to be a pipeline barrier here,
1530        * but since we're doing it manually we'll have to flush ourselves.
1531        */
1532       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1533       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1534
1535       tu_image_view_blit2(&staging, &staging_image, dst_format,
1536                           &staging_subresource, 0, false);
1537
1538       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1539       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1540
1541       for (uint32_t i = 0; i < info->extent.depth; i++) {
1542          ops->src(cmd, cs, &staging, i, false);
1543          ops->dst(cs, &dst, i);
1544          ops->run(cmd, cs);
1545       }
1546    } else {
1547       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1548       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1549
1550       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1551       coords(ops, cs, &dst_offset, &src_offset, &extent);
1552
1553       for (uint32_t i = 0; i < info->extent.depth; i++) {
1554          ops->src(cmd, cs, &src, i, false);
1555          ops->dst(cs, &dst, i);
1556          ops->run(cmd, cs);
1557       }
1558    }
1559 }
1560
1561 void
1562 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1563                 VkImage srcImage,
1564                 VkImageLayout srcImageLayout,
1565                 VkImage destImage,
1566                 VkImageLayout destImageLayout,
1567                 uint32_t regionCount,
1568                 const VkImageCopy *pRegions)
1569 {
1570    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1571    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1572    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1573
1574    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1575    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1576
1577    for (uint32_t i = 0; i < regionCount; ++i)
1578       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1579 }
1580
1581 static void
1582 copy_buffer(struct tu_cmd_buffer *cmd,
1583             uint64_t dst_va,
1584             uint64_t src_va,
1585             uint64_t size,
1586             uint32_t block_size)
1587 {
1588    const struct blit_ops *ops = &r2d_ops;
1589    struct tu_cs *cs = &cmd->cs;
1590    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1591    uint64_t blocks = size / block_size;
1592
1593    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1594
1595    while (blocks) {
1596       uint32_t src_x = (src_va & 63) / block_size;
1597       uint32_t dst_x = (dst_va & 63) / block_size;
1598       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1599
1600       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1601       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1602       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1603       ops->run(cmd, cs);
1604
1605       src_va += width * block_size;
1606       dst_va += width * block_size;
1607       blocks -= width;
1608    }
1609 }
1610
1611 void
1612 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1613                  VkBuffer srcBuffer,
1614                  VkBuffer dstBuffer,
1615                  uint32_t regionCount,
1616                  const VkBufferCopy *pRegions)
1617 {
1618    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1619    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1620    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1621
1622    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1623    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1624
1625    for (unsigned i = 0; i < regionCount; ++i) {
1626       copy_buffer(cmd,
1627                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1628                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1629                   pRegions[i].size, 1);
1630    }
1631 }
1632
1633 void
1634 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1635                    VkBuffer dstBuffer,
1636                    VkDeviceSize dstOffset,
1637                    VkDeviceSize dataSize,
1638                    const void *pData)
1639 {
1640    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1641    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1642
1643    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1644
1645    struct ts_cs_memory tmp;
1646    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1647    if (result != VK_SUCCESS) {
1648       cmd->record_result = result;
1649       return;
1650    }
1651
1652    memcpy(tmp.map, pData, dataSize);
1653    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1654 }
1655
1656 void
1657 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1658                  VkBuffer dstBuffer,
1659                  VkDeviceSize dstOffset,
1660                  VkDeviceSize fillSize,
1661                  uint32_t data)
1662 {
1663    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1664    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1665    const struct blit_ops *ops = &r2d_ops;
1666    struct tu_cs *cs = &cmd->cs;
1667
1668    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1669
1670    if (fillSize == VK_WHOLE_SIZE)
1671       fillSize = buffer->size - dstOffset;
1672
1673    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1674    uint32_t blocks = fillSize / 4;
1675
1676    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1677    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1678
1679    while (blocks) {
1680       uint32_t dst_x = (dst_va & 63) / 4;
1681       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1682
1683       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1684       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1685       ops->run(cmd, cs);
1686
1687       dst_va += width * 4;
1688       blocks -= width;
1689    }
1690 }
1691
1692 void
1693 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1694                    VkImage srcImage,
1695                    VkImageLayout srcImageLayout,
1696                    VkImage dstImage,
1697                    VkImageLayout dstImageLayout,
1698                    uint32_t regionCount,
1699                    const VkImageResolve *pRegions)
1700 {
1701    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1702    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1703    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1704    const struct blit_ops *ops = &r2d_ops;
1705    struct tu_cs *cs = &cmd->cs;
1706
1707    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1708    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1709
1710    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1711
1712    for (uint32_t i = 0; i < regionCount; ++i) {
1713       const VkImageResolve *info = &pRegions[i];
1714       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1715
1716       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1717       /* TODO: aspect masks possible ? */
1718
1719       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1720
1721       struct tu_image_view dst, src;
1722       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1723       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1724
1725       for (uint32_t i = 0; i < layers; i++) {
1726          ops->src(cmd, cs, &src, i, false);
1727          ops->dst(cs, &dst, i);
1728          ops->run(cmd, cs);
1729       }
1730    }
1731 }
1732
1733 void
1734 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1735                   struct tu_cs *cs,
1736                   struct tu_image_view *src,
1737                   struct tu_image_view *dst,
1738                   uint32_t layers,
1739                   const VkRect2D *rect)
1740 {
1741    const struct blit_ops *ops = &r2d_ops;
1742
1743    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1744    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1745
1746    assert(src->image->vk_format == dst->image->vk_format);
1747
1748    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1749    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1750
1751    for (uint32_t i = 0; i < layers; i++) {
1752       ops->src(cmd, cs, src, i, false);
1753       ops->dst(cs, dst, i);
1754       ops->run(cmd, cs);
1755    }
1756 }
1757
1758 static void
1759 clear_image(struct tu_cmd_buffer *cmd,
1760             struct tu_image *image,
1761             const VkClearValue *clear_value,
1762             const VkImageSubresourceRange *range)
1763 {
1764    uint32_t level_count = tu_get_levelCount(image, range);
1765    uint32_t layer_count = tu_get_layerCount(image, range);
1766    struct tu_cs *cs = &cmd->cs;
1767    VkFormat format = image->vk_format;
1768    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1769       format = VK_FORMAT_R32_UINT;
1770
1771    if (image->type == VK_IMAGE_TYPE_3D) {
1772       assert(layer_count == 1);
1773       assert(range->baseArrayLayer == 0);
1774    }
1775
1776    uint8_t mask = 0xf;
1777    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1778       mask = 0;
1779       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1780          mask |= 0x7;
1781       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1782          mask |= 0x8;
1783    }
1784
1785    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1786
1787    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1788    ops->clear_value(cs, image->vk_format, clear_value);
1789
1790    for (unsigned j = 0; j < level_count; j++) {
1791       if (image->type == VK_IMAGE_TYPE_3D)
1792          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1793
1794       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1795                      u_minify(image->extent.width, range->baseMipLevel + j),
1796                      u_minify(image->extent.height, range->baseMipLevel + j)
1797                   });
1798
1799       struct tu_image_view dst;
1800       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1801          .aspectMask = range->aspectMask,
1802          .mipLevel = range->baseMipLevel + j,
1803          .baseArrayLayer = range->baseArrayLayer,
1804          .layerCount = 1,
1805       }, 0, false);
1806
1807       for (uint32_t i = 0; i < layer_count; i++) {
1808          ops->dst(cs, &dst, i);
1809          ops->run(cmd, cs);
1810       }
1811    }
1812 }
1813
1814 void
1815 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1816                       VkImage image_h,
1817                       VkImageLayout imageLayout,
1818                       const VkClearColorValue *pColor,
1819                       uint32_t rangeCount,
1820                       const VkImageSubresourceRange *pRanges)
1821 {
1822    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1823    TU_FROM_HANDLE(tu_image, image, image_h);
1824
1825    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1826
1827    for (unsigned i = 0; i < rangeCount; i++)
1828       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1829 }
1830
1831 void
1832 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1833                              VkImage image_h,
1834                              VkImageLayout imageLayout,
1835                              const VkClearDepthStencilValue *pDepthStencil,
1836                              uint32_t rangeCount,
1837                              const VkImageSubresourceRange *pRanges)
1838 {
1839    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1840    TU_FROM_HANDLE(tu_image, image, image_h);
1841
1842    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1843
1844    for (unsigned i = 0; i < rangeCount; i++)
1845       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1846 }
1847
1848 static void
1849 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1850                                uint32_t attachment_count,
1851                                const VkClearAttachment *attachments,
1852                                uint32_t rect_count,
1853                                const VkClearRect *rects)
1854 {
1855    const struct tu_subpass *subpass = cmd->state.subpass;
1856    /* note: cannot use shader path here.. there is a special shader path
1857     * in tu_clear_sysmem_attachments()
1858     */
1859    const struct blit_ops *ops = &r2d_ops;
1860    struct tu_cs *cs = &cmd->draw_cs;
1861
1862    for (uint32_t j = 0; j < attachment_count; j++) {
1863          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1864           * Pass Instance" says that:
1865           *
1866           *     Unlike other clear commands, vkCmdClearAttachments executes as
1867           *     a drawing command, rather than a transfer command, with writes
1868           *     performed by it executing in rasterization order. Clears to
1869           *     color attachments are executed as color attachment writes, by
1870           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1871           *     Clears to depth/stencil attachments are executed as depth
1872           *     writes and writes by the
1873           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1874           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1875           *
1876           * However, the 2d path here is executed the same way as a
1877           * transfer command, using the CCU color cache exclusively with
1878           * a special depth-as-color format for depth clears. This means that
1879           * we can't rely on the normal pipeline barrier mechanism here, and
1880           * have to manually flush whenever using a different cache domain
1881           * from what the 3d path would've used. This happens when we clear
1882           * depth/stencil, since normally depth attachments use CCU depth, but
1883           * we clear it using a special depth-as-color format. Since the clear
1884           * potentially uses a different attachment state we also need to
1885           * invalidate color beforehand and flush it afterwards.
1886           */
1887
1888          uint32_t a;
1889          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1890             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1891             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1892          } else {
1893             a = subpass->depth_stencil_attachment.attachment;
1894             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1895             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1896             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1897          }
1898
1899          if (a == VK_ATTACHMENT_UNUSED)
1900                continue;
1901
1902          uint8_t mask = 0xf;
1903          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1904             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1905                mask &= ~0x7;
1906             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1907                mask &= ~0x8;
1908          }
1909
1910          const struct tu_image_view *iview =
1911             cmd->state.framebuffer->attachments[a].attachment;
1912
1913          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1914          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1915
1916          /* Wait for the flushes we triggered manually to complete */
1917          tu_cs_emit_wfi(cs);
1918
1919          for (uint32_t i = 0; i < rect_count; i++) {
1920             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1921             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1922                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1923                ops->run(cmd, cs);
1924             }
1925          }
1926
1927          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1928             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1929             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1930          } else {
1931             /* sync color into depth */
1932             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1933             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1934          }
1935    }
1936 }
1937
1938 static void
1939 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1940                             uint32_t attachment_count,
1941                             const VkClearAttachment *attachments,
1942                             uint32_t rect_count,
1943                             const VkClearRect *rects)
1944 {
1945    /* the shader path here is special, it avoids changing MRT/etc state */
1946    const struct tu_render_pass *pass = cmd->state.pass;
1947    const struct tu_subpass *subpass = cmd->state.subpass;
1948    const uint32_t mrt_count = subpass->color_count;
1949    struct tu_cs *cs = &cmd->draw_cs;
1950    uint32_t clear_value[MAX_RTS][4];
1951    float z_clear_val = 0.0f;
1952    uint8_t s_clear_val = 0;
1953    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1954    bool z_clear = false;
1955    bool s_clear = false;
1956    uint32_t max_samples = 1;
1957
1958    for (uint32_t i = 0; i < attachment_count; i++) {
1959       uint32_t a;
1960       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1961          uint32_t c = attachments[i].colorAttachment;
1962          a = subpass->color_attachments[c].attachment;
1963          if (a == VK_ATTACHMENT_UNUSED)
1964             continue;
1965
1966          clear_rts |= 1 << c;
1967          clear_components |= 0xf << (c * 4);
1968          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1969       } else {
1970          a = subpass->depth_stencil_attachment.attachment;
1971          if (a == VK_ATTACHMENT_UNUSED)
1972             continue;
1973
1974          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1975             z_clear = true;
1976             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1977          }
1978
1979          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1980             s_clear = true;
1981             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1982          }
1983       }
1984
1985       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1986    }
1987
1988    /* prefer to use 2D path for clears
1989     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1990     */
1991    if (max_samples == 1 && cmd->state.framebuffer) {
1992       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1993       return;
1994    }
1995
1996    /* TODO: this path doesn't take into account multilayer rendering */
1997
1998    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1999    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2000                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2001                   0xfc000000);
2002    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2003
2004    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
2005    for (uint32_t i = 0; i < mrt_count; i++) {
2006       if (clear_rts & (1 << i))
2007          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
2008       else
2009          tu_cs_emit(cs, 0);
2010    }
2011
2012    r3d_pipeline(cmd, cs, false, num_rts);
2013
2014    tu_cs_emit_regs(cs,
2015                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2016    tu_cs_emit_regs(cs,
2017                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2018
2019    tu_cs_emit_regs(cs,
2020                    A6XX_RB_FS_OUTPUT_CNTL0(),
2021                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2022
2023    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2024    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2025    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
2026    for (uint32_t i = 0; i < mrt_count; i++) {
2027       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2028             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2029    }
2030
2031    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2032    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2033          .z_enable = z_clear,
2034          .z_write_enable = z_clear,
2035          .zfunc = FUNC_ALWAYS));
2036    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2037    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2038          .stencil_enable = s_clear,
2039          .func = FUNC_ALWAYS,
2040          .zpass = STENCIL_REPLACE));
2041    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2042    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2043    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2044
2045    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2046    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2047                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2048                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2049                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2050                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2051    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2052    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2053    for_each_bit(b, clear_rts)
2054       tu_cs_emit_array(cs, clear_value[b], 4);
2055
2056    for (uint32_t i = 0; i < rect_count; i++) {
2057       r3d_coords_raw(cs, (float[]) {
2058          rects[i].rect.offset.x, rects[i].rect.offset.y,
2059          z_clear_val, 1.0f,
2060          rects[i].rect.offset.x + rects[i].rect.extent.width,
2061          rects[i].rect.offset.y + rects[i].rect.extent.height,
2062          z_clear_val, 1.0f
2063       });
2064       r3d_run(cmd, cs);
2065    }
2066
2067    cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE |
2068       TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
2069       TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2070       TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2071       TU_CMD_DIRTY_DYNAMIC_VIEWPORT |
2072       TU_CMD_DIRTY_DYNAMIC_SCISSOR;
2073 }
2074
2075 /**
2076  * Pack a VkClearValue into a 128-bit buffer. format is respected except
2077  * for the component order.  The components are always packed in WZYX order,
2078  * because gmem is tiled and tiled formats always have WZYX swap
2079  */
2080 static void
2081 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
2082 {
2083    const struct util_format_description *desc = vk_format_description(format);
2084
2085    switch (format) {
2086    case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2087       buf[0] = float3_to_r11g11b10f(val->color.float32);
2088       return;
2089    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2090       buf[0] = float3_to_rgb9e5(val->color.float32);
2091       return;
2092    default:
2093       break;
2094    }
2095
2096    assert(desc && desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
2097
2098    /* S8_UINT is special and has no depth */
2099    const int max_components =
2100       format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
2101
2102    int buf_offset = 0;
2103    int bit_shift = 0;
2104    for (int comp = 0; comp < max_components; comp++) {
2105       const struct util_format_channel_description *ch =
2106          tu_get_format_channel_description(desc, comp);
2107       if (!ch) {
2108          assert((format == VK_FORMAT_S8_UINT && comp == 0) ||
2109                 (format == VK_FORMAT_X8_D24_UNORM_PACK32 && comp == 1));
2110          continue;
2111       }
2112
2113       union tu_clear_component_value v = tu_get_clear_component_value(
2114          val, comp, desc->colorspace);
2115
2116       /* move to the next uint32_t when there is not enough space */
2117       assert(ch->size <= 32);
2118       if (bit_shift + ch->size > 32) {
2119          buf_offset++;
2120          bit_shift = 0;
2121       }
2122
2123       if (bit_shift == 0)
2124          buf[buf_offset] = 0;
2125
2126       buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
2127       bit_shift += ch->size;
2128    }
2129 }
2130
2131 static void
2132 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2133                               struct tu_cs *cs,
2134                               uint32_t attachment,
2135                               uint8_t component_mask,
2136                               const VkClearValue *value)
2137 {
2138    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2139    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2140     * because D24S8 is cleared with AS_R8G8B8A8 format
2141     */
2142
2143    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2144    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2145
2146    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2147    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2148
2149    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2150    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2151
2152    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2153    tu_cs_emit(cs, 0);
2154
2155    uint32_t clear_vals[4] = {};
2156    pack_gmem_clear_value(value, vk_format, clear_vals);
2157
2158    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2159    tu_cs_emit_array(cs, clear_vals, 4);
2160
2161    tu6_emit_event_write(cmd, cs, BLIT);
2162 }
2163
2164 static void
2165 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2166                           uint32_t attachment_count,
2167                           const VkClearAttachment *attachments,
2168                           uint32_t rect_count,
2169                           const VkClearRect *rects)
2170 {
2171    const struct tu_subpass *subpass = cmd->state.subpass;
2172    struct tu_cs *cs = &cmd->draw_cs;
2173
2174    /* TODO: swap the loops for smaller cmdstream */
2175    for (unsigned i = 0; i < rect_count; i++) {
2176       unsigned x1 = rects[i].rect.offset.x;
2177       unsigned y1 = rects[i].rect.offset.y;
2178       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2179       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2180
2181       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2182       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2183       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2184
2185       for (unsigned j = 0; j < attachment_count; j++) {
2186          uint32_t a;
2187          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2188             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2189          else
2190             a = subpass->depth_stencil_attachment.attachment;
2191
2192          if (a == VK_ATTACHMENT_UNUSED)
2193                continue;
2194
2195          unsigned clear_mask = 0xf;
2196          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2197             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2198                clear_mask &= ~0x7;
2199             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2200                clear_mask &= ~0x8;
2201          }
2202
2203          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2204                                        &attachments[j].clearValue);
2205       }
2206    }
2207 }
2208
2209 void
2210 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2211                        uint32_t attachmentCount,
2212                        const VkClearAttachment *pAttachments,
2213                        uint32_t rectCount,
2214                        const VkClearRect *pRects)
2215 {
2216    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2217    struct tu_cs *cs = &cmd->draw_cs;
2218
2219    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2220    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2221    tu_cond_exec_end(cs);
2222
2223    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2224    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2225    tu_cond_exec_end(cs);
2226 }
2227
2228 void
2229 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2230                            struct tu_cs *cs,
2231                            uint32_t a,
2232                            const VkRenderPassBeginInfo *info)
2233 {
2234    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2235    const struct tu_image_view *iview = fb->attachments[a].attachment;
2236    const struct tu_render_pass_attachment *attachment =
2237       &cmd->state.pass->attachments[a];
2238    uint8_t mask = 0;
2239
2240    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2241       mask = 0xf;
2242    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2243       mask |= 0x7;
2244    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2245       mask |= 0x8;
2246
2247    if (!mask)
2248       return;
2249
2250    const struct blit_ops *ops = &r2d_ops;
2251    if (attachment->samples > 1)
2252       ops = &r3d_ops;
2253
2254    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2255    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2256    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2257
2258    /* Wait for any flushes at the beginning of the renderpass to complete */
2259    tu_cs_emit_wfi(cs);
2260
2261    for (uint32_t i = 0; i < fb->layers; i++) {
2262       ops->dst(cs, iview, i);
2263       ops->run(cmd, cs);
2264    }
2265
2266    /* The spec doesn't explicitly say, but presumably the initial renderpass
2267     * clear is considered part of the renderpass, and therefore barriers
2268     * aren't required inside the subpass/renderpass.  Therefore we need to
2269     * flush CCU color into CCU depth here, just like with
2270     * vkCmdClearAttachments(). Note that because this only happens at the
2271     * beginning of a renderpass, and renderpass writes are considered
2272     * "incoherent", we shouldn't have to worry about syncing depth into color
2273     * beforehand as depth should already be flushed.
2274     */
2275    if (vk_format_is_depth_or_stencil(attachment->format)) {
2276       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2277       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2278    } else {
2279       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2280       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2281    }
2282 }
2283
2284 void
2285 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2286                          struct tu_cs *cs,
2287                          uint32_t a,
2288                          const VkRenderPassBeginInfo *info)
2289 {
2290    const struct tu_render_pass_attachment *attachment =
2291       &cmd->state.pass->attachments[a];
2292    unsigned clear_mask = 0;
2293
2294    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2295       clear_mask = 0xf;
2296    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2297       clear_mask |= 0x7;
2298    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2299       clear_mask |= 0x8;
2300
2301    if (!clear_mask)
2302       return;
2303
2304    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2305
2306    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2307                                  &info->pClearValues[a]);
2308 }
2309
2310 static void
2311 tu_emit_blit(struct tu_cmd_buffer *cmd,
2312              struct tu_cs *cs,
2313              const struct tu_image_view *iview,
2314              const struct tu_render_pass_attachment *attachment,
2315              bool resolve)
2316 {
2317    tu_cs_emit_regs(cs,
2318                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2319
2320    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2321       .unk0 = !resolve,
2322       .gmem = !resolve,
2323       /* "integer" bit disables msaa resolve averaging */
2324       .integer = vk_format_is_int(attachment->format)));
2325
2326    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2327    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2328    tu_cs_image_ref_2d(cs, iview, 0, false);
2329
2330    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2331    tu_cs_image_flag_ref(cs, iview, 0);
2332
2333    tu_cs_emit_regs(cs,
2334                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2335
2336    tu6_emit_event_write(cmd, cs, BLIT);
2337 }
2338
2339 static bool
2340 blit_can_resolve(VkFormat format)
2341 {
2342    const struct util_format_description *desc = vk_format_description(format);
2343
2344    /* blit event can only do resolve for simple cases:
2345     * averaging samples as unsigned integers or choosing only one sample
2346     */
2347    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2348       return false;
2349
2350    /* can't do formats with larger channel sizes
2351     * note: this includes all float formats
2352     * note2: single channel integer formats seem OK
2353     */
2354    if (desc->channel[0].size > 10)
2355       return false;
2356
2357    switch (format) {
2358    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2359     * likely related to these formats having different layout from other cpp=2 formats
2360     */
2361    case VK_FORMAT_R8G8_UNORM:
2362    case VK_FORMAT_R8G8_UINT:
2363    case VK_FORMAT_R8G8_SINT:
2364    /* TODO: this one should be able to work? */
2365    case VK_FORMAT_D24_UNORM_S8_UINT:
2366       return false;
2367    default:
2368       break;
2369    }
2370
2371    return true;
2372 }
2373
2374 void
2375 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2376                         struct tu_cs *cs,
2377                         uint32_t a,
2378                         bool force_load)
2379 {
2380    const struct tu_image_view *iview =
2381       cmd->state.framebuffer->attachments[a].attachment;
2382    const struct tu_render_pass_attachment *attachment =
2383       &cmd->state.pass->attachments[a];
2384
2385    if (attachment->load || force_load)
2386       tu_emit_blit(cmd, cs, iview, attachment, false);
2387 }
2388
2389 void
2390 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2391                          struct tu_cs *cs,
2392                          uint32_t a,
2393                          uint32_t gmem_a)
2394 {
2395    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2396    const VkRect2D *render_area = &tiling->render_area;
2397    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2398    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2399    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2400
2401    if (!dst->store)
2402       return;
2403
2404    uint32_t x1 = render_area->offset.x;
2405    uint32_t y1 = render_area->offset.y;
2406    uint32_t x2 = x1 + render_area->extent.width;
2407    uint32_t y2 = y1 + render_area->extent.height;
2408    /* x2/y2 can be unaligned if equal to the size of the image,
2409     * since it will write into padding space
2410     * the one exception is linear levels which don't have the
2411     * required y padding in the layout (except for the last level)
2412     */
2413    bool need_y2_align =
2414       y2 != iview->extent.height || iview->need_y2_align;
2415
2416    bool unaligned =
2417       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2418       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2419
2420    /* use fast path when render area is aligned, except for unsupported resolve cases */
2421    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2422       tu_emit_blit(cmd, cs, iview, src, true);
2423       return;
2424    }
2425
2426    if (dst->samples > 1) {
2427       /* I guess we need to use shader path in this case?
2428        * need a testcase which fails because of this
2429        */
2430       tu_finishme("unaligned store of msaa attachment\n");
2431       return;
2432    }
2433
2434    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2435    r2d_dst(cs, iview, 0);
2436    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2437
2438    tu_cs_emit_regs(cs,
2439                    A6XX_SP_PS_2D_SRC_INFO(
2440                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2441                       .tile_mode = TILE6_2,
2442                       .srgb = vk_format_is_srgb(src->format),
2443                       .samples = tu_msaa_samples(src->samples),
2444                       .samples_average = !vk_format_is_int(src->format),
2445                       .unk20 = 1,
2446                       .unk22 = 1),
2447                    /* note: src size does not matter when not scaling */
2448                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2449                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2450                    A6XX_SP_PS_2D_SRC_HI(),
2451                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2452
2453    /* sync GMEM writes with CACHE. */
2454    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2455
2456    /* Wait for CACHE_INVALIDATE to land */
2457    tu_cs_emit_wfi(cs);
2458
2459    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2460    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2461
2462    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2463     * sysmem, and we generally assume that GMEM renderpasses leave their
2464     * results in sysmem, so we need to flush manually here.
2465     */
2466    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2467 }