src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 static uint32_t
  20 tu_pack_float32_for_unorm(float val, int bits)
  21 {
  22    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
  23 }
  24
  25 /* r2d_ = BLIT_OP_SCALE operations */
  26
  27 static enum a6xx_2d_ifmt
  28 format_to_ifmt(enum a6xx_format fmt)
  29 {
  30    switch (fmt) {
  31    case FMT6_A8_UNORM:
  32    case FMT6_8_UNORM:
  33    case FMT6_8_SNORM:
  34    case FMT6_8_8_UNORM:
  35    case FMT6_8_8_SNORM:
  36    case FMT6_8_8_8_8_UNORM:
  37    case FMT6_8_8_8_X8_UNORM:
  38    case FMT6_8_8_8_8_SNORM:
  39    case FMT6_4_4_4_4_UNORM:
  40    case FMT6_5_5_5_1_UNORM:
  41    case FMT6_5_6_5_UNORM:
  42    case FMT6_Z24_UNORM_S8_UINT:
  43    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
  44       return R2D_UNORM8;
  45
  46    case FMT6_32_UINT:
  47    case FMT6_32_SINT:
  48    case FMT6_32_32_UINT:
  49    case FMT6_32_32_SINT:
  50    case FMT6_32_32_32_32_UINT:
  51    case FMT6_32_32_32_32_SINT:
  52       return R2D_INT32;
  53
  54    case FMT6_16_UINT:
  55    case FMT6_16_SINT:
  56    case FMT6_16_16_UINT:
  57    case FMT6_16_16_SINT:
  58    case FMT6_16_16_16_16_UINT:
  59    case FMT6_16_16_16_16_SINT:
  60    case FMT6_10_10_10_2_UINT:
  61       return R2D_INT16;
  62
  63    case FMT6_8_UINT:
  64    case FMT6_8_SINT:
  65    case FMT6_8_8_UINT:
  66    case FMT6_8_8_SINT:
  67    case FMT6_8_8_8_8_UINT:
  68    case FMT6_8_8_8_8_SINT:
  69       return R2D_INT8;
  70
  71    case FMT6_16_UNORM:
  72    case FMT6_16_SNORM:
  73    case FMT6_16_16_UNORM:
  74    case FMT6_16_16_SNORM:
  75    case FMT6_16_16_16_16_UNORM:
  76    case FMT6_16_16_16_16_SNORM:
  77    case FMT6_32_FLOAT:
  78    case FMT6_32_32_FLOAT:
  79    case FMT6_32_32_32_32_FLOAT:
  80       return R2D_FLOAT32;
  81
  82    case FMT6_16_FLOAT:
  83    case FMT6_16_16_FLOAT:
  84    case FMT6_16_16_16_16_FLOAT:
  85    case FMT6_11_11_10_FLOAT:
  86    case FMT6_10_10_10_2_UNORM:
  87    case FMT6_10_10_10_2_UNORM_DEST:
  88       return R2D_FLOAT16;
  89
  90    default:
  91       unreachable("bad format");
  92       return 0;
  93    }
  94 }
  95
  96 static void
  97 r2d_coords(struct tu_cs *cs,
  98            const VkOffset2D *dst,
  99            const VkOffset2D *src,
 100            const VkExtent2D *extent)
 101 {
 102    tu_cs_emit_regs(cs,
 103       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 104       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 105
 106    if (!src)
 107       return;
 108
 109    tu_cs_emit_regs(cs,
 110                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 111                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 112                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 113                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 114 }
 115
 116 static void
 117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 118 {
 119    uint32_t clear_value[4] = {};
 120
 121    switch (format) {
 122    case VK_FORMAT_X8_D24_UNORM_PACK32:
 123    case VK_FORMAT_D24_UNORM_S8_UINT:
 124       /* cleared as r8g8b8a8_unorm using special format */
 125       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 126       clear_value[1] = clear_value[0] >> 8;
 127       clear_value[2] = clear_value[0] >> 16;
 128       clear_value[3] = val->depthStencil.stencil;
 129       break;
 130    case VK_FORMAT_D16_UNORM:
 131    case VK_FORMAT_D32_SFLOAT:
 132       /* R2D_FLOAT32 */
 133       clear_value[0] = fui(val->depthStencil.depth);
 134       break;
 135    case VK_FORMAT_S8_UINT:
 136       clear_value[0] = val->depthStencil.stencil;
 137       break;
 138    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 139       /* cleared as UINT32 */
 140       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 141       break;
 142    default:
 143       assert(!vk_format_is_depth_or_stencil(format));
 144       const struct util_format_description *desc = vk_format_description(format);
 145       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 146
 147       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 148                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 149
 150       for (unsigned i = 0; i < desc->nr_channels; i++) {
 151          const struct util_format_channel_description *ch = &desc->channel[i];
 152          if (ifmt == R2D_UNORM8) {
 153             float linear = val->color.float32[i];
 154             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 155                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 156
 157             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 158                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
 159             else
 160                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 161          } else if (ifmt == R2D_FLOAT16) {
 162             clear_value[i] = util_float_to_half(val->color.float32[i]);
 163          } else {
 164             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 165                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 166             clear_value[i] = val->color.uint32[i];
 167          }
 168       }
 169       break;
 170    }
 171
 172    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 173    tu_cs_emit_array(cs, clear_value, 4);
 174 }
 175
 176 static void
 177 r2d_src(struct tu_cmd_buffer *cmd,
 178         struct tu_cs *cs,
 179         const struct tu_image_view *iview,
 180         uint32_t layer,
 181         VkFilter filter)
 182 {
 183    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
 184    if (filter != VK_FILTER_NEAREST)
 185       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
 186
 187    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 188    tu_cs_emit(cs, src_info);
 189    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 190    tu_cs_image_ref_2d(cs, iview, layer, true);
 191
 192    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 193    tu_cs_image_flag_ref(cs, iview, layer);
 194 }
 195
 196 static void
 197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 198                struct tu_cs *cs,
 199                VkFormat vk_format,
 200                uint64_t va, uint32_t pitch,
 201                uint32_t width, uint32_t height)
 202 {
 203    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 204
 205    tu_cs_emit_regs(cs,
 206                    A6XX_SP_PS_2D_SRC_INFO(
 207                       .color_format = format.fmt,
 208                       .color_swap = format.swap,
 209                       .srgb = vk_format_is_srgb(vk_format),
 210                       .unk20 = 1,
 211                       .unk22 = 1),
 212                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 213                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 214                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 215                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 216 }
 217
 218 static void
 219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 220 {
 221    assert(iview->image->samples == 1);
 222
 223    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 224    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 225    tu_cs_image_ref_2d(cs, iview, layer, false);
 226
 227    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 228    tu_cs_image_flag_ref(cs, iview, layer);
 229 }
 230
 231 static void
 232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 233 {
 234    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 235
 236    tu_cs_emit_regs(cs,
 237                    A6XX_RB_2D_DST_INFO(
 238                       .color_format = format.fmt,
 239                       .color_swap = format.swap,
 240                       .srgb = vk_format_is_srgb(vk_format)),
 241                    A6XX_RB_2D_DST_LO((uint32_t) va),
 242                    A6XX_RB_2D_DST_HI(va >> 32),
 243                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 244 }
 245
 246 static void
 247 r2d_setup_common(struct tu_cmd_buffer *cmd,
 248                  struct tu_cs *cs,
 249                  VkFormat vk_format,
 250                  enum a6xx_rotation rotation,
 251                  bool clear,
 252                  uint8_t mask,
 253                  bool scissor)
 254 {
 255    enum a6xx_format format = tu6_base_format(vk_format);
 256    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 257    uint32_t unknown_8c01 = 0;
 258
 259    if (format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8) {
 260       /* preserve depth channels */
 261       if (mask == 0x8)
 262          unknown_8c01 = 0x00084001;
 263       /* preserve stencil channel */
 264       if (mask == 0x7)
 265          unknown_8c01 = 0x08000041;
 266    }
 267
 268    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 269    tu_cs_emit(cs, unknown_8c01);
 270
 271    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 272          .scissor = scissor,
 273          .rotate = rotation,
 274          .solid_color = clear,
 275          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 276          .color_format = format,
 277          .mask = 0xf,
 278          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 279       ).value;
 280
 281    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 282    tu_cs_emit(cs, blit_cntl);
 283
 284    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 285    tu_cs_emit(cs, blit_cntl);
 286
 287    if (format == FMT6_10_10_10_2_UNORM_DEST)
 288       format = FMT6_16_16_16_16_FLOAT;
 289
 290    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 291          .sint = vk_format_is_sint(vk_format),
 292          .uint = vk_format_is_uint(vk_format),
 293          .color_format = format,
 294          .srgb = vk_format_is_srgb(vk_format),
 295          .mask = 0xf));
 296 }
 297
 298 static void
 299 r2d_setup(struct tu_cmd_buffer *cmd,
 300           struct tu_cs *cs,
 301           VkFormat vk_format,
 302           enum a6xx_rotation rotation,
 303           bool clear,
 304           uint8_t mask)
 305 {
 306    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 307
 308    r2d_setup_common(cmd, cs, vk_format, rotation, clear, mask, false);
 309 }
 310
 311 static void
 312 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 313 {
 314    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 315    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 316 }
 317
 318 /* r3d_ = shader path operations */
 319
 320 static void
 321 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 322            bool layered_clear)
 323 {
 324    struct ir3_shader dummy_shader = {};
 325
 326    struct ir3_shader_variant vs = {
 327       .type = MESA_SHADER_VERTEX,
 328       .instrlen = 1,
 329       .constlen = 2,
 330       .info.max_reg = 1,
 331       .inputs_count = 1,
 332       .inputs[0] = {
 333          .slot = SYSTEM_VALUE_VERTEX_ID,
 334          .regid = regid(0, 3),
 335          .sysval = true,
 336       },
 337       .outputs_count = blit ? 2 : 1,
 338       .outputs[0] = {
 339          .slot = VARYING_SLOT_POS,
 340          .regid = regid(0, 0),
 341       },
 342       .outputs[1] = {
 343          .slot = VARYING_SLOT_VAR0,
 344          .regid = regid(1, 0),
 345       },
 346       .shader = &dummy_shader,
 347    };
 348    if (layered_clear) {
 349       vs = (struct ir3_shader_variant) {
 350          .type = MESA_SHADER_VERTEX,
 351          .instrlen = 1,
 352          .info.max_reg = 0,
 353          .shader = &dummy_shader,
 354       };
 355    }
 356
 357    struct ir3_shader_variant fs = {
 358       .type = MESA_SHADER_FRAGMENT,
 359       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 360       .constlen = num_rts,
 361       .info.max_reg = MAX2(num_rts, 1) - 1,
 362       .total_in = blit ? 2 : 0,
 363       .num_samp = blit ? 1 : 0,
 364       .inputs_count = blit ? 2 : 0,
 365       .inputs[0] = {
 366          .slot = VARYING_SLOT_VAR0,
 367          .inloc = 0,
 368          .compmask = 3,
 369          .bary = true,
 370       },
 371       .inputs[1] = {
 372          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 373          .regid = regid(0, 0),
 374          .sysval = 1,
 375       },
 376       .num_sampler_prefetch = blit ? 1 : 0,
 377       .sampler_prefetch[0] = {
 378          .src = 0,
 379          .wrmask = 0xf,
 380          .cmd = 4,
 381       },
 382       .shader = &dummy_shader,
 383    };
 384
 385    struct ir3_shader_variant gs_shader = {
 386       .type = MESA_SHADER_GEOMETRY,
 387       .instrlen = 1,
 388       .constlen = 2,
 389       .info.max_reg = 1,
 390       .inputs_count = 1,
 391       .inputs[0] = {
 392          .slot = SYSTEM_VALUE_GS_HEADER_IR3,
 393          .regid = regid(0, 0),
 394          .sysval = true,
 395       },
 396       .outputs_count = 3,
 397       .outputs[0] = {
 398          .slot = VARYING_SLOT_POS,
 399          .regid = regid(0, 0),
 400       },
 401       .outputs[1] = {
 402          .slot = VARYING_SLOT_LAYER,
 403          .regid = regid(1, 1),
 404       },
 405       .outputs[2] = {
 406          .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
 407          .regid = regid(1, 0),
 408       },
 409       .shader = &dummy_shader,
 410    }, *gs = layered_clear ? &gs_shader : NULL;
 411
 412
 413 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_F32, .dst_type = TYPE_F32, args } }
 414 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 415 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 416
 417    static const instr_t vs_code[] = {
 418       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 419        * r1.xy = r0.w ? c1.zw : c0.zw
 420        * r0.w = 1.0f
 421        */
 422       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 423          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 424          .src2 = 3,
 425          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 426       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 427          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 428          .src2 = 3,
 429          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 430       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 431       { .cat0 = { .opc = OPC_END } },
 432    };
 433
 434    static const instr_t vs_layered[] = {
 435       { .cat0 = { .opc = OPC_CHMASK } },
 436       { .cat0 = { .opc = OPC_CHSH } },
 437    };
 438
 439    static const instr_t gs_code[16] = {
 440       /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
 441       CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
 442            .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
 443       /* x = (local_id & 1) ? c1.x : c0.x */
 444       CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
 445       /* y = (local_id & 2) ? c1.y : c0.y */
 446       CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
 447       /* pred = (local_id >= 4), used by OPC_KILL */
 448       CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
 449       /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
 450       CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
 451
 452       MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
 453       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
 454       MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
 455
 456       /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
 457       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
 458          .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
 459          .src2 = 0,
 460          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 461
 462       CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
 463
 464       { .cat0 = { .opc = OPC_KILL } },
 465       { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
 466    };
 467 #define FS_OFFSET (16 * sizeof(instr_t))
 468 #define GS_OFFSET (32 * sizeof(instr_t))
 469
 470    /* shaders */
 471    struct ts_cs_memory shaders = { };
 472    VkResult result = tu_cs_alloc(&cmd->sub_cs, 2 + layered_clear,
 473                                  16 * sizeof(instr_t), &shaders);
 474    assert(result == VK_SUCCESS);
 475
 476    if (layered_clear) {
 477       memcpy(shaders.map, vs_layered, sizeof(vs_layered));
 478       memcpy((uint8_t*) shaders.map + GS_OFFSET, gs_code, sizeof(gs_code));
 479    } else {
 480       memcpy(shaders.map, vs_code, sizeof(vs_code));
 481    }
 482
 483    instr_t *fs_code = (instr_t*) ((uint8_t*) shaders.map + FS_OFFSET);
 484    for (uint32_t i = 0; i < num_rts; i++) {
 485       /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 486       *fs_code++ = (instr_t) { .cat1 = {
 487          .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32,
 488          .repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4
 489       } };
 490    }
 491
 492    /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 493     * blit path (its not clear what allows it to not have it)
 494     */
 495    if (blit) {
 496       *fs_code++ = (instr_t) { .cat2 = {
 497          .opc_cat = 2, .opc = OPC_BARY_F & 63, .ei = 1, .full = 1,
 498          .dst = regid(63, 0), .src1_im = 1
 499       } };
 500    }
 501    *fs_code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 502    /* note: assumed <= 16 instructions (MAX_RTS is 8) */
 503
 504    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 505
 506    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, shaders.iova);
 507    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 508    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 509    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs, shaders.iova + GS_OFFSET);
 510    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, shaders.iova + FS_OFFSET);
 511
 512    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 513    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 514
 515    tu6_emit_vpc(cs, &vs, gs, &fs, NULL);
 516
 517    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 518    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 519    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 520
 521    tu6_emit_fs_inputs(cs, &fs);
 522
 523    tu_cs_emit_regs(cs,
 524                    A6XX_GRAS_CL_CNTL(
 525                       .persp_division_disable = 1,
 526                       .vp_xform_disable = 1,
 527                       .vp_clip_code_ignore = 1,
 528                       .clip_disable = 1),
 529                    A6XX_GRAS_UNKNOWN_8001(0));
 530    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 531
 532    tu_cs_emit_regs(cs,
 533                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 534                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 535    tu_cs_emit_regs(cs,
 536                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 537                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 538
 539    tu_cs_emit_regs(cs,
 540                    A6XX_VFD_INDEX_OFFSET(),
 541                    A6XX_VFD_INSTANCE_START_OFFSET());
 542 }
 543
 544 static void
 545 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
 546 {
 547    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 548    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 549                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 550                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 551                   CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
 552                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 553    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 554    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 555    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 556 }
 557
 558 static void
 559 r3d_coords(struct tu_cs *cs,
 560            const VkOffset2D *dst,
 561            const VkOffset2D *src,
 562            const VkExtent2D *extent)
 563 {
 564    int32_t src_x1 = src ? src->x : 0;
 565    int32_t src_y1 = src ? src->y : 0;
 566    r3d_coords_raw(cs, false, (float[]) {
 567       dst->x,                 dst->y,
 568       src_x1,                 src_y1,
 569       dst->x + extent->width, dst->y + extent->height,
 570       src_x1 + extent->width, src_y1 + extent->height,
 571    });
 572 }
 573
 574 static void
 575 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 576 {
 577    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 578    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 579                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 580                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 581                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 582                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 583    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 584    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 585    switch (format) {
 586    case VK_FORMAT_X8_D24_UNORM_PACK32:
 587    case VK_FORMAT_D24_UNORM_S8_UINT: {
 588       /* cleared as r8g8b8a8_unorm using special format */
 589       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 590       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 591       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 592       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 593       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 594    } break;
 595    case VK_FORMAT_D16_UNORM:
 596    case VK_FORMAT_D32_SFLOAT:
 597       tu_cs_emit(cs, fui(val->depthStencil.depth));
 598       tu_cs_emit(cs, 0);
 599       tu_cs_emit(cs, 0);
 600       tu_cs_emit(cs, 0);
 601       break;
 602    case VK_FORMAT_S8_UINT:
 603       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 604       tu_cs_emit(cs, 0);
 605       tu_cs_emit(cs, 0);
 606       tu_cs_emit(cs, 0);
 607       break;
 608    default:
 609       /* as color formats use clear value as-is */
 610       assert(!vk_format_is_depth_or_stencil(format));
 611       tu_cs_emit_array(cs, val->color.uint32, 4);
 612       break;
 613    }
 614 }
 615
 616 static void
 617 r3d_src_common(struct tu_cmd_buffer *cmd,
 618                struct tu_cs *cs,
 619                const uint32_t *tex_const,
 620                uint32_t offset_base,
 621                uint32_t offset_ubwc,
 622                VkFilter filter)
 623 {
 624    struct ts_cs_memory texture = { };
 625    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 626                                  2, /* allocate space for a sampler too */
 627                                  A6XX_TEX_CONST_DWORDS, &texture);
 628    assert(result == VK_SUCCESS);
 629
 630    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 631
 632    /* patch addresses for layer offset */
 633    *(uint64_t*) (texture.map + 4) += offset_base;
 634    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 635    texture.map[7] = ubwc_addr;
 636    texture.map[8] = ubwc_addr >> 32;
 637
 638    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 639       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
 640       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
 641       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 642       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 643       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 644       0x60000; /* XXX used by blob, doesn't seem necessary */
 645    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 646       0x1 | /* XXX used by blob, doesn't seem necessary */
 647       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 648       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 649    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 650    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 651
 652    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 653    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 654                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 655                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 656                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 657                CP_LOAD_STATE6_0_NUM_UNIT(1));
 658    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 659
 660    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 661    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 662
 663    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 664    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 665       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 666       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 667       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 668       CP_LOAD_STATE6_0_NUM_UNIT(1));
 669    tu_cs_emit_qw(cs, texture.iova);
 670
 671    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 672    tu_cs_emit_qw(cs, texture.iova);
 673
 674    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 675 }
 676
 677 static void
 678 r3d_src(struct tu_cmd_buffer *cmd,
 679         struct tu_cs *cs,
 680         const struct tu_image_view *iview,
 681         uint32_t layer,
 682         VkFilter filter)
 683 {
 684    r3d_src_common(cmd, cs, iview->descriptor,
 685                   iview->layer_size * layer,
 686                   iview->ubwc_layer_size * layer,
 687                   filter);
 688 }
 689
 690 static void
 691 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 692                struct tu_cs *cs,
 693                VkFormat vk_format,
 694                uint64_t va, uint32_t pitch,
 695                uint32_t width, uint32_t height)
 696 {
 697    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 698
 699    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 700
 701    desc[0] =
 702       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 703       A6XX_TEX_CONST_0_FMT(format.fmt) |
 704       A6XX_TEX_CONST_0_SWAP(format.swap) |
 705       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 706       // XXX to swizzle into .w for stencil buffer_to_image
 707       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 708       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 709       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 710    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 711    desc[2] =
 712       A6XX_TEX_CONST_2_PITCH(pitch) |
 713       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 714    desc[3] = 0;
 715    desc[4] = va;
 716    desc[5] = va >> 32;
 717    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 718       desc[i] = 0;
 719
 720    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 721 }
 722
 723 static void
 724 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 725 {
 726    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 727
 728    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 729    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 730    tu_cs_image_ref(cs, iview, layer);
 731    tu_cs_emit(cs, 0);
 732
 733    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 734    tu_cs_image_flag_ref(cs, iview, layer);
 735
 736    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 737 }
 738
 739 static void
 740 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 741 {
 742    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 743
 744    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 745
 746    tu_cs_emit_regs(cs,
 747                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 748                    A6XX_RB_MRT_PITCH(0, pitch),
 749                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 750                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 751                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 752                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 753
 754    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 755 }
 756
 757 static void
 758 r3d_setup(struct tu_cmd_buffer *cmd,
 759           struct tu_cs *cs,
 760           VkFormat vk_format,
 761           enum a6xx_rotation rotation,
 762           bool clear,
 763           uint8_t mask)
 764 {
 765    if (!cmd->state.pass) {
 766       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 767       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 768    }
 769
 770    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 771    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 772
 773    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 774
 775    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 776    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 777                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 778                   0xfc000000);
 779    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 780
 781    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 782    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 783
 784    tu_cs_emit_regs(cs,
 785                    A6XX_RB_FS_OUTPUT_CNTL0(),
 786                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 787
 788    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 789    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 790    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 791
 792    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 793    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 794    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 795    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 796    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 797    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 798    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 799
 800    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 801    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 802
 803    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 804                         .color_format = tu6_base_format(vk_format),
 805                         .color_sint = vk_format_is_sint(vk_format),
 806                         .color_uint = vk_format_is_uint(vk_format)));
 807
 808    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = mask));
 809    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 810    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 811 }
 812
 813 static void
 814 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 815 {
 816    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 817    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 818                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 819                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 820    tu_cs_emit(cs, 1); /* instance count */
 821    tu_cs_emit(cs, 2); /* vertex count */
 822 }
 823
 824 /* blit ops - common interface for 2d/shader paths */
 825
 826 struct blit_ops {
 827    void (*coords)(struct tu_cs *cs,
 828                   const VkOffset2D *dst,
 829                   const VkOffset2D *src,
 830                   const VkExtent2D *extent);
 831    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 832    void (*src)(
 833         struct tu_cmd_buffer *cmd,
 834         struct tu_cs *cs,
 835         const struct tu_image_view *iview,
 836         uint32_t layer,
 837         VkFilter filter);
 838    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 839                       VkFormat vk_format,
 840                       uint64_t va, uint32_t pitch,
 841                       uint32_t width, uint32_t height);
 842    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 843    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
 844    void (*setup)(struct tu_cmd_buffer *cmd,
 845                  struct tu_cs *cs,
 846                  VkFormat vk_format,
 847                  enum a6xx_rotation rotation,
 848                  bool clear,
 849                  uint8_t mask);
 850    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 851 };
 852
 853 static const struct blit_ops r2d_ops = {
 854    .coords = r2d_coords,
 855    .clear_value = r2d_clear_value,
 856    .src = r2d_src,
 857    .src_buffer = r2d_src_buffer,
 858    .dst = r2d_dst,
 859    .dst_buffer = r2d_dst_buffer,
 860    .setup = r2d_setup,
 861    .run = r2d_run,
 862 };
 863
 864 static const struct blit_ops r3d_ops = {
 865    .coords = r3d_coords,
 866    .clear_value = r3d_clear_value,
 867    .src = r3d_src,
 868    .src_buffer = r3d_src_buffer,
 869    .dst = r3d_dst,
 870    .dst_buffer = r3d_dst_buffer,
 871    .setup = r3d_setup,
 872    .run = r3d_run,
 873 };
 874
 875 /* passthrough set coords from 3D extents */
 876 static void
 877 coords(const struct blit_ops *ops,
 878        struct tu_cs *cs,
 879        const VkOffset3D *dst,
 880        const VkOffset3D *src,
 881        const VkExtent3D *extent)
 882 {
 883    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 884 }
 885
 886 static void
 887 tu_image_view_blit2(struct tu_image_view *iview,
 888                     struct tu_image *image,
 889                     VkFormat format,
 890                     const VkImageSubresourceLayers *subres,
 891                     uint32_t layer,
 892                     bool stencil_read)
 893 {
 894    VkImageAspectFlags aspect_mask = subres->aspectMask;
 895
 896    /* always use the AS_R8G8B8A8 format for these */
 897    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
 898        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
 899       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
 900    }
 901
 902    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
 903       .image = tu_image_to_handle(image),
 904       .viewType = VK_IMAGE_VIEW_TYPE_2D,
 905       .format = format,
 906       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
 907       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
 908       .subresourceRange = {
 909          .aspectMask = aspect_mask,
 910          .baseMipLevel = subres->mipLevel,
 911          .levelCount = 1,
 912          .baseArrayLayer = subres->baseArrayLayer + layer,
 913          .layerCount = 1,
 914       },
 915    });
 916 }
 917
 918 static void
 919 tu_image_view_blit(struct tu_image_view *iview,
 920                    struct tu_image *image,
 921                    const VkImageSubresourceLayers *subres,
 922                    uint32_t layer)
 923 {
 924    tu_image_view_blit2(iview, image, image->vk_format, subres, layer, false);
 925 }
 926
 927 static void
 928 tu6_blit_image(struct tu_cmd_buffer *cmd,
 929                struct tu_image *src_image,
 930                struct tu_image *dst_image,
 931                const VkImageBlit *info,
 932                VkFilter filter)
 933 {
 934    const struct blit_ops *ops = &r2d_ops;
 935    struct tu_cs *cs = &cmd->cs;
 936    uint32_t layers;
 937
 938    /* 2D blit can't do rotation mirroring from just coordinates */
 939    static const enum a6xx_rotation rotate[2][2] = {
 940       {ROTATE_0, ROTATE_HFLIP},
 941       {ROTATE_VFLIP, ROTATE_180},
 942    };
 943
 944    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
 945                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
 946    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
 947                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
 948    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
 949                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
 950
 951    if (mirror_z) {
 952       tu_finishme("blit z mirror\n");
 953       return;
 954    }
 955
 956    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
 957        info->dstOffsets[1].z - info->dstOffsets[0].z) {
 958       tu_finishme("blit z filter\n");
 959       return;
 960    }
 961
 962    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
 963    if (info->dstSubresource.layerCount > 1) {
 964       assert(layers <= 1);
 965       layers = info->dstSubresource.layerCount;
 966    }
 967
 968    uint8_t mask = 0xf;
 969    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 970       assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
 971       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
 972          mask = 0x7;
 973       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
 974          mask = 0x8;
 975    }
 976
 977    /* BC1_RGB_* formats need to have their last components overriden with 1
 978     * when sampling, which is normally handled with the texture descriptor
 979     * swizzle. The 2d path can't handle that, so use the 3d path.
 980     *
 981     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
 982     * the 2d path.
 983     */
 984
 985    if (dst_image->samples > 1 ||
 986        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
 987        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
 988        filter == VK_FILTER_CUBIC_EXT)
 989       ops = &r3d_ops;
 990
 991    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
 992     * figure out why (should be able to pass all tests with only shader path)
 993     */
 994
 995    ops->setup(cmd, cs, dst_image->vk_format, rotate[mirror_y][mirror_x], false, mask);
 996
 997    if (ops == &r3d_ops) {
 998       r3d_coords_raw(cs, false, (float[]) {
 999          info->dstOffsets[0].x, info->dstOffsets[0].y,
1000          info->srcOffsets[0].x, info->srcOffsets[0].y,
1001          info->dstOffsets[1].x, info->dstOffsets[1].y,
1002          info->srcOffsets[1].x, info->srcOffsets[1].y
1003       });
1004    } else {
1005       tu_cs_emit_regs(cs,
1006          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1007                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1008          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1009                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1010       tu_cs_emit_regs(cs,
1011          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1012          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1013          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1014          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1015    }
1016
1017    struct tu_image_view dst, src;
1018    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1019    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1020
1021    for (uint32_t i = 0; i < layers; i++) {
1022       ops->dst(cs, &dst, i);
1023       ops->src(cmd, cs, &src, i, filter);
1024       ops->run(cmd, cs);
1025    }
1026 }
1027
1028 void
1029 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1030                 VkImage srcImage,
1031                 VkImageLayout srcImageLayout,
1032                 VkImage dstImage,
1033                 VkImageLayout dstImageLayout,
1034                 uint32_t regionCount,
1035                 const VkImageBlit *pRegions,
1036                 VkFilter filter)
1037
1038 {
1039    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1040    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1041    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1042
1043    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1044    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1045
1046    for (uint32_t i = 0; i < regionCount; ++i)
1047       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1048 }
1049
1050 static VkFormat
1051 copy_format(VkFormat format)
1052 {
1053    switch (vk_format_get_blocksize(format)) {
1054    case 1: return VK_FORMAT_R8_UINT;
1055    case 2: return VK_FORMAT_R16_UINT;
1056    case 4: return VK_FORMAT_R32_UINT;
1057    case 8: return VK_FORMAT_R32G32_UINT;
1058    case 12:return VK_FORMAT_R32G32B32_UINT;
1059    case 16:return VK_FORMAT_R32G32B32A32_UINT;
1060    default:
1061       unreachable("unhandled format size");
1062    }
1063 }
1064
1065 static void
1066 copy_compressed(VkFormat format,
1067                 VkOffset3D *offset,
1068                 VkExtent3D *extent,
1069                 uint32_t *width,
1070                 uint32_t *height)
1071 {
1072    if (!vk_format_is_compressed(format))
1073       return;
1074
1075    uint32_t block_width = vk_format_get_blockwidth(format);
1076    uint32_t block_height = vk_format_get_blockheight(format);
1077
1078    offset->x /= block_width;
1079    offset->y /= block_height;
1080
1081    if (extent) {
1082       extent->width = DIV_ROUND_UP(extent->width, block_width);
1083       extent->height = DIV_ROUND_UP(extent->height, block_height);
1084    }
1085    if (width)
1086       *width = DIV_ROUND_UP(*width, block_width);
1087    if (height)
1088       *height = DIV_ROUND_UP(*height, block_height);
1089 }
1090
1091 static void
1092 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1093                         struct tu_buffer *src_buffer,
1094                         struct tu_image *dst_image,
1095                         const VkBufferImageCopy *info)
1096 {
1097    struct tu_cs *cs = &cmd->cs;
1098    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1099    VkFormat dst_format = dst_image->vk_format;
1100    VkFormat src_format = dst_image->vk_format;
1101    const struct blit_ops *ops = &r2d_ops;
1102
1103    uint8_t mask = 0xf;
1104
1105    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1106       switch (info->imageSubresource.aspectMask) {
1107       case VK_IMAGE_ASPECT_STENCIL_BIT:
1108          src_format = VK_FORMAT_R8_UNORM; /* changes how src buffer is interpreted */
1109          mask = 0x8;
1110          ops = &r3d_ops;
1111          break;
1112       case VK_IMAGE_ASPECT_DEPTH_BIT:
1113          mask = 0x7;
1114          break;
1115       }
1116    }
1117
1118    VkOffset3D offset = info->imageOffset;
1119    VkExtent3D extent = info->imageExtent;
1120    uint32_t src_width = info->bufferRowLength ?: extent.width;
1121    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1122
1123    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(src_format)) {
1124       assert(src_format == dst_format);
1125       copy_compressed(dst_format, &offset, &extent, &src_width, &src_height);
1126       src_format = dst_format = copy_format(dst_format);
1127    }
1128
1129    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1130    uint32_t layer_size = src_height * pitch;
1131
1132    /* note: the src_va/pitch alignment of 64 is for 2D engine,
1133     * it is also valid for 1cpp format with shader path (stencil aspect path)
1134     */
1135
1136    ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1137
1138    struct tu_image_view dst;
1139    tu_image_view_blit2(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
1140
1141    for (uint32_t i = 0; i < layers; i++) {
1142       ops->dst(cs, &dst, i);
1143
1144       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1145       if ((src_va & 63) || (pitch & 63)) {
1146          for (uint32_t y = 0; y < extent.height; y++) {
1147             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1148             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1149                             x + extent.width, 1);
1150             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1151                         &(VkExtent2D) {extent.width, 1});
1152             ops->run(cmd, cs);
1153             src_va += pitch;
1154          }
1155       } else {
1156          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1157          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1158          ops->run(cmd, cs);
1159       }
1160    }
1161 }
1162
1163 void
1164 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1165                         VkBuffer srcBuffer,
1166                         VkImage dstImage,
1167                         VkImageLayout dstImageLayout,
1168                         uint32_t regionCount,
1169                         const VkBufferImageCopy *pRegions)
1170 {
1171    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1172    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1173    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1174
1175    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1176    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1177
1178    for (unsigned i = 0; i < regionCount; ++i)
1179       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1180 }
1181
1182 static void
1183 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1184                         struct tu_image *src_image,
1185                         struct tu_buffer *dst_buffer,
1186                         const VkBufferImageCopy *info)
1187 {
1188    struct tu_cs *cs = &cmd->cs;
1189    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1190    VkFormat src_format = src_image->vk_format;
1191    VkFormat dst_format = src_image->vk_format;
1192    bool stencil_read = false;
1193
1194    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1195        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1196       dst_format = VK_FORMAT_R8_UNORM;
1197       stencil_read = true;
1198    }
1199
1200    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1201    VkOffset3D offset = info->imageOffset;
1202    VkExtent3D extent = info->imageExtent;
1203    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1204    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1205
1206    if (dst_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || vk_format_is_compressed(dst_format)) {
1207       assert(src_format == dst_format);
1208       copy_compressed(dst_format, &offset, &extent, &dst_width, &dst_height);
1209       src_format = dst_format = copy_format(dst_format);
1210    }
1211
1212    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1213    uint32_t layer_size = pitch * dst_height;
1214
1215    /* note: the dst_va/pitch alignment of 64 is for 2D engine,
1216     * it is also valid for 1cpp format with shader path (stencil aspect)
1217     */
1218
1219    ops->setup(cmd, cs, dst_format, ROTATE_0, false, 0xf);
1220
1221    struct tu_image_view src;
1222    tu_image_view_blit2(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
1223
1224    for (uint32_t i = 0; i < layers; i++) {
1225       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1226
1227       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1228       if ((dst_va & 63) || (pitch & 63)) {
1229          for (uint32_t y = 0; y < extent.height; y++) {
1230             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1231             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1232             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1233                         &(VkExtent2D) {extent.width, 1});
1234             ops->run(cmd, cs);
1235             dst_va += pitch;
1236          }
1237       } else {
1238          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1239          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1240          ops->run(cmd, cs);
1241       }
1242    }
1243 }
1244
1245 void
1246 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1247                         VkImage srcImage,
1248                         VkImageLayout srcImageLayout,
1249                         VkBuffer dstBuffer,
1250                         uint32_t regionCount,
1251                         const VkBufferImageCopy *pRegions)
1252 {
1253    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1254    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1255    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1256
1257    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1258    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1259
1260    for (unsigned i = 0; i < regionCount; ++i)
1261       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1262 }
1263
1264 /* Tiled formats don't support swapping, which means that we can't support
1265  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1266  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1267  * Currently we fake support for tiled swapped formats and use the unswapped
1268  * format instead, but this means that reinterpreting copies to and from
1269  * swapped formats can't be performed correctly unless we can swizzle the
1270  * components by reinterpreting the other image as the "correct" swapped
1271  * format, i.e. only when the other image is linear.
1272  */
1273
1274 static bool
1275 is_swapped_format(VkFormat format)
1276 {
1277    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1278    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1279    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1280 }
1281
1282 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1283  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1284  * versa). This should mirror the logic in fdl6_layout.
1285  */
1286 static bool
1287 image_is_r8g8(struct tu_image *image)
1288 {
1289    return image->layout.cpp == 2 &&
1290       vk_format_get_nr_components(image->vk_format) == 2;
1291 }
1292
1293 static void
1294 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1295                        struct tu_image *src_image,
1296                        struct tu_image *dst_image,
1297                        const VkImageCopy *info)
1298 {
1299    const struct blit_ops *ops = &r2d_ops;
1300    struct tu_cs *cs = &cmd->cs;
1301
1302    uint8_t mask = 0xf;
1303    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1304       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1305          mask = 0x7;
1306       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1307          mask = 0x8;
1308    }
1309
1310    if (dst_image->samples > 1)
1311       ops = &r3d_ops;
1312
1313    assert(info->srcSubresource.aspectMask == info->dstSubresource.aspectMask);
1314
1315    VkFormat format = VK_FORMAT_UNDEFINED;
1316    VkOffset3D src_offset = info->srcOffset;
1317    VkOffset3D dst_offset = info->dstOffset;
1318    VkExtent3D extent = info->extent;
1319
1320    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1321     * Images":
1322     *
1323     *    When copying between compressed and uncompressed formats the extent
1324     *    members represent the texel dimensions of the source image and not
1325     *    the destination. When copying from a compressed image to an
1326     *    uncompressed image the image texel dimensions written to the
1327     *    uncompressed image will be source extent divided by the compressed
1328     *    texel block dimensions. When copying from an uncompressed image to a
1329     *    compressed image the image texel dimensions written to the compressed
1330     *    image will be the source extent multiplied by the compressed texel
1331     *    block dimensions.
1332     *
1333     * This means we only have to adjust the extent if the source image is
1334     * compressed.
1335     */
1336    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1337    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1338
1339    VkFormat dst_format = vk_format_is_compressed(dst_image->vk_format) ?
1340       copy_format(dst_image->vk_format) : dst_image->vk_format;
1341    VkFormat src_format = vk_format_is_compressed(src_image->vk_format) ?
1342       copy_format(src_image->vk_format) : src_image->vk_format;
1343
1344    bool use_staging_blit = false;
1345
1346    if (src_format == dst_format) {
1347       /* Images that share a format can always be copied directly because it's
1348        * the same as a blit.
1349        */
1350       format = src_format;
1351    } else if (!src_image->layout.tile_mode) {
1352       /* If an image is linear, we can always safely reinterpret it with the
1353        * other image's format and then do a regular blit.
1354        */
1355       format = dst_format;
1356    } else if (!dst_image->layout.tile_mode) {
1357       format = src_format;
1358    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1359       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1360        * due to the different tile layout.
1361        */
1362       use_staging_blit = true;
1363    } else if (is_swapped_format(src_format) ||
1364               is_swapped_format(dst_format)) {
1365       /* If either format has a non-identity swap, then we can't copy
1366        * to/from it.
1367        */
1368       use_staging_blit = true;
1369    } else if (!src_image->layout.ubwc) {
1370       format = dst_format;
1371    } else if (!dst_image->layout.ubwc) {
1372       format = src_format;
1373    } else {
1374       /* Both formats use UBWC and so neither can be reinterpreted.
1375        * TODO: We could do an in-place decompression of the dst instead.
1376        */
1377       use_staging_blit = true;
1378    }
1379
1380    struct tu_image_view dst, src;
1381
1382    if (use_staging_blit) {
1383       tu_image_view_blit2(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1384       tu_image_view_blit2(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1385
1386       struct tu_image staging_image = {
1387          .vk_format = src_format,
1388          .type = src_image->type,
1389          .tiling = VK_IMAGE_TILING_LINEAR,
1390          .extent = extent,
1391          .level_count = 1,
1392          .layer_count = info->srcSubresource.layerCount,
1393          .samples = src_image->samples,
1394          .bo_offset = 0,
1395       };
1396
1397       VkImageSubresourceLayers staging_subresource = {
1398          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1399          .mipLevel = 0,
1400          .baseArrayLayer = 0,
1401          .layerCount = info->srcSubresource.layerCount,
1402       };
1403
1404       VkOffset3D staging_offset = { 0 };
1405
1406       staging_image.layout.tile_mode = TILE6_LINEAR;
1407       staging_image.layout.ubwc = false;
1408
1409       fdl6_layout(&staging_image.layout,
1410                   vk_format_to_pipe_format(staging_image.vk_format),
1411                   staging_image.samples,
1412                   staging_image.extent.width,
1413                   staging_image.extent.height,
1414                   staging_image.extent.depth,
1415                   staging_image.level_count,
1416                   staging_image.layer_count,
1417                   staging_image.type == VK_IMAGE_TYPE_3D,
1418                   NULL);
1419
1420       VkResult result = tu_get_scratch_bo(cmd->device,
1421                                           staging_image.layout.size,
1422                                           &staging_image.bo);
1423       if (result != VK_SUCCESS) {
1424          cmd->record_result = result;
1425          return;
1426       }
1427
1428       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1429                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1430
1431       struct tu_image_view staging;
1432       tu_image_view_blit2(&staging, &staging_image, src_format,
1433                           &staging_subresource, 0, false);
1434
1435       ops->setup(cmd, cs, src_format, ROTATE_0, false, mask);
1436       coords(ops, cs, &staging_offset, &src_offset, &extent);
1437
1438       for (uint32_t i = 0; i < info->extent.depth; i++) {
1439          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1440          ops->dst(cs, &staging, i);
1441          ops->run(cmd, cs);
1442       }
1443
1444       /* When executed by the user there has to be a pipeline barrier here,
1445        * but since we're doing it manually we'll have to flush ourselves.
1446        */
1447       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1448       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1449
1450       tu_image_view_blit2(&staging, &staging_image, dst_format,
1451                           &staging_subresource, 0, false);
1452
1453       ops->setup(cmd, cs, dst_format, ROTATE_0, false, mask);
1454       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1455
1456       for (uint32_t i = 0; i < info->extent.depth; i++) {
1457          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1458          ops->dst(cs, &dst, i);
1459          ops->run(cmd, cs);
1460       }
1461    } else {
1462       tu_image_view_blit2(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1463       tu_image_view_blit2(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1464
1465       ops->setup(cmd, cs, format, ROTATE_0, false, mask);
1466       coords(ops, cs, &dst_offset, &src_offset, &extent);
1467
1468       for (uint32_t i = 0; i < info->extent.depth; i++) {
1469          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1470          ops->dst(cs, &dst, i);
1471          ops->run(cmd, cs);
1472       }
1473    }
1474 }
1475
1476 void
1477 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1478                 VkImage srcImage,
1479                 VkImageLayout srcImageLayout,
1480                 VkImage destImage,
1481                 VkImageLayout destImageLayout,
1482                 uint32_t regionCount,
1483                 const VkImageCopy *pRegions)
1484 {
1485    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1486    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1487    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1488
1489    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1490    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1491
1492    for (uint32_t i = 0; i < regionCount; ++i)
1493       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1494 }
1495
1496 static void
1497 copy_buffer(struct tu_cmd_buffer *cmd,
1498             uint64_t dst_va,
1499             uint64_t src_va,
1500             uint64_t size,
1501             uint32_t block_size)
1502 {
1503    const struct blit_ops *ops = &r2d_ops;
1504    struct tu_cs *cs = &cmd->cs;
1505    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1506    uint64_t blocks = size / block_size;
1507
1508    ops->setup(cmd, cs, format, ROTATE_0, false, 0xf);
1509
1510    while (blocks) {
1511       uint32_t src_x = (src_va & 63) / block_size;
1512       uint32_t dst_x = (dst_va & 63) / block_size;
1513       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1514
1515       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1516       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1517       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1518       ops->run(cmd, cs);
1519
1520       src_va += width * block_size;
1521       dst_va += width * block_size;
1522       blocks -= width;
1523    }
1524 }
1525
1526 void
1527 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1528                  VkBuffer srcBuffer,
1529                  VkBuffer dstBuffer,
1530                  uint32_t regionCount,
1531                  const VkBufferCopy *pRegions)
1532 {
1533    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1534    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1535    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1536
1537    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1538    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1539
1540    for (unsigned i = 0; i < regionCount; ++i) {
1541       copy_buffer(cmd,
1542                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1543                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1544                   pRegions[i].size, 1);
1545    }
1546 }
1547
1548 void
1549 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1550                    VkBuffer dstBuffer,
1551                    VkDeviceSize dstOffset,
1552                    VkDeviceSize dataSize,
1553                    const void *pData)
1554 {
1555    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1556    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1557
1558    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1559
1560    struct ts_cs_memory tmp;
1561    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1562    if (result != VK_SUCCESS) {
1563       cmd->record_result = result;
1564       return;
1565    }
1566
1567    memcpy(tmp.map, pData, dataSize);
1568    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1569 }
1570
1571 void
1572 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1573                  VkBuffer dstBuffer,
1574                  VkDeviceSize dstOffset,
1575                  VkDeviceSize fillSize,
1576                  uint32_t data)
1577 {
1578    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1579    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1580    const struct blit_ops *ops = &r2d_ops;
1581    struct tu_cs *cs = &cmd->cs;
1582
1583    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1584
1585    if (fillSize == VK_WHOLE_SIZE)
1586       fillSize = buffer->size - dstOffset;
1587
1588    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1589    uint32_t blocks = fillSize / 4;
1590
1591    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, ROTATE_0, true, 0xf);
1592    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1593
1594    while (blocks) {
1595       uint32_t dst_x = (dst_va & 63) / 4;
1596       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1597
1598       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1599       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1600       ops->run(cmd, cs);
1601
1602       dst_va += width * 4;
1603       blocks -= width;
1604    }
1605 }
1606
1607 void
1608 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1609                    VkImage srcImage,
1610                    VkImageLayout srcImageLayout,
1611                    VkImage dstImage,
1612                    VkImageLayout dstImageLayout,
1613                    uint32_t regionCount,
1614                    const VkImageResolve *pRegions)
1615 {
1616    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1617    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1618    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1619    const struct blit_ops *ops = &r2d_ops;
1620    struct tu_cs *cs = &cmd->cs;
1621
1622    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1623    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1624
1625    ops->setup(cmd, cs, dst_image->vk_format, ROTATE_0, false, 0xf);
1626
1627    for (uint32_t i = 0; i < regionCount; ++i) {
1628       const VkImageResolve *info = &pRegions[i];
1629       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1630
1631       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1632       /* TODO: aspect masks possible ? */
1633
1634       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1635
1636       struct tu_image_view dst, src;
1637       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1638       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1639
1640       for (uint32_t i = 0; i < layers; i++) {
1641          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1642          ops->dst(cs, &dst, i);
1643          ops->run(cmd, cs);
1644       }
1645    }
1646 }
1647
1648 void
1649 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1650                   struct tu_cs *cs,
1651                   struct tu_image_view *src,
1652                   struct tu_image_view *dst,
1653                   uint32_t layers,
1654                   const VkRect2D *rect)
1655 {
1656    const struct blit_ops *ops = &r2d_ops;
1657
1658    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1659    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1660
1661    assert(src->image->vk_format == dst->image->vk_format);
1662
1663    ops->setup(cmd, cs, dst->image->vk_format, ROTATE_0, false, 0xf);
1664    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1665
1666    for (uint32_t i = 0; i < layers; i++) {
1667       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1668       ops->dst(cs, dst, i);
1669       ops->run(cmd, cs);
1670    }
1671 }
1672
1673 static void
1674 clear_image(struct tu_cmd_buffer *cmd,
1675             struct tu_image *image,
1676             const VkClearValue *clear_value,
1677             const VkImageSubresourceRange *range)
1678 {
1679    uint32_t level_count = tu_get_levelCount(image, range);
1680    uint32_t layer_count = tu_get_layerCount(image, range);
1681    struct tu_cs *cs = &cmd->cs;
1682    VkFormat format = image->vk_format;
1683    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1684       format = VK_FORMAT_R32_UINT;
1685
1686    if (image->type == VK_IMAGE_TYPE_3D) {
1687       assert(layer_count == 1);
1688       assert(range->baseArrayLayer == 0);
1689    }
1690
1691    uint8_t mask = 0xf;
1692    if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1693       mask = 0;
1694       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
1695          mask |= 0x7;
1696       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
1697          mask |= 0x8;
1698    }
1699
1700    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1701
1702    ops->setup(cmd, cs, format, ROTATE_0, true, mask);
1703    ops->clear_value(cs, image->vk_format, clear_value);
1704
1705    for (unsigned j = 0; j < level_count; j++) {
1706       if (image->type == VK_IMAGE_TYPE_3D)
1707          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1708
1709       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1710                      u_minify(image->extent.width, range->baseMipLevel + j),
1711                      u_minify(image->extent.height, range->baseMipLevel + j)
1712                   });
1713
1714       struct tu_image_view dst;
1715       tu_image_view_blit2(&dst, image, format, &(VkImageSubresourceLayers) {
1716          .aspectMask = range->aspectMask,
1717          .mipLevel = range->baseMipLevel + j,
1718          .baseArrayLayer = range->baseArrayLayer,
1719          .layerCount = 1,
1720       }, 0, false);
1721
1722       for (uint32_t i = 0; i < layer_count; i++) {
1723          ops->dst(cs, &dst, i);
1724          ops->run(cmd, cs);
1725       }
1726    }
1727 }
1728
1729 void
1730 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1731                       VkImage image_h,
1732                       VkImageLayout imageLayout,
1733                       const VkClearColorValue *pColor,
1734                       uint32_t rangeCount,
1735                       const VkImageSubresourceRange *pRanges)
1736 {
1737    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1738    TU_FROM_HANDLE(tu_image, image, image_h);
1739
1740    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1741
1742    for (unsigned i = 0; i < rangeCount; i++)
1743       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1744 }
1745
1746 void
1747 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1748                              VkImage image_h,
1749                              VkImageLayout imageLayout,
1750                              const VkClearDepthStencilValue *pDepthStencil,
1751                              uint32_t rangeCount,
1752                              const VkImageSubresourceRange *pRanges)
1753 {
1754    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1755    TU_FROM_HANDLE(tu_image, image, image_h);
1756
1757    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1758
1759    for (unsigned i = 0; i < rangeCount; i++)
1760       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1761 }
1762
1763 static void
1764 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1765                                uint32_t attachment_count,
1766                                const VkClearAttachment *attachments,
1767                                uint32_t rect_count,
1768                                const VkClearRect *rects)
1769 {
1770    const struct tu_subpass *subpass = cmd->state.subpass;
1771    /* note: cannot use shader path here.. there is a special shader path
1772     * in tu_clear_sysmem_attachments()
1773     */
1774    const struct blit_ops *ops = &r2d_ops;
1775    struct tu_cs *cs = &cmd->draw_cs;
1776
1777    for (uint32_t j = 0; j < attachment_count; j++) {
1778          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1779           * Pass Instance" says that:
1780           *
1781           *     Unlike other clear commands, vkCmdClearAttachments executes as
1782           *     a drawing command, rather than a transfer command, with writes
1783           *     performed by it executing in rasterization order. Clears to
1784           *     color attachments are executed as color attachment writes, by
1785           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1786           *     Clears to depth/stencil attachments are executed as depth
1787           *     writes and writes by the
1788           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1789           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1790           *
1791           * However, the 2d path here is executed the same way as a
1792           * transfer command, using the CCU color cache exclusively with
1793           * a special depth-as-color format for depth clears. This means that
1794           * we can't rely on the normal pipeline barrier mechanism here, and
1795           * have to manually flush whenever using a different cache domain
1796           * from what the 3d path would've used. This happens when we clear
1797           * depth/stencil, since normally depth attachments use CCU depth, but
1798           * we clear it using a special depth-as-color format. Since the clear
1799           * potentially uses a different attachment state we also need to
1800           * invalidate color beforehand and flush it afterwards.
1801           */
1802
1803          uint32_t a;
1804          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1805             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1806             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1807          } else {
1808             a = subpass->depth_stencil_attachment.attachment;
1809             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1810             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1811             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1812          }
1813
1814          if (a == VK_ATTACHMENT_UNUSED)
1815                continue;
1816
1817          uint8_t mask = 0xf;
1818          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
1819             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
1820                mask &= ~0x7;
1821             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
1822                mask &= ~0x8;
1823          }
1824
1825          const struct tu_image_view *iview =
1826             cmd->state.framebuffer->attachments[a].attachment;
1827
1828          ops->setup(cmd, cs, iview->image->vk_format, ROTATE_0, true, mask);
1829          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1830
1831          /* Wait for the flushes we triggered manually to complete */
1832          tu_cs_emit_wfi(cs);
1833
1834          for (uint32_t i = 0; i < rect_count; i++) {
1835             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1836             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1837                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1838                ops->run(cmd, cs);
1839             }
1840          }
1841
1842          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1843             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1844             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1845          } else {
1846             /* sync color into depth */
1847             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1848             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1849          }
1850    }
1851 }
1852
1853 static void
1854 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1855                             uint32_t attachment_count,
1856                             const VkClearAttachment *attachments,
1857                             uint32_t rect_count,
1858                             const VkClearRect *rects)
1859 {
1860    /* the shader path here is special, it avoids changing MRT/etc state */
1861    const struct tu_render_pass *pass = cmd->state.pass;
1862    const struct tu_subpass *subpass = cmd->state.subpass;
1863    const uint32_t mrt_count = subpass->color_count;
1864    struct tu_cs *cs = &cmd->draw_cs;
1865    uint32_t clear_value[MAX_RTS][4];
1866    float z_clear_val = 0.0f;
1867    uint8_t s_clear_val = 0;
1868    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1869    bool z_clear = false;
1870    bool s_clear = false;
1871    bool layered_clear = false;
1872    uint32_t max_samples = 1;
1873
1874    for (uint32_t i = 0; i < attachment_count; i++) {
1875       uint32_t a;
1876       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1877          uint32_t c = attachments[i].colorAttachment;
1878          a = subpass->color_attachments[c].attachment;
1879          if (a == VK_ATTACHMENT_UNUSED)
1880             continue;
1881
1882          clear_rts |= 1 << c;
1883          clear_components |= 0xf << (c * 4);
1884          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1885       } else {
1886          a = subpass->depth_stencil_attachment.attachment;
1887          if (a == VK_ATTACHMENT_UNUSED)
1888             continue;
1889
1890          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1891             z_clear = true;
1892             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1893          }
1894
1895          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1896             s_clear = true;
1897             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1898          }
1899       }
1900
1901       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1902    }
1903
1904    /* prefer to use 2D path for clears
1905     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1906     */
1907    if (max_samples == 1 && cmd->state.framebuffer) {
1908       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1909       return;
1910    }
1911
1912    /* This clear path behaves like a draw, needs the same flush as tu_draw */
1913    tu_emit_cache_flush_renderpass(cmd, cs);
1914
1915    /* disable all draw states so they don't interfere
1916     * TODO: use and re-use draw states for this path
1917     * we have to disable draw states individually to preserve
1918     * input attachment states, because a secondary command buffer
1919     * won't be able to restore them
1920     */
1921    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1922    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1923       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1924           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1925          continue;
1926       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1927                      CP_SET_DRAW_STATE__0_DISABLE);
1928       tu_cs_emit_qw(cs, 0);
1929    }
1930    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1931
1932    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1933    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1934                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1935                   0xfc000000);
1936    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1937
1938    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1939    for (uint32_t i = 0; i < mrt_count; i++) {
1940       if (clear_rts & (1 << i))
1941          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1942       else
1943          tu_cs_emit(cs, 0);
1944    }
1945
1946    for (uint32_t i = 0; i < rect_count; i++) {
1947       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1948          layered_clear = true;
1949    }
1950
1951    r3d_common(cmd, cs, false, num_rts, layered_clear);
1952
1953    tu_cs_emit_regs(cs,
1954                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1955    tu_cs_emit_regs(cs,
1956                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1957
1958    tu_cs_emit_regs(cs,
1959                    A6XX_RB_FS_OUTPUT_CNTL0(),
1960                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1961
1962    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1963    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1964    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1965    for (uint32_t i = 0; i < mrt_count; i++) {
1966       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1967             .component_enable = COND(clear_rts & (1 << i), 0xf)));
1968    }
1969
1970    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1971    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1972          .z_enable = z_clear,
1973          .z_write_enable = z_clear,
1974          .zfunc = FUNC_ALWAYS));
1975    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1976    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1977          .stencil_enable = s_clear,
1978          .func = FUNC_ALWAYS,
1979          .zpass = STENCIL_REPLACE));
1980    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1981    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1982    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1983
1984    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1985    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1986                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1987                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1988                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1989                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1990    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1991    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1992    for_each_bit(b, clear_rts)
1993       tu_cs_emit_array(cs, clear_value[b], 4);
1994
1995    for (uint32_t i = 0; i < rect_count; i++) {
1996       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1997          r3d_coords_raw(cs, layered_clear, (float[]) {
1998             rects[i].rect.offset.x, rects[i].rect.offset.y,
1999             z_clear_val, uif(rects[i].baseArrayLayer + layer),
2000             rects[i].rect.offset.x + rects[i].rect.extent.width,
2001             rects[i].rect.offset.y + rects[i].rect.extent.height,
2002             z_clear_val, 1.0f,
2003          });
2004
2005          if (layered_clear) {
2006             tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
2007             tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
2008                            CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
2009                            CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2010                            CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2011             tu_cs_emit(cs, 1); /* instance count */
2012             tu_cs_emit(cs, 1); /* vertex count */
2013          } else {
2014             r3d_run(cmd, cs);
2015          }
2016       }
2017    }
2018 }
2019
2020 static void
2021 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2022 {
2023    enum pipe_format pformat = vk_format_to_pipe_format(format);
2024
2025    switch (format) {
2026    case VK_FORMAT_X8_D24_UNORM_PACK32:
2027    case VK_FORMAT_D24_UNORM_S8_UINT:
2028       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2029                        val->depthStencil.stencil << 24;
2030       return;
2031    case VK_FORMAT_D16_UNORM:
2032       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2033       return;
2034    case VK_FORMAT_D32_SFLOAT:
2035       clear_value[0] = fui(val->depthStencil.depth);
2036       return;
2037    case VK_FORMAT_S8_UINT:
2038       clear_value[0] = val->depthStencil.stencil;
2039       return;
2040    /* these formats use a different base format when tiled
2041     * the same format can be used for both because GMEM is always in WZYX order
2042     */
2043    case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2044    case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2045       pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
2046    default:
2047       break;
2048    }
2049
2050    VkClearColorValue color;
2051
2052    /**
2053     * GMEM is tiled and wants the components in WZYX order,
2054     * apply swizzle to the color before packing, to counteract
2055     * deswizzling applied by packing functions
2056     */
2057    pipe_swizzle_4f(color.float32, val->color.float32,
2058                    util_format_description(pformat)->swizzle);
2059
2060    util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2061 }
2062
2063 static void
2064 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2065                               struct tu_cs *cs,
2066                               uint32_t attachment,
2067                               uint8_t component_mask,
2068                               const VkClearValue *value)
2069 {
2070    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2071    /* note: component_mask is 0x7 for depth and 0x8 for stencil
2072     * because D24S8 is cleared with AS_R8G8B8A8 format
2073     */
2074
2075    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2076    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2077
2078    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
2079    tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(component_mask));
2080
2081    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2082    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2083
2084    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2085    tu_cs_emit(cs, 0);
2086
2087    uint32_t clear_vals[4] = {};
2088    pack_gmem_clear_value(value, vk_format, clear_vals);
2089
2090    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2091    tu_cs_emit_array(cs, clear_vals, 4);
2092
2093    tu6_emit_event_write(cmd, cs, BLIT);
2094 }
2095
2096 static void
2097 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2098                           uint32_t attachment_count,
2099                           const VkClearAttachment *attachments,
2100                           uint32_t rect_count,
2101                           const VkClearRect *rects)
2102 {
2103    const struct tu_subpass *subpass = cmd->state.subpass;
2104    struct tu_cs *cs = &cmd->draw_cs;
2105
2106    /* TODO: swap the loops for smaller cmdstream */
2107    for (unsigned i = 0; i < rect_count; i++) {
2108       unsigned x1 = rects[i].rect.offset.x;
2109       unsigned y1 = rects[i].rect.offset.y;
2110       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2111       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2112
2113       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2114       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2115       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2116
2117       for (unsigned j = 0; j < attachment_count; j++) {
2118          uint32_t a;
2119          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2120             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2121          else
2122             a = subpass->depth_stencil_attachment.attachment;
2123
2124          if (a == VK_ATTACHMENT_UNUSED)
2125                continue;
2126
2127          unsigned clear_mask = 0xf;
2128          if (cmd->state.pass->attachments[a].format == VK_FORMAT_D24_UNORM_S8_UINT) {
2129             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT))
2130                clear_mask &= ~0x7;
2131             if (!(attachments[j].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT))
2132                clear_mask &= ~0x8;
2133          }
2134
2135          tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2136                                        &attachments[j].clearValue);
2137       }
2138    }
2139 }
2140
2141 void
2142 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2143                        uint32_t attachmentCount,
2144                        const VkClearAttachment *pAttachments,
2145                        uint32_t rectCount,
2146                        const VkClearRect *pRects)
2147 {
2148    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2149    struct tu_cs *cs = &cmd->draw_cs;
2150
2151    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2152    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2153    tu_cond_exec_end(cs);
2154
2155    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2156    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2157    tu_cond_exec_end(cs);
2158 }
2159
2160 void
2161 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2162                            struct tu_cs *cs,
2163                            uint32_t a,
2164                            const VkRenderPassBeginInfo *info)
2165 {
2166    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2167    const struct tu_image_view *iview = fb->attachments[a].attachment;
2168    const struct tu_render_pass_attachment *attachment =
2169       &cmd->state.pass->attachments[a];
2170    uint8_t mask = 0;
2171
2172    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2173       mask = 0xf;
2174    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2175       mask |= 0x7;
2176    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2177       mask |= 0x8;
2178
2179    if (!mask)
2180       return;
2181
2182    const struct blit_ops *ops = &r2d_ops;
2183    if (attachment->samples > 1)
2184       ops = &r3d_ops;
2185
2186    ops->setup(cmd, cs, attachment->format, ROTATE_0, true, mask);
2187    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2188    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2189
2190    /* Wait for any flushes at the beginning of the renderpass to complete */
2191    tu_cs_emit_wfi(cs);
2192
2193    for (uint32_t i = 0; i < fb->layers; i++) {
2194       ops->dst(cs, iview, i);
2195       ops->run(cmd, cs);
2196    }
2197
2198    /* The spec doesn't explicitly say, but presumably the initial renderpass
2199     * clear is considered part of the renderpass, and therefore barriers
2200     * aren't required inside the subpass/renderpass.  Therefore we need to
2201     * flush CCU color into CCU depth here, just like with
2202     * vkCmdClearAttachments(). Note that because this only happens at the
2203     * beginning of a renderpass, and renderpass writes are considered
2204     * "incoherent", we shouldn't have to worry about syncing depth into color
2205     * beforehand as depth should already be flushed.
2206     */
2207    if (vk_format_is_depth_or_stencil(attachment->format)) {
2208       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2209       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2210    } else {
2211       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2212       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2213    }
2214 }
2215
2216 void
2217 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2218                          struct tu_cs *cs,
2219                          uint32_t a,
2220                          const VkRenderPassBeginInfo *info)
2221 {
2222    const struct tu_render_pass_attachment *attachment =
2223       &cmd->state.pass->attachments[a];
2224    unsigned clear_mask = 0;
2225
2226    if (attachment->clear_mask == VK_IMAGE_ASPECT_COLOR_BIT)
2227       clear_mask = 0xf;
2228    if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2229       clear_mask |= 0x7;
2230    if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2231       clear_mask |= 0x8;
2232
2233    if (!clear_mask)
2234       return;
2235
2236    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2237
2238    tu_emit_clear_gmem_attachment(cmd, cs, a, clear_mask,
2239                                  &info->pClearValues[a]);
2240 }
2241
2242 static void
2243 tu_emit_blit(struct tu_cmd_buffer *cmd,
2244              struct tu_cs *cs,
2245              const struct tu_image_view *iview,
2246              const struct tu_render_pass_attachment *attachment,
2247              bool resolve)
2248 {
2249    tu_cs_emit_regs(cs,
2250                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2251
2252    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2253       .unk0 = !resolve,
2254       .gmem = !resolve,
2255       /* "integer" bit disables msaa resolve averaging */
2256       .integer = vk_format_is_int(attachment->format)));
2257
2258    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2259    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2260    tu_cs_image_ref_2d(cs, iview, 0, false);
2261
2262    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2263    tu_cs_image_flag_ref(cs, iview, 0);
2264
2265    tu_cs_emit_regs(cs,
2266                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2267
2268    tu6_emit_event_write(cmd, cs, BLIT);
2269 }
2270
2271 static bool
2272 blit_can_resolve(VkFormat format)
2273 {
2274    const struct util_format_description *desc = vk_format_description(format);
2275
2276    /* blit event can only do resolve for simple cases:
2277     * averaging samples as unsigned integers or choosing only one sample
2278     */
2279    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2280       return false;
2281
2282    /* can't do formats with larger channel sizes
2283     * note: this includes all float formats
2284     * note2: single channel integer formats seem OK
2285     */
2286    if (desc->channel[0].size > 10)
2287       return false;
2288
2289    switch (format) {
2290    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2291     * likely related to these formats having different layout from other cpp=2 formats
2292     */
2293    case VK_FORMAT_R8G8_UNORM:
2294    case VK_FORMAT_R8G8_UINT:
2295    case VK_FORMAT_R8G8_SINT:
2296    /* TODO: this one should be able to work? */
2297    case VK_FORMAT_D24_UNORM_S8_UINT:
2298       return false;
2299    default:
2300       break;
2301    }
2302
2303    return true;
2304 }
2305
2306 void
2307 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2308                         struct tu_cs *cs,
2309                         uint32_t a,
2310                         bool force_load)
2311 {
2312    const struct tu_image_view *iview =
2313       cmd->state.framebuffer->attachments[a].attachment;
2314    const struct tu_render_pass_attachment *attachment =
2315       &cmd->state.pass->attachments[a];
2316
2317    if (attachment->load || force_load)
2318       tu_emit_blit(cmd, cs, iview, attachment, false);
2319 }
2320
2321 void
2322 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2323                          struct tu_cs *cs,
2324                          uint32_t a,
2325                          uint32_t gmem_a)
2326 {
2327    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
2328    const VkRect2D *render_area = &tiling->render_area;
2329    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2330    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2331    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2332
2333    if (!dst->store)
2334       return;
2335
2336    uint32_t x1 = render_area->offset.x;
2337    uint32_t y1 = render_area->offset.y;
2338    uint32_t x2 = x1 + render_area->extent.width;
2339    uint32_t y2 = y1 + render_area->extent.height;
2340    /* x2/y2 can be unaligned if equal to the size of the image,
2341     * since it will write into padding space
2342     * the one exception is linear levels which don't have the
2343     * required y padding in the layout (except for the last level)
2344     */
2345    bool need_y2_align =
2346       y2 != iview->extent.height || iview->need_y2_align;
2347
2348    bool unaligned =
2349       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2350       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2351
2352    /* use fast path when render area is aligned, except for unsupported resolve cases */
2353    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2354       tu_emit_blit(cmd, cs, iview, src, true);
2355       return;
2356    }
2357
2358    if (dst->samples > 1) {
2359       /* I guess we need to use shader path in this case?
2360        * need a testcase which fails because of this
2361        */
2362       tu_finishme("unaligned store of msaa attachment\n");
2363       return;
2364    }
2365
2366    r2d_setup_common(cmd, cs, dst->format, ROTATE_0, false, 0xf, true);
2367    r2d_dst(cs, iview, 0);
2368    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2369
2370    tu_cs_emit_regs(cs,
2371                    A6XX_SP_PS_2D_SRC_INFO(
2372                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2373                       .tile_mode = TILE6_2,
2374                       .srgb = vk_format_is_srgb(src->format),
2375                       .samples = tu_msaa_samples(src->samples),
2376                       .samples_average = !vk_format_is_int(src->format),
2377                       .unk20 = 1,
2378                       .unk22 = 1),
2379                    /* note: src size does not matter when not scaling */
2380                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2381                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2382                    A6XX_SP_PS_2D_SRC_HI(),
2383                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = tiling->tile0.extent.width * src->cpp));
2384
2385    /* sync GMEM writes with CACHE. */
2386    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2387
2388    /* Wait for CACHE_INVALIDATE to land */
2389    tu_cs_emit_wfi(cs);
2390
2391    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2392    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2393
2394    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2395     * sysmem, and we generally assume that GMEM renderpasses leave their
2396     * results in sysmem, so we need to flush manually here.
2397     */
2398    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2399 }