src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 static uint32_t
  20 tu_pack_float32_for_unorm(float val, int bits)
  21 {
  22    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
  23 }
  24
  25 /* r2d_ = BLIT_OP_SCALE operations */
  26
  27 static enum a6xx_2d_ifmt
  28 format_to_ifmt(enum a6xx_format fmt)
  29 {
  30    switch (fmt) {
  31    case FMT6_A8_UNORM:
  32    case FMT6_8_UNORM:
  33    case FMT6_8_SNORM:
  34    case FMT6_8_8_UNORM:
  35    case FMT6_8_8_SNORM:
  36    case FMT6_8_8_8_8_UNORM:
  37    case FMT6_8_8_8_X8_UNORM:
  38    case FMT6_8_8_8_8_SNORM:
  39    case FMT6_4_4_4_4_UNORM:
  40    case FMT6_5_5_5_1_UNORM:
  41    case FMT6_5_6_5_UNORM:
  42    case FMT6_Z24_UNORM_S8_UINT:
  43    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
  44       return R2D_UNORM8;
  45
  46    case FMT6_32_UINT:
  47    case FMT6_32_SINT:
  48    case FMT6_32_32_UINT:
  49    case FMT6_32_32_SINT:
  50    case FMT6_32_32_32_32_UINT:
  51    case FMT6_32_32_32_32_SINT:
  52       return R2D_INT32;
  53
  54    case FMT6_16_UINT:
  55    case FMT6_16_SINT:
  56    case FMT6_16_16_UINT:
  57    case FMT6_16_16_SINT:
  58    case FMT6_16_16_16_16_UINT:
  59    case FMT6_16_16_16_16_SINT:
  60    case FMT6_10_10_10_2_UINT:
  61       return R2D_INT16;
  62
  63    case FMT6_8_UINT:
  64    case FMT6_8_SINT:
  65    case FMT6_8_8_UINT:
  66    case FMT6_8_8_SINT:
  67    case FMT6_8_8_8_8_UINT:
  68    case FMT6_8_8_8_8_SINT:
  69       return R2D_INT8;
  70
  71    case FMT6_16_UNORM:
  72    case FMT6_16_SNORM:
  73    case FMT6_16_16_UNORM:
  74    case FMT6_16_16_SNORM:
  75    case FMT6_16_16_16_16_UNORM:
  76    case FMT6_16_16_16_16_SNORM:
  77    case FMT6_32_FLOAT:
  78    case FMT6_32_32_FLOAT:
  79    case FMT6_32_32_32_32_FLOAT:
  80       return R2D_FLOAT32;
  81
  82    case FMT6_16_FLOAT:
  83    case FMT6_16_16_FLOAT:
  84    case FMT6_16_16_16_16_FLOAT:
  85    case FMT6_11_11_10_FLOAT:
  86    case FMT6_10_10_10_2_UNORM:
  87    case FMT6_10_10_10_2_UNORM_DEST:
  88       return R2D_FLOAT16;
  89
  90    default:
  91       unreachable("bad format");
  92       return 0;
  93    }
  94 }
  95
  96 static void
  97 r2d_coords(struct tu_cs *cs,
  98            const VkOffset2D *dst,
  99            const VkOffset2D *src,
 100            const VkExtent2D *extent)
 101 {
 102    tu_cs_emit_regs(cs,
 103       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 104       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 105
 106    if (!src)
 107       return;
 108
 109    tu_cs_emit_regs(cs,
 110                    A6XX_GRAS_2D_SRC_TL_X(.x = src->x),
 111                    A6XX_GRAS_2D_SRC_BR_X(.x = src->x + extent->width - 1),
 112                    A6XX_GRAS_2D_SRC_TL_Y(.y = src->y),
 113                    A6XX_GRAS_2D_SRC_BR_Y(.y = src->y + extent->height - 1));
 114 }
 115
 116 static void
 117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 118 {
 119    uint32_t clear_value[4] = {};
 120
 121    switch (format) {
 122    case VK_FORMAT_X8_D24_UNORM_PACK32:
 123    case VK_FORMAT_D24_UNORM_S8_UINT:
 124       /* cleared as r8g8b8a8_unorm using special format */
 125       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 126       clear_value[1] = clear_value[0] >> 8;
 127       clear_value[2] = clear_value[0] >> 16;
 128       clear_value[3] = val->depthStencil.stencil;
 129       break;
 130    case VK_FORMAT_D16_UNORM:
 131    case VK_FORMAT_D32_SFLOAT:
 132       /* R2D_FLOAT32 */
 133       clear_value[0] = fui(val->depthStencil.depth);
 134       break;
 135    case VK_FORMAT_S8_UINT:
 136       clear_value[0] = val->depthStencil.stencil;
 137       break;
 138    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 139       /* cleared as UINT32 */
 140       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 141       break;
 142    default:
 143       assert(!vk_format_is_depth_or_stencil(format));
 144       const struct util_format_description *desc = vk_format_description(format);
 145       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 146
 147       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 148                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 149
 150       for (unsigned i = 0; i < desc->nr_channels; i++) {
 151          const struct util_format_channel_description *ch = &desc->channel[i];
 152          if (ifmt == R2D_UNORM8) {
 153             float linear = val->color.float32[i];
 154             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 155                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 156
 157             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 158                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
 159             else
 160                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 161          } else if (ifmt == R2D_FLOAT16) {
 162             clear_value[i] = util_float_to_half(val->color.float32[i]);
 163          } else {
 164             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 165                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 166             clear_value[i] = val->color.uint32[i];
 167          }
 168       }
 169       break;
 170    }
 171
 172    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 173    tu_cs_emit_array(cs, clear_value, 4);
 174 }
 175
 176 static void
 177 r2d_src(struct tu_cmd_buffer *cmd,
 178         struct tu_cs *cs,
 179         const struct tu_image_view *iview,
 180         uint32_t layer,
 181         VkFilter filter)
 182 {
 183    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
 184    if (filter != VK_FILTER_NEAREST)
 185       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
 186
 187    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 188    tu_cs_emit(cs, src_info);
 189    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 190    tu_cs_image_ref_2d(cs, iview, layer, true);
 191
 192    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 193    tu_cs_image_flag_ref(cs, iview, layer);
 194 }
 195
 196 static void
 197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 198                struct tu_cs *cs,
 199                VkFormat vk_format,
 200                uint64_t va, uint32_t pitch,
 201                uint32_t width, uint32_t height)
 202 {
 203    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 204
 205    tu_cs_emit_regs(cs,
 206                    A6XX_SP_PS_2D_SRC_INFO(
 207                       .color_format = format.fmt,
 208                       .color_swap = format.swap,
 209                       .srgb = vk_format_is_srgb(vk_format),
 210                       .unk20 = 1,
 211                       .unk22 = 1),
 212                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 213                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 214                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 215                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 216 }
 217
 218 static void
 219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 220 {
 221    assert(iview->image->samples == 1);
 222
 223    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 224    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 225    tu_cs_image_ref_2d(cs, iview, layer, false);
 226
 227    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 228    tu_cs_image_flag_ref(cs, iview, layer);
 229 }
 230
 231 static void
 232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 233 {
 234    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 235
 236    tu_cs_emit_regs(cs,
 237                    A6XX_RB_2D_DST_INFO(
 238                       .color_format = format.fmt,
 239                       .color_swap = format.swap,
 240                       .srgb = vk_format_is_srgb(vk_format)),
 241                    A6XX_RB_2D_DST_LO((uint32_t) va),
 242                    A6XX_RB_2D_DST_HI(va >> 32),
 243                    A6XX_RB_2D_DST_SIZE(.pitch = pitch));
 244 }
 245
 246 static void
 247 r2d_setup_common(struct tu_cmd_buffer *cmd,
 248                  struct tu_cs *cs,
 249                  VkFormat vk_format,
 250                  VkImageAspectFlags aspect_mask,
 251                  enum a6xx_rotation rotation,
 252                  bool clear,
 253                  bool scissor)
 254 {
 255    enum a6xx_format format = tu6_base_format(vk_format);
 256    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 257    uint32_t unknown_8c01 = 0;
 258
 259    /* note: the only format with partial clearing is D24S8 */
 260    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 261       /* preserve stencil channel */
 262       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 263          unknown_8c01 = 0x08000041;
 264       /* preserve depth channels */
 265       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 266          unknown_8c01 = 0x00084001;
 267    }
 268
 269    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_8C01, 1);
 270    tu_cs_emit(cs, unknown_8c01);
 271
 272    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 273          .scissor = scissor,
 274          .rotate = rotation,
 275          .solid_color = clear,
 276          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 277          .color_format = format,
 278          .mask = 0xf,
 279          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 280       ).value;
 281
 282    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 283    tu_cs_emit(cs, blit_cntl);
 284
 285    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 286    tu_cs_emit(cs, blit_cntl);
 287
 288    if (format == FMT6_10_10_10_2_UNORM_DEST)
 289       format = FMT6_16_16_16_16_FLOAT;
 290
 291    tu_cs_emit_regs(cs, A6XX_SP_2D_SRC_FORMAT(
 292          .sint = vk_format_is_sint(vk_format),
 293          .uint = vk_format_is_uint(vk_format),
 294          .color_format = format,
 295          .srgb = vk_format_is_srgb(vk_format),
 296          .mask = 0xf));
 297 }
 298
 299 static void
 300 r2d_setup(struct tu_cmd_buffer *cmd,
 301           struct tu_cs *cs,
 302           VkFormat vk_format,
 303           VkImageAspectFlags aspect_mask,
 304           enum a6xx_rotation rotation,
 305           bool clear)
 306 {
 307    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 308
 309    r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, false);
 310 }
 311
 312 static void
 313 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 314 {
 315    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 316    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 317 }
 318
 319 /* r3d_ = shader path operations */
 320
 321 void
 322 tu_init_clear_blit_shaders(struct tu6_global *global)
 323 {
 324 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
 325 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 326 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 327
 328    static const instr_t vs_code[] = {
 329       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 330        * r1.xy = r0.w ? c1.zw : c0.zw
 331        * r0.w = 1.0f
 332        */
 333       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 334          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 335          .src2 = 3,
 336          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 337       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 338          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 339          .src2 = 3,
 340          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 341       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 342       { .cat0 = { .opc = OPC_END } },
 343    };
 344
 345    static const instr_t vs_layered[] = {
 346       { .cat0 = { .opc = OPC_CHMASK } },
 347       { .cat0 = { .opc = OPC_CHSH } },
 348    };
 349
 350    static const instr_t gs_code[] = {
 351       /* (sy)(ss)(nop3)shr.b r0.w, r0.x, 16 (extract local_id) */
 352       CAT2(OPC_SHR_B, .dst = 3, .src1 = 0, .src2_im = 1, .src2 = 16,
 353            .src1_r = 1, .src2_r = 1, .ss = 1, .sync = 1),
 354       /* x = (local_id & 1) ? c1.x : c0.x */
 355       CAT2(OPC_AND_B, .dst = 0, .src1 = 3, .src2_im = 1, .src2 = 1),
 356       /* y = (local_id & 2) ? c1.y : c0.y */
 357       CAT2(OPC_AND_B, .dst = 1, .src1 = 3, .src2_im = 1, .src2 = 2),
 358       /* pred = (local_id >= 4), used by OPC_KILL */
 359       CAT2(OPC_CMPS_S, .dst = REG_P0 * 4, .cond = IR3_COND_GE, .src1 = 3, .src2_im = 1, .src2 = 4),
 360       /* vertex_flags_out = (local_id == 0) ? 4 : 0 - first vertex flag */
 361       CAT2(OPC_CMPS_S, .dst = 4, .cond = IR3_COND_EQ, .src1 = 3, .src2_im = 1, .src2 = 0),
 362
 363       MOV(.dst = 2, .src_c = 1, .src = 2), /* depth clear value from c0.z */
 364       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f),
 365       MOV(.dst = 5, .src_c = 1, .src = 3), /* layer id from c0.w */
 366
 367       /* (rpt1)sel.b32 r0.x, (r)c1.x, (r)r0.x, (r)c0.x */
 368       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 0,
 369          .c1 = {.src1_c = 1, .src1 = 4, .dummy = 4}, .src1_r = 1,
 370          .src2 = 0,
 371          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 372
 373       CAT2(OPC_SHL_B, .dst = 4, .src1 = 4, .src2_im = 1, .src2 = 2),
 374
 375       { .cat0 = { .opc = OPC_KILL } },
 376       { .cat0 = { .opc = OPC_END, .ss = 1, .sync = 1 } },
 377    };
 378
 379    static const instr_t fs_blit[] = {
 380       /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 381        * blit path (its not clear what allows it to not have it)
 382        */
 383       CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
 384       { .cat0 = { .opc = OPC_END } },
 385    };
 386
 387    memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
 388    memcpy(&global->shaders[GLOBAL_SH_VS_LAYER], vs_layered, sizeof(vs_layered));
 389    memcpy(&global->shaders[GLOBAL_SH_GS_LAYER], gs_code, sizeof(gs_code));
 390    memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
 391
 392    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
 393       instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
 394       for (uint32_t i = 0; i < num_rts; i++) {
 395          /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 396          *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
 397       }
 398       *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 399    }
 400 }
 401
 402 static void
 403 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 404            bool layered_clear)
 405 {
 406    struct ir3_const_state dummy_const_state = {};
 407    struct ir3_shader dummy_shader = {};
 408
 409    struct ir3_shader_variant vs = {
 410       .type = MESA_SHADER_VERTEX,
 411       .instrlen = 1,
 412       .constlen = 4,
 413       .info.max_reg = 1,
 414       .inputs_count = 1,
 415       .inputs[0] = {
 416          .slot = SYSTEM_VALUE_VERTEX_ID,
 417          .regid = regid(0, 3),
 418          .sysval = true,
 419       },
 420       .outputs_count = blit ? 2 : 1,
 421       .outputs[0] = {
 422          .slot = VARYING_SLOT_POS,
 423          .regid = regid(0, 0),
 424       },
 425       .outputs[1] = {
 426          .slot = VARYING_SLOT_VAR0,
 427          .regid = regid(1, 0),
 428       },
 429       .shader = &dummy_shader,
 430       .const_state = &dummy_const_state,
 431    };
 432    if (layered_clear) {
 433       vs = (struct ir3_shader_variant) {
 434          .type = MESA_SHADER_VERTEX,
 435          .instrlen = 1,
 436          .info.max_reg = 0,
 437          .shader = &dummy_shader,
 438          .const_state = &dummy_const_state,
 439       };
 440    }
 441
 442    struct ir3_shader_variant fs = {
 443       .type = MESA_SHADER_FRAGMENT,
 444       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 445       .constlen = align(num_rts, 4),
 446       .info.max_reg = MAX2(num_rts, 1) - 1,
 447       .total_in = blit ? 2 : 0,
 448       .num_samp = blit ? 1 : 0,
 449       .inputs_count = blit ? 2 : 0,
 450       .inputs[0] = {
 451          .slot = VARYING_SLOT_VAR0,
 452          .inloc = 0,
 453          .compmask = 3,
 454          .bary = true,
 455       },
 456       .inputs[1] = {
 457          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 458          .regid = regid(0, 0),
 459          .sysval = 1,
 460       },
 461       .num_sampler_prefetch = blit ? 1 : 0,
 462       .sampler_prefetch[0] = {
 463          .src = 0,
 464          .wrmask = 0xf,
 465          .cmd = 4,
 466       },
 467       .shader = &dummy_shader,
 468       .const_state = &dummy_const_state,
 469    };
 470
 471    struct ir3_shader_variant gs_shader = {
 472       .type = MESA_SHADER_GEOMETRY,
 473       .instrlen = 1,
 474       .constlen = 4,
 475       .info.max_reg = 1,
 476       .inputs_count = 1,
 477       .inputs[0] = {
 478          .slot = SYSTEM_VALUE_GS_HEADER_IR3,
 479          .regid = regid(0, 0),
 480          .sysval = true,
 481       },
 482       .outputs_count = 3,
 483       .outputs[0] = {
 484          .slot = VARYING_SLOT_POS,
 485          .regid = regid(0, 0),
 486       },
 487       .outputs[1] = {
 488          .slot = VARYING_SLOT_LAYER,
 489          .regid = regid(1, 1),
 490       },
 491       .outputs[2] = {
 492          .slot = VARYING_SLOT_GS_VERTEX_FLAGS_IR3,
 493          .regid = regid(1, 0),
 494       },
 495       .shader = &dummy_shader,
 496       .const_state = &dummy_const_state,
 497    }, *gs = layered_clear ? &gs_shader : NULL;
 498
 499    /* shaders */
 500    tu_cs_emit_regs(cs, A6XX_HLSQ_UPDATE_CNTL(0x7ffff));
 501
 502    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs,
 503          global_iova(cmd, shaders[gs ? GLOBAL_SH_VS_LAYER : GLOBAL_SH_VS]));
 504    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 505    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 506    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, gs,
 507          global_iova(cmd, shaders[GLOBAL_SH_GS_LAYER]));
 508    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
 509          global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
 510
 511    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 512    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 513
 514    tu6_emit_vpc(cs, &vs, NULL, NULL, gs, &fs);
 515
 516    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 517    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 518    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 519
 520    tu6_emit_fs_inputs(cs, &fs);
 521
 522    tu_cs_emit_regs(cs,
 523                    A6XX_GRAS_CL_CNTL(
 524                       .persp_division_disable = 1,
 525                       .vp_xform_disable = 1,
 526                       .vp_clip_code_ignore = 1,
 527                       .clip_disable = 1),
 528                    A6XX_GRAS_UNKNOWN_8001(0));
 529    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 530
 531    tu_cs_emit_regs(cs,
 532                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0(.x = 0, .y = 0),
 533                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 534    tu_cs_emit_regs(cs,
 535                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0(.x = 0, .y = 0),
 536                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR_0(.x = 0x7fff, .y = 0x7fff));
 537
 538    tu_cs_emit_regs(cs,
 539                    A6XX_VFD_INDEX_OFFSET(),
 540                    A6XX_VFD_INSTANCE_START_OFFSET());
 541 }
 542
 543 static void
 544 r3d_coords_raw(struct tu_cs *cs, bool gs, const float *coords)
 545 {
 546    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 547    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 548                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 549                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 550                   CP_LOAD_STATE6_0_STATE_BLOCK(gs ? SB6_GS_SHADER : SB6_VS_SHADER) |
 551                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 552    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 553    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 554    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 555 }
 556
 557 static void
 558 r3d_coords(struct tu_cs *cs,
 559            const VkOffset2D *dst,
 560            const VkOffset2D *src,
 561            const VkExtent2D *extent)
 562 {
 563    int32_t src_x1 = src ? src->x : 0;
 564    int32_t src_y1 = src ? src->y : 0;
 565    r3d_coords_raw(cs, false, (float[]) {
 566       dst->x,                 dst->y,
 567       src_x1,                 src_y1,
 568       dst->x + extent->width, dst->y + extent->height,
 569       src_x1 + extent->width, src_y1 + extent->height,
 570    });
 571 }
 572
 573 static void
 574 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 575 {
 576    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 577    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 578                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 579                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 580                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 581                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 582    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 583    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 584    switch (format) {
 585    case VK_FORMAT_X8_D24_UNORM_PACK32:
 586    case VK_FORMAT_D24_UNORM_S8_UINT: {
 587       /* cleared as r8g8b8a8_unorm using special format */
 588       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 589       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 590       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 591       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 592       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 593    } break;
 594    case VK_FORMAT_D16_UNORM:
 595    case VK_FORMAT_D32_SFLOAT:
 596       tu_cs_emit(cs, fui(val->depthStencil.depth));
 597       tu_cs_emit(cs, 0);
 598       tu_cs_emit(cs, 0);
 599       tu_cs_emit(cs, 0);
 600       break;
 601    case VK_FORMAT_S8_UINT:
 602       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 603       tu_cs_emit(cs, 0);
 604       tu_cs_emit(cs, 0);
 605       tu_cs_emit(cs, 0);
 606       break;
 607    default:
 608       /* as color formats use clear value as-is */
 609       assert(!vk_format_is_depth_or_stencil(format));
 610       tu_cs_emit_array(cs, val->color.uint32, 4);
 611       break;
 612    }
 613 }
 614
 615 static void
 616 r3d_src_common(struct tu_cmd_buffer *cmd,
 617                struct tu_cs *cs,
 618                const uint32_t *tex_const,
 619                uint32_t offset_base,
 620                uint32_t offset_ubwc,
 621                VkFilter filter)
 622 {
 623    struct tu_cs_memory texture = { };
 624    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 625                                  2, /* allocate space for a sampler too */
 626                                  A6XX_TEX_CONST_DWORDS, &texture);
 627    assert(result == VK_SUCCESS);
 628
 629    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 630
 631    /* patch addresses for layer offset */
 632    *(uint64_t*) (texture.map + 4) += offset_base;
 633    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 634    texture.map[7] = ubwc_addr;
 635    texture.map[8] = ubwc_addr >> 32;
 636
 637    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 638       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
 639       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
 640       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 641       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 642       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 643       0x60000; /* XXX used by blob, doesn't seem necessary */
 644    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 645       0x1 | /* XXX used by blob, doesn't seem necessary */
 646       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 647       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 648    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 649    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 650
 651    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 652    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 653                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 654                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 655                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 656                CP_LOAD_STATE6_0_NUM_UNIT(1));
 657    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 658
 659    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 660    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 661
 662    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 663    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 664       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 665       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 666       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 667       CP_LOAD_STATE6_0_NUM_UNIT(1));
 668    tu_cs_emit_qw(cs, texture.iova);
 669
 670    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 671    tu_cs_emit_qw(cs, texture.iova);
 672
 673    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 674 }
 675
 676 static void
 677 r3d_src(struct tu_cmd_buffer *cmd,
 678         struct tu_cs *cs,
 679         const struct tu_image_view *iview,
 680         uint32_t layer,
 681         VkFilter filter)
 682 {
 683    r3d_src_common(cmd, cs, iview->descriptor,
 684                   iview->layer_size * layer,
 685                   iview->ubwc_layer_size * layer,
 686                   filter);
 687 }
 688
 689 static void
 690 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 691                struct tu_cs *cs,
 692                VkFormat vk_format,
 693                uint64_t va, uint32_t pitch,
 694                uint32_t width, uint32_t height)
 695 {
 696    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 697
 698    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 699
 700    desc[0] =
 701       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 702       A6XX_TEX_CONST_0_FMT(format.fmt) |
 703       A6XX_TEX_CONST_0_SWAP(format.swap) |
 704       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 705       // XXX to swizzle into .w for stencil buffer_to_image
 706       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 707       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 708       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 709    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 710    desc[2] =
 711       A6XX_TEX_CONST_2_PITCH(pitch) |
 712       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 713    desc[3] = 0;
 714    desc[4] = va;
 715    desc[5] = va >> 32;
 716    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 717       desc[i] = 0;
 718
 719    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 720 }
 721
 722 static void
 723 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 724 {
 725    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 726
 727    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 728    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 729    tu_cs_image_ref(cs, iview, layer);
 730    tu_cs_emit(cs, 0);
 731
 732    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 733    tu_cs_image_flag_ref(cs, iview, layer);
 734
 735    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 736 }
 737
 738 static void
 739 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 740 {
 741    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 742
 743    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 744
 745    tu_cs_emit_regs(cs,
 746                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 747                    A6XX_RB_MRT_PITCH(0, pitch),
 748                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 749                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 750                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 751                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 752
 753    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 754 }
 755
 756 static uint8_t
 757 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
 758 {
 759    uint8_t mask = 0xf;
 760    assert(aspect_mask);
 761    /* note: the only format with partial writing is D24S8,
 762     * clear/blit uses the _AS_R8G8B8A8 format to access it
 763     */
 764    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 765       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 766          mask = 0x7;
 767       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 768          mask = 0x8;
 769    }
 770    return mask;
 771 }
 772
 773 static void
 774 r3d_setup(struct tu_cmd_buffer *cmd,
 775           struct tu_cs *cs,
 776           VkFormat vk_format,
 777           VkImageAspectFlags aspect_mask,
 778           enum a6xx_rotation rotation,
 779           bool clear)
 780 {
 781    if (!cmd->state.pass) {
 782       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 783       tu6_emit_window_scissor(cs, 0, 0, 0x7fff, 0x7fff);
 784    }
 785
 786    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 787    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 788
 789    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 790
 791    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 792    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 793                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 794                   0xfc000000);
 795    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 796
 797    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 798    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 799
 800    tu_cs_emit_regs(cs,
 801                    A6XX_RB_FS_OUTPUT_CNTL0(),
 802                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 803
 804    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 805    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 806    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 807
 808    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 809    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 810    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 811    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 812    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 813    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 814    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 815
 816    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 817    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 818
 819    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 820                         .color_format = tu6_base_format(vk_format),
 821                         .color_sint = vk_format_is_sint(vk_format),
 822                         .color_uint = vk_format_is_uint(vk_format)));
 823
 824    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
 825       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
 826    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 827    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 828 }
 829
 830 static void
 831 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 832 {
 833    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 834    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 835                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 836                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 837    tu_cs_emit(cs, 1); /* instance count */
 838    tu_cs_emit(cs, 2); /* vertex count */
 839 }
 840
 841 /* blit ops - common interface for 2d/shader paths */
 842
 843 struct blit_ops {
 844    void (*coords)(struct tu_cs *cs,
 845                   const VkOffset2D *dst,
 846                   const VkOffset2D *src,
 847                   const VkExtent2D *extent);
 848    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 849    void (*src)(
 850         struct tu_cmd_buffer *cmd,
 851         struct tu_cs *cs,
 852         const struct tu_image_view *iview,
 853         uint32_t layer,
 854         VkFilter filter);
 855    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 856                       VkFormat vk_format,
 857                       uint64_t va, uint32_t pitch,
 858                       uint32_t width, uint32_t height);
 859    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 860    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
 861    void (*setup)(struct tu_cmd_buffer *cmd,
 862                  struct tu_cs *cs,
 863                  VkFormat vk_format,
 864                  VkImageAspectFlags aspect_mask,
 865                  enum a6xx_rotation rotation,
 866                  bool clear);
 867    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 868 };
 869
 870 static const struct blit_ops r2d_ops = {
 871    .coords = r2d_coords,
 872    .clear_value = r2d_clear_value,
 873    .src = r2d_src,
 874    .src_buffer = r2d_src_buffer,
 875    .dst = r2d_dst,
 876    .dst_buffer = r2d_dst_buffer,
 877    .setup = r2d_setup,
 878    .run = r2d_run,
 879 };
 880
 881 static const struct blit_ops r3d_ops = {
 882    .coords = r3d_coords,
 883    .clear_value = r3d_clear_value,
 884    .src = r3d_src,
 885    .src_buffer = r3d_src_buffer,
 886    .dst = r3d_dst,
 887    .dst_buffer = r3d_dst_buffer,
 888    .setup = r3d_setup,
 889    .run = r3d_run,
 890 };
 891
 892 /* passthrough set coords from 3D extents */
 893 static void
 894 coords(const struct blit_ops *ops,
 895        struct tu_cs *cs,
 896        const VkOffset3D *dst,
 897        const VkOffset3D *src,
 898        const VkExtent3D *extent)
 899 {
 900    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 901 }
 902
 903 static VkFormat
 904 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
 905 {
 906    if (vk_format_is_compressed(format)) {
 907       switch (vk_format_get_blocksize(format)) {
 908       case 1: return VK_FORMAT_R8_UINT;
 909       case 2: return VK_FORMAT_R16_UINT;
 910       case 4: return VK_FORMAT_R32_UINT;
 911       case 8: return VK_FORMAT_R32G32_UINT;
 912       case 16:return VK_FORMAT_R32G32B32A32_UINT;
 913       default:
 914          unreachable("unhandled format size");
 915       }
 916    }
 917
 918    switch (format) {
 919    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 920       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
 921          return VK_FORMAT_R8G8_UNORM;
 922       /* fallthrough */
 923    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
 924       return VK_FORMAT_R8_UNORM;
 925    case VK_FORMAT_D24_UNORM_S8_UINT:
 926       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
 927          return VK_FORMAT_R8_UNORM;
 928       /* fallthrough */
 929    default:
 930       return format;
 931    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 932       return VK_FORMAT_R32_UINT;
 933    }
 934 }
 935
 936 static void
 937 tu_image_view_copy_blit(struct tu_image_view *iview,
 938                         struct tu_image *image,
 939                         VkFormat format,
 940                         const VkImageSubresourceLayers *subres,
 941                         uint32_t layer,
 942                         bool stencil_read)
 943 {
 944    VkImageAspectFlags aspect_mask = subres->aspectMask;
 945
 946    /* always use the AS_R8G8B8A8 format for these */
 947    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
 948        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
 949       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
 950    }
 951
 952    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
 953       .image = tu_image_to_handle(image),
 954       .viewType = VK_IMAGE_VIEW_TYPE_2D,
 955       .format = format,
 956       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
 957       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
 958       .subresourceRange = {
 959          .aspectMask = aspect_mask,
 960          .baseMipLevel = subres->mipLevel,
 961          .levelCount = 1,
 962          .baseArrayLayer = subres->baseArrayLayer + layer,
 963          .layerCount = 1,
 964       },
 965    });
 966 }
 967
 968 static void
 969 tu_image_view_copy(struct tu_image_view *iview,
 970                    struct tu_image *image,
 971                    VkFormat format,
 972                    const VkImageSubresourceLayers *subres,
 973                    uint32_t layer,
 974                    bool stencil_read)
 975 {
 976    format = copy_format(format, subres->aspectMask, false);
 977    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
 978 }
 979
 980 static void
 981 tu_image_view_blit(struct tu_image_view *iview,
 982                    struct tu_image *image,
 983                    const VkImageSubresourceLayers *subres,
 984                    uint32_t layer)
 985 {
 986    tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
 987 }
 988
 989 static void
 990 tu6_blit_image(struct tu_cmd_buffer *cmd,
 991                struct tu_image *src_image,
 992                struct tu_image *dst_image,
 993                const VkImageBlit *info,
 994                VkFilter filter)
 995 {
 996    const struct blit_ops *ops = &r2d_ops;
 997    struct tu_cs *cs = &cmd->cs;
 998    uint32_t layers;
 999
1000    /* 2D blit can't do rotation mirroring from just coordinates */
1001    static const enum a6xx_rotation rotate[2][2] = {
1002       {ROTATE_0, ROTATE_HFLIP},
1003       {ROTATE_VFLIP, ROTATE_180},
1004    };
1005
1006    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1007                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1008    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1009                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1010    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1011                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1012
1013    if (mirror_z) {
1014       tu_finishme("blit z mirror\n");
1015       return;
1016    }
1017
1018    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1019        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1020       tu_finishme("blit z filter\n");
1021       return;
1022    }
1023
1024    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1025    if (info->dstSubresource.layerCount > 1) {
1026       assert(layers <= 1);
1027       layers = info->dstSubresource.layerCount;
1028    }
1029
1030    /* BC1_RGB_* formats need to have their last components overriden with 1
1031     * when sampling, which is normally handled with the texture descriptor
1032     * swizzle. The 2d path can't handle that, so use the 3d path.
1033     *
1034     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1035     * the 2d path.
1036     */
1037
1038    if (dst_image->samples > 1 ||
1039        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1040        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1041        filter == VK_FILTER_CUBIC_EXT)
1042       ops = &r3d_ops;
1043
1044    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
1045     * figure out why (should be able to pass all tests with only shader path)
1046     */
1047
1048    ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
1049               rotate[mirror_y][mirror_x], false);
1050
1051    if (ops == &r3d_ops) {
1052       r3d_coords_raw(cs, false, (float[]) {
1053          info->dstOffsets[0].x, info->dstOffsets[0].y,
1054          info->srcOffsets[0].x, info->srcOffsets[0].y,
1055          info->dstOffsets[1].x, info->dstOffsets[1].y,
1056          info->srcOffsets[1].x, info->srcOffsets[1].y
1057       });
1058    } else {
1059       tu_cs_emit_regs(cs,
1060          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1061                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1062          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1063                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1064       tu_cs_emit_regs(cs,
1065          A6XX_GRAS_2D_SRC_TL_X(.x = MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1066          A6XX_GRAS_2D_SRC_BR_X(.x = MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1067          A6XX_GRAS_2D_SRC_TL_Y(.y = MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1068          A6XX_GRAS_2D_SRC_BR_Y(.y = MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1069    }
1070
1071    struct tu_image_view dst, src;
1072    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1073    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1074
1075    for (uint32_t i = 0; i < layers; i++) {
1076       ops->dst(cs, &dst, i);
1077       ops->src(cmd, cs, &src, i, filter);
1078       ops->run(cmd, cs);
1079    }
1080 }
1081
1082 void
1083 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1084                 VkImage srcImage,
1085                 VkImageLayout srcImageLayout,
1086                 VkImage dstImage,
1087                 VkImageLayout dstImageLayout,
1088                 uint32_t regionCount,
1089                 const VkImageBlit *pRegions,
1090                 VkFilter filter)
1091
1092 {
1093    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1094    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1095    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1096
1097    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1098    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1099
1100    for (uint32_t i = 0; i < regionCount; ++i)
1101       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1102 }
1103
1104 static void
1105 copy_compressed(VkFormat format,
1106                 VkOffset3D *offset,
1107                 VkExtent3D *extent,
1108                 uint32_t *width,
1109                 uint32_t *height)
1110 {
1111    if (!vk_format_is_compressed(format))
1112       return;
1113
1114    uint32_t block_width = vk_format_get_blockwidth(format);
1115    uint32_t block_height = vk_format_get_blockheight(format);
1116
1117    offset->x /= block_width;
1118    offset->y /= block_height;
1119
1120    if (extent) {
1121       extent->width = DIV_ROUND_UP(extent->width, block_width);
1122       extent->height = DIV_ROUND_UP(extent->height, block_height);
1123    }
1124    if (width)
1125       *width = DIV_ROUND_UP(*width, block_width);
1126    if (height)
1127       *height = DIV_ROUND_UP(*height, block_height);
1128 }
1129
1130 static void
1131 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1132                         struct tu_buffer *src_buffer,
1133                         struct tu_image *dst_image,
1134                         const VkBufferImageCopy *info)
1135 {
1136    struct tu_cs *cs = &cmd->cs;
1137    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1138    VkFormat src_format =
1139       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1140    const struct blit_ops *ops = &r2d_ops;
1141
1142    /* special case for buffer to stencil */
1143    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1144        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1145       ops = &r3d_ops;
1146    }
1147
1148    /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1149     * which matters for UBWC. buffer_to_image/etc can fail because of this
1150     */
1151
1152    VkOffset3D offset = info->imageOffset;
1153    VkExtent3D extent = info->imageExtent;
1154    uint32_t src_width = info->bufferRowLength ?: extent.width;
1155    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1156
1157    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1158
1159    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1160    uint32_t layer_size = src_height * pitch;
1161
1162    ops->setup(cmd, cs,
1163               copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1164               info->imageSubresource.aspectMask, ROTATE_0, false);
1165
1166    struct tu_image_view dst;
1167    tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1168
1169    for (uint32_t i = 0; i < layers; i++) {
1170       ops->dst(cs, &dst, i);
1171
1172       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1173       if ((src_va & 63) || (pitch & 63)) {
1174          for (uint32_t y = 0; y < extent.height; y++) {
1175             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1176             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1177                             x + extent.width, 1);
1178             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1179                         &(VkExtent2D) {extent.width, 1});
1180             ops->run(cmd, cs);
1181             src_va += pitch;
1182          }
1183       } else {
1184          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1185          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1186          ops->run(cmd, cs);
1187       }
1188    }
1189 }
1190
1191 void
1192 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1193                         VkBuffer srcBuffer,
1194                         VkImage dstImage,
1195                         VkImageLayout dstImageLayout,
1196                         uint32_t regionCount,
1197                         const VkBufferImageCopy *pRegions)
1198 {
1199    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1200    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1201    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1202
1203    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1204    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1205
1206    for (unsigned i = 0; i < regionCount; ++i)
1207       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1208 }
1209
1210 static void
1211 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1212                         struct tu_image *src_image,
1213                         struct tu_buffer *dst_buffer,
1214                         const VkBufferImageCopy *info)
1215 {
1216    struct tu_cs *cs = &cmd->cs;
1217    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1218    VkFormat dst_format =
1219       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1220    bool stencil_read = false;
1221
1222    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1223        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1224       stencil_read = true;
1225    }
1226
1227    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1228    VkOffset3D offset = info->imageOffset;
1229    VkExtent3D extent = info->imageExtent;
1230    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1231    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1232
1233    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1234
1235    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1236    uint32_t layer_size = pitch * dst_height;
1237
1238    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1239
1240    struct tu_image_view src;
1241    tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1242
1243    for (uint32_t i = 0; i < layers; i++) {
1244       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1245
1246       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1247       if ((dst_va & 63) || (pitch & 63)) {
1248          for (uint32_t y = 0; y < extent.height; y++) {
1249             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1250             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1251             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1252                         &(VkExtent2D) {extent.width, 1});
1253             ops->run(cmd, cs);
1254             dst_va += pitch;
1255          }
1256       } else {
1257          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1258          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1259          ops->run(cmd, cs);
1260       }
1261    }
1262 }
1263
1264 void
1265 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1266                         VkImage srcImage,
1267                         VkImageLayout srcImageLayout,
1268                         VkBuffer dstBuffer,
1269                         uint32_t regionCount,
1270                         const VkBufferImageCopy *pRegions)
1271 {
1272    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1273    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1274    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1275
1276    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1277    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1278
1279    for (unsigned i = 0; i < regionCount; ++i)
1280       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1281 }
1282
1283 /* Tiled formats don't support swapping, which means that we can't support
1284  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1285  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1286  * Currently we fake support for tiled swapped formats and use the unswapped
1287  * format instead, but this means that reinterpreting copies to and from
1288  * swapped formats can't be performed correctly unless we can swizzle the
1289  * components by reinterpreting the other image as the "correct" swapped
1290  * format, i.e. only when the other image is linear.
1291  */
1292
1293 static bool
1294 is_swapped_format(VkFormat format)
1295 {
1296    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1297    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1298    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1299 }
1300
1301 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1302  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1303  * versa). This should mirror the logic in fdl6_layout.
1304  */
1305 static bool
1306 image_is_r8g8(struct tu_image *image)
1307 {
1308    return image->layout[0].cpp == 2 &&
1309       vk_format_get_nr_components(image->vk_format) == 2;
1310 }
1311
1312 static void
1313 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1314                        struct tu_image *src_image,
1315                        struct tu_image *dst_image,
1316                        const VkImageCopy *info)
1317 {
1318    const struct blit_ops *ops = &r2d_ops;
1319    struct tu_cs *cs = &cmd->cs;
1320
1321    if (dst_image->samples > 1)
1322       ops = &r3d_ops;
1323
1324    VkFormat format = VK_FORMAT_UNDEFINED;
1325    VkOffset3D src_offset = info->srcOffset;
1326    VkOffset3D dst_offset = info->dstOffset;
1327    VkExtent3D extent = info->extent;
1328
1329    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1330     * Images":
1331     *
1332     *    When copying between compressed and uncompressed formats the extent
1333     *    members represent the texel dimensions of the source image and not
1334     *    the destination. When copying from a compressed image to an
1335     *    uncompressed image the image texel dimensions written to the
1336     *    uncompressed image will be source extent divided by the compressed
1337     *    texel block dimensions. When copying from an uncompressed image to a
1338     *    compressed image the image texel dimensions written to the compressed
1339     *    image will be the source extent multiplied by the compressed texel
1340     *    block dimensions.
1341     *
1342     * This means we only have to adjust the extent if the source image is
1343     * compressed.
1344     */
1345    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1346    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1347
1348    VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1349    VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1350
1351    bool use_staging_blit = false;
1352
1353    if (src_format == dst_format) {
1354       /* Images that share a format can always be copied directly because it's
1355        * the same as a blit.
1356        */
1357       format = src_format;
1358    } else if (!src_image->layout[0].tile_mode) {
1359       /* If an image is linear, we can always safely reinterpret it with the
1360        * other image's format and then do a regular blit.
1361        */
1362       format = dst_format;
1363    } else if (!dst_image->layout[0].tile_mode) {
1364       format = src_format;
1365    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1366       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1367        * due to the different tile layout.
1368        */
1369       use_staging_blit = true;
1370    } else if (is_swapped_format(src_format) ||
1371               is_swapped_format(dst_format)) {
1372       /* If either format has a non-identity swap, then we can't copy
1373        * to/from it.
1374        */
1375       use_staging_blit = true;
1376    } else if (!src_image->layout[0].ubwc) {
1377       format = dst_format;
1378    } else if (!dst_image->layout[0].ubwc) {
1379       format = src_format;
1380    } else {
1381       /* Both formats use UBWC and so neither can be reinterpreted.
1382        * TODO: We could do an in-place decompression of the dst instead.
1383        */
1384       use_staging_blit = true;
1385    }
1386
1387    struct tu_image_view dst, src;
1388
1389    if (use_staging_blit) {
1390       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1391       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1392
1393       struct tu_image staging_image = {
1394          .vk_format = src_format,
1395          .type = src_image->type,
1396          .tiling = VK_IMAGE_TILING_LINEAR,
1397          .extent = extent,
1398          .level_count = 1,
1399          .layer_count = info->srcSubresource.layerCount,
1400          .samples = src_image->samples,
1401          .bo_offset = 0,
1402       };
1403
1404       VkImageSubresourceLayers staging_subresource = {
1405          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1406          .mipLevel = 0,
1407          .baseArrayLayer = 0,
1408          .layerCount = info->srcSubresource.layerCount,
1409       };
1410
1411       VkOffset3D staging_offset = { 0 };
1412
1413       staging_image.layout[0].tile_mode = TILE6_LINEAR;
1414       staging_image.layout[0].ubwc = false;
1415
1416       fdl6_layout(&staging_image.layout[0],
1417                   vk_format_to_pipe_format(staging_image.vk_format),
1418                   staging_image.samples,
1419                   staging_image.extent.width,
1420                   staging_image.extent.height,
1421                   staging_image.extent.depth,
1422                   staging_image.level_count,
1423                   staging_image.layer_count,
1424                   staging_image.type == VK_IMAGE_TYPE_3D,
1425                   NULL);
1426
1427       VkResult result = tu_get_scratch_bo(cmd->device,
1428                                           staging_image.layout[0].size,
1429                                           &staging_image.bo);
1430       if (result != VK_SUCCESS) {
1431          cmd->record_result = result;
1432          return;
1433       }
1434
1435       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1436                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1437
1438       struct tu_image_view staging;
1439       tu_image_view_copy(&staging, &staging_image, src_format,
1440                          &staging_subresource, 0, false);
1441
1442       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1443       coords(ops, cs, &staging_offset, &src_offset, &extent);
1444
1445       for (uint32_t i = 0; i < info->extent.depth; i++) {
1446          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1447          ops->dst(cs, &staging, i);
1448          ops->run(cmd, cs);
1449       }
1450
1451       /* When executed by the user there has to be a pipeline barrier here,
1452        * but since we're doing it manually we'll have to flush ourselves.
1453        */
1454       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1455       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1456
1457       tu_image_view_copy(&staging, &staging_image, dst_format,
1458                          &staging_subresource, 0, false);
1459
1460       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, ROTATE_0, false);
1461       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1462
1463       for (uint32_t i = 0; i < info->extent.depth; i++) {
1464          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1465          ops->dst(cs, &dst, i);
1466          ops->run(cmd, cs);
1467       }
1468    } else {
1469       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1470       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1471
1472       ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, ROTATE_0, false);
1473       coords(ops, cs, &dst_offset, &src_offset, &extent);
1474
1475       for (uint32_t i = 0; i < info->extent.depth; i++) {
1476          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1477          ops->dst(cs, &dst, i);
1478          ops->run(cmd, cs);
1479       }
1480    }
1481 }
1482
1483 void
1484 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1485                 VkImage srcImage,
1486                 VkImageLayout srcImageLayout,
1487                 VkImage destImage,
1488                 VkImageLayout destImageLayout,
1489                 uint32_t regionCount,
1490                 const VkImageCopy *pRegions)
1491 {
1492    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1493    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1494    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1495
1496    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1497    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1498
1499    for (uint32_t i = 0; i < regionCount; ++i)
1500       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1501 }
1502
1503 static void
1504 copy_buffer(struct tu_cmd_buffer *cmd,
1505             uint64_t dst_va,
1506             uint64_t src_va,
1507             uint64_t size,
1508             uint32_t block_size)
1509 {
1510    const struct blit_ops *ops = &r2d_ops;
1511    struct tu_cs *cs = &cmd->cs;
1512    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1513    uint64_t blocks = size / block_size;
1514
1515    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1516
1517    while (blocks) {
1518       uint32_t src_x = (src_va & 63) / block_size;
1519       uint32_t dst_x = (dst_va & 63) / block_size;
1520       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1521
1522       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1523       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1524       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1525       ops->run(cmd, cs);
1526
1527       src_va += width * block_size;
1528       dst_va += width * block_size;
1529       blocks -= width;
1530    }
1531 }
1532
1533 void
1534 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1535                  VkBuffer srcBuffer,
1536                  VkBuffer dstBuffer,
1537                  uint32_t regionCount,
1538                  const VkBufferCopy *pRegions)
1539 {
1540    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1541    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1542    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1543
1544    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1545    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1546
1547    for (unsigned i = 0; i < regionCount; ++i) {
1548       copy_buffer(cmd,
1549                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1550                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1551                   pRegions[i].size, 1);
1552    }
1553 }
1554
1555 void
1556 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1557                    VkBuffer dstBuffer,
1558                    VkDeviceSize dstOffset,
1559                    VkDeviceSize dataSize,
1560                    const void *pData)
1561 {
1562    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1563    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1564
1565    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1566
1567    struct tu_cs_memory tmp;
1568    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1569    if (result != VK_SUCCESS) {
1570       cmd->record_result = result;
1571       return;
1572    }
1573
1574    memcpy(tmp.map, pData, dataSize);
1575    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1576 }
1577
1578 void
1579 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1580                  VkBuffer dstBuffer,
1581                  VkDeviceSize dstOffset,
1582                  VkDeviceSize fillSize,
1583                  uint32_t data)
1584 {
1585    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1586    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1587    const struct blit_ops *ops = &r2d_ops;
1588    struct tu_cs *cs = &cmd->cs;
1589
1590    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1591
1592    if (fillSize == VK_WHOLE_SIZE)
1593       fillSize = buffer->size - dstOffset;
1594
1595    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1596    uint32_t blocks = fillSize / 4;
1597
1598    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true);
1599    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1600
1601    while (blocks) {
1602       uint32_t dst_x = (dst_va & 63) / 4;
1603       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1604
1605       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1606       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1607       ops->run(cmd, cs);
1608
1609       dst_va += width * 4;
1610       blocks -= width;
1611    }
1612 }
1613
1614 void
1615 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1616                    VkImage srcImage,
1617                    VkImageLayout srcImageLayout,
1618                    VkImage dstImage,
1619                    VkImageLayout dstImageLayout,
1620                    uint32_t regionCount,
1621                    const VkImageResolve *pRegions)
1622 {
1623    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1624    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1625    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1626    const struct blit_ops *ops = &r2d_ops;
1627    struct tu_cs *cs = &cmd->cs;
1628
1629    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1630    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1631
1632    ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1633
1634    for (uint32_t i = 0; i < regionCount; ++i) {
1635       const VkImageResolve *info = &pRegions[i];
1636       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1637
1638       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1639       /* TODO: aspect masks possible ? */
1640
1641       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1642
1643       struct tu_image_view dst, src;
1644       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1645       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1646
1647       for (uint32_t i = 0; i < layers; i++) {
1648          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1649          ops->dst(cs, &dst, i);
1650          ops->run(cmd, cs);
1651       }
1652    }
1653 }
1654
1655 void
1656 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1657                   struct tu_cs *cs,
1658                   struct tu_image_view *src,
1659                   struct tu_image_view *dst,
1660                   uint32_t layers,
1661                   const VkRect2D *rect)
1662 {
1663    const struct blit_ops *ops = &r2d_ops;
1664
1665    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1666    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1667
1668    assert(src->image->vk_format == dst->image->vk_format);
1669
1670    ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1671    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1672
1673    for (uint32_t i = 0; i < layers; i++) {
1674       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1675       ops->dst(cs, dst, i);
1676       ops->run(cmd, cs);
1677    }
1678 }
1679
1680 static void
1681 clear_image(struct tu_cmd_buffer *cmd,
1682             struct tu_image *image,
1683             const VkClearValue *clear_value,
1684             const VkImageSubresourceRange *range)
1685 {
1686    uint32_t level_count = tu_get_levelCount(image, range);
1687    uint32_t layer_count = tu_get_layerCount(image, range);
1688    struct tu_cs *cs = &cmd->cs;
1689    VkFormat format = image->vk_format;
1690    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1691       format = VK_FORMAT_R32_UINT;
1692
1693    if (image->type == VK_IMAGE_TYPE_3D) {
1694       assert(layer_count == 1);
1695       assert(range->baseArrayLayer == 0);
1696    }
1697
1698    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1699
1700    ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true);
1701    ops->clear_value(cs, image->vk_format, clear_value);
1702
1703    for (unsigned j = 0; j < level_count; j++) {
1704       if (image->type == VK_IMAGE_TYPE_3D)
1705          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1706
1707       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1708                      u_minify(image->extent.width, range->baseMipLevel + j),
1709                      u_minify(image->extent.height, range->baseMipLevel + j)
1710                   });
1711
1712       struct tu_image_view dst;
1713       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1714          .aspectMask = range->aspectMask,
1715          .mipLevel = range->baseMipLevel + j,
1716          .baseArrayLayer = range->baseArrayLayer,
1717          .layerCount = 1,
1718       }, 0, false);
1719
1720       for (uint32_t i = 0; i < layer_count; i++) {
1721          ops->dst(cs, &dst, i);
1722          ops->run(cmd, cs);
1723       }
1724    }
1725 }
1726
1727 void
1728 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1729                       VkImage image_h,
1730                       VkImageLayout imageLayout,
1731                       const VkClearColorValue *pColor,
1732                       uint32_t rangeCount,
1733                       const VkImageSubresourceRange *pRanges)
1734 {
1735    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1736    TU_FROM_HANDLE(tu_image, image, image_h);
1737
1738    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1739
1740    for (unsigned i = 0; i < rangeCount; i++)
1741       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1742 }
1743
1744 void
1745 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1746                              VkImage image_h,
1747                              VkImageLayout imageLayout,
1748                              const VkClearDepthStencilValue *pDepthStencil,
1749                              uint32_t rangeCount,
1750                              const VkImageSubresourceRange *pRanges)
1751 {
1752    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1753    TU_FROM_HANDLE(tu_image, image, image_h);
1754
1755    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1756
1757    for (unsigned i = 0; i < rangeCount; i++)
1758       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1759 }
1760
1761 static void
1762 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1763                                uint32_t attachment_count,
1764                                const VkClearAttachment *attachments,
1765                                uint32_t rect_count,
1766                                const VkClearRect *rects)
1767 {
1768    const struct tu_subpass *subpass = cmd->state.subpass;
1769    /* note: cannot use shader path here.. there is a special shader path
1770     * in tu_clear_sysmem_attachments()
1771     */
1772    const struct blit_ops *ops = &r2d_ops;
1773    struct tu_cs *cs = &cmd->draw_cs;
1774
1775    for (uint32_t j = 0; j < attachment_count; j++) {
1776          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1777           * Pass Instance" says that:
1778           *
1779           *     Unlike other clear commands, vkCmdClearAttachments executes as
1780           *     a drawing command, rather than a transfer command, with writes
1781           *     performed by it executing in rasterization order. Clears to
1782           *     color attachments are executed as color attachment writes, by
1783           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1784           *     Clears to depth/stencil attachments are executed as depth
1785           *     writes and writes by the
1786           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1787           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1788           *
1789           * However, the 2d path here is executed the same way as a
1790           * transfer command, using the CCU color cache exclusively with
1791           * a special depth-as-color format for depth clears. This means that
1792           * we can't rely on the normal pipeline barrier mechanism here, and
1793           * have to manually flush whenever using a different cache domain
1794           * from what the 3d path would've used. This happens when we clear
1795           * depth/stencil, since normally depth attachments use CCU depth, but
1796           * we clear it using a special depth-as-color format. Since the clear
1797           * potentially uses a different attachment state we also need to
1798           * invalidate color beforehand and flush it afterwards.
1799           */
1800
1801          uint32_t a;
1802          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1803             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1804             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1805          } else {
1806             a = subpass->depth_stencil_attachment.attachment;
1807             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1808             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1809             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1810          }
1811
1812          if (a == VK_ATTACHMENT_UNUSED)
1813                continue;
1814
1815          const struct tu_image_view *iview =
1816             cmd->state.framebuffer->attachments[a].attachment;
1817
1818          ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask, ROTATE_0, true);
1819          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1820
1821          /* Wait for the flushes we triggered manually to complete */
1822          tu_cs_emit_wfi(cs);
1823
1824          for (uint32_t i = 0; i < rect_count; i++) {
1825             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1826             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1827                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1828                ops->run(cmd, cs);
1829             }
1830          }
1831
1832          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1833             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1834             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1835          } else {
1836             /* sync color into depth */
1837             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1838             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1839          }
1840    }
1841 }
1842
1843 static void
1844 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1845                             uint32_t attachment_count,
1846                             const VkClearAttachment *attachments,
1847                             uint32_t rect_count,
1848                             const VkClearRect *rects)
1849 {
1850    /* the shader path here is special, it avoids changing MRT/etc state */
1851    const struct tu_render_pass *pass = cmd->state.pass;
1852    const struct tu_subpass *subpass = cmd->state.subpass;
1853    const uint32_t mrt_count = subpass->color_count;
1854    struct tu_cs *cs = &cmd->draw_cs;
1855    uint32_t clear_value[MAX_RTS][4];
1856    float z_clear_val = 0.0f;
1857    uint8_t s_clear_val = 0;
1858    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1859    bool z_clear = false;
1860    bool s_clear = false;
1861    bool layered_clear = false;
1862    uint32_t max_samples = 1;
1863
1864    for (uint32_t i = 0; i < attachment_count; i++) {
1865       uint32_t a;
1866       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1867          uint32_t c = attachments[i].colorAttachment;
1868          a = subpass->color_attachments[c].attachment;
1869          if (a == VK_ATTACHMENT_UNUSED)
1870             continue;
1871
1872          clear_rts |= 1 << c;
1873          clear_components |= 0xf << (c * 4);
1874          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1875       } else {
1876          a = subpass->depth_stencil_attachment.attachment;
1877          if (a == VK_ATTACHMENT_UNUSED)
1878             continue;
1879
1880          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1881             z_clear = true;
1882             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1883          }
1884
1885          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1886             s_clear = true;
1887             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1888          }
1889       }
1890
1891       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1892    }
1893
1894    /* prefer to use 2D path for clears
1895     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1896     */
1897    if (max_samples == 1 && cmd->state.framebuffer) {
1898       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1899       return;
1900    }
1901
1902    /* This clear path behaves like a draw, needs the same flush as tu_draw */
1903    tu_emit_cache_flush_renderpass(cmd, cs);
1904
1905    /* disable all draw states so they don't interfere
1906     * TODO: use and re-use draw states for this path
1907     * we have to disable draw states individually to preserve
1908     * input attachment states, because a secondary command buffer
1909     * won't be able to restore them
1910     */
1911    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1912    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1913       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1914           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1915          continue;
1916       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1917                      CP_SET_DRAW_STATE__0_DISABLE);
1918       tu_cs_emit_qw(cs, 0);
1919    }
1920    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1921
1922    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1923    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1924                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1925                   0xfc000000);
1926    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1927
1928    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1929    for (uint32_t i = 0; i < mrt_count; i++) {
1930       if (clear_rts & (1 << i))
1931          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1932       else
1933          tu_cs_emit(cs, 0);
1934    }
1935
1936    for (uint32_t i = 0; i < rect_count; i++) {
1937       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1938          layered_clear = true;
1939    }
1940
1941    r3d_common(cmd, cs, false, num_rts, layered_clear);
1942
1943    tu_cs_emit_regs(cs,
1944                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1945    tu_cs_emit_regs(cs,
1946                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1947
1948    tu_cs_emit_regs(cs,
1949                    A6XX_RB_FS_OUTPUT_CNTL0(),
1950                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1951
1952    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1953    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1954    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1955    for (uint32_t i = 0; i < mrt_count; i++) {
1956       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1957             .component_enable = COND(clear_rts & (1 << i), 0xf)));
1958    }
1959
1960    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1961    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1962          .z_enable = z_clear,
1963          .z_write_enable = z_clear,
1964          .zfunc = FUNC_ALWAYS));
1965    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1966    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1967          .stencil_enable = s_clear,
1968          .func = FUNC_ALWAYS,
1969          .zpass = STENCIL_REPLACE));
1970    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1971    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1972    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1973
1974    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1975    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1976                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1977                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1978                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1979                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1980    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1981    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1982    for_each_bit(b, clear_rts)
1983       tu_cs_emit_array(cs, clear_value[b], 4);
1984
1985    for (uint32_t i = 0; i < rect_count; i++) {
1986       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1987          r3d_coords_raw(cs, layered_clear, (float[]) {
1988             rects[i].rect.offset.x, rects[i].rect.offset.y,
1989             z_clear_val, uif(rects[i].baseArrayLayer + layer),
1990             rects[i].rect.offset.x + rects[i].rect.extent.width,
1991             rects[i].rect.offset.y + rects[i].rect.extent.height,
1992             z_clear_val, 1.0f,
1993          });
1994
1995          if (layered_clear) {
1996             tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1997             tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_POINTLIST) |
1998                            CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1999                            CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) |
2000                            CP_DRAW_INDX_OFFSET_0_GS_ENABLE);
2001             tu_cs_emit(cs, 1); /* instance count */
2002             tu_cs_emit(cs, 1); /* vertex count */
2003          } else {
2004             r3d_run(cmd, cs);
2005          }
2006       }
2007    }
2008 }
2009
2010 static void
2011 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2012 {
2013    enum pipe_format pformat = vk_format_to_pipe_format(format);
2014
2015    switch (format) {
2016    case VK_FORMAT_X8_D24_UNORM_PACK32:
2017    case VK_FORMAT_D24_UNORM_S8_UINT:
2018       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2019                        val->depthStencil.stencil << 24;
2020       return;
2021    case VK_FORMAT_D16_UNORM:
2022       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2023       return;
2024    case VK_FORMAT_D32_SFLOAT:
2025       clear_value[0] = fui(val->depthStencil.depth);
2026       return;
2027    case VK_FORMAT_S8_UINT:
2028       clear_value[0] = val->depthStencil.stencil;
2029       return;
2030    /* these formats use a different base format when tiled
2031     * the same format can be used for both because GMEM is always in WZYX order
2032     */
2033    case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2034    case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2035       pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
2036    default:
2037       break;
2038    }
2039
2040    VkClearColorValue color;
2041
2042    /**
2043     * GMEM is tiled and wants the components in WZYX order,
2044     * apply swizzle to the color before packing, to counteract
2045     * deswizzling applied by packing functions
2046     */
2047    pipe_swizzle_4f(color.float32, val->color.float32,
2048                    util_format_description(pformat)->swizzle);
2049
2050    util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
2051 }
2052
2053 static void
2054 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2055                               struct tu_cs *cs,
2056                               uint32_t attachment,
2057                               VkImageAspectFlags mask,
2058                               const VkClearValue *value)
2059 {
2060    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
2061
2062
2063    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2064    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
2065
2066    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
2067       .clear_mask = aspect_write_mask(vk_format, mask)));
2068
2069    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2070    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
2071
2072    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2073    tu_cs_emit(cs, 0);
2074
2075    uint32_t clear_vals[4] = {};
2076    pack_gmem_clear_value(value, vk_format, clear_vals);
2077
2078    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2079    tu_cs_emit_array(cs, clear_vals, 4);
2080
2081    tu6_emit_event_write(cmd, cs, BLIT);
2082 }
2083
2084 static void
2085 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2086                           uint32_t attachment_count,
2087                           const VkClearAttachment *attachments,
2088                           uint32_t rect_count,
2089                           const VkClearRect *rects)
2090 {
2091    const struct tu_subpass *subpass = cmd->state.subpass;
2092    struct tu_cs *cs = &cmd->draw_cs;
2093
2094    /* TODO: swap the loops for smaller cmdstream */
2095    for (unsigned i = 0; i < rect_count; i++) {
2096       unsigned x1 = rects[i].rect.offset.x;
2097       unsigned y1 = rects[i].rect.offset.y;
2098       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2099       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2100
2101       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2102       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2103       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2104
2105       for (unsigned j = 0; j < attachment_count; j++) {
2106          uint32_t a;
2107          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2108             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2109          else
2110             a = subpass->depth_stencil_attachment.attachment;
2111
2112          if (a == VK_ATTACHMENT_UNUSED)
2113                continue;
2114
2115          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2116                                        &attachments[j].clearValue);
2117       }
2118    }
2119 }
2120
2121 void
2122 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2123                        uint32_t attachmentCount,
2124                        const VkClearAttachment *pAttachments,
2125                        uint32_t rectCount,
2126                        const VkClearRect *pRects)
2127 {
2128    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2129    struct tu_cs *cs = &cmd->draw_cs;
2130
2131    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2132    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2133    tu_cond_exec_end(cs);
2134
2135    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2136    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2137    tu_cond_exec_end(cs);
2138 }
2139
2140 void
2141 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2142                            struct tu_cs *cs,
2143                            uint32_t a,
2144                            const VkRenderPassBeginInfo *info)
2145 {
2146    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2147    const struct tu_image_view *iview = fb->attachments[a].attachment;
2148    const struct tu_render_pass_attachment *attachment =
2149       &cmd->state.pass->attachments[a];
2150
2151    if (!attachment->clear_mask)
2152       return;
2153
2154    const struct blit_ops *ops = &r2d_ops;
2155    if (attachment->samples > 1)
2156       ops = &r3d_ops;
2157
2158    ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, true);
2159    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2160    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2161
2162    /* Wait for any flushes at the beginning of the renderpass to complete */
2163    tu_cs_emit_wfi(cs);
2164
2165    for (uint32_t i = 0; i < fb->layers; i++) {
2166       ops->dst(cs, iview, i);
2167       ops->run(cmd, cs);
2168    }
2169
2170    /* The spec doesn't explicitly say, but presumably the initial renderpass
2171     * clear is considered part of the renderpass, and therefore barriers
2172     * aren't required inside the subpass/renderpass.  Therefore we need to
2173     * flush CCU color into CCU depth here, just like with
2174     * vkCmdClearAttachments(). Note that because this only happens at the
2175     * beginning of a renderpass, and renderpass writes are considered
2176     * "incoherent", we shouldn't have to worry about syncing depth into color
2177     * beforehand as depth should already be flushed.
2178     */
2179    if (vk_format_is_depth_or_stencil(attachment->format)) {
2180       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2181       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2182    } else {
2183       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2184       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2185    }
2186 }
2187
2188 void
2189 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2190                          struct tu_cs *cs,
2191                          uint32_t a,
2192                          const VkRenderPassBeginInfo *info)
2193 {
2194    const struct tu_render_pass_attachment *attachment =
2195       &cmd->state.pass->attachments[a];
2196
2197    if (!attachment->clear_mask)
2198       return;
2199
2200    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2201
2202    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2203                                  &info->pClearValues[a]);
2204 }
2205
2206 static void
2207 tu_emit_blit(struct tu_cmd_buffer *cmd,
2208              struct tu_cs *cs,
2209              const struct tu_image_view *iview,
2210              const struct tu_render_pass_attachment *attachment,
2211              bool resolve)
2212 {
2213    tu_cs_emit_regs(cs,
2214                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2215
2216    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2217       .unk0 = !resolve,
2218       .gmem = !resolve,
2219       /* "integer" bit disables msaa resolve averaging */
2220       .integer = vk_format_is_int(attachment->format)));
2221
2222    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2223    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2224    tu_cs_image_ref_2d(cs, iview, 0, false);
2225
2226    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2227    tu_cs_image_flag_ref(cs, iview, 0);
2228
2229    tu_cs_emit_regs(cs,
2230                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2231
2232    tu6_emit_event_write(cmd, cs, BLIT);
2233 }
2234
2235 static bool
2236 blit_can_resolve(VkFormat format)
2237 {
2238    const struct util_format_description *desc = vk_format_description(format);
2239
2240    /* blit event can only do resolve for simple cases:
2241     * averaging samples as unsigned integers or choosing only one sample
2242     */
2243    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2244       return false;
2245
2246    /* can't do formats with larger channel sizes
2247     * note: this includes all float formats
2248     * note2: single channel integer formats seem OK
2249     */
2250    if (desc->channel[0].size > 10)
2251       return false;
2252
2253    switch (format) {
2254    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2255     * likely related to these formats having different layout from other cpp=2 formats
2256     */
2257    case VK_FORMAT_R8G8_UNORM:
2258    case VK_FORMAT_R8G8_UINT:
2259    case VK_FORMAT_R8G8_SINT:
2260    /* TODO: this one should be able to work? */
2261    case VK_FORMAT_D24_UNORM_S8_UINT:
2262       return false;
2263    default:
2264       break;
2265    }
2266
2267    return true;
2268 }
2269
2270 void
2271 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2272                         struct tu_cs *cs,
2273                         uint32_t a,
2274                         bool force_load)
2275 {
2276    const struct tu_image_view *iview =
2277       cmd->state.framebuffer->attachments[a].attachment;
2278    const struct tu_render_pass_attachment *attachment =
2279       &cmd->state.pass->attachments[a];
2280
2281    if (attachment->load || force_load)
2282       tu_emit_blit(cmd, cs, iview, attachment, false);
2283 }
2284
2285 void
2286 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2287                          struct tu_cs *cs,
2288                          uint32_t a,
2289                          uint32_t gmem_a)
2290 {
2291    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2292    const VkRect2D *render_area = &cmd->state.render_area;
2293    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2294    struct tu_image_view *iview = fb->attachments[a].attachment;
2295    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2296
2297    if (!dst->store)
2298       return;
2299
2300    uint32_t x1 = render_area->offset.x;
2301    uint32_t y1 = render_area->offset.y;
2302    uint32_t x2 = x1 + render_area->extent.width;
2303    uint32_t y2 = y1 + render_area->extent.height;
2304    /* x2/y2 can be unaligned if equal to the size of the image,
2305     * since it will write into padding space
2306     * the one exception is linear levels which don't have the
2307     * required y padding in the layout (except for the last level)
2308     */
2309    bool need_y2_align =
2310       y2 != iview->extent.height || iview->need_y2_align;
2311
2312    bool unaligned =
2313       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2314       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2315
2316    /* use fast path when render area is aligned, except for unsupported resolve cases */
2317    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2318       tu_emit_blit(cmd, cs, iview, src, true);
2319       return;
2320    }
2321
2322    if (dst->samples > 1) {
2323       /* I guess we need to use shader path in this case?
2324        * need a testcase which fails because of this
2325        */
2326       tu_finishme("unaligned store of msaa attachment\n");
2327       return;
2328    }
2329
2330    r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, true);
2331    r2d_dst(cs, iview, 0);
2332    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2333
2334    tu_cs_emit_regs(cs,
2335                    A6XX_SP_PS_2D_SRC_INFO(
2336                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2337                       .tile_mode = TILE6_2,
2338                       .srgb = vk_format_is_srgb(src->format),
2339                       .samples = tu_msaa_samples(src->samples),
2340                       .samples_average = !vk_format_is_int(src->format),
2341                       .unk20 = 1,
2342                       .unk22 = 1),
2343                    /* note: src size does not matter when not scaling */
2344                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2345                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2346                    A6XX_SP_PS_2D_SRC_HI(),
2347                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2348
2349    /* sync GMEM writes with CACHE. */
2350    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2351
2352    /* Wait for CACHE_INVALIDATE to land */
2353    tu_cs_emit_wfi(cs);
2354
2355    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2356    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2357
2358    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2359     * sysmem, and we generally assume that GMEM renderpasses leave their
2360     * results in sysmem, so we need to flush manually here.
2361     */
2362    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2363 }