src/freedreno/vulkan/tu_clear_blit.c

   1 /*
   2  * Copyright 2019-2020 Valve Corporation
   3  * SPDX-License-Identifier: MIT
   4  *
   5  * Authors:
   6  *    Jonathan Marek <jonathan@marek.ca>
   7  */
   8
   9 #include "tu_private.h"
  10
  11 #include "tu_cs.h"
  12 #include "vk_format.h"
  13
  14 #include "util/format_r11g11b10f.h"
  15 #include "util/format_rgb9e5.h"
  16 #include "util/format_srgb.h"
  17 #include "util/u_half.h"
  18
  19 static uint32_t
  20 tu_pack_float32_for_unorm(float val, int bits)
  21 {
  22    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
  23 }
  24
  25 /* r2d_ = BLIT_OP_SCALE operations */
  26
  27 static enum a6xx_2d_ifmt
  28 format_to_ifmt(enum a6xx_format fmt)
  29 {
  30    switch (fmt) {
  31    case FMT6_A8_UNORM:
  32    case FMT6_8_UNORM:
  33    case FMT6_8_SNORM:
  34    case FMT6_8_8_UNORM:
  35    case FMT6_8_8_SNORM:
  36    case FMT6_8_8_8_8_UNORM:
  37    case FMT6_8_8_8_X8_UNORM:
  38    case FMT6_8_8_8_8_SNORM:
  39    case FMT6_4_4_4_4_UNORM:
  40    case FMT6_5_5_5_1_UNORM:
  41    case FMT6_5_6_5_UNORM:
  42    case FMT6_Z24_UNORM_S8_UINT:
  43    case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8:
  44       return R2D_UNORM8;
  45
  46    case FMT6_32_UINT:
  47    case FMT6_32_SINT:
  48    case FMT6_32_32_UINT:
  49    case FMT6_32_32_SINT:
  50    case FMT6_32_32_32_32_UINT:
  51    case FMT6_32_32_32_32_SINT:
  52       return R2D_INT32;
  53
  54    case FMT6_16_UINT:
  55    case FMT6_16_SINT:
  56    case FMT6_16_16_UINT:
  57    case FMT6_16_16_SINT:
  58    case FMT6_16_16_16_16_UINT:
  59    case FMT6_16_16_16_16_SINT:
  60    case FMT6_10_10_10_2_UINT:
  61       return R2D_INT16;
  62
  63    case FMT6_8_UINT:
  64    case FMT6_8_SINT:
  65    case FMT6_8_8_UINT:
  66    case FMT6_8_8_SINT:
  67    case FMT6_8_8_8_8_UINT:
  68    case FMT6_8_8_8_8_SINT:
  69       return R2D_INT8;
  70
  71    case FMT6_16_UNORM:
  72    case FMT6_16_SNORM:
  73    case FMT6_16_16_UNORM:
  74    case FMT6_16_16_SNORM:
  75    case FMT6_16_16_16_16_UNORM:
  76    case FMT6_16_16_16_16_SNORM:
  77    case FMT6_32_FLOAT:
  78    case FMT6_32_32_FLOAT:
  79    case FMT6_32_32_32_32_FLOAT:
  80       return R2D_FLOAT32;
  81
  82    case FMT6_16_FLOAT:
  83    case FMT6_16_16_FLOAT:
  84    case FMT6_16_16_16_16_FLOAT:
  85    case FMT6_11_11_10_FLOAT:
  86    case FMT6_10_10_10_2_UNORM:
  87    case FMT6_10_10_10_2_UNORM_DEST:
  88       return R2D_FLOAT16;
  89
  90    default:
  91       unreachable("bad format");
  92       return 0;
  93    }
  94 }
  95
  96 static void
  97 r2d_coords(struct tu_cs *cs,
  98            const VkOffset2D *dst,
  99            const VkOffset2D *src,
 100            const VkExtent2D *extent)
 101 {
 102    tu_cs_emit_regs(cs,
 103       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
 104       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
 105
 106    if (!src)
 107       return;
 108
 109    tu_cs_emit_regs(cs,
 110                    A6XX_GRAS_2D_SRC_TL_X(src->x),
 111                    A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
 112                    A6XX_GRAS_2D_SRC_TL_Y(src->y),
 113                    A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
 114 }
 115
 116 static void
 117 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 118 {
 119    uint32_t clear_value[4] = {};
 120
 121    switch (format) {
 122    case VK_FORMAT_X8_D24_UNORM_PACK32:
 123    case VK_FORMAT_D24_UNORM_S8_UINT:
 124       /* cleared as r8g8b8a8_unorm using special format */
 125       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 126       clear_value[1] = clear_value[0] >> 8;
 127       clear_value[2] = clear_value[0] >> 16;
 128       clear_value[3] = val->depthStencil.stencil;
 129       break;
 130    case VK_FORMAT_D16_UNORM:
 131    case VK_FORMAT_D32_SFLOAT:
 132       /* R2D_FLOAT32 */
 133       clear_value[0] = fui(val->depthStencil.depth);
 134       break;
 135    case VK_FORMAT_S8_UINT:
 136       clear_value[0] = val->depthStencil.stencil;
 137       break;
 138    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 139       /* cleared as UINT32 */
 140       clear_value[0] = float3_to_rgb9e5(val->color.float32);
 141       break;
 142    default:
 143       assert(!vk_format_is_depth_or_stencil(format));
 144       const struct util_format_description *desc = vk_format_description(format);
 145       enum a6xx_2d_ifmt ifmt = format_to_ifmt(tu6_base_format(format));
 146
 147       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
 148                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
 149
 150       for (unsigned i = 0; i < desc->nr_channels; i++) {
 151          const struct util_format_channel_description *ch = &desc->channel[i];
 152          if (ifmt == R2D_UNORM8) {
 153             float linear = val->color.float32[i];
 154             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
 155                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
 156
 157             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
 158                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
 159             else
 160                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
 161          } else if (ifmt == R2D_FLOAT16) {
 162             clear_value[i] = util_float_to_half(val->color.float32[i]);
 163          } else {
 164             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
 165                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
 166             clear_value[i] = val->color.uint32[i];
 167          }
 168       }
 169       break;
 170    }
 171
 172    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
 173    tu_cs_emit_array(cs, clear_value, 4);
 174 }
 175
 176 static void
 177 r2d_src(struct tu_cmd_buffer *cmd,
 178         struct tu_cs *cs,
 179         const struct tu_image_view *iview,
 180         uint32_t layer,
 181         VkFilter filter)
 182 {
 183    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
 184    if (filter != VK_FILTER_NEAREST)
 185       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
 186
 187    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
 188    tu_cs_emit(cs, src_info);
 189    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
 190    tu_cs_image_ref_2d(cs, iview, layer, true);
 191
 192    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
 193    tu_cs_image_flag_ref(cs, iview, layer);
 194 }
 195
 196 static void
 197 r2d_src_buffer(struct tu_cmd_buffer *cmd,
 198                struct tu_cs *cs,
 199                VkFormat vk_format,
 200                uint64_t va, uint32_t pitch,
 201                uint32_t width, uint32_t height)
 202 {
 203    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 204
 205    tu_cs_emit_regs(cs,
 206                    A6XX_SP_PS_2D_SRC_INFO(
 207                       .color_format = format.fmt,
 208                       .color_swap = format.swap,
 209                       .srgb = vk_format_is_srgb(vk_format),
 210                       .unk20 = 1,
 211                       .unk22 = 1),
 212                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
 213                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
 214                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
 215                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
 216 }
 217
 218 static void
 219 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 220 {
 221    assert(iview->image->samples == 1);
 222
 223    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
 224    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
 225    tu_cs_image_ref_2d(cs, iview, layer, false);
 226
 227    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
 228    tu_cs_image_flag_ref(cs, iview, layer);
 229 }
 230
 231 static void
 232 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 233 {
 234    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 235
 236    tu_cs_emit_regs(cs,
 237                    A6XX_RB_2D_DST_INFO(
 238                       .color_format = format.fmt,
 239                       .color_swap = format.swap,
 240                       .srgb = vk_format_is_srgb(vk_format)),
 241                    A6XX_RB_2D_DST_LO((uint32_t) va),
 242                    A6XX_RB_2D_DST_HI(va >> 32),
 243                    A6XX_RB_2D_DST_PITCH(pitch));
 244 }
 245
 246 static void
 247 r2d_setup_common(struct tu_cmd_buffer *cmd,
 248                  struct tu_cs *cs,
 249                  VkFormat vk_format,
 250                  VkImageAspectFlags aspect_mask,
 251                  enum a6xx_rotation rotation,
 252                  bool clear,
 253                  bool scissor)
 254 {
 255    enum a6xx_format format = tu6_base_format(vk_format);
 256    enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 257    uint32_t unknown_8c01 = 0;
 258
 259    /* note: the only format with partial clearing is D24S8 */
 260    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 261       /* preserve stencil channel */
 262       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 263          unknown_8c01 = 0x08000041;
 264       /* preserve depth channels */
 265       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 266          unknown_8c01 = 0x00084001;
 267    }
 268
 269    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
 270    tu_cs_emit(cs, unknown_8c01);
 271
 272    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
 273          .scissor = scissor,
 274          .rotate = rotation,
 275          .solid_color = clear,
 276          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
 277          .color_format = format,
 278          .mask = 0xf,
 279          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
 280       ).value;
 281
 282    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
 283    tu_cs_emit(cs, blit_cntl);
 284
 285    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
 286    tu_cs_emit(cs, blit_cntl);
 287
 288    if (format == FMT6_10_10_10_2_UNORM_DEST)
 289       format = FMT6_16_16_16_16_FLOAT;
 290
 291    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
 292          .sint = vk_format_is_sint(vk_format),
 293          .uint = vk_format_is_uint(vk_format),
 294          .color_format = format,
 295          .srgb = vk_format_is_srgb(vk_format),
 296          .mask = 0xf));
 297 }
 298
 299 static void
 300 r2d_setup(struct tu_cmd_buffer *cmd,
 301           struct tu_cs *cs,
 302           VkFormat vk_format,
 303           VkImageAspectFlags aspect_mask,
 304           enum a6xx_rotation rotation,
 305           bool clear)
 306 {
 307    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 308
 309    r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, false);
 310 }
 311
 312 static void
 313 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 314 {
 315    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
 316    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
 317 }
 318
 319 /* r3d_ = shader path operations */
 320
 321 void
 322 tu_init_clear_blit_shaders(struct tu6_global *global)
 323 {
 324 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
 325 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
 326 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
 327
 328    static const instr_t vs_code[] = {
 329       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
 330        * r1.xy = r0.w ? c1.zw : c0.zw
 331        * r0.w = 1.0f
 332        */
 333       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
 334          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
 335          .src2 = 3,
 336          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
 337       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
 338          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
 339          .src2 = 3,
 340          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
 341       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
 342       { .cat0 = { .opc = OPC_END } },
 343    };
 344
 345    static const instr_t fs_blit[] = {
 346       /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
 347        * blit path (its not clear what allows it to not have it)
 348        */
 349       CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
 350       { .cat0 = { .opc = OPC_END } },
 351    };
 352
 353    memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
 354    memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
 355
 356    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
 357       instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
 358       for (uint32_t i = 0; i < num_rts; i++) {
 359          /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
 360          *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
 361       }
 362       *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
 363    }
 364 }
 365
 366 static void
 367 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
 368            bool layered_clear)
 369 {
 370    struct ir3_const_state dummy_const_state = {};
 371    struct ir3_shader dummy_shader = {};
 372
 373    struct ir3_shader_variant vs = {
 374       .type = MESA_SHADER_VERTEX,
 375       .instrlen = 1,
 376       .constlen = 4,
 377       .info.max_reg = 1,
 378       .inputs_count = 1,
 379       .inputs[0] = {
 380          .slot = SYSTEM_VALUE_VERTEX_ID,
 381          .regid = regid(0, 3),
 382          .sysval = true,
 383       },
 384       .outputs_count = blit ? 2 : 1,
 385       .outputs[0] = {
 386          .slot = VARYING_SLOT_POS,
 387          .regid = regid(0, 0),
 388       },
 389       .outputs[1] = {
 390          .slot = VARYING_SLOT_VAR0,
 391          .regid = regid(1, 0),
 392       },
 393       .shader = &dummy_shader,
 394       .const_state = &dummy_const_state,
 395    };
 396    if (layered_clear) {
 397       vs.outputs[1].slot = VARYING_SLOT_LAYER;
 398       vs.outputs[1].regid = regid(1, 1);
 399       vs.outputs_count = 2;
 400    }
 401
 402    struct ir3_shader_variant fs = {
 403       .type = MESA_SHADER_FRAGMENT,
 404       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
 405       .constlen = align(num_rts, 4),
 406       .info.max_reg = MAX2(num_rts, 1) - 1,
 407       .total_in = blit ? 2 : 0,
 408       .num_samp = blit ? 1 : 0,
 409       .inputs_count = blit ? 2 : 0,
 410       .inputs[0] = {
 411          .slot = VARYING_SLOT_VAR0,
 412          .inloc = 0,
 413          .compmask = 3,
 414          .bary = true,
 415       },
 416       .inputs[1] = {
 417          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
 418          .regid = regid(0, 0),
 419          .sysval = 1,
 420       },
 421       .num_sampler_prefetch = blit ? 1 : 0,
 422       .sampler_prefetch[0] = {
 423          .src = 0,
 424          .wrmask = 0xf,
 425          .cmd = 4,
 426       },
 427       .shader = &dummy_shader,
 428       .const_state = &dummy_const_state,
 429    };
 430
 431    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
 432          .vs_state = true,
 433          .hs_state = true,
 434          .ds_state = true,
 435          .gs_state = true,
 436          .fs_state = true,
 437          .cs_state = true,
 438          .gfx_ibo = true,
 439          .cs_ibo = true,
 440          .gfx_shared_const = true,
 441          .gfx_bindless = 0x1f,
 442          .cs_bindless = 0x1f));
 443
 444    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
 445    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
 446    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
 447    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
 448    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
 449          global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
 450
 451    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
 452    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
 453
 454    tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
 455
 456    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
 457    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
 458    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
 459
 460    tu6_emit_fs_inputs(cs, &fs);
 461
 462    tu_cs_emit_regs(cs,
 463                    A6XX_GRAS_CL_CNTL(
 464                       .persp_division_disable = 1,
 465                       .vp_xform_disable = 1,
 466                       .vp_clip_code_ignore = 1,
 467                       .clip_disable = 1));
 468    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
 469
 470    tu_cs_emit_regs(cs,
 471                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
 472                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
 473    tu_cs_emit_regs(cs,
 474                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
 475                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
 476
 477    tu_cs_emit_regs(cs,
 478                    A6XX_VFD_INDEX_OFFSET(),
 479                    A6XX_VFD_INSTANCE_START_OFFSET());
 480 }
 481
 482 static void
 483 r3d_coords_raw(struct tu_cs *cs, const float *coords)
 484 {
 485    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
 486    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 487                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 488                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 489                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
 490                   CP_LOAD_STATE6_0_NUM_UNIT(2));
 491    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 492    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 493    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
 494 }
 495
 496 static void
 497 r3d_coords(struct tu_cs *cs,
 498            const VkOffset2D *dst,
 499            const VkOffset2D *src,
 500            const VkExtent2D *extent)
 501 {
 502    int32_t src_x1 = src ? src->x : 0;
 503    int32_t src_y1 = src ? src->y : 0;
 504    r3d_coords_raw(cs, (float[]) {
 505       dst->x,                 dst->y,
 506       src_x1,                 src_y1,
 507       dst->x + extent->width, dst->y + extent->height,
 508       src_x1 + extent->width, src_y1 + extent->height,
 509    });
 510 }
 511
 512 static void
 513 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 514 {
 515    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
 516    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 517                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 518                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
 519                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
 520                   CP_LOAD_STATE6_0_NUM_UNIT(1));
 521    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
 522    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 523    switch (format) {
 524    case VK_FORMAT_X8_D24_UNORM_PACK32:
 525    case VK_FORMAT_D24_UNORM_S8_UINT: {
 526       /* cleared as r8g8b8a8_unorm using special format */
 527       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
 528       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
 529       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
 530       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
 531       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
 532    } break;
 533    case VK_FORMAT_D16_UNORM:
 534    case VK_FORMAT_D32_SFLOAT:
 535       tu_cs_emit(cs, fui(val->depthStencil.depth));
 536       tu_cs_emit(cs, 0);
 537       tu_cs_emit(cs, 0);
 538       tu_cs_emit(cs, 0);
 539       break;
 540    case VK_FORMAT_S8_UINT:
 541       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
 542       tu_cs_emit(cs, 0);
 543       tu_cs_emit(cs, 0);
 544       tu_cs_emit(cs, 0);
 545       break;
 546    default:
 547       /* as color formats use clear value as-is */
 548       assert(!vk_format_is_depth_or_stencil(format));
 549       tu_cs_emit_array(cs, val->color.uint32, 4);
 550       break;
 551    }
 552 }
 553
 554 static void
 555 r3d_src_common(struct tu_cmd_buffer *cmd,
 556                struct tu_cs *cs,
 557                const uint32_t *tex_const,
 558                uint32_t offset_base,
 559                uint32_t offset_ubwc,
 560                VkFilter filter)
 561 {
 562    struct tu_cs_memory texture = { };
 563    VkResult result = tu_cs_alloc(&cmd->sub_cs,
 564                                  2, /* allocate space for a sampler too */
 565                                  A6XX_TEX_CONST_DWORDS, &texture);
 566    assert(result == VK_SUCCESS);
 567
 568    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
 569
 570    /* patch addresses for layer offset */
 571    *(uint64_t*) (texture.map + 4) += offset_base;
 572    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
 573    texture.map[7] = ubwc_addr;
 574    texture.map[8] = ubwc_addr >> 32;
 575
 576    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
 577       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
 578       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
 579       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
 580       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
 581       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
 582       0x60000; /* XXX used by blob, doesn't seem necessary */
 583    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
 584       0x1 | /* XXX used by blob, doesn't seem necessary */
 585       A6XX_TEX_SAMP_1_UNNORM_COORDS |
 586       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
 587    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
 588    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
 589
 590    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 591    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 592                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
 593                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 594                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 595                CP_LOAD_STATE6_0_NUM_UNIT(1));
 596    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 597
 598    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
 599    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
 600
 601    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
 602    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
 603       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 604       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
 605       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
 606       CP_LOAD_STATE6_0_NUM_UNIT(1));
 607    tu_cs_emit_qw(cs, texture.iova);
 608
 609    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
 610    tu_cs_emit_qw(cs, texture.iova);
 611
 612    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
 613 }
 614
 615 static void
 616 r3d_src(struct tu_cmd_buffer *cmd,
 617         struct tu_cs *cs,
 618         const struct tu_image_view *iview,
 619         uint32_t layer,
 620         VkFilter filter)
 621 {
 622    r3d_src_common(cmd, cs, iview->descriptor,
 623                   iview->layer_size * layer,
 624                   iview->ubwc_layer_size * layer,
 625                   filter);
 626 }
 627
 628 static void
 629 r3d_src_buffer(struct tu_cmd_buffer *cmd,
 630                struct tu_cs *cs,
 631                VkFormat vk_format,
 632                uint64_t va, uint32_t pitch,
 633                uint32_t width, uint32_t height)
 634 {
 635    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 636
 637    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
 638
 639    desc[0] =
 640       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
 641       A6XX_TEX_CONST_0_FMT(format.fmt) |
 642       A6XX_TEX_CONST_0_SWAP(format.swap) |
 643       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
 644       // XXX to swizzle into .w for stencil buffer_to_image
 645       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
 646       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
 647       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
 648    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
 649    desc[2] =
 650       A6XX_TEX_CONST_2_PITCH(pitch) |
 651       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
 652    desc[3] = 0;
 653    desc[4] = va;
 654    desc[5] = va >> 32;
 655    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
 656       desc[i] = 0;
 657
 658    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
 659 }
 660
 661 static void
 662 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
 663 {
 664    tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
 665
 666    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
 667    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
 668    tu_cs_image_ref(cs, iview, layer);
 669    tu_cs_emit(cs, 0);
 670
 671    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
 672    tu_cs_image_flag_ref(cs, iview, layer);
 673
 674    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 675 }
 676
 677 static void
 678 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
 679 {
 680    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
 681
 682    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
 683
 684    tu_cs_emit_regs(cs,
 685                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
 686                    A6XX_RB_MRT_PITCH(0, pitch),
 687                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
 688                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
 689                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
 690                    A6XX_RB_MRT_BASE_GMEM(0, 0));
 691
 692    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
 693 }
 694
 695 static uint8_t
 696 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
 697 {
 698    uint8_t mask = 0xf;
 699    assert(aspect_mask);
 700    /* note: the only format with partial writing is D24S8,
 701     * clear/blit uses the _AS_R8G8B8A8 format to access it
 702     */
 703    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
 704       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
 705          mask = 0x7;
 706       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
 707          mask = 0x8;
 708    }
 709    return mask;
 710 }
 711
 712 static void
 713 r3d_setup(struct tu_cmd_buffer *cmd,
 714           struct tu_cs *cs,
 715           VkFormat vk_format,
 716           VkImageAspectFlags aspect_mask,
 717           enum a6xx_rotation rotation,
 718           bool clear)
 719 {
 720    if (!cmd->state.pass) {
 721       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 722       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
 723    }
 724
 725    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
 726    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
 727
 728    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
 729
 730    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
 731    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
 732                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
 733                   0xfc000000);
 734    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
 735
 736    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
 737    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
 738
 739    tu_cs_emit_regs(cs,
 740                    A6XX_RB_FS_OUTPUT_CNTL0(),
 741                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
 742
 743    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
 744    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
 745    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
 746
 747    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
 748    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
 749    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
 750    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
 751    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
 752    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
 753    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
 754
 755    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
 756    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 757
 758    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
 759                         .color_format = tu6_base_format(vk_format),
 760                         .color_sint = vk_format_is_sint(vk_format),
 761                         .color_uint = vk_format_is_uint(vk_format)));
 762
 763    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
 764       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
 765    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 766    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
 767 }
 768
 769 static void
 770 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 771 {
 772    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
 773    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
 774                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
 775                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
 776    tu_cs_emit(cs, 1); /* instance count */
 777    tu_cs_emit(cs, 2); /* vertex count */
 778 }
 779
 780 /* blit ops - common interface for 2d/shader paths */
 781
 782 struct blit_ops {
 783    void (*coords)(struct tu_cs *cs,
 784                   const VkOffset2D *dst,
 785                   const VkOffset2D *src,
 786                   const VkExtent2D *extent);
 787    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
 788    void (*src)(
 789         struct tu_cmd_buffer *cmd,
 790         struct tu_cs *cs,
 791         const struct tu_image_view *iview,
 792         uint32_t layer,
 793         VkFilter filter);
 794    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 795                       VkFormat vk_format,
 796                       uint64_t va, uint32_t pitch,
 797                       uint32_t width, uint32_t height);
 798    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 799    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
 800    void (*setup)(struct tu_cmd_buffer *cmd,
 801                  struct tu_cs *cs,
 802                  VkFormat vk_format,
 803                  VkImageAspectFlags aspect_mask,
 804                  enum a6xx_rotation rotation,
 805                  bool clear);
 806    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
 807 };
 808
 809 static const struct blit_ops r2d_ops = {
 810    .coords = r2d_coords,
 811    .clear_value = r2d_clear_value,
 812    .src = r2d_src,
 813    .src_buffer = r2d_src_buffer,
 814    .dst = r2d_dst,
 815    .dst_buffer = r2d_dst_buffer,
 816    .setup = r2d_setup,
 817    .run = r2d_run,
 818 };
 819
 820 static const struct blit_ops r3d_ops = {
 821    .coords = r3d_coords,
 822    .clear_value = r3d_clear_value,
 823    .src = r3d_src,
 824    .src_buffer = r3d_src_buffer,
 825    .dst = r3d_dst,
 826    .dst_buffer = r3d_dst_buffer,
 827    .setup = r3d_setup,
 828    .run = r3d_run,
 829 };
 830
 831 /* passthrough set coords from 3D extents */
 832 static void
 833 coords(const struct blit_ops *ops,
 834        struct tu_cs *cs,
 835        const VkOffset3D *dst,
 836        const VkOffset3D *src,
 837        const VkExtent3D *extent)
 838 {
 839    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
 840 }
 841
 842 static VkFormat
 843 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
 844 {
 845    if (vk_format_is_compressed(format)) {
 846       switch (vk_format_get_blocksize(format)) {
 847       case 1: return VK_FORMAT_R8_UINT;
 848       case 2: return VK_FORMAT_R16_UINT;
 849       case 4: return VK_FORMAT_R32_UINT;
 850       case 8: return VK_FORMAT_R32G32_UINT;
 851       case 16:return VK_FORMAT_R32G32B32A32_UINT;
 852       default:
 853          unreachable("unhandled format size");
 854       }
 855    }
 856
 857    switch (format) {
 858    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 859       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
 860          return VK_FORMAT_R8G8_UNORM;
 861       /* fallthrough */
 862    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
 863       return VK_FORMAT_R8_UNORM;
 864    case VK_FORMAT_D24_UNORM_S8_UINT:
 865       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
 866          return VK_FORMAT_R8_UNORM;
 867       /* fallthrough */
 868    default:
 869       return format;
 870    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
 871       return VK_FORMAT_R32_UINT;
 872    }
 873 }
 874
 875 static void
 876 tu_image_view_copy_blit(struct tu_image_view *iview,
 877                         struct tu_image *image,
 878                         VkFormat format,
 879                         const VkImageSubresourceLayers *subres,
 880                         uint32_t layer,
 881                         bool stencil_read)
 882 {
 883    VkImageAspectFlags aspect_mask = subres->aspectMask;
 884
 885    /* always use the AS_R8G8B8A8 format for these */
 886    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
 887        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
 888       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
 889    }
 890
 891    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
 892       .image = tu_image_to_handle(image),
 893       .viewType = VK_IMAGE_VIEW_TYPE_2D,
 894       .format = format,
 895       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
 896       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
 897       .subresourceRange = {
 898          .aspectMask = aspect_mask,
 899          .baseMipLevel = subres->mipLevel,
 900          .levelCount = 1,
 901          .baseArrayLayer = subres->baseArrayLayer + layer,
 902          .layerCount = 1,
 903       },
 904    });
 905 }
 906
 907 static void
 908 tu_image_view_copy(struct tu_image_view *iview,
 909                    struct tu_image *image,
 910                    VkFormat format,
 911                    const VkImageSubresourceLayers *subres,
 912                    uint32_t layer,
 913                    bool stencil_read)
 914 {
 915    format = copy_format(format, subres->aspectMask, false);
 916    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
 917 }
 918
 919 static void
 920 tu_image_view_blit(struct tu_image_view *iview,
 921                    struct tu_image *image,
 922                    const VkImageSubresourceLayers *subres,
 923                    uint32_t layer)
 924 {
 925    tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
 926 }
 927
 928 static void
 929 tu6_blit_image(struct tu_cmd_buffer *cmd,
 930                struct tu_image *src_image,
 931                struct tu_image *dst_image,
 932                const VkImageBlit *info,
 933                VkFilter filter)
 934 {
 935    const struct blit_ops *ops = &r2d_ops;
 936    struct tu_cs *cs = &cmd->cs;
 937    uint32_t layers;
 938
 939    /* 2D blit can't do rotation mirroring from just coordinates */
 940    static const enum a6xx_rotation rotate[2][2] = {
 941       {ROTATE_0, ROTATE_HFLIP},
 942       {ROTATE_VFLIP, ROTATE_180},
 943    };
 944
 945    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
 946                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
 947    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
 948                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
 949    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
 950                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
 951
 952    if (mirror_z) {
 953       tu_finishme("blit z mirror\n");
 954       return;
 955    }
 956
 957    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
 958        info->dstOffsets[1].z - info->dstOffsets[0].z) {
 959       tu_finishme("blit z filter\n");
 960       return;
 961    }
 962
 963    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
 964    if (info->dstSubresource.layerCount > 1) {
 965       assert(layers <= 1);
 966       layers = info->dstSubresource.layerCount;
 967    }
 968
 969    /* BC1_RGB_* formats need to have their last components overriden with 1
 970     * when sampling, which is normally handled with the texture descriptor
 971     * swizzle. The 2d path can't handle that, so use the 3d path.
 972     *
 973     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
 974     * the 2d path.
 975     */
 976
 977    if (dst_image->samples > 1 ||
 978        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
 979        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
 980        filter == VK_FILTER_CUBIC_EXT)
 981       ops = &r3d_ops;
 982
 983    /* TODO: shader path fails some of blit_image.all_formats.generate_mipmaps.* tests,
 984     * figure out why (should be able to pass all tests with only shader path)
 985     */
 986
 987    ops->setup(cmd, cs, dst_image->vk_format, info->dstSubresource.aspectMask,
 988               rotate[mirror_y][mirror_x], false);
 989
 990    if (ops == &r3d_ops) {
 991       r3d_coords_raw(cs, (float[]) {
 992          info->dstOffsets[0].x, info->dstOffsets[0].y,
 993          info->srcOffsets[0].x, info->srcOffsets[0].y,
 994          info->dstOffsets[1].x, info->dstOffsets[1].y,
 995          info->srcOffsets[1].x, info->srcOffsets[1].y
 996       });
 997    } else {
 998       tu_cs_emit_regs(cs,
 999          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1000                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1001          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1002                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1003       tu_cs_emit_regs(cs,
1004          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1005          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1006          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1007          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1008    }
1009
1010    struct tu_image_view dst, src;
1011    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1012    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1013
1014    for (uint32_t i = 0; i < layers; i++) {
1015       ops->dst(cs, &dst, i);
1016       ops->src(cmd, cs, &src, i, filter);
1017       ops->run(cmd, cs);
1018    }
1019 }
1020
1021 void
1022 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1023                 VkImage srcImage,
1024                 VkImageLayout srcImageLayout,
1025                 VkImage dstImage,
1026                 VkImageLayout dstImageLayout,
1027                 uint32_t regionCount,
1028                 const VkImageBlit *pRegions,
1029                 VkFilter filter)
1030
1031 {
1032    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1033    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1034    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1035
1036    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1037    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1038
1039    for (uint32_t i = 0; i < regionCount; ++i)
1040       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1041 }
1042
1043 static void
1044 copy_compressed(VkFormat format,
1045                 VkOffset3D *offset,
1046                 VkExtent3D *extent,
1047                 uint32_t *width,
1048                 uint32_t *height)
1049 {
1050    if (!vk_format_is_compressed(format))
1051       return;
1052
1053    uint32_t block_width = vk_format_get_blockwidth(format);
1054    uint32_t block_height = vk_format_get_blockheight(format);
1055
1056    offset->x /= block_width;
1057    offset->y /= block_height;
1058
1059    if (extent) {
1060       extent->width = DIV_ROUND_UP(extent->width, block_width);
1061       extent->height = DIV_ROUND_UP(extent->height, block_height);
1062    }
1063    if (width)
1064       *width = DIV_ROUND_UP(*width, block_width);
1065    if (height)
1066       *height = DIV_ROUND_UP(*height, block_height);
1067 }
1068
1069 static void
1070 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1071                         struct tu_buffer *src_buffer,
1072                         struct tu_image *dst_image,
1073                         const VkBufferImageCopy *info)
1074 {
1075    struct tu_cs *cs = &cmd->cs;
1076    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1077    VkFormat src_format =
1078       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1079    const struct blit_ops *ops = &r2d_ops;
1080
1081    /* special case for buffer to stencil */
1082    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1083        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1084       ops = &r3d_ops;
1085    }
1086
1087    /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1088     * which matters for UBWC. buffer_to_image/etc can fail because of this
1089     */
1090
1091    VkOffset3D offset = info->imageOffset;
1092    VkExtent3D extent = info->imageExtent;
1093    uint32_t src_width = info->bufferRowLength ?: extent.width;
1094    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1095
1096    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1097
1098    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1099    uint32_t layer_size = src_height * pitch;
1100
1101    ops->setup(cmd, cs,
1102               copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1103               info->imageSubresource.aspectMask, ROTATE_0, false);
1104
1105    struct tu_image_view dst;
1106    tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1107
1108    for (uint32_t i = 0; i < layers; i++) {
1109       ops->dst(cs, &dst, i);
1110
1111       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1112       if ((src_va & 63) || (pitch & 63)) {
1113          for (uint32_t y = 0; y < extent.height; y++) {
1114             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1115             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1116                             x + extent.width, 1);
1117             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1118                         &(VkExtent2D) {extent.width, 1});
1119             ops->run(cmd, cs);
1120             src_va += pitch;
1121          }
1122       } else {
1123          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1124          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1125          ops->run(cmd, cs);
1126       }
1127    }
1128 }
1129
1130 void
1131 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1132                         VkBuffer srcBuffer,
1133                         VkImage dstImage,
1134                         VkImageLayout dstImageLayout,
1135                         uint32_t regionCount,
1136                         const VkBufferImageCopy *pRegions)
1137 {
1138    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1139    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1140    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1141
1142    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1143    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1144
1145    for (unsigned i = 0; i < regionCount; ++i)
1146       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1147 }
1148
1149 static void
1150 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1151                         struct tu_image *src_image,
1152                         struct tu_buffer *dst_buffer,
1153                         const VkBufferImageCopy *info)
1154 {
1155    struct tu_cs *cs = &cmd->cs;
1156    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1157    VkFormat dst_format =
1158       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1159    bool stencil_read = false;
1160
1161    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1162        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1163       stencil_read = true;
1164    }
1165
1166    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1167    VkOffset3D offset = info->imageOffset;
1168    VkExtent3D extent = info->imageExtent;
1169    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1170    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1171
1172    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1173
1174    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1175    uint32_t layer_size = pitch * dst_height;
1176
1177    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1178
1179    struct tu_image_view src;
1180    tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1181
1182    for (uint32_t i = 0; i < layers; i++) {
1183       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1184
1185       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1186       if ((dst_va & 63) || (pitch & 63)) {
1187          for (uint32_t y = 0; y < extent.height; y++) {
1188             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1189             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1190             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1191                         &(VkExtent2D) {extent.width, 1});
1192             ops->run(cmd, cs);
1193             dst_va += pitch;
1194          }
1195       } else {
1196          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1197          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1198          ops->run(cmd, cs);
1199       }
1200    }
1201 }
1202
1203 void
1204 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1205                         VkImage srcImage,
1206                         VkImageLayout srcImageLayout,
1207                         VkBuffer dstBuffer,
1208                         uint32_t regionCount,
1209                         const VkBufferImageCopy *pRegions)
1210 {
1211    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1212    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1213    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1214
1215    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1216    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1217
1218    for (unsigned i = 0; i < regionCount; ++i)
1219       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1220 }
1221
1222 /* Tiled formats don't support swapping, which means that we can't support
1223  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1224  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1225  * Currently we fake support for tiled swapped formats and use the unswapped
1226  * format instead, but this means that reinterpreting copies to and from
1227  * swapped formats can't be performed correctly unless we can swizzle the
1228  * components by reinterpreting the other image as the "correct" swapped
1229  * format, i.e. only when the other image is linear.
1230  */
1231
1232 static bool
1233 is_swapped_format(VkFormat format)
1234 {
1235    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1236    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1237    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1238 }
1239
1240 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1241  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1242  * versa). This should mirror the logic in fdl6_layout.
1243  */
1244 static bool
1245 image_is_r8g8(struct tu_image *image)
1246 {
1247    return image->layout[0].cpp == 2 &&
1248       vk_format_get_nr_components(image->vk_format) == 2;
1249 }
1250
1251 static void
1252 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1253                        struct tu_image *src_image,
1254                        struct tu_image *dst_image,
1255                        const VkImageCopy *info)
1256 {
1257    const struct blit_ops *ops = &r2d_ops;
1258    struct tu_cs *cs = &cmd->cs;
1259
1260    if (dst_image->samples > 1)
1261       ops = &r3d_ops;
1262
1263    VkFormat format = VK_FORMAT_UNDEFINED;
1264    VkOffset3D src_offset = info->srcOffset;
1265    VkOffset3D dst_offset = info->dstOffset;
1266    VkExtent3D extent = info->extent;
1267
1268    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1269     * Images":
1270     *
1271     *    When copying between compressed and uncompressed formats the extent
1272     *    members represent the texel dimensions of the source image and not
1273     *    the destination. When copying from a compressed image to an
1274     *    uncompressed image the image texel dimensions written to the
1275     *    uncompressed image will be source extent divided by the compressed
1276     *    texel block dimensions. When copying from an uncompressed image to a
1277     *    compressed image the image texel dimensions written to the compressed
1278     *    image will be the source extent multiplied by the compressed texel
1279     *    block dimensions.
1280     *
1281     * This means we only have to adjust the extent if the source image is
1282     * compressed.
1283     */
1284    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1285    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1286
1287    VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1288    VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1289
1290    bool use_staging_blit = false;
1291
1292    if (src_format == dst_format) {
1293       /* Images that share a format can always be copied directly because it's
1294        * the same as a blit.
1295        */
1296       format = src_format;
1297    } else if (!src_image->layout[0].tile_mode) {
1298       /* If an image is linear, we can always safely reinterpret it with the
1299        * other image's format and then do a regular blit.
1300        */
1301       format = dst_format;
1302    } else if (!dst_image->layout[0].tile_mode) {
1303       format = src_format;
1304    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1305       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1306        * due to the different tile layout.
1307        */
1308       use_staging_blit = true;
1309    } else if (is_swapped_format(src_format) ||
1310               is_swapped_format(dst_format)) {
1311       /* If either format has a non-identity swap, then we can't copy
1312        * to/from it.
1313        */
1314       use_staging_blit = true;
1315    } else if (!src_image->layout[0].ubwc) {
1316       format = dst_format;
1317    } else if (!dst_image->layout[0].ubwc) {
1318       format = src_format;
1319    } else {
1320       /* Both formats use UBWC and so neither can be reinterpreted.
1321        * TODO: We could do an in-place decompression of the dst instead.
1322        */
1323       use_staging_blit = true;
1324    }
1325
1326    struct tu_image_view dst, src;
1327
1328    if (use_staging_blit) {
1329       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1330       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1331
1332       struct tu_image staging_image = {
1333          .vk_format = src_format,
1334          .type = src_image->type,
1335          .tiling = VK_IMAGE_TILING_LINEAR,
1336          .extent = extent,
1337          .level_count = 1,
1338          .layer_count = info->srcSubresource.layerCount,
1339          .samples = src_image->samples,
1340          .bo_offset = 0,
1341       };
1342
1343       VkImageSubresourceLayers staging_subresource = {
1344          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1345          .mipLevel = 0,
1346          .baseArrayLayer = 0,
1347          .layerCount = info->srcSubresource.layerCount,
1348       };
1349
1350       VkOffset3D staging_offset = { 0 };
1351
1352       staging_image.layout[0].tile_mode = TILE6_LINEAR;
1353       staging_image.layout[0].ubwc = false;
1354
1355       fdl6_layout(&staging_image.layout[0],
1356                   vk_format_to_pipe_format(staging_image.vk_format),
1357                   staging_image.samples,
1358                   staging_image.extent.width,
1359                   staging_image.extent.height,
1360                   staging_image.extent.depth,
1361                   staging_image.level_count,
1362                   staging_image.layer_count,
1363                   staging_image.type == VK_IMAGE_TYPE_3D,
1364                   NULL);
1365
1366       VkResult result = tu_get_scratch_bo(cmd->device,
1367                                           staging_image.layout[0].size,
1368                                           &staging_image.bo);
1369       if (result != VK_SUCCESS) {
1370          cmd->record_result = result;
1371          return;
1372       }
1373
1374       tu_bo_list_add(&cmd->bo_list, staging_image.bo,
1375                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
1376
1377       struct tu_image_view staging;
1378       tu_image_view_copy(&staging, &staging_image, src_format,
1379                          &staging_subresource, 0, false);
1380
1381       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1382       coords(ops, cs, &staging_offset, &src_offset, &extent);
1383
1384       for (uint32_t i = 0; i < info->extent.depth; i++) {
1385          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1386          ops->dst(cs, &staging, i);
1387          ops->run(cmd, cs);
1388       }
1389
1390       /* When executed by the user there has to be a pipeline barrier here,
1391        * but since we're doing it manually we'll have to flush ourselves.
1392        */
1393       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1394       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1395
1396       tu_image_view_copy(&staging, &staging_image, dst_format,
1397                          &staging_subresource, 0, false);
1398
1399       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask, ROTATE_0, false);
1400       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1401
1402       for (uint32_t i = 0; i < info->extent.depth; i++) {
1403          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1404          ops->dst(cs, &dst, i);
1405          ops->run(cmd, cs);
1406       }
1407    } else {
1408       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1409       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1410
1411       ops->setup(cmd, cs, format, info->dstSubresource.aspectMask, ROTATE_0, false);
1412       coords(ops, cs, &dst_offset, &src_offset, &extent);
1413
1414       for (uint32_t i = 0; i < info->extent.depth; i++) {
1415          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1416          ops->dst(cs, &dst, i);
1417          ops->run(cmd, cs);
1418       }
1419    }
1420 }
1421
1422 void
1423 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1424                 VkImage srcImage,
1425                 VkImageLayout srcImageLayout,
1426                 VkImage destImage,
1427                 VkImageLayout destImageLayout,
1428                 uint32_t regionCount,
1429                 const VkImageCopy *pRegions)
1430 {
1431    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1432    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1433    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1434
1435    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1436    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1437
1438    for (uint32_t i = 0; i < regionCount; ++i)
1439       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1440 }
1441
1442 static void
1443 copy_buffer(struct tu_cmd_buffer *cmd,
1444             uint64_t dst_va,
1445             uint64_t src_va,
1446             uint64_t size,
1447             uint32_t block_size)
1448 {
1449    const struct blit_ops *ops = &r2d_ops;
1450    struct tu_cs *cs = &cmd->cs;
1451    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1452    uint64_t blocks = size / block_size;
1453
1454    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1455
1456    while (blocks) {
1457       uint32_t src_x = (src_va & 63) / block_size;
1458       uint32_t dst_x = (dst_va & 63) / block_size;
1459       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1460
1461       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1462       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1463       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1464       ops->run(cmd, cs);
1465
1466       src_va += width * block_size;
1467       dst_va += width * block_size;
1468       blocks -= width;
1469    }
1470 }
1471
1472 void
1473 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1474                  VkBuffer srcBuffer,
1475                  VkBuffer dstBuffer,
1476                  uint32_t regionCount,
1477                  const VkBufferCopy *pRegions)
1478 {
1479    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1480    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1481    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1482
1483    tu_bo_list_add(&cmd->bo_list, src_buffer->bo, MSM_SUBMIT_BO_READ);
1484    tu_bo_list_add(&cmd->bo_list, dst_buffer->bo, MSM_SUBMIT_BO_WRITE);
1485
1486    for (unsigned i = 0; i < regionCount; ++i) {
1487       copy_buffer(cmd,
1488                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1489                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1490                   pRegions[i].size, 1);
1491    }
1492 }
1493
1494 void
1495 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1496                    VkBuffer dstBuffer,
1497                    VkDeviceSize dstOffset,
1498                    VkDeviceSize dataSize,
1499                    const void *pData)
1500 {
1501    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1502    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1503
1504    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1505
1506    struct tu_cs_memory tmp;
1507    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1508    if (result != VK_SUCCESS) {
1509       cmd->record_result = result;
1510       return;
1511    }
1512
1513    memcpy(tmp.map, pData, dataSize);
1514    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1515 }
1516
1517 void
1518 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1519                  VkBuffer dstBuffer,
1520                  VkDeviceSize dstOffset,
1521                  VkDeviceSize fillSize,
1522                  uint32_t data)
1523 {
1524    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1525    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1526    const struct blit_ops *ops = &r2d_ops;
1527    struct tu_cs *cs = &cmd->cs;
1528
1529    tu_bo_list_add(&cmd->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
1530
1531    if (fillSize == VK_WHOLE_SIZE)
1532       fillSize = buffer->size - dstOffset;
1533
1534    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1535    uint32_t blocks = fillSize / 4;
1536
1537    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true);
1538    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1539
1540    while (blocks) {
1541       uint32_t dst_x = (dst_va & 63) / 4;
1542       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1543
1544       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1545       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1546       ops->run(cmd, cs);
1547
1548       dst_va += width * 4;
1549       blocks -= width;
1550    }
1551 }
1552
1553 void
1554 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1555                    VkImage srcImage,
1556                    VkImageLayout srcImageLayout,
1557                    VkImage dstImage,
1558                    VkImageLayout dstImageLayout,
1559                    uint32_t regionCount,
1560                    const VkImageResolve *pRegions)
1561 {
1562    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1563    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1564    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1565    const struct blit_ops *ops = &r2d_ops;
1566    struct tu_cs *cs = &cmd->cs;
1567
1568    tu_bo_list_add(&cmd->bo_list, src_image->bo, MSM_SUBMIT_BO_READ);
1569    tu_bo_list_add(&cmd->bo_list, dst_image->bo, MSM_SUBMIT_BO_WRITE);
1570
1571    ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1572
1573    for (uint32_t i = 0; i < regionCount; ++i) {
1574       const VkImageResolve *info = &pRegions[i];
1575       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1576
1577       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1578       /* TODO: aspect masks possible ? */
1579
1580       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1581
1582       struct tu_image_view dst, src;
1583       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1584       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1585
1586       for (uint32_t i = 0; i < layers; i++) {
1587          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1588          ops->dst(cs, &dst, i);
1589          ops->run(cmd, cs);
1590       }
1591    }
1592 }
1593
1594 void
1595 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1596                   struct tu_cs *cs,
1597                   struct tu_image_view *src,
1598                   struct tu_image_view *dst,
1599                   uint32_t layers,
1600                   const VkRect2D *rect)
1601 {
1602    const struct blit_ops *ops = &r2d_ops;
1603
1604    tu_bo_list_add(&cmd->bo_list, src->image->bo, MSM_SUBMIT_BO_READ);
1605    tu_bo_list_add(&cmd->bo_list, dst->image->bo, MSM_SUBMIT_BO_WRITE);
1606
1607    assert(src->image->vk_format == dst->image->vk_format);
1608
1609    ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false);
1610    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1611
1612    for (uint32_t i = 0; i < layers; i++) {
1613       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1614       ops->dst(cs, dst, i);
1615       ops->run(cmd, cs);
1616    }
1617 }
1618
1619 static void
1620 clear_image(struct tu_cmd_buffer *cmd,
1621             struct tu_image *image,
1622             const VkClearValue *clear_value,
1623             const VkImageSubresourceRange *range)
1624 {
1625    uint32_t level_count = tu_get_levelCount(image, range);
1626    uint32_t layer_count = tu_get_layerCount(image, range);
1627    struct tu_cs *cs = &cmd->cs;
1628    VkFormat format = image->vk_format;
1629    if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1630       format = VK_FORMAT_R32_UINT;
1631
1632    if (image->type == VK_IMAGE_TYPE_3D) {
1633       assert(layer_count == 1);
1634       assert(range->baseArrayLayer == 0);
1635    }
1636
1637    const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
1638
1639    ops->setup(cmd, cs, format, range->aspectMask, ROTATE_0, true);
1640    ops->clear_value(cs, image->vk_format, clear_value);
1641
1642    for (unsigned j = 0; j < level_count; j++) {
1643       if (image->type == VK_IMAGE_TYPE_3D)
1644          layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
1645
1646       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1647                      u_minify(image->extent.width, range->baseMipLevel + j),
1648                      u_minify(image->extent.height, range->baseMipLevel + j)
1649                   });
1650
1651       struct tu_image_view dst;
1652       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1653          .aspectMask = range->aspectMask,
1654          .mipLevel = range->baseMipLevel + j,
1655          .baseArrayLayer = range->baseArrayLayer,
1656          .layerCount = 1,
1657       }, 0, false);
1658
1659       for (uint32_t i = 0; i < layer_count; i++) {
1660          ops->dst(cs, &dst, i);
1661          ops->run(cmd, cs);
1662       }
1663    }
1664 }
1665
1666 void
1667 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1668                       VkImage image_h,
1669                       VkImageLayout imageLayout,
1670                       const VkClearColorValue *pColor,
1671                       uint32_t rangeCount,
1672                       const VkImageSubresourceRange *pRanges)
1673 {
1674    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1675    TU_FROM_HANDLE(tu_image, image, image_h);
1676
1677    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1678
1679    for (unsigned i = 0; i < rangeCount; i++)
1680       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i);
1681 }
1682
1683 void
1684 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1685                              VkImage image_h,
1686                              VkImageLayout imageLayout,
1687                              const VkClearDepthStencilValue *pDepthStencil,
1688                              uint32_t rangeCount,
1689                              const VkImageSubresourceRange *pRanges)
1690 {
1691    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1692    TU_FROM_HANDLE(tu_image, image, image_h);
1693
1694    tu_bo_list_add(&cmd->bo_list, image->bo, MSM_SUBMIT_BO_WRITE);
1695
1696    for (unsigned i = 0; i < rangeCount; i++)
1697       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, pRanges + i);
1698 }
1699
1700 static void
1701 tu_clear_sysmem_attachments_2d(struct tu_cmd_buffer *cmd,
1702                                uint32_t attachment_count,
1703                                const VkClearAttachment *attachments,
1704                                uint32_t rect_count,
1705                                const VkClearRect *rects)
1706 {
1707    const struct tu_subpass *subpass = cmd->state.subpass;
1708    /* note: cannot use shader path here.. there is a special shader path
1709     * in tu_clear_sysmem_attachments()
1710     */
1711    const struct blit_ops *ops = &r2d_ops;
1712    struct tu_cs *cs = &cmd->draw_cs;
1713
1714    for (uint32_t j = 0; j < attachment_count; j++) {
1715          /* The vulkan spec, section 17.2 "Clearing Images Inside a Render
1716           * Pass Instance" says that:
1717           *
1718           *     Unlike other clear commands, vkCmdClearAttachments executes as
1719           *     a drawing command, rather than a transfer command, with writes
1720           *     performed by it executing in rasterization order. Clears to
1721           *     color attachments are executed as color attachment writes, by
1722           *     the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT stage.
1723           *     Clears to depth/stencil attachments are executed as depth
1724           *     writes and writes by the
1725           *     VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT and
1726           *     VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT stages.
1727           *
1728           * However, the 2d path here is executed the same way as a
1729           * transfer command, using the CCU color cache exclusively with
1730           * a special depth-as-color format for depth clears. This means that
1731           * we can't rely on the normal pipeline barrier mechanism here, and
1732           * have to manually flush whenever using a different cache domain
1733           * from what the 3d path would've used. This happens when we clear
1734           * depth/stencil, since normally depth attachments use CCU depth, but
1735           * we clear it using a special depth-as-color format. Since the clear
1736           * potentially uses a different attachment state we also need to
1737           * invalidate color beforehand and flush it afterwards.
1738           */
1739
1740          uint32_t a;
1741          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1742             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
1743             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1744          } else {
1745             a = subpass->depth_stencil_attachment.attachment;
1746             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
1747             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1748             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1749          }
1750
1751          if (a == VK_ATTACHMENT_UNUSED)
1752                continue;
1753
1754          const struct tu_image_view *iview =
1755             cmd->state.framebuffer->attachments[a].attachment;
1756
1757          ops->setup(cmd, cs, iview->image->vk_format, attachments[j].aspectMask, ROTATE_0, true);
1758          ops->clear_value(cs, iview->image->vk_format, &attachments[j].clearValue);
1759
1760          /* Wait for the flushes we triggered manually to complete */
1761          tu_cs_emit_wfi(cs);
1762
1763          for (uint32_t i = 0; i < rect_count; i++) {
1764             ops->coords(cs, &rects[i].rect.offset, NULL, &rects[i].rect.extent);
1765             for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1766                ops->dst(cs, iview, rects[i].baseArrayLayer + layer);
1767                ops->run(cmd, cs);
1768             }
1769          }
1770
1771          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1772             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1773             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
1774          } else {
1775             /* sync color into depth */
1776             tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1777             tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
1778          }
1779    }
1780 }
1781
1782 static void
1783 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1784                             uint32_t attachment_count,
1785                             const VkClearAttachment *attachments,
1786                             uint32_t rect_count,
1787                             const VkClearRect *rects)
1788 {
1789    /* the shader path here is special, it avoids changing MRT/etc state */
1790    const struct tu_render_pass *pass = cmd->state.pass;
1791    const struct tu_subpass *subpass = cmd->state.subpass;
1792    const uint32_t mrt_count = subpass->color_count;
1793    struct tu_cs *cs = &cmd->draw_cs;
1794    uint32_t clear_value[MAX_RTS][4];
1795    float z_clear_val = 0.0f;
1796    uint8_t s_clear_val = 0;
1797    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1798    bool z_clear = false;
1799    bool s_clear = false;
1800    bool layered_clear = false;
1801    uint32_t max_samples = 1;
1802
1803    for (uint32_t i = 0; i < attachment_count; i++) {
1804       uint32_t a;
1805       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1806          uint32_t c = attachments[i].colorAttachment;
1807          a = subpass->color_attachments[c].attachment;
1808          if (a == VK_ATTACHMENT_UNUSED)
1809             continue;
1810
1811          clear_rts |= 1 << c;
1812          clear_components |= 0xf << (c * 4);
1813          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1814       } else {
1815          a = subpass->depth_stencil_attachment.attachment;
1816          if (a == VK_ATTACHMENT_UNUSED)
1817             continue;
1818
1819          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1820             z_clear = true;
1821             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1822          }
1823
1824          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1825             s_clear = true;
1826             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1827          }
1828       }
1829
1830       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1831    }
1832
1833    /* prefer to use 2D path for clears
1834     * 2D can't clear separate depth/stencil and msaa, needs known framebuffer
1835     */
1836    if (max_samples == 1 && cmd->state.framebuffer) {
1837       tu_clear_sysmem_attachments_2d(cmd, attachment_count, attachments, rect_count, rects);
1838       return;
1839    }
1840
1841    /* This clear path behaves like a draw, needs the same flush as tu_draw */
1842    tu_emit_cache_flush_renderpass(cmd, cs);
1843
1844    /* disable all draw states so they don't interfere
1845     * TODO: use and re-use draw states for this path
1846     * we have to disable draw states individually to preserve
1847     * input attachment states, because a secondary command buffer
1848     * won't be able to restore them
1849     */
1850    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1851    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1852       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1853           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1854          continue;
1855       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1856                      CP_SET_DRAW_STATE__0_DISABLE);
1857       tu_cs_emit_qw(cs, 0);
1858    }
1859    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1860
1861    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1862    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1863                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1864                   0xfc000000);
1865    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1866
1867    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1868    for (uint32_t i = 0; i < mrt_count; i++) {
1869       if (clear_rts & (1 << i))
1870          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1871       else
1872          tu_cs_emit(cs, 0);
1873    }
1874
1875    for (uint32_t i = 0; i < rect_count; i++) {
1876       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1877          layered_clear = true;
1878    }
1879
1880    r3d_common(cmd, cs, false, num_rts, layered_clear);
1881
1882    tu_cs_emit_regs(cs,
1883                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1884    tu_cs_emit_regs(cs,
1885                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1886
1887    tu_cs_emit_regs(cs,
1888                    A6XX_RB_FS_OUTPUT_CNTL0(),
1889                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1890
1891    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1892    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1893    tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL());
1894    for (uint32_t i = 0; i < mrt_count; i++) {
1895       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1896             .component_enable = COND(clear_rts & (1 << i), 0xf)));
1897    }
1898
1899    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1900    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1901          .z_enable = z_clear,
1902          .z_write_enable = z_clear,
1903          .zfunc = FUNC_ALWAYS));
1904    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1905    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1906          .stencil_enable = s_clear,
1907          .func = FUNC_ALWAYS,
1908          .zpass = STENCIL_REPLACE));
1909    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1910    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1911    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1912
1913    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1914    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1915                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1916                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1917                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1918                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1919    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1920    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1921    for_each_bit(b, clear_rts)
1922       tu_cs_emit_array(cs, clear_value[b], 4);
1923
1924    for (uint32_t i = 0; i < rect_count; i++) {
1925       for (uint32_t layer = 0; layer < rects[i].layerCount; layer++) {
1926          r3d_coords_raw(cs, (float[]) {
1927             rects[i].rect.offset.x, rects[i].rect.offset.y,
1928             z_clear_val, uif(rects[i].baseArrayLayer + layer),
1929             rects[i].rect.offset.x + rects[i].rect.extent.width,
1930             rects[i].rect.offset.y + rects[i].rect.extent.height,
1931             z_clear_val, 1.0f,
1932          });
1933          r3d_run(cmd, cs);
1934       }
1935    }
1936 }
1937
1938 static void
1939 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1940 {
1941    enum pipe_format pformat = vk_format_to_pipe_format(format);
1942
1943    switch (format) {
1944    case VK_FORMAT_X8_D24_UNORM_PACK32:
1945    case VK_FORMAT_D24_UNORM_S8_UINT:
1946       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1947                        val->depthStencil.stencil << 24;
1948       return;
1949    case VK_FORMAT_D16_UNORM:
1950       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1951       return;
1952    case VK_FORMAT_D32_SFLOAT:
1953       clear_value[0] = fui(val->depthStencil.depth);
1954       return;
1955    case VK_FORMAT_S8_UINT:
1956       clear_value[0] = val->depthStencil.stencil;
1957       return;
1958    /* these formats use a different base format when tiled
1959     * the same format can be used for both because GMEM is always in WZYX order
1960     */
1961    case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1962    case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1963       pformat = PIPE_FORMAT_B5G5R5A1_UNORM;
1964    default:
1965       break;
1966    }
1967
1968    VkClearColorValue color;
1969
1970    /**
1971     * GMEM is tiled and wants the components in WZYX order,
1972     * apply swizzle to the color before packing, to counteract
1973     * deswizzling applied by packing functions
1974     */
1975    pipe_swizzle_4f(color.float32, val->color.float32,
1976                    util_format_description(pformat)->swizzle);
1977
1978    util_format_pack_rgba(pformat, clear_value, color.uint32, 1);
1979 }
1980
1981 static void
1982 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1983                               struct tu_cs *cs,
1984                               uint32_t attachment,
1985                               VkImageAspectFlags mask,
1986                               const VkClearValue *value)
1987 {
1988    VkFormat vk_format = cmd->state.pass->attachments[attachment].format;
1989
1990
1991    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
1992    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(vk_format)));
1993
1994    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1,
1995       .clear_mask = aspect_write_mask(vk_format, mask)));
1996
1997    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
1998    tu_cs_emit(cs, cmd->state.pass->attachments[attachment].gmem_offset);
1999
2000    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2001    tu_cs_emit(cs, 0);
2002
2003    uint32_t clear_vals[4] = {};
2004    pack_gmem_clear_value(value, vk_format, clear_vals);
2005
2006    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2007    tu_cs_emit_array(cs, clear_vals, 4);
2008
2009    tu6_emit_event_write(cmd, cs, BLIT);
2010 }
2011
2012 static void
2013 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2014                           uint32_t attachment_count,
2015                           const VkClearAttachment *attachments,
2016                           uint32_t rect_count,
2017                           const VkClearRect *rects)
2018 {
2019    const struct tu_subpass *subpass = cmd->state.subpass;
2020    struct tu_cs *cs = &cmd->draw_cs;
2021
2022    /* TODO: swap the loops for smaller cmdstream */
2023    for (unsigned i = 0; i < rect_count; i++) {
2024       unsigned x1 = rects[i].rect.offset.x;
2025       unsigned y1 = rects[i].rect.offset.y;
2026       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2027       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2028
2029       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2030       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2031       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2032
2033       for (unsigned j = 0; j < attachment_count; j++) {
2034          uint32_t a;
2035          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2036             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2037          else
2038             a = subpass->depth_stencil_attachment.attachment;
2039
2040          if (a == VK_ATTACHMENT_UNUSED)
2041                continue;
2042
2043          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2044                                        &attachments[j].clearValue);
2045       }
2046    }
2047 }
2048
2049 void
2050 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2051                        uint32_t attachmentCount,
2052                        const VkClearAttachment *pAttachments,
2053                        uint32_t rectCount,
2054                        const VkClearRect *pRects)
2055 {
2056    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2057    struct tu_cs *cs = &cmd->draw_cs;
2058
2059    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2060    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2061    tu_cond_exec_end(cs);
2062
2063    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2064    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2065    tu_cond_exec_end(cs);
2066 }
2067
2068 void
2069 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2070                            struct tu_cs *cs,
2071                            uint32_t a,
2072                            const VkRenderPassBeginInfo *info)
2073 {
2074    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2075    const struct tu_image_view *iview = fb->attachments[a].attachment;
2076    const struct tu_render_pass_attachment *attachment =
2077       &cmd->state.pass->attachments[a];
2078
2079    if (!attachment->clear_mask)
2080       return;
2081
2082    const struct blit_ops *ops = &r2d_ops;
2083    if (attachment->samples > 1)
2084       ops = &r3d_ops;
2085
2086    ops->setup(cmd, cs, attachment->format, attachment->clear_mask, ROTATE_0, true);
2087    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2088    ops->clear_value(cs, attachment->format, &info->pClearValues[a]);
2089
2090    /* Wait for any flushes at the beginning of the renderpass to complete */
2091    tu_cs_emit_wfi(cs);
2092
2093    for (uint32_t i = 0; i < fb->layers; i++) {
2094       ops->dst(cs, iview, i);
2095       ops->run(cmd, cs);
2096    }
2097
2098    /* The spec doesn't explicitly say, but presumably the initial renderpass
2099     * clear is considered part of the renderpass, and therefore barriers
2100     * aren't required inside the subpass/renderpass.  Therefore we need to
2101     * flush CCU color into CCU depth here, just like with
2102     * vkCmdClearAttachments(). Note that because this only happens at the
2103     * beginning of a renderpass, and renderpass writes are considered
2104     * "incoherent", we shouldn't have to worry about syncing depth into color
2105     * beforehand as depth should already be flushed.
2106     */
2107    if (vk_format_is_depth_or_stencil(attachment->format)) {
2108       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2109       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2110    } else {
2111       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2112       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2113    }
2114 }
2115
2116 void
2117 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2118                          struct tu_cs *cs,
2119                          uint32_t a,
2120                          const VkRenderPassBeginInfo *info)
2121 {
2122    const struct tu_render_pass_attachment *attachment =
2123       &cmd->state.pass->attachments[a];
2124
2125    if (!attachment->clear_mask)
2126       return;
2127
2128    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2129
2130    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2131                                  &info->pClearValues[a]);
2132 }
2133
2134 static void
2135 tu_emit_blit(struct tu_cmd_buffer *cmd,
2136              struct tu_cs *cs,
2137              const struct tu_image_view *iview,
2138              const struct tu_render_pass_attachment *attachment,
2139              bool resolve)
2140 {
2141    tu_cs_emit_regs(cs,
2142                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2143
2144    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2145       .unk0 = !resolve,
2146       .gmem = !resolve,
2147       /* "integer" bit disables msaa resolve averaging */
2148       .integer = vk_format_is_int(attachment->format)));
2149
2150    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2151    tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2152    tu_cs_image_ref_2d(cs, iview, 0, false);
2153
2154    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2155    tu_cs_image_flag_ref(cs, iview, 0);
2156
2157    tu_cs_emit_regs(cs,
2158                    A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2159
2160    tu6_emit_event_write(cmd, cs, BLIT);
2161 }
2162
2163 static bool
2164 blit_can_resolve(VkFormat format)
2165 {
2166    const struct util_format_description *desc = vk_format_description(format);
2167
2168    /* blit event can only do resolve for simple cases:
2169     * averaging samples as unsigned integers or choosing only one sample
2170     */
2171    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2172       return false;
2173
2174    /* can't do formats with larger channel sizes
2175     * note: this includes all float formats
2176     * note2: single channel integer formats seem OK
2177     */
2178    if (desc->channel[0].size > 10)
2179       return false;
2180
2181    switch (format) {
2182    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2183     * likely related to these formats having different layout from other cpp=2 formats
2184     */
2185    case VK_FORMAT_R8G8_UNORM:
2186    case VK_FORMAT_R8G8_UINT:
2187    case VK_FORMAT_R8G8_SINT:
2188    /* TODO: this one should be able to work? */
2189    case VK_FORMAT_D24_UNORM_S8_UINT:
2190       return false;
2191    default:
2192       break;
2193    }
2194
2195    return true;
2196 }
2197
2198 void
2199 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2200                         struct tu_cs *cs,
2201                         uint32_t a,
2202                         bool force_load)
2203 {
2204    const struct tu_image_view *iview =
2205       cmd->state.framebuffer->attachments[a].attachment;
2206    const struct tu_render_pass_attachment *attachment =
2207       &cmd->state.pass->attachments[a];
2208
2209    if (attachment->load || force_load)
2210       tu_emit_blit(cmd, cs, iview, attachment, false);
2211 }
2212
2213 void
2214 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2215                          struct tu_cs *cs,
2216                          uint32_t a,
2217                          uint32_t gmem_a)
2218 {
2219    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2220    const VkRect2D *render_area = &cmd->state.render_area;
2221    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2222    struct tu_image_view *iview = fb->attachments[a].attachment;
2223    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2224
2225    if (!dst->store)
2226       return;
2227
2228    uint32_t x1 = render_area->offset.x;
2229    uint32_t y1 = render_area->offset.y;
2230    uint32_t x2 = x1 + render_area->extent.width;
2231    uint32_t y2 = y1 + render_area->extent.height;
2232    /* x2/y2 can be unaligned if equal to the size of the image,
2233     * since it will write into padding space
2234     * the one exception is linear levels which don't have the
2235     * required y padding in the layout (except for the last level)
2236     */
2237    bool need_y2_align =
2238       y2 != iview->extent.height || iview->need_y2_align;
2239
2240    bool unaligned =
2241       x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
2242       y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
2243
2244    /* use fast path when render area is aligned, except for unsupported resolve cases */
2245    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2246       tu_emit_blit(cmd, cs, iview, src, true);
2247       return;
2248    }
2249
2250    if (dst->samples > 1) {
2251       /* I guess we need to use shader path in this case?
2252        * need a testcase which fails because of this
2253        */
2254       tu_finishme("unaligned store of msaa attachment\n");
2255       return;
2256    }
2257
2258    r2d_setup_common(cmd, cs, dst->format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, true);
2259    r2d_dst(cs, iview, 0);
2260    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2261
2262    tu_cs_emit_regs(cs,
2263                    A6XX_SP_PS_2D_SRC_INFO(
2264                       .color_format = tu6_format_texture(src->format, TILE6_2).fmt,
2265                       .tile_mode = TILE6_2,
2266                       .srgb = vk_format_is_srgb(src->format),
2267                       .samples = tu_msaa_samples(src->samples),
2268                       .samples_average = !vk_format_is_int(src->format),
2269                       .unk20 = 1,
2270                       .unk22 = 1),
2271                    /* note: src size does not matter when not scaling */
2272                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2273                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + src->gmem_offset),
2274                    A6XX_SP_PS_2D_SRC_HI(),
2275                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = fb->tile0.width * src->cpp));
2276
2277    /* sync GMEM writes with CACHE. */
2278    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2279
2280    /* Wait for CACHE_INVALIDATE to land */
2281    tu_cs_emit_wfi(cs);
2282
2283    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2284    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2285
2286    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2287     * sysmem, and we generally assume that GMEM renderpasses leave their
2288     * results in sysmem, so we need to flush manually here.
2289     */
2290    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2291 }