src/gallium/drivers/radeonsi/si_compute_blit.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "si_pipe.h"
  27 #include "util/format/u_format.h"
  28 #include "util/format_srgb.h"
  29
  30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
  31  * and L2_STREAM for src.
  32  */
  33 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
  34                                              uint64_t size)
  35 {
  36    if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
  37        (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
  38       return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
  39
  40    return L2_BYPASS;
  41 }
  42
  43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
  44                             enum si_cache_policy cache_policy)
  45 {
  46    switch (coher) {
  47    default:
  48    case SI_COHERENCY_NONE:
  49    case SI_COHERENCY_CP:
  50       return 0;
  51    case SI_COHERENCY_SHADER:
  52       return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
  53              (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
  54    case SI_COHERENCY_CB_META:
  55       return SI_CONTEXT_FLUSH_AND_INV_CB;
  56    }
  57 }
  58
  59 static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
  60 {
  61    /* Set settings for driver-internal compute dispatches. */
  62    sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
  63    sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
  64    sctx->render_cond_force_off = true;
  65    /* Skip decompression to prevent infinite recursion. */
  66    if (sctx->blitter)
  67       sctx->blitter->running = true;
  68
  69    /* Dispatch compute. */
  70    sctx->b.launch_grid(&sctx->b, info);
  71
  72    /* Restore default settings. */
  73    sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
  74    sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
  75    sctx->render_cond_force_off = false;
  76    if (sctx->blitter)
  77       sctx->blitter->running = false;
  78 }
  79
  80 static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
  81                                             unsigned dst_offset, unsigned size,
  82                                             const uint32_t *clear_value, enum si_coherency coher)
  83 {
  84    struct pipe_context *ctx = &sctx->b;
  85
  86    assert(dst_offset % 4 == 0);
  87    assert(size % 4 == 0);
  88    unsigned size_12 = DIV_ROUND_UP(size, 12);
  89
  90    unsigned data[4] = {0};
  91    memcpy(data, clear_value, 12);
  92
  93    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
  94                   si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
  95
  96    struct pipe_shader_buffer saved_sb = {0};
  97    si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
  98
  99    unsigned saved_writable_mask = 0;
 100    if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
 101        (1u << si_get_shaderbuf_slot(0)))
 102       saved_writable_mask = 1;
 103
 104    struct pipe_constant_buffer saved_cb = {};
 105    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 106
 107    void *saved_cs = sctx->cs_shader_state.program;
 108
 109    struct pipe_constant_buffer cb = {};
 110    cb.buffer_size = sizeof(data);
 111    cb.user_buffer = data;
 112    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 113
 114    struct pipe_shader_buffer sb = {0};
 115    sb.buffer = dst;
 116    sb.buffer_offset = dst_offset;
 117    sb.buffer_size = size;
 118
 119    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
 120
 121    struct pipe_grid_info info = {0};
 122
 123    if (!sctx->cs_clear_12bytes_buffer)
 124       sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
 125    ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
 126    info.block[0] = 64;
 127    info.last_block[0] = size_12 % 64;
 128    info.block[1] = 1;
 129    info.block[2] = 1;
 130    info.grid[0] = DIV_ROUND_UP(size_12, 64);
 131    info.grid[1] = 1;
 132    info.grid[2] = 1;
 133
 134    si_launch_grid_internal(sctx, &info);
 135
 136    ctx->bind_compute_state(ctx, saved_cs);
 137    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
 138    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 139
 140    pipe_resource_reference(&saved_sb.buffer, NULL);
 141    pipe_resource_reference(&saved_cb.buffer, NULL);
 142 }
 143
 144 static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
 145                                         unsigned dst_offset, struct pipe_resource *src,
 146                                         unsigned src_offset, unsigned size,
 147                                         const uint32_t *clear_value, unsigned clear_value_size,
 148                                         enum si_coherency coher)
 149 {
 150    struct pipe_context *ctx = &sctx->b;
 151
 152    assert(src_offset % 4 == 0);
 153    assert(dst_offset % 4 == 0);
 154    assert(size % 4 == 0);
 155
 156    assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
 157    assert(!src || src_offset + size <= src->width0);
 158
 159    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
 160                   si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
 161
 162    /* Save states. */
 163    void *saved_cs = sctx->cs_shader_state.program;
 164    struct pipe_shader_buffer saved_sb[2] = {};
 165    si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
 166
 167    unsigned saved_writable_mask = 0;
 168    for (unsigned i = 0; i < (src ? 2 : 1); i++) {
 169       if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
 170           (1u << si_get_shaderbuf_slot(i)))
 171          saved_writable_mask |= 1 << i;
 172    }
 173
 174    /* The memory accesses are coalesced, meaning that the 1st instruction writes
 175     * the 1st contiguous block of data for the whole wave, the 2nd instruction
 176     * writes the 2nd contiguous block of data, etc.
 177     */
 178    unsigned dwords_per_thread =
 179       src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
 180    unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
 181    unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
 182    unsigned wave_size = sctx->screen->compute_wave_size;
 183    unsigned dwords_per_wave = dwords_per_thread * wave_size;
 184
 185    unsigned num_dwords = size / 4;
 186    unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 187
 188    struct pipe_grid_info info = {};
 189    info.block[0] = MIN2(wave_size, num_instructions);
 190    info.block[1] = 1;
 191    info.block[2] = 1;
 192    info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
 193    info.grid[1] = 1;
 194    info.grid[2] = 1;
 195
 196    struct pipe_shader_buffer sb[2] = {};
 197    sb[0].buffer = dst;
 198    sb[0].buffer_offset = dst_offset;
 199    sb[0].buffer_size = size;
 200
 201    bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
 202
 203    if (src) {
 204       sb[1].buffer = src;
 205       sb[1].buffer_offset = src_offset;
 206       sb[1].buffer_size = size;
 207
 208       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
 209
 210       if (!sctx->cs_copy_buffer) {
 211          sctx->cs_copy_buffer = si_create_dma_compute_shader(
 212             &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
 213       }
 214       ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
 215    } else {
 216       assert(clear_value_size >= 4 && clear_value_size <= 16 &&
 217              util_is_power_of_two_or_zero(clear_value_size));
 218
 219       for (unsigned i = 0; i < 4; i++)
 220          sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
 221
 222       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
 223
 224       if (!sctx->cs_clear_buffer) {
 225          sctx->cs_clear_buffer = si_create_dma_compute_shader(
 226             &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
 227       }
 228       ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
 229    }
 230
 231    si_launch_grid_internal(sctx, &info);
 232
 233    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
 234    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
 235
 236    if (cache_policy != L2_BYPASS)
 237       si_resource(dst)->TC_L2_dirty = true;
 238
 239    /* Restore states. */
 240    ctx->bind_compute_state(ctx, saved_cs);
 241    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
 242    for (int i = 0; i < 2; i++)
 243       pipe_resource_reference(&saved_sb[i].buffer, NULL);
 244 }
 245
 246 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
 247                      uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
 248                      enum si_coherency coher, bool force_cpdma)
 249 {
 250    if (!size)
 251       return;
 252
 253    ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
 254
 255    assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
 256    assert(offset % clear_alignment == 0);
 257    assert(size % clear_alignment == 0);
 258    assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
 259
 260    /* Reduce a large clear value size if possible. */
 261    if (clear_value_size > 4) {
 262       bool clear_dword_duplicated = true;
 263
 264       /* See if we can lower large fills to dword fills. */
 265       for (unsigned i = 1; i < clear_value_size / 4; i++) {
 266          if (clear_value[0] != clear_value[i]) {
 267             clear_dword_duplicated = false;
 268             break;
 269          }
 270       }
 271       if (clear_dword_duplicated)
 272          clear_value_size = 4;
 273    }
 274
 275    /* Expand a small clear value size. */
 276    uint32_t tmp_clear_value;
 277    if (clear_value_size <= 2) {
 278       if (clear_value_size == 1) {
 279          tmp_clear_value = *(uint8_t *)clear_value;
 280          tmp_clear_value |=
 281             (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
 282       } else {
 283          tmp_clear_value = *(uint16_t *)clear_value;
 284          tmp_clear_value |= tmp_clear_value << 16;
 285       }
 286       clear_value = &tmp_clear_value;
 287       clear_value_size = 4;
 288    }
 289
 290    if (clear_value_size == 12) {
 291       si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
 292       return;
 293    }
 294
 295    uint64_t aligned_size = size & ~3ull;
 296    if (aligned_size >= 4) {
 297       /* Before GFX9, CP DMA was very slow when clearing GTT, so never
 298        * use CP DMA clears on those chips, because we can't be certain
 299        * about buffer placements.
 300        */
 301       if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
 302                                    (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
 303          si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
 304                                      clear_value_size, coher);
 305       } else {
 306          assert(clear_value_size == 4);
 307          si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
 308                                 coher, get_cache_policy(sctx, coher, size));
 309       }
 310
 311       offset += aligned_size;
 312       size -= aligned_size;
 313    }
 314
 315    /* Handle non-dword alignment. */
 316    if (size) {
 317       assert(dst);
 318       assert(dst->target == PIPE_BUFFER);
 319       assert(size < 4);
 320
 321       pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
 322    }
 323 }
 324
 325 static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 326                                  unsigned offset, unsigned size, const void *clear_value,
 327                                  int clear_value_size)
 328 {
 329    si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
 330                    clear_value_size, SI_COHERENCY_SHADER, false);
 331 }
 332
 333 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
 334                     uint64_t dst_offset, uint64_t src_offset, unsigned size)
 335 {
 336    if (!size)
 337       return;
 338
 339    enum si_coherency coher = SI_COHERENCY_SHADER;
 340    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
 341
 342    /* Only use compute for VRAM copies on dGPUs. */
 343    if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
 344        si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
 345        src_offset % 4 == 0 && size % 4 == 0) {
 346       si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
 347    } else {
 348       si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
 349    }
 350 }
 351
 352 void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
 353                            struct pipe_resource *src, unsigned src_level, unsigned dstx,
 354                            unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
 355 {
 356    struct pipe_context *ctx = &sctx->b;
 357    unsigned width = src_box->width;
 358    unsigned height = src_box->height;
 359    unsigned depth = src_box->depth;
 360    enum pipe_format src_format = util_format_linear(src->format);
 361    enum pipe_format dst_format = util_format_linear(dst->format);
 362
 363    assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
 364
 365    if (util_format_is_subsampled_422(src_format)) {
 366       src_format = dst_format = PIPE_FORMAT_R32_UINT;
 367       /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
 368        * should force us to divide src_box->x, dstx and width by 2.
 369        * But given that ac_surface allocates this format as 32 bpp
 370        * and that surf_size is then modified to pack the values
 371        * we must keep the original values to get the correct results.
 372        */
 373    }
 374    unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
 375
 376    if (width == 0 || height == 0)
 377       return;
 378
 379    sctx->flags |=
 380       SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 381
 382    /* The driver doesn't decompress resources automatically here. */
 383    si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
 384                              dstz + src_box->depth - 1);
 385    si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
 386                              src_box->z + src_box->depth - 1);
 387
 388    /* src and dst have the same number of samples. */
 389    si_make_CB_shader_coherent(sctx, src->nr_samples, true,
 390                               /* Only src can have DCC.*/
 391                               ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
 392
 393    struct pipe_constant_buffer saved_cb = {};
 394    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 395
 396    struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
 397    struct pipe_image_view saved_image[2] = {0};
 398    util_copy_image_view(&saved_image[0], &images->views[0]);
 399    util_copy_image_view(&saved_image[1], &images->views[1]);
 400
 401    void *saved_cs = sctx->cs_shader_state.program;
 402
 403    struct pipe_constant_buffer cb = {};
 404    cb.buffer_size = sizeof(data);
 405    cb.user_buffer = data;
 406    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 407
 408    struct pipe_image_view image[2] = {0};
 409    image[0].resource = src;
 410    image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
 411    image[0].format = src_format;
 412    image[0].u.tex.level = src_level;
 413    image[0].u.tex.first_layer = 0;
 414    image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
 415                                                               : (unsigned)(src->array_size - 1);
 416    image[1].resource = dst;
 417    image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
 418    image[1].format = dst_format;
 419    image[1].u.tex.level = dst_level;
 420    image[1].u.tex.first_layer = 0;
 421    image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
 422                                                               : (unsigned)(dst->array_size - 1);
 423
 424    if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
 425       image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
 426
 427    /* SNORM8 blitting has precision issues on some chips. Use the SINT
 428     * equivalent instead, which doesn't force DCC decompression.
 429     * Note that some chips avoid this issue by using SDMA.
 430     */
 431    if (util_format_is_snorm8(dst->format)) {
 432       image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
 433    }
 434
 435    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
 436
 437    struct pipe_grid_info info = {0};
 438
 439    if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
 440       if (!sctx->cs_copy_image_1d_array)
 441          sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
 442       ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
 443       info.block[0] = 64;
 444       info.last_block[0] = width % 64;
 445       info.block[1] = 1;
 446       info.block[2] = 1;
 447       info.grid[0] = DIV_ROUND_UP(width, 64);
 448       info.grid[1] = depth;
 449       info.grid[2] = 1;
 450    } else {
 451       if (!sctx->cs_copy_image)
 452          sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
 453       ctx->bind_compute_state(ctx, sctx->cs_copy_image);
 454       info.block[0] = 8;
 455       info.last_block[0] = width % 8;
 456       info.block[1] = 8;
 457       info.last_block[1] = height % 8;
 458       info.block[2] = 1;
 459       info.grid[0] = DIV_ROUND_UP(width, 8);
 460       info.grid[1] = DIV_ROUND_UP(height, 8);
 461       info.grid[2] = depth;
 462    }
 463
 464    si_launch_grid_internal(sctx, &info);
 465
 466    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
 467                   si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 468    ctx->bind_compute_state(ctx, saved_cs);
 469    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
 470    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 471    for (int i = 0; i < 2; i++)
 472       pipe_resource_reference(&saved_image[i].resource, NULL);
 473    pipe_resource_reference(&saved_cb.buffer, NULL);
 474 }
 475
 476 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 477 {
 478    struct pipe_context *ctx = &sctx->b;
 479
 480    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
 481                   si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
 482                   si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
 483    sctx->emit_cache_flush(sctx);
 484
 485    /* Save states. */
 486    void *saved_cs = sctx->cs_shader_state.program;
 487    struct pipe_image_view saved_img[3] = {};
 488
 489    for (unsigned i = 0; i < 3; i++) {
 490       util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
 491    }
 492
 493    /* Set images. */
 494    bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
 495    unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
 496    struct pipe_image_view img[3];
 497
 498    assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
 499    assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
 500    assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
 501
 502    for (unsigned i = 0; i < 3; i++) {
 503       img[i].resource = &tex->buffer.b.b;
 504       img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
 505       img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
 506    }
 507
 508    img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
 509    img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
 510    img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
 511
 512    img[1].format = PIPE_FORMAT_R8_UINT;
 513    img[1].u.buf.offset = tex->surface.dcc_offset;
 514    img[1].u.buf.size = tex->surface.dcc_size;
 515
 516    img[2].format = PIPE_FORMAT_R8_UINT;
 517    img[2].u.buf.offset = tex->surface.display_dcc_offset;
 518    img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
 519
 520    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
 521
 522    /* Bind the compute shader. */
 523    if (!sctx->cs_dcc_retile)
 524       sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
 525    ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
 526
 527    /* Dispatch compute. */
 528    /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
 529    unsigned num_threads = num_elements / 4;
 530
 531    struct pipe_grid_info info = {};
 532    info.block[0] = 64;
 533    info.block[1] = 1;
 534    info.block[2] = 1;
 535    info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
 536    info.grid[1] = 1;
 537    info.grid[2] = 1;
 538    info.last_block[0] = num_threads % 64;
 539
 540    si_launch_grid_internal(sctx, &info);
 541
 542    /* Don't flush caches or wait. The driver will wait at the end of this IB,
 543     * and L2 will be flushed by the kernel fence.
 544     */
 545
 546    /* Restore states. */
 547    ctx->bind_compute_state(ctx, saved_cs);
 548    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
 549
 550    for (unsigned i = 0; i < 3; i++) {
 551       pipe_resource_reference(&saved_img[i].resource, NULL);
 552    }
 553 }
 554
 555 /* Expand FMASK to make it identity, so that image stores can ignore it. */
 556 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
 557 {
 558    struct si_context *sctx = (struct si_context *)ctx;
 559    bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
 560    unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
 561    unsigned log_samples = util_logbase2(tex->nr_samples);
 562    assert(tex->nr_samples >= 2);
 563
 564    /* EQAA FMASK expansion is unimplemented. */
 565    if (tex->nr_samples != tex->nr_storage_samples)
 566       return;
 567
 568    /* Flush caches and sync engines. */
 569    sctx->flags |=
 570       SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 571    si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
 572                               true /* DCC is not possible with image stores */);
 573
 574    /* Save states. */
 575    void *saved_cs = sctx->cs_shader_state.program;
 576    struct pipe_image_view saved_image = {0};
 577    util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
 578
 579    /* Bind the image. */
 580    struct pipe_image_view image = {0};
 581    image.resource = tex;
 582    /* Don't set WRITE so as not to trigger FMASK expansion, causing
 583     * an infinite loop. */
 584    image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
 585    image.format = util_format_linear(tex->format);
 586    if (is_array)
 587       image.u.tex.last_layer = tex->array_size - 1;
 588
 589    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
 590
 591    /* Bind the shader. */
 592    void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
 593    if (!*shader)
 594       *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
 595    ctx->bind_compute_state(ctx, *shader);
 596
 597    /* Dispatch compute. */
 598    struct pipe_grid_info info = {0};
 599    info.block[0] = 8;
 600    info.last_block[0] = tex->width0 % 8;
 601    info.block[1] = 8;
 602    info.last_block[1] = tex->height0 % 8;
 603    info.block[2] = 1;
 604    info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
 605    info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
 606    info.grid[2] = is_array ? tex->array_size : 1;
 607
 608    si_launch_grid_internal(sctx, &info);
 609
 610    /* Flush caches and sync engines. */
 611    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
 612                   si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 613
 614    /* Restore previous states. */
 615    ctx->bind_compute_state(ctx, saved_cs);
 616    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
 617    pipe_resource_reference(&saved_image.resource, NULL);
 618
 619    /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
 620 #define INVALID 0 /* never used */
 621    static const uint64_t fmask_expand_values[][4] = {
 622       /* samples */
 623       /* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
 624       {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},      /* 1 */
 625       {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},      /* 2 */
 626       {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
 627       {INVALID, INVALID, 0x76543210, 0x8888888876543210},    /* 8 */
 628    };
 629
 630    /* Clear FMASK to identity. */
 631    struct si_texture *stex = (struct si_texture *)tex;
 632    si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
 633                    (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
 634                    SI_COHERENCY_SHADER, false);
 635 }
 636
 637 void si_init_compute_blit_functions(struct si_context *sctx)
 638 {
 639    sctx->b.clear_buffer = si_pipe_clear_buffer;
 640 }
 641
 642 /* Clear a region of a color surface to a constant value. */
 643 void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
 644                                     const union pipe_color_union *color, unsigned dstx,
 645                                     unsigned dsty, unsigned width, unsigned height,
 646                                     bool render_condition_enabled)
 647 {
 648    struct si_context *sctx = (struct si_context *)ctx;
 649    unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
 650    unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
 651
 652    if (width == 0 || height == 0)
 653       return;
 654
 655    /* The driver doesn't decompress resources automatically here. */
 656    si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
 657                              dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
 658
 659    if (util_format_is_srgb(dstsurf->format)) {
 660       union pipe_color_union color_srgb;
 661       for (int i = 0; i < 3; i++)
 662          color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
 663       color_srgb.f[3] = color->f[3];
 664       memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
 665    } else {
 666       memcpy(data + 4, color->ui, sizeof(color->ui));
 667    }
 668
 669    sctx->render_cond_force_off = !render_condition_enabled;
 670
 671    sctx->flags |=
 672       SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 673    si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
 674                               true /* DCC is not possible with image stores */);
 675
 676    struct pipe_constant_buffer saved_cb = {};
 677    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 678
 679    struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
 680    struct pipe_image_view saved_image = {0};
 681    util_copy_image_view(&saved_image, &images->views[0]);
 682
 683    void *saved_cs = sctx->cs_shader_state.program;
 684
 685    struct pipe_constant_buffer cb = {};
 686    cb.buffer_size = sizeof(data);
 687    cb.user_buffer = data;
 688    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 689
 690    struct pipe_image_view image = {0};
 691    image.resource = dstsurf->texture;
 692    image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
 693    image.format = util_format_linear(dstsurf->format);
 694    image.u.tex.level = dstsurf->u.tex.level;
 695    image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
 696    image.u.tex.last_layer = dstsurf->u.tex.last_layer;
 697
 698    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
 699
 700    struct pipe_grid_info info = {0};
 701
 702    if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
 703       if (!sctx->cs_clear_render_target)
 704          sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
 705       ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
 706       info.block[0] = 8;
 707       info.last_block[0] = width % 8;
 708       info.block[1] = 8;
 709       info.last_block[1] = height % 8;
 710       info.block[2] = 1;
 711       info.grid[0] = DIV_ROUND_UP(width, 8);
 712       info.grid[1] = DIV_ROUND_UP(height, 8);
 713       info.grid[2] = num_layers;
 714    } else {
 715       if (!sctx->cs_clear_render_target_1d_array)
 716          sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
 717       ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
 718       info.block[0] = 64;
 719       info.last_block[0] = width % 64;
 720       info.block[1] = 1;
 721       info.block[2] = 1;
 722       info.grid[0] = DIV_ROUND_UP(width, 64);
 723       info.grid[1] = num_layers;
 724       info.grid[2] = 1;
 725    }
 726
 727    si_launch_grid_internal(sctx, &info);
 728
 729    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
 730                   si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 731    ctx->bind_compute_state(ctx, saved_cs);
 732    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
 733    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 734    pipe_resource_reference(&saved_image.resource, NULL);
 735    pipe_resource_reference(&saved_cb.buffer, NULL);
 736 }