src/gallium/drivers/radeonsi/si_compute_blit.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "si_pipe.h"
  27 #include "util/format/u_format.h"
  28 #include "util/format_srgb.h"
  29
  30 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
  31  * and L2_STREAM for src.
  32  */
  33 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
  34                                              uint64_t size)
  35 {
  36    if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
  37        (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
  38       return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
  39
  40    return L2_BYPASS;
  41 }
  42
  43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
  44                             enum si_cache_policy cache_policy)
  45 {
  46    switch (coher) {
  47    default:
  48    case SI_COHERENCY_NONE:
  49    case SI_COHERENCY_CP:
  50       return 0;
  51    case SI_COHERENCY_SHADER:
  52       return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
  53              (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
  54    case SI_COHERENCY_CB_META:
  55       return SI_CONTEXT_FLUSH_AND_INV_CB;
  56    }
  57 }
  58
  59 #define SI_CS_IMAGE_OP        (1 << 0)
  60 #define SI_CS_WAIT_FOR_IDLE   (1 << 1)
  61
  62 static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
  63                                     void *restore_cs, unsigned flags)
  64 {
  65    /* Wait for previous shaders to finish. */
  66    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
  67    /* Invalidate L0-L1 caches. */
  68    /* sL0 is never invalidated, because src resources don't use it. */
  69    sctx->flags |= SI_CONTEXT_INV_VCACHE;
  70
  71    /* Set settings for driver-internal compute dispatches. */
  72    sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
  73    sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
  74    sctx->render_cond_force_off = true;
  75    /* Skip decompression to prevent infinite recursion. */
  76    if (sctx->blitter)
  77       sctx->blitter->running = true;
  78
  79    /* Dispatch compute. */
  80    sctx->b.launch_grid(&sctx->b, info);
  81
  82    /* Restore default settings. */
  83    sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
  84    sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
  85    sctx->render_cond_force_off = false;
  86    if (sctx->blitter)
  87       sctx->blitter->running = false;
  88
  89    /* Restore the original compute shader. */
  90    sctx->b.bind_compute_state(&sctx->b, restore_cs);
  91
  92    if (flags & SI_CS_WAIT_FOR_IDLE) {
  93       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
  94
  95       if (flags & SI_CS_IMAGE_OP) {
  96          /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
  97          sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
  98          /* Make sure image stores are visible to all CUs. */
  99          sctx->flags |= SI_CONTEXT_INV_VCACHE;
 100       } else {
 101          /* Make sure buffer stores are visible to all CUs. */
 102          sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
 103       }
 104    } else {
 105       assert(!(flags & SI_CS_IMAGE_OP));
 106    }
 107 }
 108
 109 static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
 110                                             unsigned dst_offset, unsigned size,
 111                                             const uint32_t *clear_value, enum si_coherency coher)
 112 {
 113    struct pipe_context *ctx = &sctx->b;
 114
 115    assert(dst_offset % 4 == 0);
 116    assert(size % 4 == 0);
 117    unsigned size_12 = DIV_ROUND_UP(size, 12);
 118
 119    unsigned data[4] = {0};
 120    memcpy(data, clear_value, 12);
 121
 122    sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
 123
 124    struct pipe_shader_buffer saved_sb = {0};
 125    si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
 126
 127    unsigned saved_writable_mask = 0;
 128    if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
 129        (1u << si_get_shaderbuf_slot(0)))
 130       saved_writable_mask = 1;
 131
 132    struct pipe_constant_buffer saved_cb = {};
 133    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 134
 135    void *saved_cs = sctx->cs_shader_state.program;
 136
 137    struct pipe_constant_buffer cb = {};
 138    cb.buffer_size = sizeof(data);
 139    cb.user_buffer = data;
 140    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 141
 142    struct pipe_shader_buffer sb = {0};
 143    sb.buffer = dst;
 144    sb.buffer_offset = dst_offset;
 145    sb.buffer_size = size;
 146
 147    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
 148
 149    struct pipe_grid_info info = {0};
 150
 151    if (!sctx->cs_clear_12bytes_buffer)
 152       sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
 153    ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
 154    info.block[0] = 64;
 155    info.last_block[0] = size_12 % 64;
 156    info.block[1] = 1;
 157    info.block[2] = 1;
 158    info.grid[0] = DIV_ROUND_UP(size_12, 64);
 159    info.grid[1] = 1;
 160    info.grid[2] = 1;
 161
 162    si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE);
 163
 164    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
 165    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 166
 167    pipe_resource_reference(&saved_sb.buffer, NULL);
 168    pipe_resource_reference(&saved_cb.buffer, NULL);
 169 }
 170
 171 static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
 172                                         unsigned dst_offset, struct pipe_resource *src,
 173                                         unsigned src_offset, unsigned size,
 174                                         const uint32_t *clear_value, unsigned clear_value_size,
 175                                         enum si_coherency coher)
 176 {
 177    struct pipe_context *ctx = &sctx->b;
 178
 179    assert(src_offset % 4 == 0);
 180    assert(dst_offset % 4 == 0);
 181    assert(size % 4 == 0);
 182
 183    assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
 184    assert(!src || src_offset + size <= src->width0);
 185
 186    sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
 187
 188    /* Save states. */
 189    void *saved_cs = sctx->cs_shader_state.program;
 190    struct pipe_shader_buffer saved_sb[2] = {};
 191    si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
 192
 193    unsigned saved_writable_mask = 0;
 194    for (unsigned i = 0; i < (src ? 2 : 1); i++) {
 195       if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
 196           (1u << si_get_shaderbuf_slot(i)))
 197          saved_writable_mask |= 1 << i;
 198    }
 199
 200    /* The memory accesses are coalesced, meaning that the 1st instruction writes
 201     * the 1st contiguous block of data for the whole wave, the 2nd instruction
 202     * writes the 2nd contiguous block of data, etc.
 203     */
 204    unsigned dwords_per_thread =
 205       src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
 206    unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
 207    unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
 208    unsigned wave_size = sctx->screen->compute_wave_size;
 209    unsigned dwords_per_wave = dwords_per_thread * wave_size;
 210
 211    unsigned num_dwords = size / 4;
 212    unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 213
 214    struct pipe_grid_info info = {};
 215    info.block[0] = MIN2(wave_size, num_instructions);
 216    info.block[1] = 1;
 217    info.block[2] = 1;
 218    info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
 219    info.grid[1] = 1;
 220    info.grid[2] = 1;
 221
 222    struct pipe_shader_buffer sb[2] = {};
 223    sb[0].buffer = dst;
 224    sb[0].buffer_offset = dst_offset;
 225    sb[0].buffer_size = size;
 226
 227    bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
 228
 229    if (src) {
 230       sb[1].buffer = src;
 231       sb[1].buffer_offset = src_offset;
 232       sb[1].buffer_size = size;
 233
 234       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
 235
 236       if (!sctx->cs_copy_buffer) {
 237          sctx->cs_copy_buffer = si_create_dma_compute_shader(
 238             &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
 239       }
 240       ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
 241    } else {
 242       assert(clear_value_size >= 4 && clear_value_size <= 16 &&
 243              util_is_power_of_two_or_zero(clear_value_size));
 244
 245       for (unsigned i = 0; i < 4; i++)
 246          sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
 247
 248       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
 249
 250       if (!sctx->cs_clear_buffer) {
 251          sctx->cs_clear_buffer = si_create_dma_compute_shader(
 252             &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
 253       }
 254       ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
 255    }
 256
 257    si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE);
 258
 259    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
 260    sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
 261
 262    if (cache_policy != L2_BYPASS)
 263       si_resource(dst)->TC_L2_dirty = true;
 264
 265    /* Restore states. */
 266    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
 267    for (int i = 0; i < 2; i++)
 268       pipe_resource_reference(&saved_sb[i].buffer, NULL);
 269 }
 270
 271 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
 272                      uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
 273                      enum si_coherency coher, bool force_cpdma)
 274 {
 275    if (!size)
 276       return;
 277
 278    ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
 279
 280    assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
 281    assert(offset % clear_alignment == 0);
 282    assert(size % clear_alignment == 0);
 283    assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
 284
 285    /* Reduce a large clear value size if possible. */
 286    if (clear_value_size > 4) {
 287       bool clear_dword_duplicated = true;
 288
 289       /* See if we can lower large fills to dword fills. */
 290       for (unsigned i = 1; i < clear_value_size / 4; i++) {
 291          if (clear_value[0] != clear_value[i]) {
 292             clear_dword_duplicated = false;
 293             break;
 294          }
 295       }
 296       if (clear_dword_duplicated)
 297          clear_value_size = 4;
 298    }
 299
 300    /* Expand a small clear value size. */
 301    uint32_t tmp_clear_value;
 302    if (clear_value_size <= 2) {
 303       if (clear_value_size == 1) {
 304          tmp_clear_value = *(uint8_t *)clear_value;
 305          tmp_clear_value |=
 306             (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
 307       } else {
 308          tmp_clear_value = *(uint16_t *)clear_value;
 309          tmp_clear_value |= tmp_clear_value << 16;
 310       }
 311       clear_value = &tmp_clear_value;
 312       clear_value_size = 4;
 313    }
 314
 315    if (clear_value_size == 12) {
 316       si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
 317       return;
 318    }
 319
 320    uint64_t aligned_size = size & ~3ull;
 321    if (aligned_size >= 4) {
 322       /* Before GFX9, CP DMA was very slow when clearing GTT, so never
 323        * use CP DMA clears on those chips, because we can't be certain
 324        * about buffer placements.
 325        */
 326       if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
 327                                    (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
 328          si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
 329                                      clear_value_size, coher);
 330       } else {
 331          assert(clear_value_size == 4);
 332          si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
 333                                 coher, get_cache_policy(sctx, coher, size));
 334       }
 335
 336       offset += aligned_size;
 337       size -= aligned_size;
 338    }
 339
 340    /* Handle non-dword alignment. */
 341    if (size) {
 342       assert(dst);
 343       assert(dst->target == PIPE_BUFFER);
 344       assert(size < 4);
 345
 346       pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
 347    }
 348 }
 349
 350 static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 351                                  unsigned offset, unsigned size, const void *clear_value,
 352                                  int clear_value_size)
 353 {
 354    si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
 355                    clear_value_size, SI_COHERENCY_SHADER, false);
 356 }
 357
 358 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
 359                     uint64_t dst_offset, uint64_t src_offset, unsigned size)
 360 {
 361    if (!size)
 362       return;
 363
 364    enum si_coherency coher = SI_COHERENCY_SHADER;
 365    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
 366
 367    /* Only use compute for VRAM copies on dGPUs. */
 368    if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
 369        si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
 370        src_offset % 4 == 0 && size % 4 == 0) {
 371       si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
 372    } else {
 373       si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
 374    }
 375 }
 376
 377 void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
 378                            struct pipe_resource *src, unsigned src_level, unsigned dstx,
 379                            unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
 380 {
 381    struct pipe_context *ctx = &sctx->b;
 382    unsigned width = src_box->width;
 383    unsigned height = src_box->height;
 384    unsigned depth = src_box->depth;
 385    enum pipe_format src_format = util_format_linear(src->format);
 386    enum pipe_format dst_format = util_format_linear(dst->format);
 387
 388    assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
 389
 390    if (util_format_is_subsampled_422(src_format)) {
 391       src_format = dst_format = PIPE_FORMAT_R32_UINT;
 392       /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
 393        * should force us to divide src_box->x, dstx and width by 2.
 394        * But given that ac_surface allocates this format as 32 bpp
 395        * and that surf_size is then modified to pack the values
 396        * we must keep the original values to get the correct results.
 397        */
 398    }
 399    unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
 400
 401    if (width == 0 || height == 0)
 402       return;
 403
 404    /* The driver doesn't decompress resources automatically here. */
 405    si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
 406                              dstz + src_box->depth - 1);
 407    si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
 408                              src_box->z + src_box->depth - 1);
 409
 410    /* src and dst have the same number of samples. */
 411    si_make_CB_shader_coherent(sctx, src->nr_samples, true,
 412                               /* Only src can have DCC.*/
 413                               ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
 414
 415    struct pipe_constant_buffer saved_cb = {};
 416    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 417
 418    struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
 419    struct pipe_image_view saved_image[2] = {0};
 420    util_copy_image_view(&saved_image[0], &images->views[0]);
 421    util_copy_image_view(&saved_image[1], &images->views[1]);
 422
 423    void *saved_cs = sctx->cs_shader_state.program;
 424
 425    struct pipe_constant_buffer cb = {};
 426    cb.buffer_size = sizeof(data);
 427    cb.user_buffer = data;
 428    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 429
 430    struct pipe_image_view image[2] = {0};
 431    image[0].resource = src;
 432    image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
 433    image[0].format = src_format;
 434    image[0].u.tex.level = src_level;
 435    image[0].u.tex.first_layer = 0;
 436    image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
 437                                                               : (unsigned)(src->array_size - 1);
 438    image[1].resource = dst;
 439    image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
 440    image[1].format = dst_format;
 441    image[1].u.tex.level = dst_level;
 442    image[1].u.tex.first_layer = 0;
 443    image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
 444                                                               : (unsigned)(dst->array_size - 1);
 445
 446    if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
 447       image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
 448
 449    /* SNORM8 blitting has precision issues on some chips. Use the SINT
 450     * equivalent instead, which doesn't force DCC decompression.
 451     * Note that some chips avoid this issue by using SDMA.
 452     */
 453    if (util_format_is_snorm8(dst->format)) {
 454       image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
 455    }
 456
 457    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
 458
 459    struct pipe_grid_info info = {0};
 460
 461    if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
 462       if (!sctx->cs_copy_image_1d_array)
 463          sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
 464       ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
 465       info.block[0] = 64;
 466       info.last_block[0] = width % 64;
 467       info.block[1] = 1;
 468       info.block[2] = 1;
 469       info.grid[0] = DIV_ROUND_UP(width, 64);
 470       info.grid[1] = depth;
 471       info.grid[2] = 1;
 472    } else {
 473       if (!sctx->cs_copy_image)
 474          sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
 475       ctx->bind_compute_state(ctx, sctx->cs_copy_image);
 476       info.block[0] = 8;
 477       info.last_block[0] = width % 8;
 478       info.block[1] = 8;
 479       info.last_block[1] = height % 8;
 480       info.block[2] = 1;
 481       info.grid[0] = DIV_ROUND_UP(width, 8);
 482       info.grid[1] = DIV_ROUND_UP(height, 8);
 483       info.grid[2] = depth;
 484    }
 485
 486    si_launch_grid_internal(sctx, &info, saved_cs,
 487                            SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
 488
 489    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
 490    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 491    for (int i = 0; i < 2; i++)
 492       pipe_resource_reference(&saved_image[i].resource, NULL);
 493    pipe_resource_reference(&saved_cb.buffer, NULL);
 494 }
 495
 496 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 497 {
 498    struct pipe_context *ctx = &sctx->b;
 499
 500    sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);
 501
 502    /* Save states. */
 503    void *saved_cs = sctx->cs_shader_state.program;
 504    struct pipe_image_view saved_img[3] = {};
 505
 506    for (unsigned i = 0; i < 3; i++) {
 507       util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
 508    }
 509
 510    /* Set images. */
 511    bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
 512    unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
 513    struct pipe_image_view img[3];
 514
 515    assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
 516    assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
 517    assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
 518
 519    for (unsigned i = 0; i < 3; i++) {
 520       img[i].resource = &tex->buffer.b.b;
 521       img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
 522       img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
 523    }
 524
 525    img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
 526    img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
 527    img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
 528
 529    img[1].format = PIPE_FORMAT_R8_UINT;
 530    img[1].u.buf.offset = tex->surface.dcc_offset;
 531    img[1].u.buf.size = tex->surface.dcc_size;
 532
 533    img[2].format = PIPE_FORMAT_R8_UINT;
 534    img[2].u.buf.offset = tex->surface.display_dcc_offset;
 535    img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
 536
 537    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
 538
 539    /* Bind the compute shader. */
 540    if (!sctx->cs_dcc_retile)
 541       sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
 542    ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
 543
 544    /* Dispatch compute. */
 545    /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
 546    unsigned num_threads = num_elements / 4;
 547
 548    struct pipe_grid_info info = {};
 549    info.block[0] = 64;
 550    info.block[1] = 1;
 551    info.block[2] = 1;
 552    info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
 553    info.grid[1] = 1;
 554    info.grid[2] = 1;
 555    info.last_block[0] = num_threads % 64;
 556
 557    si_launch_grid_internal(sctx, &info, saved_cs, 0);
 558
 559    /* Don't flush caches or wait. The driver will wait at the end of this IB,
 560     * and L2 will be flushed by the kernel fence.
 561     */
 562
 563    /* Restore states. */
 564    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
 565
 566    for (unsigned i = 0; i < 3; i++) {
 567       pipe_resource_reference(&saved_img[i].resource, NULL);
 568    }
 569 }
 570
 571 /* Expand FMASK to make it identity, so that image stores can ignore it. */
 572 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
 573 {
 574    struct si_context *sctx = (struct si_context *)ctx;
 575    bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
 576    unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
 577    unsigned log_samples = util_logbase2(tex->nr_samples);
 578    assert(tex->nr_samples >= 2);
 579
 580    /* EQAA FMASK expansion is unimplemented. */
 581    if (tex->nr_samples != tex->nr_storage_samples)
 582       return;
 583
 584    si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
 585                               true /* DCC is not possible with image stores */);
 586
 587    /* Save states. */
 588    void *saved_cs = sctx->cs_shader_state.program;
 589    struct pipe_image_view saved_image = {0};
 590    util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
 591
 592    /* Bind the image. */
 593    struct pipe_image_view image = {0};
 594    image.resource = tex;
 595    /* Don't set WRITE so as not to trigger FMASK expansion, causing
 596     * an infinite loop. */
 597    image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
 598    image.format = util_format_linear(tex->format);
 599    if (is_array)
 600       image.u.tex.last_layer = tex->array_size - 1;
 601
 602    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
 603
 604    /* Bind the shader. */
 605    void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
 606    if (!*shader)
 607       *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
 608    ctx->bind_compute_state(ctx, *shader);
 609
 610    /* Dispatch compute. */
 611    struct pipe_grid_info info = {0};
 612    info.block[0] = 8;
 613    info.last_block[0] = tex->width0 % 8;
 614    info.block[1] = 8;
 615    info.last_block[1] = tex->height0 % 8;
 616    info.block[2] = 1;
 617    info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
 618    info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
 619    info.grid[2] = is_array ? tex->array_size : 1;
 620
 621    si_launch_grid_internal(sctx, &info, saved_cs,
 622                            SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
 623
 624    /* Restore previous states. */
 625    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
 626    pipe_resource_reference(&saved_image.resource, NULL);
 627
 628    /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
 629 #define INVALID 0 /* never used */
 630    static const uint64_t fmask_expand_values[][4] = {
 631       /* samples */
 632       /* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
 633       {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},      /* 1 */
 634       {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},      /* 2 */
 635       {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
 636       {INVALID, INVALID, 0x76543210, 0x8888888876543210},    /* 8 */
 637    };
 638
 639    /* Clear FMASK to identity. */
 640    struct si_texture *stex = (struct si_texture *)tex;
 641    si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
 642                    (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
 643                    SI_COHERENCY_SHADER, false);
 644 }
 645
 646 void si_init_compute_blit_functions(struct si_context *sctx)
 647 {
 648    sctx->b.clear_buffer = si_pipe_clear_buffer;
 649 }
 650
 651 /* Clear a region of a color surface to a constant value. */
 652 void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
 653                                     const union pipe_color_union *color, unsigned dstx,
 654                                     unsigned dsty, unsigned width, unsigned height,
 655                                     bool render_condition_enabled)
 656 {
 657    struct si_context *sctx = (struct si_context *)ctx;
 658    unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
 659    unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
 660
 661    if (width == 0 || height == 0)
 662       return;
 663
 664    /* The driver doesn't decompress resources automatically here. */
 665    si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
 666                              dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
 667
 668    if (util_format_is_srgb(dstsurf->format)) {
 669       union pipe_color_union color_srgb;
 670       for (int i = 0; i < 3; i++)
 671          color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
 672       color_srgb.f[3] = color->f[3];
 673       memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
 674    } else {
 675       memcpy(data + 4, color->ui, sizeof(color->ui));
 676    }
 677
 678    sctx->render_cond_force_off = !render_condition_enabled;
 679
 680    si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
 681                               true /* DCC is not possible with image stores */);
 682
 683    struct pipe_constant_buffer saved_cb = {};
 684    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 685
 686    struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
 687    struct pipe_image_view saved_image = {0};
 688    util_copy_image_view(&saved_image, &images->views[0]);
 689
 690    void *saved_cs = sctx->cs_shader_state.program;
 691
 692    struct pipe_constant_buffer cb = {};
 693    cb.buffer_size = sizeof(data);
 694    cb.user_buffer = data;
 695    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 696
 697    struct pipe_image_view image = {0};
 698    image.resource = dstsurf->texture;
 699    image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
 700    image.format = util_format_linear(dstsurf->format);
 701    image.u.tex.level = dstsurf->u.tex.level;
 702    image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
 703    image.u.tex.last_layer = dstsurf->u.tex.last_layer;
 704
 705    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
 706
 707    struct pipe_grid_info info = {0};
 708
 709    if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
 710       if (!sctx->cs_clear_render_target)
 711          sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
 712       ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
 713       info.block[0] = 8;
 714       info.last_block[0] = width % 8;
 715       info.block[1] = 8;
 716       info.last_block[1] = height % 8;
 717       info.block[2] = 1;
 718       info.grid[0] = DIV_ROUND_UP(width, 8);
 719       info.grid[1] = DIV_ROUND_UP(height, 8);
 720       info.grid[2] = num_layers;
 721    } else {
 722       if (!sctx->cs_clear_render_target_1d_array)
 723          sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
 724       ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
 725       info.block[0] = 64;
 726       info.last_block[0] = width % 64;
 727       info.block[1] = 1;
 728       info.block[2] = 1;
 729       info.grid[0] = DIV_ROUND_UP(width, 64);
 730       info.grid[1] = num_layers;
 731       info.grid[2] = 1;
 732    }
 733
 734    si_launch_grid_internal(sctx, &info, saved_cs,
 735                            SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
 736
 737    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
 738    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 739    pipe_resource_reference(&saved_image.resource, NULL);
 740    pipe_resource_reference(&saved_cb.buffer, NULL);
 741 }