src/gallium/drivers/radeonsi/si_dma_cs.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "sid.h"
  27
  28 static void si_dma_emit_wait_idle(struct si_context *sctx)
  29 {
  30    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  31
  32    /* NOP waits for idle. */
  33    if (sctx->chip_class >= GFX7)
  34       radeon_emit(cs, 0x00000000); /* NOP */
  35    else
  36       radeon_emit(cs, 0xf0000000); /* NOP */
  37 }
  38
  39 void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
  40 {
  41    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  42    uint64_t va = dst->gpu_address + offset;
  43
  44    if (sctx->chip_class == GFX6) {
  45       unreachable("SI DMA doesn't support the timestamp packet.");
  46       return;
  47    }
  48
  49    /* Mark the buffer range of destination as valid (initialized),
  50     * so that transfer_map knows it should wait for the GPU when mapping
  51     * that range. */
  52    util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
  53
  54    assert(va % 8 == 0);
  55
  56    si_need_dma_space(sctx, 4, dst, NULL);
  57    si_dma_emit_wait_idle(sctx);
  58
  59    radeon_emit(
  60       cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
  61    radeon_emit(cs, va);
  62    radeon_emit(cs, va >> 32);
  63 }
  64
  65 void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
  66                           uint64_t size, unsigned clear_value)
  67 {
  68    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  69    unsigned i, ncopy, csize;
  70    struct si_resource *sdst = si_resource(dst);
  71
  72    assert(offset % 4 == 0);
  73    assert(size);
  74    assert(size % 4 == 0);
  75
  76    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
  77        sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS) || sctx->ws->ws_is_secure(sctx->ws)) {
  78       sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
  79       return;
  80    }
  81
  82    /* Mark the buffer range of destination as valid (initialized),
  83     * so that transfer_map knows it should wait for the GPU when mapping
  84     * that range. */
  85    util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
  86
  87    offset += sdst->gpu_address;
  88
  89    if (sctx->chip_class == GFX6) {
  90       /* the same maximum size as for copying */
  91       ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  92       si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
  93
  94       for (i = 0; i < ncopy; i++) {
  95          csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  96          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
  97          radeon_emit(cs, offset);
  98          radeon_emit(cs, clear_value);
  99          radeon_emit(cs, (offset >> 32) << 16);
 100          offset += csize;
 101          size -= csize;
 102       }
 103       return;
 104    }
 105
 106    /* The following code is for CI and later. */
 107    /* the same maximum size as for copying */
 108    unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ?
 109                                      GFX103_SDMA_COPY_MAX_SIZE :
 110                                      CIK_SDMA_COPY_MAX_SIZE;
 111    ncopy = DIV_ROUND_UP(size, max_size_per_packet);
 112    si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
 113
 114    for (i = 0; i < ncopy; i++) {
 115       csize = MIN2(size, max_size_per_packet);
 116       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
 117       radeon_emit(cs, offset);
 118       radeon_emit(cs, offset >> 32);
 119       radeon_emit(cs, clear_value);
 120       /* dw count */
 121       radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
 122       offset += csize;
 123       size -= csize;
 124    }
 125 }
 126
 127 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
 128                          struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
 129                          uint64_t size)
 130 {
 131    struct radeon_cmdbuf *cs = sctx->sdma_cs;
 132    unsigned i, ncopy, csize;
 133    struct si_resource *sdst = si_resource(dst);
 134    struct si_resource *ssrc = si_resource(src);
 135
 136    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
 137        (ssrc->flags & RADEON_FLAG_ENCRYPTED) != (sdst->flags & RADEON_FLAG_ENCRYPTED)) {
 138       si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
 139       return;
 140    }
 141
 142    /* Mark the buffer range of destination as valid (initialized),
 143     * so that transfer_map knows it should wait for the GPU when mapping
 144     * that range. */
 145    util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
 146
 147    dst_offset += sdst->gpu_address;
 148    src_offset += ssrc->gpu_address;
 149
 150    if (sctx->chip_class == GFX6) {
 151       unsigned max_size, sub_cmd, shift;
 152
 153       /* see whether we should use the dword-aligned or byte-aligned copy */
 154       if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
 155          sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
 156          shift = 2;
 157          max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
 158       } else {
 159          sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
 160          shift = 0;
 161          max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
 162       }
 163
 164       ncopy = DIV_ROUND_UP(size, max_size);
 165       si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
 166
 167       for (i = 0; i < ncopy; i++) {
 168          csize = MIN2(size, max_size);
 169          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
 170          radeon_emit(cs, dst_offset);
 171          radeon_emit(cs, src_offset);
 172          radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 173          radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 174          dst_offset += csize;
 175          src_offset += csize;
 176          size -= csize;
 177       }
 178       return;
 179    }
 180
 181    /* The following code is for CI and later. */
 182    unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ?
 183                                      GFX103_SDMA_COPY_MAX_SIZE :
 184                                      CIK_SDMA_COPY_MAX_SIZE;
 185    unsigned align = ~0u;
 186    ncopy = DIV_ROUND_UP(size, max_size_per_packet);
 187
 188    /* Align copy size to dw if src/dst address are dw aligned */
 189    if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
 190       align = ~0x3u;
 191       ncopy++;
 192    }
 193
 194    si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
 195
 196    for (i = 0; i < ncopy; i++) {
 197       csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
 198       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
 199                                       (sctx->ws->cs_is_secure(cs) ? 1u : 0) << 2));
 200       radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
 201       radeon_emit(cs, 0); /* src/dst endian swap */
 202       radeon_emit(cs, src_offset);
 203       radeon_emit(cs, src_offset >> 32);
 204       radeon_emit(cs, dst_offset);
 205       radeon_emit(cs, dst_offset >> 32);
 206       dst_offset += csize;
 207       src_offset += csize;
 208       size -= csize;
 209    }
 210 }
 211
 212 void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
 213                        struct si_resource *src)
 214 {
 215    struct radeon_winsys *ws = ctx->ws;
 216    uint64_t vram = ctx->sdma_cs->used_vram;
 217    uint64_t gtt = ctx->sdma_cs->used_gart;
 218
 219    if (dst) {
 220       vram += dst->vram_usage;
 221       gtt += dst->gart_usage;
 222    }
 223    if (src) {
 224       vram += src->vram_usage;
 225       gtt += src->gart_usage;
 226    }
 227
 228    /* Flush the GFX IB if DMA depends on it. */
 229    if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 230        ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 231         (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
 232       si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 233
 234    bool use_secure_cmd = false;
 235    /* if TMZ is supported and enabled */
 236    if (ctx->ws->ws_is_secure(ctx->ws)) {
 237       if (src && src->flags & RADEON_FLAG_ENCRYPTED) {
 238          assert(!dst || (dst->flags & RADEON_FLAG_ENCRYPTED));
 239          use_secure_cmd = true;
 240       } else if (dst && (dst->flags & RADEON_FLAG_ENCRYPTED)) {
 241          use_secure_cmd = true;
 242       }
 243    }
 244
 245    /* Flush if there's not enough space, or if the memory usage per IB
 246     * is too large.
 247     *
 248     * IBs using too little memory are limited by the IB submission overhead.
 249     * IBs using too much memory are limited by the kernel/TTM overhead.
 250     * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 251     *
 252     * This heuristic makes sure that DMA requests are executed
 253     * very soon after the call is made and lowers memory usage.
 254     * It improves texture upload performance by keeping the DMA
 255     * engine busy while uploads are being submitted.
 256     */
 257    num_dw++; /* for emit_wait_idle below */
 258    if (!ctx->sdma_uploads_in_progress &&
 259        (use_secure_cmd != ctx->ws->cs_is_secure(ctx->sdma_cs) ||
 260         !ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
 261         ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
 262         !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
 263       si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 264       assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
 265    }
 266    ctx->ws->cs_set_secure(ctx->sdma_cs, use_secure_cmd);
 267
 268    /* Wait for idle if either buffer has been used in the IB before to
 269     * prevent read-after-write hazards.
 270     */
 271    if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 272        (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
 273       si_dma_emit_wait_idle(ctx);
 274
 275    unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
 276    if (dst) {
 277       ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
 278    }
 279    if (src) {
 280       ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
 281    }
 282
 283    /* this function is called before all DMA calls, so increment this. */
 284    ctx->num_dma_calls++;
 285 }
 286
 287 void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 288 {
 289    struct radeon_cmdbuf *cs = ctx->sdma_cs;
 290    struct radeon_saved_cs saved;
 291    bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 292
 293    if (!radeon_emitted(cs, 0)) {
 294       if (fence)
 295          ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 296       return;
 297    }
 298
 299    if (check_vm)
 300       si_save_cs(ctx->ws, cs, &saved, true);
 301
 302    ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
 303    if (fence)
 304       ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 305
 306    if (check_vm) {
 307       /* Use conservative timeout 800ms, after which we won't wait any
 308        * longer and assume the GPU is hung.
 309        */
 310       ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
 311
 312       si_check_vm_faults(ctx, &saved, RING_DMA);
 313       si_clear_saved_cs(&saved);
 314    }
 315 }
 316
 317 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
 318                             uint64_t size, unsigned value)
 319 {
 320    struct si_context *ctx = (struct si_context *)sscreen->aux_context;
 321
 322    simple_mtx_lock(&sscreen->aux_context_lock);
 323    si_sdma_clear_buffer(ctx, dst, offset, size, value);
 324    sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 325    simple_mtx_unlock(&sscreen->aux_context_lock);
 326 }