src/gallium/drivers/radeonsi/si_dma_cs.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "sid.h"
  27
  28 static void si_dma_emit_wait_idle(struct si_context *sctx)
  29 {
  30    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  31
  32    /* NOP waits for idle. */
  33    if (sctx->chip_class >= GFX7)
  34       radeon_emit(cs, 0x00000000); /* NOP */
  35    else
  36       radeon_emit(cs, 0xf0000000); /* NOP */
  37 }
  38
  39 void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
  40 {
  41    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  42    uint64_t va = dst->gpu_address + offset;
  43
  44    if (sctx->chip_class == GFX6) {
  45       unreachable("SI DMA doesn't support the timestamp packet.");
  46       return;
  47    }
  48
  49    /* Mark the buffer range of destination as valid (initialized),
  50     * so that transfer_map knows it should wait for the GPU when mapping
  51     * that range. */
  52    util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
  53
  54    assert(va % 8 == 0);
  55
  56    si_need_dma_space(sctx, 4, dst, NULL);
  57    si_dma_emit_wait_idle(sctx);
  58
  59    radeon_emit(
  60       cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
  61    radeon_emit(cs, va);
  62    radeon_emit(cs, va >> 32);
  63 }
  64
  65 void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
  66                           uint64_t size, unsigned clear_value)
  67 {
  68    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  69    unsigned i, ncopy, csize;
  70    struct si_resource *sdst = si_resource(dst);
  71
  72    assert(offset % 4 == 0);
  73    assert(size);
  74    assert(size % 4 == 0);
  75
  76    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
  77        sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS) || sctx->ws->ws_is_secure(sctx->ws)) {
  78       sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
  79       return;
  80    }
  81
  82    /* Mark the buffer range of destination as valid (initialized),
  83     * so that transfer_map knows it should wait for the GPU when mapping
  84     * that range. */
  85    util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
  86
  87    offset += sdst->gpu_address;
  88
  89    if (sctx->chip_class == GFX6) {
  90       /* the same maximum size as for copying */
  91       ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  92       si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
  93
  94       for (i = 0; i < ncopy; i++) {
  95          csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  96          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
  97          radeon_emit(cs, offset);
  98          radeon_emit(cs, clear_value);
  99          radeon_emit(cs, (offset >> 32) << 16);
 100          offset += csize;
 101          size -= csize;
 102       }
 103       return;
 104    }
 105
 106    /* The following code is for Sea Islands and later. */
 107    /* the same maximum size as for copying */
 108    ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
 109    si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
 110
 111    for (i = 0; i < ncopy; i++) {
 112       csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
 113       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
 114       radeon_emit(cs, offset);
 115       radeon_emit(cs, offset >> 32);
 116       radeon_emit(cs, clear_value);
 117       /* dw count */
 118       radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
 119       offset += csize;
 120       size -= csize;
 121    }
 122 }
 123
 124 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
 125                          struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
 126                          uint64_t size)
 127 {
 128    struct radeon_cmdbuf *cs = sctx->sdma_cs;
 129    unsigned i, ncopy, csize;
 130    struct si_resource *sdst = si_resource(dst);
 131    struct si_resource *ssrc = si_resource(src);
 132
 133    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
 134        (ssrc->flags & RADEON_FLAG_ENCRYPTED) != (sdst->flags & RADEON_FLAG_ENCRYPTED)) {
 135       si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
 136       return;
 137    }
 138
 139    /* Mark the buffer range of destination as valid (initialized),
 140     * so that transfer_map knows it should wait for the GPU when mapping
 141     * that range. */
 142    util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
 143
 144    dst_offset += sdst->gpu_address;
 145    src_offset += ssrc->gpu_address;
 146
 147    if (sctx->chip_class == GFX6) {
 148       unsigned max_size, sub_cmd, shift;
 149
 150       /* see whether we should use the dword-aligned or byte-aligned copy */
 151       if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
 152          sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
 153          shift = 2;
 154          max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
 155       } else {
 156          sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
 157          shift = 0;
 158          max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
 159       }
 160
 161       ncopy = DIV_ROUND_UP(size, max_size);
 162       si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
 163
 164       for (i = 0; i < ncopy; i++) {
 165          csize = MIN2(size, max_size);
 166          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
 167          radeon_emit(cs, dst_offset);
 168          radeon_emit(cs, src_offset);
 169          radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 170          radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 171          dst_offset += csize;
 172          src_offset += csize;
 173          size -= csize;
 174       }
 175       return;
 176    }
 177
 178    /* The following code is for CI and later. */
 179    unsigned align = ~0u;
 180    ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
 181
 182    /* Align copy size to dw if src/dst address are dw aligned */
 183    if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
 184       align = ~0x3u;
 185       ncopy++;
 186    }
 187
 188    si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
 189
 190    for (i = 0; i < ncopy; i++) {
 191       csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
 192       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
 193                                       (sctx->ws->cs_is_secure(cs) ? 1u : 0) << 2));
 194       radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
 195       radeon_emit(cs, 0); /* src/dst endian swap */
 196       radeon_emit(cs, src_offset);
 197       radeon_emit(cs, src_offset >> 32);
 198       radeon_emit(cs, dst_offset);
 199       radeon_emit(cs, dst_offset >> 32);
 200       dst_offset += csize;
 201       src_offset += csize;
 202       size -= csize;
 203    }
 204 }
 205
 206 void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
 207                        struct si_resource *src)
 208 {
 209    struct radeon_winsys *ws = ctx->ws;
 210    uint64_t vram = ctx->sdma_cs->used_vram;
 211    uint64_t gtt = ctx->sdma_cs->used_gart;
 212
 213    if (dst) {
 214       vram += dst->vram_usage;
 215       gtt += dst->gart_usage;
 216    }
 217    if (src) {
 218       vram += src->vram_usage;
 219       gtt += src->gart_usage;
 220    }
 221
 222    /* Flush the GFX IB if DMA depends on it. */
 223    if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 224        ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 225         (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
 226       si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 227
 228    bool use_secure_cmd = false;
 229    /* if TMZ is supported and enabled */
 230    if (ctx->ws->ws_is_secure(ctx->ws)) {
 231       if (src && src->flags & RADEON_FLAG_ENCRYPTED) {
 232          assert(!dst || (dst->flags & RADEON_FLAG_ENCRYPTED));
 233          use_secure_cmd = true;
 234       } else if (dst && (dst->flags & RADEON_FLAG_ENCRYPTED)) {
 235          use_secure_cmd = true;
 236       }
 237    }
 238
 239    /* Flush if there's not enough space, or if the memory usage per IB
 240     * is too large.
 241     *
 242     * IBs using too little memory are limited by the IB submission overhead.
 243     * IBs using too much memory are limited by the kernel/TTM overhead.
 244     * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 245     *
 246     * This heuristic makes sure that DMA requests are executed
 247     * very soon after the call is made and lowers memory usage.
 248     * It improves texture upload performance by keeping the DMA
 249     * engine busy while uploads are being submitted.
 250     */
 251    num_dw++; /* for emit_wait_idle below */
 252    if (!ctx->sdma_uploads_in_progress &&
 253        (use_secure_cmd != ctx->ws->cs_is_secure(ctx->sdma_cs) ||
 254         !ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
 255         ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
 256         !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
 257       si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 258       assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
 259    }
 260    ctx->ws->cs_set_secure(ctx->sdma_cs, use_secure_cmd);
 261
 262    /* Wait for idle if either buffer has been used in the IB before to
 263     * prevent read-after-write hazards.
 264     */
 265    if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 266        (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
 267       si_dma_emit_wait_idle(ctx);
 268
 269    unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
 270    if (dst) {
 271       ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
 272    }
 273    if (src) {
 274       ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
 275    }
 276
 277    /* this function is called before all DMA calls, so increment this. */
 278    ctx->num_dma_calls++;
 279 }
 280
 281 void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 282 {
 283    struct radeon_cmdbuf *cs = ctx->sdma_cs;
 284    struct radeon_saved_cs saved;
 285    bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 286
 287    if (!radeon_emitted(cs, 0)) {
 288       if (fence)
 289          ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 290       return;
 291    }
 292
 293    if (check_vm)
 294       si_save_cs(ctx->ws, cs, &saved, true);
 295
 296    ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
 297    if (fence)
 298       ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 299
 300    if (check_vm) {
 301       /* Use conservative timeout 800ms, after which we won't wait any
 302        * longer and assume the GPU is hung.
 303        */
 304       ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
 305
 306       si_check_vm_faults(ctx, &saved, RING_DMA);
 307       si_clear_saved_cs(&saved);
 308    }
 309 }
 310
 311 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
 312                             uint64_t size, unsigned value)
 313 {
 314    struct si_context *ctx = (struct si_context *)sscreen->aux_context;
 315
 316    simple_mtx_lock(&sscreen->aux_context_lock);
 317    si_sdma_clear_buffer(ctx, dst, offset, size, value);
 318    sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 319    simple_mtx_unlock(&sscreen->aux_context_lock);
 320 }