src/gallium/drivers/radeonsi/si_dma_cs.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "sid.h"
  27
  28 static void si_dma_emit_wait_idle(struct si_context *sctx)
  29 {
  30    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  31
  32    /* NOP waits for idle. */
  33    if (sctx->chip_class >= GFX7)
  34       radeon_emit(cs, 0x00000000); /* NOP */
  35    else
  36       radeon_emit(cs, 0xf0000000); /* NOP */
  37 }
  38
  39 void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
  40 {
  41    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  42    uint64_t va = dst->gpu_address + offset;
  43
  44    if (sctx->chip_class == GFX6) {
  45       unreachable("SI DMA doesn't support the timestamp packet.");
  46       return;
  47    }
  48
  49    /* Mark the buffer range of destination as valid (initialized),
  50     * so that transfer_map knows it should wait for the GPU when mapping
  51     * that range. */
  52    util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
  53
  54    assert(va % 8 == 0);
  55
  56    si_need_dma_space(sctx, 4, dst, NULL);
  57    si_dma_emit_wait_idle(sctx);
  58
  59    radeon_emit(
  60       cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
  61    radeon_emit(cs, va);
  62    radeon_emit(cs, va >> 32);
  63 }
  64
  65 void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
  66                           uint64_t size, unsigned clear_value)
  67 {
  68    struct radeon_cmdbuf *cs = sctx->sdma_cs;
  69    unsigned i, ncopy, csize;
  70    struct si_resource *sdst = si_resource(dst);
  71
  72    assert(offset % 4 == 0);
  73    assert(size);
  74    assert(size % 4 == 0);
  75
  76    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
  77        sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
  78       sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
  79       return;
  80    }
  81
  82    /* Mark the buffer range of destination as valid (initialized),
  83     * so that transfer_map knows it should wait for the GPU when mapping
  84     * that range. */
  85    util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
  86
  87    offset += sdst->gpu_address;
  88
  89    if (sctx->chip_class == GFX6) {
  90       /* the same maximum size as for copying */
  91       ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  92       si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
  93
  94       for (i = 0; i < ncopy; i++) {
  95          csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  96          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
  97          radeon_emit(cs, offset);
  98          radeon_emit(cs, clear_value);
  99          radeon_emit(cs, (offset >> 32) << 16);
 100          offset += csize;
 101          size -= csize;
 102       }
 103       return;
 104    }
 105
 106    /* The following code is for Sea Islands and later. */
 107    /* the same maximum size as for copying */
 108    ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
 109    si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
 110
 111    for (i = 0; i < ncopy; i++) {
 112       csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
 113       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
 114       radeon_emit(cs, offset);
 115       radeon_emit(cs, offset >> 32);
 116       radeon_emit(cs, clear_value);
 117       /* dw count */
 118       radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
 119       offset += csize;
 120       size -= csize;
 121    }
 122 }
 123
 124 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
 125                          struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
 126                          uint64_t size)
 127 {
 128    struct radeon_cmdbuf *cs = sctx->sdma_cs;
 129    unsigned i, ncopy, csize;
 130    struct si_resource *sdst = si_resource(dst);
 131    struct si_resource *ssrc = si_resource(src);
 132
 133    if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
 134       si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
 135       return;
 136    }
 137
 138    /* Mark the buffer range of destination as valid (initialized),
 139     * so that transfer_map knows it should wait for the GPU when mapping
 140     * that range. */
 141    util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
 142
 143    dst_offset += sdst->gpu_address;
 144    src_offset += ssrc->gpu_address;
 145
 146    if (sctx->chip_class == GFX6) {
 147       unsigned max_size, sub_cmd, shift;
 148
 149       /* see whether we should use the dword-aligned or byte-aligned copy */
 150       if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
 151          sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
 152          shift = 2;
 153          max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
 154       } else {
 155          sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
 156          shift = 0;
 157          max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
 158       }
 159
 160       ncopy = DIV_ROUND_UP(size, max_size);
 161       si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
 162
 163       for (i = 0; i < ncopy; i++) {
 164          csize = MIN2(size, max_size);
 165          radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
 166          radeon_emit(cs, dst_offset);
 167          radeon_emit(cs, src_offset);
 168          radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 169          radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 170          dst_offset += csize;
 171          src_offset += csize;
 172          size -= csize;
 173       }
 174       return;
 175    }
 176
 177    /* The following code is for CI and later. */
 178    unsigned align = ~0u;
 179    ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
 180
 181    /* Align copy size to dw if src/dst address are dw aligned */
 182    if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
 183       align = ~0x3u;
 184       ncopy++;
 185    }
 186
 187    si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
 188
 189    for (i = 0; i < ncopy; i++) {
 190       csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
 191       radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
 192       radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
 193       radeon_emit(cs, 0); /* src/dst endian swap */
 194       radeon_emit(cs, src_offset);
 195       radeon_emit(cs, src_offset >> 32);
 196       radeon_emit(cs, dst_offset);
 197       radeon_emit(cs, dst_offset >> 32);
 198       dst_offset += csize;
 199       src_offset += csize;
 200       size -= csize;
 201    }
 202 }
 203
 204 void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
 205                        struct si_resource *src)
 206 {
 207    struct radeon_winsys *ws = ctx->ws;
 208    uint64_t vram = ctx->sdma_cs->used_vram;
 209    uint64_t gtt = ctx->sdma_cs->used_gart;
 210
 211    if (dst) {
 212       vram += dst->vram_usage;
 213       gtt += dst->gart_usage;
 214    }
 215    if (src) {
 216       vram += src->vram_usage;
 217       gtt += src->gart_usage;
 218    }
 219
 220    /* Flush the GFX IB if DMA depends on it. */
 221    if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 222        ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 223         (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
 224       si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 225
 226    /* Flush if there's not enough space, or if the memory usage per IB
 227     * is too large.
 228     *
 229     * IBs using too little memory are limited by the IB submission overhead.
 230     * IBs using too much memory are limited by the kernel/TTM overhead.
 231     * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 232     *
 233     * This heuristic makes sure that DMA requests are executed
 234     * very soon after the call is made and lowers memory usage.
 235     * It improves texture upload performance by keeping the DMA
 236     * engine busy while uploads are being submitted.
 237     */
 238    num_dw++; /* for emit_wait_idle below */
 239    if (!ctx->sdma_uploads_in_progress &&
 240        (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
 241         ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
 242         !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
 243       si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 244       assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
 245    }
 246
 247    /* Wait for idle if either buffer has been used in the IB before to
 248     * prevent read-after-write hazards.
 249     */
 250    if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
 251        (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
 252       si_dma_emit_wait_idle(ctx);
 253
 254    unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
 255    if (dst) {
 256       ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
 257    }
 258    if (src) {
 259       ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
 260    }
 261
 262    /* this function is called before all DMA calls, so increment this. */
 263    ctx->num_dma_calls++;
 264 }
 265
 266 void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 267 {
 268    struct radeon_cmdbuf *cs = ctx->sdma_cs;
 269    struct radeon_saved_cs saved;
 270    bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 271
 272    if (!radeon_emitted(cs, 0)) {
 273       if (fence)
 274          ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 275       return;
 276    }
 277
 278    if (check_vm)
 279       si_save_cs(ctx->ws, cs, &saved, true);
 280
 281    ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
 282    if (fence)
 283       ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 284
 285    if (check_vm) {
 286       /* Use conservative timeout 800ms, after which we won't wait any
 287        * longer and assume the GPU is hung.
 288        */
 289       ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
 290
 291       si_check_vm_faults(ctx, &saved, RING_DMA);
 292       si_clear_saved_cs(&saved);
 293    }
 294 }
 295
 296 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
 297                             uint64_t size, unsigned value)
 298 {
 299    struct si_context *ctx = (struct si_context *)sscreen->aux_context;
 300
 301    simple_mtx_lock(&sscreen->aux_context_lock);
 302    si_sdma_clear_buffer(ctx, dst, offset, size, value);
 303    sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 304    simple_mtx_unlock(&sscreen->aux_context_lock);
 305 }