src/gallium/drivers/radeonsi/si_dma_cs.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "sid.h"
  27
  28 static void si_dma_emit_wait_idle(struct si_context *sctx)
  29 {
  30         struct radeon_cmdbuf *cs = sctx->dma_cs;
  31
  32         /* NOP waits for idle. */
  33         if (sctx->chip_class >= CIK)
  34                 radeon_emit(cs, 0x00000000); /* NOP */
  35         else
  36                 radeon_emit(cs, 0xf0000000); /* NOP */
  37 }
  38
  39 void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
  40                            uint64_t offset)
  41 {
  42         struct radeon_cmdbuf *cs = sctx->dma_cs;
  43         uint64_t va = dst->gpu_address + offset;
  44
  45         if (sctx->chip_class == SI) {
  46                 unreachable("SI DMA doesn't support the timestamp packet.");
  47                 return;
  48         }
  49
  50         /* Mark the buffer range of destination as valid (initialized),
  51          * so that transfer_map knows it should wait for the GPU when mapping
  52          * that range. */
  53         util_range_add(&dst->valid_buffer_range, offset, offset + 8);
  54
  55         assert(va % 8 == 0);
  56
  57         si_need_dma_space(sctx, 4, dst, NULL);
  58         si_dma_emit_wait_idle(sctx);
  59
  60         radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
  61                                         SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
  62                                         0));
  63         radeon_emit(cs, va);
  64         radeon_emit(cs, va >> 32);
  65 }
  66
  67 void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
  68                           uint64_t offset, uint64_t size, unsigned clear_value)
  69 {
  70         struct radeon_cmdbuf *cs = sctx->dma_cs;
  71         unsigned i, ncopy, csize;
  72         struct r600_resource *rdst = r600_resource(dst);
  73
  74         assert(offset % 4 == 0);
  75         assert(size);
  76         assert(size % 4 == 0);
  77
  78         if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
  79                 sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
  80                 return;
  81         }
  82
  83         /* Mark the buffer range of destination as valid (initialized),
  84          * so that transfer_map knows it should wait for the GPU when mapping
  85          * that range. */
  86         util_range_add(&rdst->valid_buffer_range, offset, offset + size);
  87
  88         offset += rdst->gpu_address;
  89
  90         if (sctx->chip_class == SI) {
  91                 /* the same maximum size as for copying */
  92                 ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  93                 si_need_dma_space(sctx, ncopy * 4, rdst, NULL);
  94
  95                 for (i = 0; i < ncopy; i++) {
  96                         csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
  97                         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
  98                                                       csize / 4));
  99                         radeon_emit(cs, offset);
 100                         radeon_emit(cs, clear_value);
 101                         radeon_emit(cs, (offset >> 32) << 16);
 102                         offset += csize;
 103                         size -= csize;
 104                 }
 105                 return;
 106         }
 107
 108         /* The following code is for CI, VI, Vega/Raven, etc. */
 109         /* the same maximum size as for copying */
 110         ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
 111         si_need_dma_space(sctx, ncopy * 5, rdst, NULL);
 112
 113         for (i = 0; i < ncopy; i++) {
 114                 csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
 115                 radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
 116                                                 0x8000 /* dword copy */));
 117                 radeon_emit(cs, offset);
 118                 radeon_emit(cs, offset >> 32);
 119                 radeon_emit(cs, clear_value);
 120                 radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
 121                 offset += csize;
 122                 size -= csize;
 123         }
 124 }
 125
 126 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 127                        struct r600_resource *dst, struct r600_resource *src)
 128 {
 129         uint64_t vram = ctx->dma_cs->used_vram;
 130         uint64_t gtt = ctx->dma_cs->used_gart;
 131
 132         if (dst) {
 133                 vram += dst->vram_usage;
 134                 gtt += dst->gart_usage;
 135         }
 136         if (src) {
 137                 vram += src->vram_usage;
 138                 gtt += src->gart_usage;
 139         }
 140
 141         /* Flush the GFX IB if DMA depends on it. */
 142         if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 143             ((dst &&
 144               ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
 145                                                  RADEON_USAGE_READWRITE)) ||
 146              (src &&
 147               ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
 148                                                  RADEON_USAGE_WRITE))))
 149                 si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 150
 151         /* Flush if there's not enough space, or if the memory usage per IB
 152          * is too large.
 153          *
 154          * IBs using too little memory are limited by the IB submission overhead.
 155          * IBs using too much memory are limited by the kernel/TTM overhead.
 156          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 157          *
 158          * This heuristic makes sure that DMA requests are executed
 159          * very soon after the call is made and lowers memory usage.
 160          * It improves texture upload performance by keeping the DMA
 161          * engine busy while uploads are being submitted.
 162          */
 163         num_dw++; /* for emit_wait_idle below */
 164         if (!ctx->ws->cs_check_space(ctx->dma_cs, num_dw) ||
 165             ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
 166             !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
 167                 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 168                 assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
 169         }
 170
 171         /* Wait for idle if either buffer has been used in the IB before to
 172          * prevent read-after-write hazards.
 173          */
 174         if ((dst &&
 175              ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
 176                                                 RADEON_USAGE_READWRITE)) ||
 177             (src &&
 178              ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
 179                                                 RADEON_USAGE_WRITE)))
 180                 si_dma_emit_wait_idle(ctx);
 181
 182         if (dst) {
 183                 radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
 184                                           RADEON_USAGE_WRITE, 0);
 185         }
 186         if (src) {
 187                 radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
 188                                           RADEON_USAGE_READ, 0);
 189         }
 190
 191         /* this function is called before all DMA calls, so increment this. */
 192         ctx->num_dma_calls++;
 193 }
 194
 195 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 196                      struct pipe_fence_handle **fence)
 197 {
 198         struct radeon_cmdbuf *cs = ctx->dma_cs;
 199         struct radeon_saved_cs saved;
 200         bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 201
 202         if (!radeon_emitted(cs, 0)) {
 203                 if (fence)
 204                         ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 205                 return;
 206         }
 207
 208         if (check_vm)
 209                 si_save_cs(ctx->ws, cs, &saved, true);
 210
 211         ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
 212         if (fence)
 213                 ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
 214
 215         if (check_vm) {
 216                 /* Use conservative timeout 800ms, after which we won't wait any
 217                  * longer and assume the GPU is hung.
 218                  */
 219                 ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
 220
 221                 si_check_vm_faults(ctx, &saved, RING_DMA);
 222                 si_clear_saved_cs(&saved);
 223         }
 224 }
 225
 226 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 227                             uint64_t offset, uint64_t size, unsigned value)
 228 {
 229         struct si_context *ctx = (struct si_context*)sscreen->aux_context;
 230
 231         mtx_lock(&sscreen->aux_context_lock);
 232         si_sdma_clear_buffer(ctx, dst, offset, size, value);
 233         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 234         mtx_unlock(&sscreen->aux_context_lock);
 235 }