src/gallium/drivers/radeonsi/si_dma_cs.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "radeon/r600_cs.h"
  27
  28 static void si_dma_emit_wait_idle(struct si_context *sctx)
  29 {
  30         struct radeon_winsys_cs *cs = sctx->b.dma_cs;
  31
  32         /* NOP waits for idle on Evergreen and later. */
  33         if (sctx->b.chip_class >= CIK)
  34                 radeon_emit(cs, 0x00000000); /* NOP */
  35         else
  36                 radeon_emit(cs, 0xf0000000); /* NOP */
  37 }
  38
  39 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
  40                        struct r600_resource *dst, struct r600_resource *src)
  41 {
  42         uint64_t vram = ctx->b.dma_cs->used_vram;
  43         uint64_t gtt = ctx->b.dma_cs->used_gart;
  44
  45         if (dst) {
  46                 vram += dst->vram_usage;
  47                 gtt += dst->gart_usage;
  48         }
  49         if (src) {
  50                 vram += src->vram_usage;
  51                 gtt += src->gart_usage;
  52         }
  53
  54         /* Flush the GFX IB if DMA depends on it. */
  55         if (radeon_emitted(ctx->b.gfx_cs, ctx->b.initial_gfx_cs_size) &&
  56             ((dst &&
  57               ctx->b.ws->cs_is_buffer_referenced(ctx->b.gfx_cs, dst->buf,
  58                                                  RADEON_USAGE_READWRITE)) ||
  59              (src &&
  60               ctx->b.ws->cs_is_buffer_referenced(ctx->b.gfx_cs, src->buf,
  61                                                  RADEON_USAGE_WRITE))))
  62                 si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
  63
  64         /* Flush if there's not enough space, or if the memory usage per IB
  65          * is too large.
  66          *
  67          * IBs using too little memory are limited by the IB submission overhead.
  68          * IBs using too much memory are limited by the kernel/TTM overhead.
  69          * Too long IBs create CPU-GPU pipeline bubbles and add latency.
  70          *
  71          * This heuristic makes sure that DMA requests are executed
  72          * very soon after the call is made and lowers memory usage.
  73          * It improves texture upload performance by keeping the DMA
  74          * engine busy while uploads are being submitted.
  75          */
  76         num_dw++; /* for emit_wait_idle below */
  77         if (!ctx->b.ws->cs_check_space(ctx->b.dma_cs, num_dw) ||
  78             ctx->b.dma_cs->used_vram + ctx->b.dma_cs->used_gart > 64 * 1024 * 1024 ||
  79             !radeon_cs_memory_below_limit(ctx->screen, ctx->b.dma_cs, vram, gtt)) {
  80                 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
  81                 assert((num_dw + ctx->b.dma_cs->current.cdw) <= ctx->b.dma_cs->current.max_dw);
  82         }
  83
  84         /* Wait for idle if either buffer has been used in the IB before to
  85          * prevent read-after-write hazards.
  86          */
  87         if ((dst &&
  88              ctx->b.ws->cs_is_buffer_referenced(ctx->b.dma_cs, dst->buf,
  89                                                 RADEON_USAGE_READWRITE)) ||
  90             (src &&
  91              ctx->b.ws->cs_is_buffer_referenced(ctx->b.dma_cs, src->buf,
  92                                                 RADEON_USAGE_WRITE)))
  93                 si_dma_emit_wait_idle(ctx);
  94
  95         if (dst) {
  96                 radeon_add_to_buffer_list(ctx, ctx->b.dma_cs, dst,
  97                                           RADEON_USAGE_WRITE,
  98                                           RADEON_PRIO_SDMA_BUFFER);
  99         }
 100         if (src) {
 101                 radeon_add_to_buffer_list(ctx, ctx->b.dma_cs, src,
 102                                           RADEON_USAGE_READ,
 103                                           RADEON_PRIO_SDMA_BUFFER);
 104         }
 105
 106         /* this function is called before all DMA calls, so increment this. */
 107         ctx->b.num_dma_calls++;
 108 }
 109
 110 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 111                      struct pipe_fence_handle **fence)
 112 {
 113         struct radeon_winsys_cs *cs = ctx->b.dma_cs;
 114         struct radeon_saved_cs saved;
 115         bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 116
 117         if (!radeon_emitted(cs, 0)) {
 118                 if (fence)
 119                         ctx->b.ws->fence_reference(fence, ctx->b.last_sdma_fence);
 120                 return;
 121         }
 122
 123         if (check_vm)
 124                 si_save_cs(ctx->b.ws, cs, &saved, true);
 125
 126         ctx->b.ws->cs_flush(cs, flags, &ctx->b.last_sdma_fence);
 127         if (fence)
 128                 ctx->b.ws->fence_reference(fence, ctx->b.last_sdma_fence);
 129
 130         if (check_vm) {
 131                 /* Use conservative timeout 800ms, after which we won't wait any
 132                  * longer and assume the GPU is hung.
 133                  */
 134                 ctx->b.ws->fence_wait(ctx->b.ws, ctx->b.last_sdma_fence, 800*1000*1000);
 135
 136                 si_check_vm_faults(ctx, &saved, RING_DMA);
 137                 si_clear_saved_cs(&saved);
 138         }
 139 }
 140
 141 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 142                             uint64_t offset, uint64_t size, unsigned value)
 143 {
 144         struct si_context *ctx = (struct si_context*)sscreen->aux_context;
 145
 146         mtx_lock(&sscreen->aux_context_lock);
 147         ctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
 148         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 149         mtx_unlock(&sscreen->aux_context_lock);
 150 }