radeonsi: use SDMA for uploading data through const_uploader

author Marek Olšák <marek.olsak@amd.com>

Thu, 31 Jan 2019 01:56:59 +0000 (20:56 -0500)

committer Marek Olšák <marek.olsak@amd.com>

Thu, 21 Feb 2019 02:04:29 +0000 (21:04 -0500)
author Marek Olšák <marek.olsak@amd.com>
Thu, 31 Jan 2019 01:56:59 +0000 (20:56 -0500)
committer Marek Olšák <marek.olsak@amd.com>
Thu, 21 Feb 2019 02:04:29 +0000 (21:04 -0500)
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c

index c01118ce96a2745fa554e12ab9f3f8efe985577f..4936eb5a5b11eb93769e607cc0855a17b0c1fb52 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -440,7 +440,15 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
                 }
         }
  
-       if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+       if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+           buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+               usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
+                          PIPE_TRANSFER_PERSISTENT);
+               usage |= PIPE_TRANSFER_DISCARD_RANGE;
+               force_discard_range = true;
+       }
+
+       if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
             ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
                          PIPE_TRANSFER_PERSISTENT))) ||
              (buf->flags & RADEON_FLAG_SPARSE))) {
@@ -453,10 +461,20 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
                     si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
                     !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
                         /* Do a wait-free write-only transfer using a temporary buffer. */
-                       unsigned offset;
+                       struct u_upload_mgr *uploader;
                         struct si_resource *staging = NULL;
+                       unsigned offset;
+
+                       /* If we are not called from the driver thread, we have
+                        * to use the uploader from u_threaded_context, which is
+                        * local to the calling thread.
+                        */
+                       if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+                               uploader = sctx->tc->base.stream_uploader;
+                       else
+                               uploader = sctx->b.stream_uploader;
  
-                       u_upload_alloc(ctx->stream_uploader, 0,
+                       u_upload_alloc(uploader, 0,
                                         box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
                                        sctx->screen->info.tcc_cache_line_size,
                                        &offset, (struct pipe_resource**)&staging,
@@ -521,6 +539,7 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
                                       struct pipe_transfer *transfer,
                                       const struct pipe_box *box)
  {
+       struct si_context *sctx = (struct si_context*)ctx;
         struct si_transfer *stransfer = (struct si_transfer*)transfer;
         struct si_resource *buf = si_resource(transfer->resource);
  
@@ -529,10 +548,49 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
                                       transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
                                       (box->x - transfer->box.x);
  
+               if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+                       /* This should be true for all uploaders. */
+                       assert(transfer->box.x == 0);
+
+                       /* Find a previous upload and extend its range. The last
+                        * upload is likely to be at the end of the list.
+                        */
+                       for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+                               struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+                               if (up->dst != buf)
+                                       continue;
+
+                               assert(up->src == stransfer->staging);
+                               assert(box->x > up->dst_offset);
+                               up->size = box->x + box->width - up->dst_offset;
+                               return;
+                       }
+
+                       /* Enlarge the array if it's full. */
+                       if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+                               unsigned size;
+
+                               sctx->max_sdma_uploads += 4;
+                               size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+                               sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+                       }
+
+                       /* Add a new upload. */
+                       struct si_sdma_upload *up =
+                               &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+                       up->dst = up->src = NULL;
+                       si_resource_reference(&up->dst, buf);
+                       si_resource_reference(&up->src, stransfer->staging);
+                       up->dst_offset = box->x;
+                       up->src_offset = src_offset;
+                       up->size = box->width;
+                       return;
+               }
+
                 /* Copy the staging buffer into the original one. */
-               si_copy_buffer((struct si_context*)ctx, transfer->resource,
-                              &stransfer->staging->b.b, box->x, src_offset,
-                              box->width);
+               si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
+                              box->x, src_offset, box->width);
         }
  
         util_range_add(&buf->valid_buffer_range, box->x,
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c

index 2aafc1f09a0e0f17251d4bae07cd301bc240cb89..bba1bd9582644601cbec9cace10d5eef239f50bb 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -140,7 +140,8 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
         }
  
         /* Flush the GFX IB if DMA depends on it. */
-       if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       if (!ctx->sdma_uploads_in_progress &&
+           radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
             ((dst &&
               ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
                                           RADEON_USAGE_READWRITE)) ||
@@ -162,9 +163,10 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
          * engine busy while uploads are being submitted.
          */
         num_dw++; /* for emit_wait_idle below */
-       if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
-           ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
-           !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
+       if (!ctx->sdma_uploads_in_progress &&
+           (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
+            ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
+            !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) {
                 si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
                 assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
         }
@@ -180,13 +182,14 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
                                          RADEON_USAGE_WRITE)))
                 si_dma_emit_wait_idle(ctx);
  
+       unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
         if (dst) {
-               radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
-                                         RADEON_USAGE_WRITE, 0);
+               ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
+                                 dst->domains, 0);
         }
         if (src) {
-               radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
-                                         RADEON_USAGE_READ, 0);
+               ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync,
+                                 src->domains, 0);
         }
  
         /* this function is called before all DMA calls, so increment this. */
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c

index 3d64587fa2b944bf95faa5a17cf802a1848dfdb1..13d5b5a959a0905f3a453477027f343f9f85a210 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -26,6 +26,7 @@
  #include "si_pipe.h"
  
  #include "util/os_time.h"
+#include "util/u_upload_mgr.h"
  
  /* initialize */
  void si_need_gfx_cs_space(struct si_context *ctx)
@@ -64,6 +65,15 @@ void si_need_gfx_cs_space(struct si_context *ctx)
                 si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
  }
  
+void si_unref_sdma_uploads(struct si_context *sctx)
+{
+       for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+               si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+               si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+       }
+       sctx->num_sdma_uploads = 0;
+}
+
  void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
                      struct pipe_fence_handle **fence)
  {
@@ -98,17 +108,37 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
         if (ctx->screen->debug_flags & DBG(CHECK_VM))
                 flags &= ~PIPE_FLUSH_ASYNC;
  
+       ctx->gfx_flush_in_progress = true;
+
         /* If the state tracker is flushing the GFX IB, si_flush_from_st is
          * responsible for flushing the DMA IB and merging the fences from both.
-        * This code is only needed when the driver flushes the GFX IB
-        * internally, and it never asks for a fence handle.
+        * If the driver flushes the GFX IB internally, and it should never ask
+        * for a fence handle.
          */
-       if (radeon_emitted(ctx->dma_cs, 0)) {
-               assert(fence == NULL); /* internal flushes only */
-               si_flush_dma_cs(ctx, flags, NULL);
+       assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL);
+
+       /* Update the sdma_uploads list by flushing the uploader. */
+       u_upload_unmap(ctx->b.const_uploader);
+
+       /* Execute SDMA uploads. */
+       ctx->sdma_uploads_in_progress = true;
+       for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+               struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+               struct pipe_box box;
+
+               assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
+                      up->size % 4 == 0);
+
+               u_box_1d(up->src_offset, up->size, &box);
+               ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0,
+                             &up->src->b.b, 0, &box);
         }
+       ctx->sdma_uploads_in_progress = false;
+       si_unref_sdma_uploads(ctx);
  
-       ctx->gfx_flush_in_progress = true;
+       /* Flush SDMA (preamble IB). */
+       if (radeon_emitted(ctx->dma_cs, 0))
+               si_flush_dma_cs(ctx, flags, NULL);
  
         if (!LIST_IS_EMPTY(&ctx->active_queries))
                 si_suspend_queries(ctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index c02c81cad3a1500e42dfe82be3c1f6d9531039b2..b965d9d64d491d11d265b140e71a65a62cecb024 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -264,6 +264,7 @@ static void si_destroy_context(struct pipe_context *context)
         util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
         util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
         util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+       si_unref_sdma_uploads(sctx);
         FREE(sctx);
  }
  
@@ -443,14 +444,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
         if (!sctx->b.stream_uploader)
                 goto fail;
  
-       sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
-                                                  0, PIPE_USAGE_DEFAULT,
-                                                  SI_RESOURCE_FLAG_32BIT |
-                                                  (sscreen->cpdma_prefetch_writes_memory ?
-                                                           0 : SI_RESOURCE_FLAG_READ_ONLY));
-       if (!sctx->b.const_uploader)
-               goto fail;
-
         sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
                                                        0, PIPE_USAGE_STAGING, 0);
         if (!sctx->cached_gtt_allocator)
@@ -466,6 +459,20 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
                                                    sctx, stop_exec_on_failure);
         }
  
+       bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs;
+       sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
+                                                0, PIPE_USAGE_DEFAULT,
+                                                SI_RESOURCE_FLAG_32BIT |
+                                                (use_sdma_upload ?
+                                                         SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
+                                                         (sscreen->cpdma_prefetch_writes_memory ?
+                                                                  0 : SI_RESOURCE_FLAG_READ_ONLY)));
+       if (!sctx->b.const_uploader)
+               goto fail;
+
+       if (use_sdma_upload)
+               u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
         si_init_buffer_functions(sctx);
         si_init_clear_functions(sctx);
         si_init_blit_functions(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index b01d57447526056f2c26df47cfea9ebd3959d6ed..b208bdeb848df72aac92768dc4dd671e963add92 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -110,6 +110,8 @@
  #define SI_RESOURCE_FLAG_READ_ONLY     (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
  #define SI_RESOURCE_FLAG_32BIT         (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
  #define SI_RESOURCE_FLAG_CLEAR         (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA  (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
  
  enum si_clear_code
  {
@@ -776,6 +778,14 @@ struct si_saved_cs {
         int64_t                 time_flush;
  };
  
+struct si_sdma_upload {
+       struct si_resource      *dst;
+       struct si_resource      *src;
+       unsigned                src_offset;
+       unsigned                dst_offset;
+       unsigned                size;
+};
+
  struct si_context {
         struct pipe_context             b; /* base class */
  
@@ -1081,6 +1091,12 @@ struct si_context {
         bool                            render_cond_invert;
         bool                            render_cond_force_off; /* for u_blitter */
  
+       /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+       bool                            sdma_uploads_in_progress;
+       struct si_sdma_upload           *sdma_uploads;
+       unsigned                        num_sdma_uploads;
+       unsigned                        max_sdma_uploads;
+
         /* Statistics gathering for the DCC enablement heuristic. It can't be
          * in si_texture because si_texture can be shared by multiple
          * contexts. This is for back buffers only. We shouldn't get too many
@@ -1280,6 +1296,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
                      struct pipe_fence_handle **fence);
  void si_begin_new_gfx_cs(struct si_context *ctx);
  void si_need_gfx_cs_space(struct si_context *ctx);
+void si_unref_sdma_uploads(struct si_context *sctx);
  
  /* si_gpu_load.c */
  void si_gpu_load_kill_thread(struct si_screen *sscreen);
author	Marek Olšák <marek.olsak@amd.com>
	Thu, 31 Jan 2019 01:56:59 +0000 (20:56 -0500)
committer	Marek Olšák <marek.olsak@amd.com>
	Thu, 21 Feb 2019 02:04:29 +0000 (21:04 -0500)
src/gallium/drivers/radeonsi/si_buffer.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_dma_cs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_gfx_cs.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history