winsys/amdgpu: decay max_ib_size over time

[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_cs.c
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c

index 781960c96000631096a02a7412fbd119637bd6cc..fefa5d6db535cb548f66406576d70d4a66be5bbc 100644 (file)
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -35,6 +35,7 @@
  #include <stdio.h>
  #include <amdgpu_drm.h>
  
+#include "../../../drivers/radeonsi/sid.h"
  
  /* FENCES */
  
@@ -226,6 +227,19 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
            cs->request.ip_type != AMDGPU_HW_IP_VCE;
  }
  
+static bool amdgpu_cs_has_chaining(enum ring_type ring_type)
+{
+   return ring_type == RING_GFX;
+}
+
+static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type)
+{
+   if (ring_type == RING_GFX)
+      return 4; /* for chaining */
+
+   return 0;
+}
+
  int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
  {
     unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
@@ -342,13 +356,18 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
     uint8_t *mapped;
     unsigned buffer_size;
  
-   /* Always create a buffer that is 4 times larger than the maximum seen IB
-    * size, aligned to a power of two. Limit to 512k dwords, which is the
-    * largest power of two that fits into the size field of the INDIRECT_BUFFER
-    * packet.
+   /* Always create a buffer that is at least as large as the maximum seen IB
+    * size, aligned to a power of two (and multiplied by 4 to reduce internal
+    * fragmentation if chaining is not available). Limit to 512k dwords, which
+    * is the largest power of two that fits into the size field of the
+    * INDIRECT_BUFFER packet.
      */
-   buffer_size = 4 * MIN2(util_next_power_of_two(4 * ib->max_ib_size),
-                          512 * 1024);
+   if (amdgpu_cs_has_chaining(amdgpu_cs_from_ib(ib)->ring_type))
+      buffer_size = 4 *util_next_power_of_two(ib->max_ib_size);
+   else
+      buffer_size = 4 *util_next_power_of_two(4 * ib->max_ib_size);
+
+   buffer_size = MIN2(buffer_size, 4 * 512 * 1024);
  
     switch (ib->ib_type) {
     case IB_CONST_PREAMBLE:
@@ -436,12 +455,18 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
        unreachable("unhandled IB type");
     }
  
-   ib_size = MAX2(ib_size,
-                  4 * MIN2(util_next_power_of_two(ib->max_ib_size),
-                           amdgpu_ib_max_submit_dwords(ib_type)));
+   if (!amdgpu_cs_has_chaining(cs->ring_type)) {
+      ib_size = MAX2(ib_size,
+                     4 * MIN2(util_next_power_of_two(ib->max_ib_size),
+                              amdgpu_ib_max_submit_dwords(ib_type)));
+   }
+
+   ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32;
  
-   ib->base.cdw = 0;
-   ib->base.buf = NULL;
+   ib->base.prev_dw = 0;
+   ib->base.num_prev = 0;
+   ib->base.current.cdw = 0;
+   ib->base.current.buf = NULL;
  
     /* Allocate a new buffer for IBs if the current buffer is all used. */
     if (!ib->big_ib_buffer ||
@@ -452,16 +477,26 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
  
     info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
                           ib->used_ib_space;
+   info->size = 0;
+   ib->ptr_ib_size = &info->size;
+
     amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
                          RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
  
-   ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+   ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
  
     ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
-   ib->base.max_dw = ib_size / 4;
+   ib->base.current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
     return true;
  }
  
+static void amdgpu_ib_finalize(struct amdgpu_ib *ib)
+{
+   *ib->ptr_ib_size |= ib->base.current.cdw;
+   ib->used_ib_space += ib->base.current.cdw * 4;
+   ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw);
+}
+
  static boolean amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
                                        enum ring_type ring_type)
  {
@@ -653,7 +688,7 @@ amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
     return &cs->const_preamble_ib.base;
  }
  
-#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
+#define OUT_CS(cs, value) (cs)->current.buf[(cs)->current.cdw++] = (value)
  
  static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
                                 struct pb_buffer *buf)
@@ -672,16 +707,81 @@ static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
  {
     struct amdgpu_ib *ib = amdgpu_ib(rcs);
     struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
-   unsigned requested_size = rcs->cdw + dw;
+   unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
+   uint64_t va;
+   uint32_t *new_ptr_ib_size;
  
-   assert(rcs->cdw <= rcs->max_dw);
+   assert(rcs->current.cdw <= rcs->current.max_dw);
  
     if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
        return false;
  
     ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
  
-   return rcs->max_dw - rcs->cdw >= dw;
+   if (rcs->current.max_dw - rcs->current.cdw >= dw)
+      return true;
+
+   if (!amdgpu_cs_has_chaining(cs->ring_type))
+      return false;
+
+   /* Allocate a new chunk */
+   if (rcs->num_prev >= rcs->max_prev) {
+      unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
+      struct radeon_winsys_cs_chunk *new_prev;
+
+      new_prev = REALLOC(rcs->prev,
+                         sizeof(*new_prev) * rcs->max_prev,
+                         sizeof(*new_prev) * new_max_prev);
+      if (!new_prev)
+         return false;
+
+      rcs->prev = new_prev;
+      rcs->max_prev = new_max_prev;
+   }
+
+   if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib))
+      return false;
+
+   assert(ib->used_ib_space == 0);
+   va = amdgpu_winsys_bo(ib->big_ib_buffer)->va;
+
+   /* This space was originally reserved. */
+   rcs->current.max_dw += 4;
+   assert(ib->used_ib_space + 4 * rcs->current.max_dw <= ib->big_ib_buffer->size);
+
+   /* Pad with NOPs and add INDIRECT_BUFFER packet */
+   while ((rcs->current.cdw & 7) != 4)
+      OUT_CS(rcs, 0xffff1000); /* type3 nop packet */
+
+   OUT_CS(rcs, PKT3(ib->ib_type == IB_MAIN ? PKT3_INDIRECT_BUFFER_CIK
+                                           : PKT3_INDIRECT_BUFFER_CONST, 2, 0));
+   OUT_CS(rcs, va);
+   OUT_CS(rcs, va >> 32);
+   new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw];
+   OUT_CS(rcs, S_3F2_CHAIN(1) | S_3F2_VALID(1));
+
+   assert((rcs->current.cdw & 7) == 0);
+   assert(rcs->current.cdw <= rcs->current.max_dw);
+
+   *ib->ptr_ib_size |= rcs->current.cdw;
+   ib->ptr_ib_size = new_ptr_ib_size;
+
+   /* Hook up the new chunk */
+   rcs->prev[rcs->num_prev].buf = rcs->current.buf;
+   rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
+   rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
+   rcs->num_prev++;
+
+   ib->base.prev_dw += ib->base.current.cdw;
+   ib->base.current.cdw = 0;
+
+   ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+   ib->base.current.max_dw = ib->big_ib_buffer->size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
+
+   amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
+                        RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
+
+   return true;
  }
  
  static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
@@ -876,61 +976,55 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
     struct amdgpu_cs *cs = amdgpu_cs(rcs);
     struct amdgpu_winsys *ws = cs->ctx->ws;
  
+   rcs->current.max_dw += amdgpu_cs_epilog_dws(cs->ring_type);
+
     switch (cs->ring_type) {
     case RING_DMA:
        /* pad DMA ring to 8 DWs */
-      while (rcs->cdw & 7)
+      while (rcs->current.cdw & 7)
           OUT_CS(rcs, 0x00000000); /* NOP packet */
        break;
     case RING_GFX:
        /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
-      while (rcs->cdw & 7)
+      while (rcs->current.cdw & 7)
           OUT_CS(rcs, 0xffff1000); /* type3 nop packet */
  
        /* Also pad the const IB. */
        if (cs->const_ib.ib_mapped)
-         while (!cs->const_ib.base.cdw || (cs->const_ib.base.cdw & 7))
+         while (!cs->const_ib.base.current.cdw || (cs->const_ib.base.current.cdw & 7))
              OUT_CS(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */
  
        if (cs->const_preamble_ib.ib_mapped)
-         while (!cs->const_preamble_ib.base.cdw || (cs->const_preamble_ib.base.cdw & 7))
+         while (!cs->const_preamble_ib.base.current.cdw || (cs->const_preamble_ib.base.current.cdw & 7))
              OUT_CS(&cs->const_preamble_ib.base, 0xffff1000);
        break;
     case RING_UVD:
-      while (rcs->cdw & 15)
+      while (rcs->current.cdw & 15)
           OUT_CS(rcs, 0x80000000); /* type2 nop packet */
        break;
     default:
        break;
     }
  
-   if (rcs->cdw > rcs->max_dw) {
+   if (rcs->current.cdw > rcs->current.max_dw) {
        fprintf(stderr, "amdgpu: command stream overflowed\n");
     }
  
     /* If the CS is not empty or overflowed.... */
-   if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw &&
+   if (radeon_emitted(&cs->main.base, 0) &&
+       cs->main.base.current.cdw <= cs->main.base.current.max_dw &&
         !debug_get_option_noop()) {
        struct amdgpu_cs_context *cur = cs->csc;
        unsigned i, num_buffers = cur->num_buffers;
  
        /* Set IB sizes. */
-      cur->ib[IB_MAIN].size = cs->main.base.cdw;
-      cs->main.used_ib_space += cs->main.base.cdw * 4;
-      cs->main.max_ib_size = MAX2(cs->main.max_ib_size, cs->main.base.cdw);
-
-      if (cs->const_ib.ib_mapped) {
-         cur->ib[IB_CONST].size = cs->const_ib.base.cdw;
-         cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
-         cs->const_ib.max_ib_size = MAX2(cs->const_ib.max_ib_size, cs->const_ib.base.cdw);
-      }
+      amdgpu_ib_finalize(&cs->main);
  
-      if (cs->const_preamble_ib.ib_mapped) {
-         cur->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
-         cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4;
-         cs->const_preamble_ib.max_ib_size =
-            MAX2(cs->const_preamble_ib.max_ib_size, cs->const_preamble_ib.base.cdw);
-      }
+      if (cs->const_ib.ib_mapped)
+         amdgpu_ib_finalize(&cs->const_ib);
+
+      if (cs->const_preamble_ib.ib_mapped)
+         amdgpu_ib_finalize(&cs->const_preamble_ib);
  
        /* Create a fence. */
        amdgpu_fence_reference(&cur->fence, NULL);
@@ -986,8 +1080,11 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
     pipe_semaphore_destroy(&cs->flush_completed);
     p_atomic_dec(&cs->ctx->ws->num_cs);
     pb_reference(&cs->main.big_ib_buffer, NULL);
+   FREE(cs->main.base.prev);
     pb_reference(&cs->const_ib.big_ib_buffer, NULL);
+   FREE(cs->const_ib.base.prev);
     pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL);
+   FREE(cs->const_preamble_ib.base.prev);
     amdgpu_destroy_cs_context(&cs->csc1);
     amdgpu_destroy_cs_context(&cs->csc2);
     FREE(cs);