From a1e391e39df2b1d8169e773a30153167ab8e13e8 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 12 Sep 2016 11:46:12 +0200
Subject: [PATCH] winsys/radeon: add fine-grained fences for slab buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Note the logic for adding fences is somewhat different than for amdgpu,
because radeon has no scheduler and we therefore have no guarantee about
the order in which submissions from multiple threads are processed.

(Ironically, this is only an issue when "multi-threaded submission" is
disabled, because "multi-threaded submission" actually means that all
submissions happen from a single thread that happens to be separate from
the application's threads. If we only supported "multi-threaded
submission", the fence handling could be simplified by adding the fences
in that thread where everything is serialized.)

Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 55 +++++++++++-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h |  4 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 86 +++++++++++++++++--
 .../winsys/radeon/drm/radeon_drm_winsys.c     |  2 +
 .../winsys/radeon/drm/radeon_drm_winsys.h     |  1 +
 5 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 3f58b00efdd..f9cf2e0d2cd 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -53,7 +53,7 @@ struct radeon_bo_va_hole {
     uint64_t         size;
 };
 
-static bool radeon_bo_is_busy(struct radeon_bo *bo)
+static bool radeon_real_bo_is_busy(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_busy args = {0};
 
@@ -62,7 +62,31 @@ static bool radeon_bo_is_busy(struct radeon_bo *bo)
                                &args, sizeof(args)) != 0;
 }
 
-static void radeon_bo_wait_idle(struct radeon_bo *bo)
+static bool radeon_bo_is_busy(struct radeon_bo *bo)
+{
+    unsigned num_idle;
+    bool busy = false;
+
+    if (bo->handle)
+        return radeon_real_bo_is_busy(bo);
+
+    pipe_mutex_lock(bo->rws->bo_fence_lock);
+    for (num_idle = 0; num_idle < bo->u.slab.num_fences; ++num_idle) {
+        if (radeon_real_bo_is_busy(bo->u.slab.fences[num_idle])) {
+            busy = true;
+            break;
+        }
+        radeon_bo_reference(&bo->u.slab.fences[num_idle], NULL);
+    }
+    memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[num_idle],
+            (bo->u.slab.num_fences - num_idle) * sizeof(bo->u.slab.fences[0]));
+    bo->u.slab.num_fences -= num_idle;
+    pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+    return busy;
+}
+
+static void radeon_real_bo_wait_idle(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_wait_idle args = {0};
 
@@ -71,6 +95,33 @@ static void radeon_bo_wait_idle(struct radeon_bo *bo)
                            &args, sizeof(args)) == -EBUSY);
 }
 
+static void radeon_bo_wait_idle(struct radeon_bo *bo)
+{
+    if (bo->handle) {
+        radeon_real_bo_wait_idle(bo);
+    } else {
+        pipe_mutex_lock(bo->rws->bo_fence_lock);
+        while (bo->u.slab.num_fences) {
+            struct radeon_bo *fence = NULL;
+            radeon_bo_reference(&fence, bo->u.slab.fences[0]);
+            pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+            /* Wait without holding the fence lock. */
+            radeon_real_bo_wait_idle(fence);
+
+            pipe_mutex_lock(bo->rws->bo_fence_lock);
+            if (bo->u.slab.num_fences && fence == bo->u.slab.fences[0]) {
+                radeon_bo_reference(&bo->u.slab.fences[0], NULL);
+                memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[1],
+                        (bo->u.slab.num_fences - 1) * sizeof(bo->u.slab.fences[0]));
+                bo->u.slab.num_fences--;
+            }
+            radeon_bo_reference(&fence, NULL);
+        }
+        pipe_mutex_unlock(bo->rws->bo_fence_lock);
+    }
+}
+
 static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
                            enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index 8e35a385ed8..8f767fd2c73 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -50,6 +50,10 @@ struct radeon_bo {
         struct {
             struct pb_slab_entry entry;
             struct radeon_bo *real;
+
+            unsigned num_fences;
+            unsigned max_fences;
+            struct radeon_bo **fences;
         } slab;
     } u;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 9fbd3783699..79c09e22048 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -471,6 +471,8 @@ void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 
     for (i = 0; i < csc->num_relocs; i++)
         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
+    for (i = 0; i < csc->num_slab_buffers; i++)
+        p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 
     radeon_cs_context_cleanup(csc);
 }
@@ -487,11 +489,61 @@ void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
         util_queue_job_wait(&cs->flush_completed);
 }
 
+/* Add the given fence to a slab buffer fence list.
+ *
+ * There is a potential race condition when bo participates in submissions on
+ * two or more threads simultaneously. Since we do not know which of the
+ * submissions will be sent to the GPU first, we have to keep the fences
+ * of all submissions.
+ *
+ * However, fences that belong to submissions that have already returned from
+ * their respective ioctl do not have to be kept, because we know that they
+ * will signal earlier.
+ */
+static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
+{
+    unsigned dst;
+
+    assert(fence->num_cs_references);
+
+    /* Cleanup older fences */
+    dst = 0;
+    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
+        if (bo->u.slab.fences[src]->num_cs_references) {
+            bo->u.slab.fences[dst] = bo->u.slab.fences[src];
+            dst++;
+        } else {
+            radeon_bo_reference(&bo->u.slab.fences[src], NULL);
+        }
+    }
+    bo->u.slab.num_fences = dst;
+
+    /* Check available space for the new fence */
+    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
+        unsigned new_max_fences = bo->u.slab.max_fences + 1;
+        struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
+                                                bo->u.slab.max_fences * sizeof(*new_fences),
+                                                new_max_fences * sizeof(*new_fences));
+        if (!new_fences) {
+            fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
+            return;
+        }
+
+        bo->u.slab.fences = new_fences;
+        bo->u.slab.max_fences = new_max_fences;
+    }
+
+    /* Add the new fence */
+    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
+    radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
+    bo->u.slab.num_fences++;
+}
+
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 
 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                                unsigned flags,
-                               struct pipe_fence_handle **fence)
+                               struct pipe_fence_handle **pfence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
@@ -531,15 +583,31 @@ static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
-    if (fence) {
-       if (cs->next_fence) {
-          radeon_fence_reference(fence, cs->next_fence);
-       } else {
-          radeon_fence_reference(fence, NULL);
-          *fence = radeon_cs_create_fence(rcs);
-       }
+    if (pfence || cs->csc->num_slab_buffers) {
+        struct pipe_fence_handle *fence;
+
+        if (cs->next_fence) {
+            fence = cs->next_fence;
+            cs->next_fence = NULL;
+        } else {
+            fence = radeon_cs_create_fence(rcs);
+        }
+
+        if (pfence)
+            radeon_fence_reference(pfence, fence);
+
+        pipe_mutex_lock(cs->ws->bo_fence_lock);
+        for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
+            struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
+            p_atomic_inc(&bo->num_active_ioctls);
+            radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
+        }
+        pipe_mutex_unlock(cs->ws->bo_fence_lock);
+
+        radeon_fence_reference(&fence, NULL);
+    } else {
+        radeon_fence_reference(&cs->next_fence, NULL);
     }
-    radeon_fence_reference(&cs->next_fence, NULL);
 
     radeon_drm_cs_sync_flush(rcs);
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index aa4bf5fb8ec..e02f286b0c2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -556,6 +556,7 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
     util_hash_table_destroy(ws->bo_vas);
     pipe_mutex_destroy(ws->bo_handles_mutex);
     pipe_mutex_destroy(ws->bo_va_mutex);
+    pipe_mutex_destroy(ws->bo_fence_lock);
 
     if (ws->fd >= 0)
         close(ws->fd);
@@ -787,6 +788,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     ws->bo_vas = util_hash_table_create(handle_hash, handle_compare);
     pipe_mutex_init(ws->bo_handles_mutex);
     pipe_mutex_init(ws->bo_va_mutex);
+    pipe_mutex_init(ws->bo_fence_lock);
     ws->va_offset = ws->va_start;
     list_inithead(&ws->va_holes);
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 55149806ae9..b30055cf976 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -91,6 +91,7 @@ struct radeon_drm_winsys {
     struct util_hash_table *bo_vas;
     pipe_mutex bo_handles_mutex;
     pipe_mutex bo_va_mutex;
+    pipe_mutex bo_fence_lock;
 
     uint64_t va_offset;
     struct list_head va_holes;
-- 
2.30.2