winsys/radeon: add fine-grained fences for slab buffers
authorNicolai Hähnle <nicolai.haehnle@amd.com>
Mon, 12 Sep 2016 09:46:12 +0000 (11:46 +0200)
committerNicolai Hähnle <nicolai.haehnle@amd.com>
Tue, 27 Sep 2016 14:45:34 +0000 (16:45 +0200)
Note the logic for adding fences is somewhat different than for amdgpu,
because radeon has no scheduler and we therefore have no guarantee about
the order in which submissions from multiple threads are processed.

(Ironically, this is only an issue when "multi-threaded submission" is
disabled, because "multi-threaded submission" actually means that all
submissions happen from a single thread that happens to be separate from
the application's threads. If we only supported "multi-threaded
submission", the fence handling could be simplified by adding the fences
in that thread where everything is serialized.)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
src/gallium/winsys/radeon/drm/radeon_drm_bo.c
src/gallium/winsys/radeon/drm/radeon_drm_bo.h
src/gallium/winsys/radeon/drm/radeon_drm_cs.c
src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
src/gallium/winsys/radeon/drm/radeon_drm_winsys.h

index 3f58b00efdd012d5f8cf4e2c445ed14a65d94508..f9cf2e0d2cda6b3509b1af07bb718389045aeaa6 100644 (file)
@@ -53,7 +53,7 @@ struct radeon_bo_va_hole {
     uint64_t         size;
 };
 
-static bool radeon_bo_is_busy(struct radeon_bo *bo)
+static bool radeon_real_bo_is_busy(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_busy args = {0};
 
@@ -62,7 +62,31 @@ static bool radeon_bo_is_busy(struct radeon_bo *bo)
                                &args, sizeof(args)) != 0;
 }
 
-static void radeon_bo_wait_idle(struct radeon_bo *bo)
+static bool radeon_bo_is_busy(struct radeon_bo *bo)
+{
+    unsigned num_idle;
+    bool busy = false;
+
+    if (bo->handle)
+        return radeon_real_bo_is_busy(bo);
+
+    pipe_mutex_lock(bo->rws->bo_fence_lock);
+    for (num_idle = 0; num_idle < bo->u.slab.num_fences; ++num_idle) {
+        if (radeon_real_bo_is_busy(bo->u.slab.fences[num_idle])) {
+            busy = true;
+            break;
+        }
+        radeon_bo_reference(&bo->u.slab.fences[num_idle], NULL);
+    }
+    memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[num_idle],
+            (bo->u.slab.num_fences - num_idle) * sizeof(bo->u.slab.fences[0]));
+    bo->u.slab.num_fences -= num_idle;
+    pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+    return busy;
+}
+
+static void radeon_real_bo_wait_idle(struct radeon_bo *bo)
 {
     struct drm_radeon_gem_wait_idle args = {0};
 
@@ -71,6 +95,33 @@ static void radeon_bo_wait_idle(struct radeon_bo *bo)
                            &args, sizeof(args)) == -EBUSY);
 }
 
+static void radeon_bo_wait_idle(struct radeon_bo *bo)
+{
+    if (bo->handle) {
+        radeon_real_bo_wait_idle(bo);
+    } else {
+        pipe_mutex_lock(bo->rws->bo_fence_lock);
+        while (bo->u.slab.num_fences) {
+            struct radeon_bo *fence = NULL;
+            radeon_bo_reference(&fence, bo->u.slab.fences[0]);
+            pipe_mutex_unlock(bo->rws->bo_fence_lock);
+
+            /* Wait without holding the fence lock. */
+            radeon_real_bo_wait_idle(fence);
+
+            pipe_mutex_lock(bo->rws->bo_fence_lock);
+            if (bo->u.slab.num_fences && fence == bo->u.slab.fences[0]) {
+                radeon_bo_reference(&bo->u.slab.fences[0], NULL);
+                memmove(&bo->u.slab.fences[0], &bo->u.slab.fences[1],
+                        (bo->u.slab.num_fences - 1) * sizeof(bo->u.slab.fences[0]));
+                bo->u.slab.num_fences--;
+            }
+            radeon_bo_reference(&fence, NULL);
+        }
+        pipe_mutex_unlock(bo->rws->bo_fence_lock);
+    }
+}
+
 static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
                            enum radeon_bo_usage usage)
 {
index 8e35a385ed8e9afa4ea0424d848bc0095bacd224..8f767fd2c736f11e02d7621097a398d9b05fa3e3 100644 (file)
@@ -50,6 +50,10 @@ struct radeon_bo {
         struct {
             struct pb_slab_entry entry;
             struct radeon_bo *real;
+
+            unsigned num_fences;
+            unsigned max_fences;
+            struct radeon_bo **fences;
         } slab;
     } u;
 
index 9fbd3783699c4618bd649def2a2d100e0e82793d..79c09e22048df59f8be1611e55de03af49621013 100644 (file)
@@ -471,6 +471,8 @@ void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 
     for (i = 0; i < csc->num_relocs; i++)
         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
+    for (i = 0; i < csc->num_slab_buffers; i++)
+        p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
 
     radeon_cs_context_cleanup(csc);
 }
@@ -487,11 +489,61 @@ void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
         util_queue_job_wait(&cs->flush_completed);
 }
 
+/* Add the given fence to a slab buffer fence list.
+ *
+ * There is a potential race condition when bo participates in submissions on
+ * two or more threads simultaneously. Since we do not know which of the
+ * submissions will be sent to the GPU first, we have to keep the fences
+ * of all submissions.
+ *
+ * However, fences that belong to submissions that have already returned from
+ * their respective ioctl do not have to be kept, because we know that they
+ * will signal earlier.
+ */
+static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
+{
+    unsigned dst;
+
+    assert(fence->num_cs_references);
+
+    /* Cleanup older fences */
+    dst = 0;
+    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
+        if (bo->u.slab.fences[src]->num_cs_references) {
+            bo->u.slab.fences[dst] = bo->u.slab.fences[src];
+            dst++;
+        } else {
+            radeon_bo_reference(&bo->u.slab.fences[src], NULL);
+        }
+    }
+    bo->u.slab.num_fences = dst;
+
+    /* Check available space for the new fence */
+    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
+        unsigned new_max_fences = bo->u.slab.max_fences + 1;
+        struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
+                                                bo->u.slab.max_fences * sizeof(*new_fences),
+                                                new_max_fences * sizeof(*new_fences));
+        if (!new_fences) {
+            fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
+            return;
+        }
+
+        bo->u.slab.fences = new_fences;
+        bo->u.slab.max_fences = new_max_fences;
+    }
+
+    /* Add the new fence */
+    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
+    radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
+    bo->u.slab.num_fences++;
+}
+
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 
 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                                unsigned flags,
-                               struct pipe_fence_handle **fence)
+                               struct pipe_fence_handle **pfence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
@@ -531,15 +583,31 @@ static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
-    if (fence) {
-       if (cs->next_fence) {
-          radeon_fence_reference(fence, cs->next_fence);
-       } else {
-          radeon_fence_reference(fence, NULL);
-          *fence = radeon_cs_create_fence(rcs);
-       }
+    if (pfence || cs->csc->num_slab_buffers) {
+        struct pipe_fence_handle *fence;
+
+        if (cs->next_fence) {
+            fence = cs->next_fence;
+            cs->next_fence = NULL;
+        } else {
+            fence = radeon_cs_create_fence(rcs);
+        }
+
+        if (pfence)
+            radeon_fence_reference(pfence, fence);
+
+        pipe_mutex_lock(cs->ws->bo_fence_lock);
+        for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
+            struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
+            p_atomic_inc(&bo->num_active_ioctls);
+            radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
+        }
+        pipe_mutex_unlock(cs->ws->bo_fence_lock);
+
+        radeon_fence_reference(&fence, NULL);
+    } else {
+        radeon_fence_reference(&cs->next_fence, NULL);
     }
-    radeon_fence_reference(&cs->next_fence, NULL);
 
     radeon_drm_cs_sync_flush(rcs);
 
index aa4bf5fb8ec89266d9ea976939bdc1b6cc66a205..e02f286b0c205bd43216916cbcd4d67ecb2c753c 100644 (file)
@@ -556,6 +556,7 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
     util_hash_table_destroy(ws->bo_vas);
     pipe_mutex_destroy(ws->bo_handles_mutex);
     pipe_mutex_destroy(ws->bo_va_mutex);
+    pipe_mutex_destroy(ws->bo_fence_lock);
 
     if (ws->fd >= 0)
         close(ws->fd);
@@ -787,6 +788,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     ws->bo_vas = util_hash_table_create(handle_hash, handle_compare);
     pipe_mutex_init(ws->bo_handles_mutex);
     pipe_mutex_init(ws->bo_va_mutex);
+    pipe_mutex_init(ws->bo_fence_lock);
     ws->va_offset = ws->va_start;
     list_inithead(&ws->va_holes);
 
index 55149806ae9d6963d4414b6813eb953421376d23..b30055cf976442e01da53083129060df529af057 100644 (file)
@@ -91,6 +91,7 @@ struct radeon_drm_winsys {
     struct util_hash_table *bo_vas;
     pipe_mutex bo_handles_mutex;
     pipe_mutex bo_va_mutex;
+    pipe_mutex bo_fence_lock;
 
     uint64_t va_offset;
     struct list_head va_holes;