radv/winsys: Add binary syncobj ABI changes for timeline semaphores.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 21 Jun 2020 23:11:03 +0000 (01:11 +0200)
committerMarge Bot <eric+marge@anholt.net>
Thu, 23 Jul 2020 17:36:46 +0000 (17:36 +0000)
To facilitate cross-process timeline semaphores we have to deal with
the fact that the syncobj signal operation might be submitted a
small finite time after the wait operation.

For that we start using DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT during
the wait operation so we properly wait instead of returning an error.

Furthermore, to make this effective for syncobjs that get reused we
actually have to reset them after the wait. Otherwise the wait before
submit would get the previous fence instead of waiting for the new
thing to submit.

The obvious choice, resetting the syncobj after the CS submission
has 2 issues though:

1) If the same semaphore is used for wait and signal we can't reset it.
   This is solvable by only resetting the semaphores that are not in the
   signal list.
2) The submitted work might be complete before we get to resetting the
   syncobj. If there is a cycle of submissions that signals it again and
   finishes before we get to the reset we are screwed.

Solution:
Copy the fence into a new syncobj and reset the old syncobj before
submission. Yes I know it is more syscalls :( At least I reduced the
alloc/free overhead by keeping a cache of temporary syncobjs.

This also introduces a syncobj_reset_count as we don't want to reset
syncobjs that are part of an emulated timeline semaphore. (yes, if
the kernel supports timeline syncobjs we should use those instead,
but those still need to be implemented and if we depend on them in
this patch ordering dependencies get hard ...)

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5600>

src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_radeon_winsys.h
src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h

index dd8b2e4b73a4600d6955fc1c23285fe5f156b3a6..b8ddb89e2d8aeae8396de2c9fe506147aec16219 100644 (file)
@@ -3762,7 +3762,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
                                      VkFence _fence,
                                      bool is_signal)
 {
-       int syncobj_idx = 0, sem_idx = 0;
+       int syncobj_idx = 0, non_reset_idx = 0, sem_idx = 0;
 
        if (num_sems == 0 && _fence == VK_NULL_HANDLE)
                return VK_SUCCESS;
@@ -3771,6 +3771,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
                switch(sems[i]->kind) {
                case RADV_SEMAPHORE_SYNCOBJ:
                        counts->syncobj_count++;
+                       counts->syncobj_reset_count++;
                        break;
                case RADV_SEMAPHORE_WINSYS:
                        counts->sem_count++;
@@ -3807,6 +3808,8 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
                }
        }
 
+       non_reset_idx = counts->syncobj_reset_count;
+
        for (uint32_t i = 0; i < num_sems; i++) {
                switch(sems[i]->kind) {
                case RADV_SEMAPHORE_NONE:
@@ -3830,7 +3833,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
                        pthread_mutex_unlock(&sems[i]->timeline.mutex);
 
                        if (point) {
-                               counts->syncobj[syncobj_idx++] = point->syncobj;
+                               counts->syncobj[non_reset_idx++] = point->syncobj;
                        } else {
                                /* Explicitly remove the semaphore so we might not find
                                 * a point later post-submit. */
@@ -3848,11 +3851,11 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
                        fence->temporary.kind != RADV_FENCE_NONE ?
                        &fence->temporary : &fence->permanent;
                if (part->kind == RADV_FENCE_SYNCOBJ)
-                       counts->syncobj[syncobj_idx++] = part->syncobj;
+                       counts->syncobj[non_reset_idx++] = part->syncobj;
        }
 
-       assert(syncobj_idx <= counts->syncobj_count);
-       counts->syncobj_count = syncobj_idx;
+       assert(MAX2(syncobj_idx, non_reset_idx) <= counts->syncobj_count);
+       counts->syncobj_count = MAX2(syncobj_idx, non_reset_idx);
 
        return VK_SUCCESS;
 }
index ac5555e957d3924dc7b28e740470ef19ef6d70a2..37576327dfd6c01cc46b04f5fa555fa895e7081d 100644 (file)
@@ -168,6 +168,7 @@ struct radeon_winsys_bo {
 };
 struct radv_winsys_sem_counts {
        uint32_t syncobj_count;
+       uint32_t syncobj_reset_count; /* for wait only, whether to reset the syncobj */
        uint32_t sem_count;
        uint32_t *syncobj;
        struct radeon_winsys_sem **sem;
index b024ef971040edac49033ffee4b72af14f21dea2..2cce43c24605efcdc55ce4330244692309f8d2d5 100644 (file)
@@ -1451,15 +1451,17 @@ static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
 }
 
 static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
+                                                                         const uint32_t *syncobj_override,
                                                                          struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
 {
+       const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj;
        struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
        if (!syncobj)
                return NULL;
 
        for (unsigned i = 0; i < counts->syncobj_count; i++) {
                struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
-               sem->handle = counts->syncobj[i];
+               sem->handle = src[i];
        }
 
        chunk->chunk_id = chunk_id;
@@ -1468,6 +1470,101 @@ static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct
        return syncobj;
 }
 
+static int radv_amdgpu_cache_alloc_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *dst)
+{
+       pthread_mutex_lock(&ws->syncobj_lock);
+       if (count > ws->syncobj_capacity) {
+               if (ws->syncobj_capacity > UINT32_MAX / 2)
+                       goto fail;
+
+               unsigned new_capacity = MAX2(count, ws->syncobj_capacity * 2);
+               uint32_t *n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+               if (!n)
+                       goto fail;
+               ws->syncobj_capacity = new_capacity;
+               ws->syncobj = n;
+       }
+
+       while(ws->syncobj_count < count) {
+               int r = amdgpu_cs_create_syncobj(ws->dev, ws->syncobj + ws->syncobj_count);
+               if (r)
+                       goto fail;
+               ++ws->syncobj_count;
+       }
+
+       for (unsigned i = 0; i < count; ++i)
+               dst[i] = ws->syncobj[--ws->syncobj_count];
+
+       pthread_mutex_unlock(&ws->syncobj_lock);
+       return 0;
+
+fail:
+       pthread_mutex_unlock(&ws->syncobj_lock);
+       return -ENOMEM;
+}
+
+static void radv_amdgpu_cache_free_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *src)
+{
+       pthread_mutex_lock(&ws->syncobj_lock);
+
+       uint32_t cache_count = MIN2(count, UINT32_MAX - ws->syncobj_count);
+       if (cache_count + ws->syncobj_count > ws->syncobj_capacity) {
+               unsigned new_capacity = MAX2(ws->syncobj_count + cache_count, ws->syncobj_capacity * 2);
+               uint32_t* n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+               if (n) {
+                       ws->syncobj_capacity = new_capacity;
+                       ws->syncobj = n;
+               }
+       }
+
+       for (unsigned i = 0; i < count; ++i) {
+               if (ws->syncobj_count < ws->syncobj_capacity)
+                       ws->syncobj[ws->syncobj_count++] = src[i];
+               else
+                       amdgpu_cs_destroy_syncobj(ws->dev, src[i]);
+       }
+
+       pthread_mutex_unlock(&ws->syncobj_lock);
+
+}
+
+static int radv_amdgpu_cs_prepare_syncobjs(struct radv_amdgpu_winsys *ws,
+                                           struct radv_winsys_sem_counts *counts,
+                                           uint32_t **out_syncobjs)
+{
+       int r = 0;
+
+       if (!ws->info.has_timeline_syncobj || !counts->syncobj_count) {
+               *out_syncobjs = NULL;
+               return 0;
+       }
+
+       *out_syncobjs = malloc(counts->syncobj_count * sizeof(**out_syncobjs));
+       if (!*out_syncobjs)
+               return -ENOMEM;
+
+       r = radv_amdgpu_cache_alloc_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+       if (r)
+               return r;
+       
+       for (unsigned i = 0; i < counts->syncobj_count; ++i) {
+               r = amdgpu_cs_syncobj_transfer(ws->dev, (*out_syncobjs)[i], 0, counts->syncobj[i], 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT);
+               if (r)
+                       goto fail;
+       }
+
+       r = amdgpu_cs_syncobj_reset(ws->dev, counts->syncobj, counts->syncobj_reset_count);
+       if (r)
+               goto fail;
+
+       return 0;
+fail:
+       radv_amdgpu_cache_free_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+       free(*out_syncobjs);
+       *out_syncobjs = NULL;
+       return r;
+}
+
 static VkResult
 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
                      struct radv_amdgpu_cs_request *request,
@@ -1483,6 +1580,7 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
        struct drm_amdgpu_cs_chunk_sem *wait_syncobj = NULL, *signal_syncobj = NULL;
        bool use_bo_list_create = ctx->ws->info.drm_minor < 27;
        struct drm_amdgpu_bo_list_in bo_list_in;
+       uint32_t *in_syncobjs = NULL;
        int i;
        struct amdgpu_cs_fence *sem;
        uint32_t bo_list = 0;
@@ -1533,7 +1631,12 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
        }
 
        if (sem_info->wait.syncobj_count && sem_info->cs_emit_wait) {
+               r = radv_amdgpu_cs_prepare_syncobjs(ctx->ws, &sem_info->wait, &in_syncobjs);
+               if (r)
+                       goto error_out;
+
                wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait,
+                                                                 in_syncobjs,
                                                                  &chunks[num_chunks],
                                                                  AMDGPU_CHUNK_ID_SYNCOBJ_IN);
                if (!wait_syncobj) {
@@ -1578,6 +1681,7 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 
        if (sem_info->signal.syncobj_count && sem_info->cs_emit_signal) {
                signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal,
+                                                                   NULL,
                                                                    &chunks[num_chunks],
                                                                    AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
                if (!signal_syncobj) {
@@ -1642,6 +1746,10 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
        }
 
 error_out:
+       if (in_syncobjs) {
+               radv_amdgpu_cache_free_syncobjs(ctx->ws, sem_info->wait.syncobj_count, in_syncobjs);
+               free(in_syncobjs);
+       }
        free(chunks);
        free(chunk_data);
        free(sem_dependencies);
index 29e7b20b886e2645bd37140733bd0a85b2641f37..c6deedcb32ccd4ff7fd9e6e047ef06ca89ae5528 100644 (file)
@@ -162,6 +162,10 @@ static void radv_amdgpu_winsys_destroy(struct radeon_winsys *rws)
 {
        struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys*)rws;
 
+       for (unsigned i = 0; i < ws->syncobj_count; ++i)
+               amdgpu_cs_destroy_syncobj(ws->dev, ws->syncobj[i]);
+       free(ws->syncobj);
+
        ac_addrlib_destroy(ws->addrlib);
        amdgpu_device_deinitialize(ws->dev);
        FREE(rws);
@@ -197,6 +201,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags)
        ws->use_llvm = debug_flags & RADV_DEBUG_LLVM;
        list_inithead(&ws->global_bo_list);
        pthread_mutex_init(&ws->global_bo_list_lock, NULL);
+       pthread_mutex_init(&ws->syncobj_lock, NULL);
        ws->base.query_info = radv_amdgpu_winsys_query_info;
        ws->base.query_value = radv_amdgpu_winsys_query_value;
        ws->base.read_registers = radv_amdgpu_winsys_read_registers;
index 0601481625c622ebecf1bfaa2aeaf863681e4303..bb8ce8863ba09591b3a373959810c2f80d220a18 100644 (file)
@@ -55,6 +55,11 @@ struct radv_amdgpu_winsys {
        uint64_t allocated_vram;
        uint64_t allocated_vram_vis;
        uint64_t allocated_gtt;
+
+       /* syncobj cache */
+       pthread_mutex_t syncobj_lock;
+       uint32_t *syncobj;
+       uint32_t syncobj_count, syncobj_capacity;
 };
 
 static inline struct radv_amdgpu_winsys *