anv: Move relocation handling from EndCommandBuffer to QueueSubmit
authorJason Ekstrand <jason.ekstrand@intel.com>
Sun, 6 Nov 2016 02:47:33 +0000 (19:47 -0700)
committerJason Ekstrand <jason.ekstrand@intel.com>
Wed, 9 Nov 2016 19:31:12 +0000 (11:31 -0800)
Ever since the early days of the Vulkan driver, we've been setting up the
lists of relocations at EndCommandBuffer time.  The idea behind this was to
move some of the CPU load out of QueueSubmit which the client is required
to lock around and into command buffer building which could be done in
parallel.  Then QueueSubmit basically just becomes a bunch of execbuf2
calls.

Technically, this works.  However, when you start to do more in QueueSubmit
than just execbuf2, you start to run into problems.  In particular, if a
block pool is resized between EndCommandBuffer and QueueSubmit, the list of
anv_bo's and the execbuf2 object list can get out of sync.  This can cause
problems if, for instance, you wanted to do relocations in userspace.

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Cc: "13.0" <mesa-stable@lists.freedesktop.org>
src/intel/vulkan/anv_batch_chain.c
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c

index 29c79951be7939cf6cfa07cade0aa7963468d031..e03b5859e1ed70e69e1441d5b190b34d937df05a 100644 (file)
@@ -645,20 +645,6 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
    return VK_SUCCESS;
 }
 
-static void
-anv_execbuf_init(struct anv_execbuf *exec)
-{
-   memset(exec, 0, sizeof(*exec));
-}
-
-static void
-anv_execbuf_finish(struct anv_execbuf *exec,
-                   const VkAllocationCallbacks *alloc)
-{
-   vk_free(alloc, exec->objects);
-   vk_free(alloc, exec->bos);
-}
-
 VkResult
 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -706,8 +692,6 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 
    anv_cmd_buffer_new_binding_table_block(cmd_buffer);
 
-   anv_execbuf_init(&cmd_buffer->execbuf2);
-
    return VK_SUCCESS;
 
  fail_bt_blocks:
@@ -739,8 +723,6 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
                             &cmd_buffer->batch_bos, link) {
       anv_batch_bo_destroy(bbo, cmd_buffer);
    }
-
-   anv_execbuf_finish(&cmd_buffer->execbuf2, &cmd_buffer->pool->alloc);
 }
 
 void
@@ -938,6 +920,31 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
                          &secondary->surface_relocs, 0);
 }
 
+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   struct anv_bo **                          bos;
+
+   /* Allocated length of the 'objects' and 'bos' arrays */
+   uint32_t                                  array_length;
+};
+
+static void
+anv_execbuf_init(struct anv_execbuf *exec)
+{
+   memset(exec, 0, sizeof(*exec));
+}
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec,
+                   const VkAllocationCallbacks *alloc)
+{
+   vk_free(alloc, exec->objects);
+   vk_free(alloc, exec->bos);
+}
+
 static VkResult
 anv_execbuf_add_bo(struct anv_execbuf *exec,
                    struct anv_bo *bo,
@@ -1108,20 +1115,20 @@ adjust_relocations_to_state_pool(struct anv_block_pool *pool,
    }
 }
 
-void
-anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
+VkResult
+anv_cmd_buffer_execbuf(struct anv_device *device,
+                       struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_batch *batch = &cmd_buffer->batch;
    struct anv_block_pool *ss_pool =
       &cmd_buffer->device->surface_state_block_pool;
 
-   cmd_buffer->execbuf2.bo_count = 0;
+   struct anv_execbuf execbuf;
+   anv_execbuf_init(&execbuf);
 
    adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
                                       cmd_buffer->last_ss_pool_center);
-
-   anv_execbuf_add_bo(&cmd_buffer->execbuf2, &ss_pool->bo,
-                      &cmd_buffer->surface_relocs,
+   anv_execbuf_add_bo(&execbuf, &ss_pool->bo, &cmd_buffer->surface_relocs,
                       &cmd_buffer->pool->alloc);
 
    /* First, we walk over all of the bos we've seen and add them and their
@@ -1132,7 +1139,7 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
                                        cmd_buffer->last_ss_pool_center);
 
-      anv_execbuf_add_bo(&cmd_buffer->execbuf2, &(*bbo)->bo, &(*bbo)->relocs,
+      anv_execbuf_add_bo(&execbuf, &(*bbo)->bo, &(*bbo)->relocs,
                          &cmd_buffer->pool->alloc);
    }
 
@@ -1150,20 +1157,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
     * corresponding to the first batch_bo in the chain with the last
     * element in the list.
     */
-   if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) {
+   if (first_batch_bo->bo.index != execbuf.bo_count - 1) {
       uint32_t idx = first_batch_bo->bo.index;
-      uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1;
+      uint32_t last_idx = execbuf.bo_count - 1;
 
-      struct drm_i915_gem_exec_object2 tmp_obj =
-         cmd_buffer->execbuf2.objects[idx];
-      assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo);
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf.objects[idx];
+      assert(execbuf.bos[idx] == &first_batch_bo->bo);
 
-      cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx];
-      cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx];
-      cmd_buffer->execbuf2.bos[idx]->index = idx;
+      execbuf.objects[idx] = execbuf.objects[last_idx];
+      execbuf.bos[idx] = execbuf.bos[last_idx];
+      execbuf.bos[idx]->index = idx;
 
-      cmd_buffer->execbuf2.objects[last_idx] = tmp_obj;
-      cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo;
+      execbuf.objects[last_idx] = tmp_obj;
+      execbuf.bos[last_idx] = &first_batch_bo->bo;
       first_batch_bo->bo.index = last_idx;
    }
 
@@ -1184,9 +1190,9 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       }
    }
 
-   cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects,
-      .buffer_count = cmd_buffer->execbuf2.bo_count,
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
       .batch_start_offset = 0,
       .batch_len = batch->next - batch->start,
       .cliprects_ptr = 0,
@@ -1198,12 +1204,7 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       .rsvd1 = cmd_buffer->device->context_id,
       .rsvd2 = 0,
    };
-}
 
-VkResult
-anv_cmd_buffer_execbuf(struct anv_device *device,
-                       struct anv_cmd_buffer *cmd_buffer)
-{
    /* Since surface states are shared between command buffers and we don't
     * know what order they will be submitted to the kernel, we don't know what
     * address is actually written in the surface state object at any given
@@ -1213,6 +1214,9 @@ anv_cmd_buffer_execbuf(struct anv_device *device,
    for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
       cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
 
-   return anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf,
-                             cmd_buffer->execbuf2.bos);
+   VkResult result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos);
+
+   anv_execbuf_finish(&execbuf, &cmd_buffer->pool->alloc);
+
+   return result;
 }
index c40598c35a9ceb82c9f013d39481b9a1a0df35b3..c47961efcec7d9bef8ba3a2d899a4b3241f6e5c5 100644 (file)
@@ -1097,6 +1097,27 @@ VkResult anv_QueueSubmit(
    struct anv_device *device = queue->device;
    VkResult result = VK_SUCCESS;
 
+   /* We lock around QueueSubmit for two main reasons:
+    *
+    *  1) When a block pool is resized, we create a new gem handle with a
+    *     different size and, in the case of surface states, possibly a
+    *     different center offset but we re-use the same anv_bo struct when
+    *     we do so.  If this happens in the middle of setting up an execbuf,
+    *     we could end up with our list of BOs out of sync with our list of
+    *     gem handles.
+    *
+    *  2) The algorithm we use for building the list of unique buffers isn't
+    *     thread-safe.  While the client is supposed to syncronize around
+    *     QueueSubmit, this would be extremely difficult to debug if it ever
+    *     came up in the wild due to a broken app.  It's better to play it
+    *     safe and just lock around QueueSubmit.
+    *
+    * Since the only other things that ever take the device lock such as block
+    * pool resize only rarely happen, this will almost never be contended so
+    * taking a lock isn't really an expensive operation in this case.
+    */
+   pthread_mutex_lock(&device->mutex);
+
    for (uint32_t i = 0; i < submitCount; i++) {
       for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
          ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
@@ -1105,7 +1126,7 @@ VkResult anv_QueueSubmit(
 
          result = anv_cmd_buffer_execbuf(device, cmd_buffer);
          if (result != VK_SUCCESS)
-            return result;
+            goto out;
       }
    }
 
@@ -1113,10 +1134,13 @@ VkResult anv_QueueSubmit(
       struct anv_bo *fence_bo = &fence->bo;
       result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
       if (result != VK_SUCCESS)
-         return result;
+         goto out;
    }
 
-   return VK_SUCCESS;
+out:
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
 }
 
 VkResult anv_QueueWaitIdle(
index a60cd3ce4acf0ffcce113564087b75b138046862..4ee6118e6c646f6b016609397a534b4b3816080c 100644 (file)
@@ -1144,17 +1144,6 @@ enum anv_cmd_buffer_exec_mode {
    ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN,
 };
 
-struct anv_execbuf {
-   struct drm_i915_gem_execbuffer2           execbuf;
-
-   struct drm_i915_gem_exec_object2 *        objects;
-   uint32_t                                  bo_count;
-   struct anv_bo **                          bos;
-
-   /* Allocated length of the 'objects' and 'bos' arrays */
-   uint32_t                                  array_length;
-};
-
 struct anv_cmd_buffer {
    VK_LOADER_DATA                               _loader_data;
 
@@ -1190,8 +1179,6 @@ struct anv_cmd_buffer {
    /** Last seen surface state block pool center bo offset */
    uint32_t                                     last_ss_pool_center;
 
-   struct anv_execbuf                           execbuf2;
-
    /* Serial for tracking buffer completion */
    uint32_t                                     serial;
 
index 24e0012c7165b7846220bcd3f0e47ed08ac99c8b..045cb9d05cb824befb4fbb373760d2c6bae6c130 100644 (file)
@@ -200,20 +200,9 @@ genX(EndCommandBuffer)(
     VkCommandBuffer                             commandBuffer)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_device *device = cmd_buffer->device;
 
    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-      /* The algorithm used to compute the validate list is not threadsafe as
-       * it uses the bo->index field.  We have to lock the device around it.
-       * Fortunately, the chances for contention here are probably very low.
-       */
-      pthread_mutex_lock(&device->mutex);
-      anv_cmd_buffer_prepare_execbuf(cmd_buffer);
-      pthread_mutex_unlock(&device->mutex);
-   }
-
    return VK_SUCCESS;
 }