X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Futil%2Fu_threaded_context.c;h=fc7eb1388353f8918a0de21a5f87de153d2c5344;hb=966f155623e5a626f1d788af7e0f602cdcee6993;hp=34206bfbf40927eb88b4b895e584d3ab204deb68;hpb=3b1ce49bc1e5aff87805b0bab255885c84bf5052;p=mesa.git

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 34206bfbf40..fc7eb138835 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -64,10 +64,9 @@ typedef void (*tc_execute)(struct pipe_context *pipe, union tc_payload *payload)
 static const tc_execute execute_func[TC_NUM_CALLS];
 
 static void
-tc_batch_check(struct tc_batch *batch)
+tc_batch_check(MAYBE_UNUSED struct tc_batch *batch)
 {
    tc_assert(batch->sentinel == TC_SENTINEL);
-   tc_assert(batch->sentinel2 == TC_SENTINEL);
    tc_assert(batch->num_total_call_slots <= TC_CALLS_PER_BATCH);
 }
 
@@ -81,7 +80,7 @@ tc_debug_check(struct threaded_context *tc)
 }
 
 static void
-tc_batch_execute(void *job, int thread_index)
+tc_batch_execute(void *job, UNUSED int thread_index)
 {
    struct tc_batch *batch = job;
    struct pipe_context *pipe = batch->pipe;
@@ -89,6 +88,8 @@ tc_batch_execute(void *job, int thread_index)
 
    tc_batch_check(batch);
 
+   assert(!batch->token);
+
    for (struct tc_call *iter = batch->call; iter != last;
         iter += iter->num_call_slots) {
       tc_assert(iter->sentinel == TC_SENTINEL);
@@ -109,6 +110,11 @@ tc_batch_flush(struct threaded_context *tc)
    tc_debug_check(tc);
    p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots);
 
+   if (next->token) {
+      next->token->tc = NULL;
+      tc_unflushed_batch_token_reference(&next->token, NULL);
+   }
+
    util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
                       NULL);
    tc->last = tc->next;
@@ -163,8 +169,18 @@ tc_add_small_call(struct threaded_context *tc, enum tc_call_id id)
    return tc_add_sized_call(tc, id, 0);
 }
 
+static bool
+tc_is_sync(struct threaded_context *tc)
+{
+   struct tc_batch *last = &tc->batch_slots[tc->last];
+   struct tc_batch *next = &tc->batch_slots[tc->next];
+
+   return util_queue_fence_is_signalled(&last->fence) &&
+          !next->num_total_call_slots;
+}
+
 static void
-_tc_sync(struct threaded_context *tc, const char *info, const char *func)
+_tc_sync(struct threaded_context *tc, MAYBE_UNUSED const char *info, MAYBE_UNUSED const char *func)
 {
    struct tc_batch *last = &tc->batch_slots[tc->last];
    struct tc_batch *next = &tc->batch_slots[tc->next];
@@ -180,6 +196,11 @@ _tc_sync(struct threaded_context *tc, const char *info, const char *func)
 
    tc_debug_check(tc);
 
+   if (next->token) {
+      next->token->tc = NULL;
+      tc_unflushed_batch_token_reference(&next->token, NULL);
+   }
+
    /* .. and execute unflushed calls directly. */
    if (next->num_total_call_slots) {
       p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots);
@@ -190,8 +211,9 @@ _tc_sync(struct threaded_context *tc, const char *info, const char *func)
    if (synced) {
       p_atomic_inc(&tc->num_syncs);
 
-      if (tc_strcmp(func, "tc_destroy") != 0)
+      if (tc_strcmp(func, "tc_destroy") != 0) {
          tc_printf("sync %s %s\n", func, info);
+	  }
    }
 
    tc_debug_check(tc);
@@ -200,6 +222,34 @@ _tc_sync(struct threaded_context *tc, const char *info, const char *func)
 #define tc_sync(tc) _tc_sync(tc, "", __func__)
 #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
 
+/**
+ * Call this from fence_finish for same-context fence waits of deferred fences
+ * that haven't been flushed yet.
+ *
+ * The passed pipe_context must be the one passed to pipe_screen::fence_finish,
+ * i.e., the wrapped one.
+ */
+void
+threaded_context_flush(struct pipe_context *_pipe,
+                       struct tc_unflushed_batch_token *token,
+                       bool prefer_async)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   /* This is called from the state-tracker / application thread. */
+   if (token->tc && token->tc == tc) {
+      struct tc_batch *last = &tc->batch_slots[tc->last];
+
+      /* Prefer to do the flush in the driver thread if it is already
+       * running. That should be better for cache locality.
+       */
+      if (prefer_async || !util_queue_fence_is_signalled(&last->fence))
+         tc_batch_flush(tc);
+      else
+         tc_sync(token->tc);
+   }
+}
+
 static void
 tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
 {
@@ -299,6 +349,11 @@ tc_create_batch_query(struct pipe_context *_pipe, unsigned num_queries,
 static void
 tc_call_destroy_query(struct pipe_context *pipe, union tc_payload *payload)
 {
+   struct threaded_query *tq = threaded_query(payload->query);
+
+   if (tq->head_unflushed.next)
+      LIST_DEL(&tq->head_unflushed);
+
    pipe->destroy_query(pipe, payload->query);
 }
 
@@ -306,10 +361,6 @@ static void
 tc_destroy_query(struct pipe_context *_pipe, struct pipe_query *query)
 {
    struct threaded_context *tc = threaded_context(_pipe);
-   struct threaded_query *tq = threaded_query(query);
-
-   if (tq->head_unflushed.next)
-      LIST_DEL(&tq->head_unflushed);
 
    tc_add_small_call(tc, TC_CALL_destroy_query)->query = query;
 }
@@ -330,10 +381,21 @@ tc_begin_query(struct pipe_context *_pipe, struct pipe_query *query)
    return true; /* we don't care about the return value for this call */
 }
 
+struct tc_end_query_payload {
+   struct threaded_context *tc;
+   struct pipe_query *query;
+};
+
 static void
 tc_call_end_query(struct pipe_context *pipe, union tc_payload *payload)
 {
-   pipe->end_query(pipe, payload->query);
+   struct tc_end_query_payload *p = (struct tc_end_query_payload *)payload;
+   struct threaded_query *tq = threaded_query(p->query);
+
+   if (!tq->head_unflushed.next)
+      LIST_ADD(&tq->head_unflushed, &p->tc->unflushed_queries);
+
+   pipe->end_query(pipe, p->query);
 }
 
 static bool
@@ -341,13 +403,13 @@ tc_end_query(struct pipe_context *_pipe, struct pipe_query *query)
 {
    struct threaded_context *tc = threaded_context(_pipe);
    struct threaded_query *tq = threaded_query(query);
-   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_end_query);
+   struct tc_end_query_payload *payload =
+      tc_add_struct_typed_call(tc, TC_CALL_end_query, tc_end_query_payload);
 
+   payload->tc = tc;
    payload->query = query;
 
    tq->flushed = false;
-   if (!tq->head_unflushed.next)
-      LIST_ADD(&tq->head_unflushed, &tc->unflushed_queries);
 
    return true; /* we don't care about the return value for this call */
 }
@@ -368,8 +430,10 @@ tc_get_query_result(struct pipe_context *_pipe,
 
    if (success) {
       tq->flushed = true;
-      if (tq->head_unflushed.next)
+      if (tq->head_unflushed.next) {
+         /* This is safe because it can only happen after we sync'd. */
          LIST_DEL(&tq->head_unflushed);
+      }
    }
    return success;
 }
@@ -429,7 +493,7 @@ tc_call_render_condition(struct pipe_context *pipe, union tc_payload *payload)
 static void
 tc_render_condition(struct pipe_context *_pipe,
                     struct pipe_query *query, boolean condition,
-                    uint mode)
+                    enum pipe_render_cond_flag mode)
 {
    struct threaded_context *tc = threaded_context(_pipe);
    struct tc_render_condition *p =
@@ -598,7 +662,7 @@ tc_call_set_constant_buffer(struct pipe_context *pipe, union tc_payload *payload
 
 static void
 tc_set_constant_buffer(struct pipe_context *_pipe,
-                       uint shader, uint index,
+                       enum pipe_shader_type shader, uint index,
                        const struct pipe_constant_buffer *cb)
 {
    struct threaded_context *tc = threaded_context(_pipe);
@@ -846,7 +910,8 @@ tc_call_set_shader_buffers(struct pipe_context *pipe, union tc_payload *payload)
 }
 
 static void
-tc_set_shader_buffers(struct pipe_context *_pipe, unsigned shader,
+tc_set_shader_buffers(struct pipe_context *_pipe,
+                      enum pipe_shader_type shader,
                       unsigned start, unsigned count,
                       const struct pipe_shader_buffer *buffers)
 {
@@ -1085,6 +1150,128 @@ tc_stream_output_target_destroy(struct pipe_context *_pipe,
 }
 
 
+/********************************************************************
+ * bindless
+ */
+
+static uint64_t
+tc_create_texture_handle(struct pipe_context *_pipe,
+                         struct pipe_sampler_view *view,
+                         const struct pipe_sampler_state *state)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   return pipe->create_texture_handle(pipe, view, state);
+}
+
+static void
+tc_call_delete_texture_handle(struct pipe_context *pipe,
+                              union tc_payload *payload)
+{
+   pipe->delete_texture_handle(pipe, payload->handle);
+}
+
+static void
+tc_delete_texture_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload =
+      tc_add_small_call(tc, TC_CALL_delete_texture_handle);
+
+   payload->handle = handle;
+}
+
+struct tc_make_texture_handle_resident
+{
+   uint64_t handle;
+   bool resident;
+};
+
+static void
+tc_call_make_texture_handle_resident(struct pipe_context *pipe,
+                                     union tc_payload *payload)
+{
+   struct tc_make_texture_handle_resident *p =
+      (struct tc_make_texture_handle_resident *)payload;
+
+   pipe->make_texture_handle_resident(pipe, p->handle, p->resident);
+}
+
+static void
+tc_make_texture_handle_resident(struct pipe_context *_pipe, uint64_t handle,
+                                bool resident)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_make_texture_handle_resident *p =
+      tc_add_struct_typed_call(tc, TC_CALL_make_texture_handle_resident,
+                               tc_make_texture_handle_resident);
+
+   p->handle = handle;
+   p->resident = resident;
+}
+
+static uint64_t
+tc_create_image_handle(struct pipe_context *_pipe,
+                       const struct pipe_image_view *image)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   return pipe->create_image_handle(pipe, image);
+}
+
+static void
+tc_call_delete_image_handle(struct pipe_context *pipe,
+                            union tc_payload *payload)
+{
+   pipe->delete_image_handle(pipe, payload->handle);
+}
+
+static void
+tc_delete_image_handle(struct pipe_context *_pipe, uint64_t handle)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   union tc_payload *payload =
+      tc_add_small_call(tc, TC_CALL_delete_image_handle);
+
+   payload->handle = handle;
+}
+
+struct tc_make_image_handle_resident
+{
+   uint64_t handle;
+   unsigned access;
+   bool resident;
+};
+
+static void
+tc_call_make_image_handle_resident(struct pipe_context *pipe,
+                                     union tc_payload *payload)
+{
+   struct tc_make_image_handle_resident *p =
+      (struct tc_make_image_handle_resident *)payload;
+
+   pipe->make_image_handle_resident(pipe, p->handle, p->access, p->resident);
+}
+
+static void
+tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle,
+                              unsigned access, bool resident)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct tc_make_image_handle_resident *p =
+      tc_add_struct_typed_call(tc, TC_CALL_make_image_handle_resident,
+                               tc_make_image_handle_resident);
+
+   p->handle = handle;
+   p->access = access;
+   p->resident = resident;
+}
+
+
 /********************************************************************
  * transfer
  */
@@ -1154,6 +1341,28 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
                             struct threaded_resource *tres, unsigned usage,
                             unsigned offset, unsigned size)
 {
+   /* Never invalidate inside the driver and never infer "unsynchronized". */
+   unsigned tc_flags = TC_TRANSFER_MAP_NO_INVALIDATE |
+                       TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED;
+
+   /* Prevent a reentry. */
+   if (usage & tc_flags)
+      return usage;
+
+   /* Use the staging upload if it's preferred. */
+   if (usage & (PIPE_TRANSFER_DISCARD_RANGE |
+                PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) &&
+       !(usage & PIPE_TRANSFER_PERSISTENT) &&
+       /* Try not to decrement the counter if it's not positive. Still racy,
+        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+       tres->max_forced_staging_uploads > 0 &&
+       p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
+      usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+                 PIPE_TRANSFER_UNSYNCHRONIZED);
+
+      return usage | tc_flags | PIPE_TRANSFER_DISCARD_RANGE;
+   }
+
    /* Sparse buffers can't be mapped directly and can't be reallocated
     * (fully invalidated). That may just be a radeonsi limitation, but
     * the threaded context must obey it with radeonsi.
@@ -1173,12 +1382,12 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
       return usage;
    }
 
+   usage |= tc_flags;
+
    /* Handle CPU reads trivially. */
    if (usage & PIPE_TRANSFER_READ) {
-      /* Driver aren't allowed to do buffer invalidations. */
-      return (usage & ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) |
-             TC_TRANSFER_MAP_NO_INVALIDATE |
-             TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
+      /* Drivers aren't allowed to do buffer invalidations. */
+      return usage & ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
    }
 
    /* See if the buffer range being mapped has never been initialized,
@@ -1215,13 +1424,12 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
       usage &= ~PIPE_TRANSFER_DISCARD_RANGE;
 
    /* Unsychronized buffer mappings don't have to synchronize the thread. */
-   if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
+   if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+      usage &= ~PIPE_TRANSFER_DISCARD_RANGE;
       usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */
+   }
 
-   /* Never invalidate inside the driver and never infer "unsynchronized". */
-   return usage |
-          TC_TRANSFER_MAP_NO_INVALIDATE |
-          TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
+   return usage;
 }
 
 static void *
@@ -1615,15 +1823,33 @@ tc_set_debug_callback(struct pipe_context *_pipe,
    pipe->set_debug_callback(pipe, cb);
 }
 
+static void
+tc_set_log_context(struct pipe_context *_pipe, struct u_log_context *log)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   tc_sync(tc);
+   pipe->set_log_context(pipe, log);
+}
+
 static void
 tc_create_fence_fd(struct pipe_context *_pipe,
-                   struct pipe_fence_handle **fence, int fd)
+                   struct pipe_fence_handle **fence, int fd,
+                   enum pipe_fd_type type)
 {
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
 
    tc_sync(tc);
-   pipe->create_fence_fd(pipe, fence, fd);
+   pipe->create_fence_fd(pipe, fence, fd, type);
+}
+
+static void
+tc_call_fence_server_sync(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->fence_server_sync(pipe, payload->fence);
+   pipe->screen->fence_reference(pipe->screen, &payload->fence, NULL);
 }
 
 static void
@@ -1631,23 +1857,43 @@ tc_fence_server_sync(struct pipe_context *_pipe,
                      struct pipe_fence_handle *fence)
 {
    struct threaded_context *tc = threaded_context(_pipe);
-   struct pipe_context *pipe = tc->pipe;
+   struct pipe_screen *screen = tc->pipe->screen;
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_fence_server_sync);
 
-   tc_sync(tc);
-   pipe->fence_server_sync(pipe, fence);
+   payload->fence = NULL;
+   screen->fence_reference(screen, &payload->fence, fence);
+}
+
+static void
+tc_call_fence_server_signal(struct pipe_context *pipe, union tc_payload *payload)
+{
+   pipe->fence_server_signal(pipe, payload->fence);
+   pipe->screen->fence_reference(pipe->screen, &payload->fence, NULL);
+}
+
+static void
+tc_fence_server_signal(struct pipe_context *_pipe,
+                           struct pipe_fence_handle *fence)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_screen *screen = tc->pipe->screen;
+   union tc_payload *payload = tc_add_small_call(tc, TC_CALL_fence_server_signal);
+
+   payload->fence = NULL;
+   screen->fence_reference(screen, &payload->fence, fence);
 }
 
 static struct pipe_video_codec *
-tc_create_video_codec(struct pipe_context *_pipe,
-                      const struct pipe_video_codec *templ)
+tc_create_video_codec(UNUSED struct pipe_context *_pipe,
+                      UNUSED const struct pipe_video_codec *templ)
 {
    unreachable("Threaded context should not be enabled for video APIs");
    return NULL;
 }
 
 static struct pipe_video_buffer *
-tc_create_video_buffer(struct pipe_context *_pipe,
-                       const struct pipe_video_buffer *templ)
+tc_create_video_buffer(UNUSED struct pipe_context *_pipe,
+                       UNUSED const struct pipe_video_buffer *templ)
 {
    unreachable("Threaded context should not be enabled for video APIs");
    return NULL;
@@ -1658,22 +1904,96 @@ tc_create_video_buffer(struct pipe_context *_pipe,
  * draw, launch, clear, blit, copy, flush
  */
 
+struct tc_flush_payload {
+   struct threaded_context *tc;
+   struct pipe_fence_handle *fence;
+   unsigned flags;
+};
+
+static void
+tc_flush_queries(struct threaded_context *tc)
+{
+   struct threaded_query *tq, *tmp;
+   LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
+      LIST_DEL(&tq->head_unflushed);
+
+      /* Memory release semantics: due to a possible race with
+       * tc_get_query_result, we must ensure that the linked list changes
+       * are visible before setting tq->flushed.
+       */
+      p_atomic_set(&tq->flushed, true);
+   }
+}
+
+static void
+tc_call_flush(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_flush_payload *p = (struct tc_flush_payload *)payload;
+   struct pipe_screen *screen = pipe->screen;
+
+   pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
+   screen->fence_reference(screen, &p->fence, NULL);
+
+   if (!(p->flags & PIPE_FLUSH_DEFERRED))
+      tc_flush_queries(p->tc);
+}
+
 static void
 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
          unsigned flags)
 {
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
-   struct threaded_query *tq, *tmp;
+   struct pipe_screen *screen = pipe->screen;
+   bool async = flags & PIPE_FLUSH_DEFERRED;
 
-   LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
-      tq->flushed = true;
-      LIST_DEL(&tq->head_unflushed);
+   if (flags & PIPE_FLUSH_ASYNC) {
+      struct tc_batch *last = &tc->batch_slots[tc->last];
+
+      /* Prefer to do the flush in the driver thread, but avoid the inter-thread
+       * communication overhead if the driver thread is currently idle and the
+       * caller is going to wait for the fence immediately anyway.
+       */
+      if (!(util_queue_fence_is_signalled(&last->fence) &&
+            (flags & PIPE_FLUSH_HINT_FINISH)))
+         async = true;
    }
 
-   /* TODO: deferred flushes? */
+   if (async && tc->create_fence) {
+      if (fence) {
+         struct tc_batch *next = &tc->batch_slots[tc->next];
+
+         if (!next->token) {
+            next->token = malloc(sizeof(*next->token));
+            if (!next->token)
+               goto out_of_memory;
+
+            pipe_reference_init(&next->token->ref, 1);
+            next->token->tc = tc;
+         }
+
+         screen->fence_reference(screen, fence, tc->create_fence(pipe, next->token));
+         if (!*fence)
+            goto out_of_memory;
+      }
+
+      struct tc_flush_payload *p =
+         tc_add_struct_typed_call(tc, TC_CALL_flush, tc_flush_payload);
+      p->tc = tc;
+      p->fence = fence ? *fence : NULL;
+      p->flags = flags | TC_FLUSH_ASYNC;
+
+      if (!(flags & PIPE_FLUSH_DEFERRED))
+         tc_batch_flush(tc);
+      return;
+   }
+
+out_of_memory:
    tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
                    flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
+
+   if (!(flags & PIPE_FLUSH_DEFERRED))
+      tc_flush_queries(tc);
    pipe->flush(pipe, fence, flags);
 }
 
@@ -1856,9 +2176,11 @@ static void
 tc_call_generate_mipmap(struct pipe_context *pipe, union tc_payload *payload)
 {
    struct tc_generate_mipmap *p = (struct tc_generate_mipmap *)payload;
-   bool result = pipe->generate_mipmap(pipe, p->res, p->format, p->base_level,
-                                       p->last_level, p->first_layer,
-                                       p->last_layer);
+   MAYBE_UNUSED bool result = pipe->generate_mipmap(pipe, p->res, p->format,
+                                                    p->base_level,
+                                                    p->last_level,
+                                                    p->first_layer,
+                                                    p->last_layer);
    assert(result);
    pipe_resource_reference(&p->res, NULL);
 }
@@ -1883,7 +2205,8 @@ tc_generate_mipmap(struct pipe_context *_pipe,
       bind = PIPE_BIND_RENDER_TARGET;
 
    if (!screen->is_format_supported(screen, format, res->target,
-                                    res->nr_samples, bind))
+                                    res->nr_samples, res->nr_storage_samples,
+                                    bind))
       return false;
 
    struct tc_generate_mipmap *p =
@@ -2097,6 +2420,41 @@ tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res,
 }
 
 
+/********************************************************************
+ * callback
+ */
+
+struct tc_callback_payload {
+   void (*fn)(void *data);
+   void *data;
+};
+
+static void
+tc_call_callback(UNUSED struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_callback_payload *p = (struct tc_callback_payload *)payload;
+
+   p->fn(p->data);
+}
+
+static void
+tc_callback(struct pipe_context *_pipe, void (*fn)(void *), void *data,
+            bool asap)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   if (asap && tc_is_sync(tc)) {
+      fn(data);
+      return;
+   }
+
+   struct tc_callback_payload *p =
+      tc_add_struct_typed_call(tc, TC_CALL_callback, tc_callback_payload);
+   p->fn = fn;
+   p->data = data;
+}
+
+
 /********************************************************************
  * create & destroy
  */
@@ -2107,23 +2465,26 @@ tc_destroy(struct pipe_context *_pipe)
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
 
+   if (tc->base.const_uploader &&
+       tc->base.stream_uploader != tc->base.const_uploader)
+      u_upload_destroy(tc->base.const_uploader);
+
+   if (tc->base.stream_uploader)
+      u_upload_destroy(tc->base.stream_uploader);
+
    tc_sync(tc);
 
    if (util_queue_is_initialized(&tc->queue)) {
       util_queue_destroy(&tc->queue);
 
-      for (unsigned i = 0; i < TC_MAX_BATCHES; i++)
+      for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
          util_queue_fence_destroy(&tc->batch_slots[i].fence);
+         assert(!tc->batch_slots[i].token);
+      }
    }
 
-   if (tc->base.const_uploader &&
-       tc->base.stream_uploader != tc->base.const_uploader)
-      u_upload_destroy(tc->base.const_uploader);
-
-   if (tc->base.stream_uploader)
-      u_upload_destroy(tc->base.stream_uploader);
-
    slab_destroy_child(&tc->pool_transfers);
+   assert(tc->batch_slots[tc->next].num_total_call_slots == 0);
    pipe->destroy(pipe);
    os_free_aligned(tc);
 }
@@ -2150,6 +2511,7 @@ struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
+                        tc_create_fence_func create_fence,
                         struct threaded_context **out)
 {
    struct threaded_context *tc;
@@ -2173,21 +2535,24 @@ threaded_context_create(struct pipe_context *pipe,
    memset(tc, 0, sizeof(*tc));
 
    assert((uintptr_t)tc % 16 == 0);
-   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0]) % 16 == 0);
-   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[0]) % 16 == 0);
-   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
-   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[1].call[0]) % 16 == 0);
+   /* These should be static asserts, but they don't work with MSVC */
+   assert(offsetof(struct threaded_context, batch_slots) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
+   assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0);
 
    /* The driver context isn't wrapped, so set its "priv" to NULL. */
    pipe->priv = NULL;
 
    tc->pipe = pipe;
    tc->replace_buffer_storage = replace_buffer;
+   tc->create_fence = create_fence;
    tc->map_buffer_alignment =
       pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
    tc->base.priv = pipe; /* priv points to the wrapped driver context */
    tc->base.screen = pipe->screen;
    tc->base.destroy = tc_destroy;
+   tc->base.callback = tc_callback;
 
    tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
    if (pipe->stream_uploader == pipe->const_uploader)
@@ -2202,12 +2567,11 @@ threaded_context_create(struct pipe_context *pipe,
     * from the queue before being executed, so keep one tc_batch slot for that
     * execution. Also, keep one unused slot for an unflushed batch.
     */
-   if (!util_queue_init(&tc->queue, "gallium_drv", TC_MAX_BATCHES - 2, 1))
+   if (!util_queue_init(&tc->queue, "gdrv", TC_MAX_BATCHES - 2, 1, 0))
       goto fail;
 
    for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
       tc->batch_slots[i].sentinel = TC_SENTINEL;
-      tc->batch_slots[i].sentinel2 = TC_SENTINEL;
       tc->batch_slots[i].pipe = pipe;
       util_queue_fence_init(&tc->batch_slots[i].fence);
    }
@@ -2313,11 +2677,19 @@ threaded_context_create(struct pipe_context *pipe,
    CTX_INIT(get_device_reset_status);
    CTX_INIT(set_device_reset_callback);
    CTX_INIT(dump_debug_state);
+   CTX_INIT(set_log_context);
    CTX_INIT(emit_string_marker);
    CTX_INIT(set_debug_callback);
    CTX_INIT(create_fence_fd);
    CTX_INIT(fence_server_sync);
+   CTX_INIT(fence_server_signal);
    CTX_INIT(get_timestamp);
+   CTX_INIT(create_texture_handle);
+   CTX_INIT(delete_texture_handle);
+   CTX_INIT(make_texture_handle_resident);
+   CTX_INIT(create_image_handle);
+   CTX_INIT(delete_image_handle);
+   CTX_INIT(make_image_handle_resident);
 #undef CTX_INIT
 
    if (out)