util/u_queue: track job size and limit the size of queue growth
authorTimothy Arceri <tarceri@itsqueeze.com>
Tue, 3 Sep 2019 03:05:08 +0000 (13:05 +1000)
committerTimothy Arceri <tarceri@itsqueeze.com>
Thu, 19 Sep 2019 05:03:27 +0000 (15:03 +1000)
When both UTIL_QUEUE_INIT_RESIZE_IF_FULL and
UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY are set, we can get into a
situation where the queue never executes and grows to a huge size
due to all other threads being busy.

This is the case with the shader cache when attempting to compile a
huge number of shaders up front. If all threads are busy compiling
shaders the cache queues memory use can climb into the many GBs
very fast.

The use of these two flags with the shader cache is intended to
allow shaders compiled at runtime to be compiled as fast as possible.
To avoid huge memory use but still allow the queue to perform
optimally in the run time compilation case, we now add the ability
to track memory consumed by the jobs in the queue and limit it to
a hardcoded 256MB which should be more than enough.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
src/gallium/auxiliary/util/u_threaded_context.c
src/gallium/drivers/freedreno/freedreno_batch.c
src/gallium/drivers/radeonsi/si_state_shaders.c
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
src/gallium/winsys/radeon/drm/radeon_drm_cs.c
src/mesa/main/glthread.c
src/util/disk_cache.c
src/util/u_queue.c
src/util/u_queue.h

index c2bdda5f447888bca46d2e39fe8b7befdb9158b2..31aa18e3b97f0ba06e4717ac0b5c1cf088525a7c 100644 (file)
@@ -116,7 +116,7 @@ tc_batch_flush(struct threaded_context *tc)
    }
 
    util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
-                      NULL);
+                      NULL, 0);
    tc->last = tc->next;
    tc->next = (tc->next + 1) % TC_MAX_BATCHES;
 }
index 52870cd0aa1a9d7237fb179d454ec9b6164737ae..737a87a8dc61ef68bec6b9641da7dd9579fdc697 100644 (file)
@@ -336,7 +336,7 @@ batch_flush(struct fd_batch *batch)
 
                util_queue_add_job(&batch->ctx->flush_queue,
                                batch, &batch->flush_fence,
-                               batch_flush_func, batch_cleanup_func);
+                               batch_flush_func, batch_cleanup_func, 0);
        } else {
                fd_gmem_render_tiles(batch);
                batch_reset_resources(batch);
index d6fa1f1858219af9a8778ad4adf28f2d23f41271..832e59828949c0da0279d2c95a2c5ffd3fba5722 100644 (file)
@@ -2358,7 +2358,8 @@ current_not_ready:
                /* Compile it asynchronously. */
                util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
                                   shader, &shader->ready,
-                                  si_build_shader_variant_low_priority, NULL);
+                                  si_build_shader_variant_low_priority, NULL,
+                                  0);
 
                /* Add only after the ready fence was reset, to guard against a
                 * race with si_bind_XX_shader. */
@@ -2615,7 +2616,7 @@ void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
        }
 
        util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
-                          ready_fence, execute, NULL);
+                          ready_fence, execute, NULL, 0);
 
        if (debug) {
                util_queue_fence_wait(ready_fence);
index 976ec7770f08a3af18a81f43a9ba67b30fc754ff..bad01bd67451e0ec4a7bbbc36dc2cda7e2193ea6 100644 (file)
@@ -1756,7 +1756,7 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
 
       /* Submit. */
       util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
-                         amdgpu_cs_submit_ib, NULL);
+                         amdgpu_cs_submit_ib, NULL, 0);
       /* The submission has been queued, unlock the fence now. */
       simple_mtx_unlock(&ws->bo_fence_lock);
 
index ccdaed64e023747b1bf5ec973cff17e1fc321504..6726f6a77ab6366eeb05add8113d8d9634b07807 100644 (file)
@@ -697,7 +697,7 @@ static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
 
         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
-                               radeon_drm_cs_emit_ioctl_oneshot, NULL);
+                               radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
             if (!(flags & PIPE_FLUSH_ASYNC))
                 radeon_drm_cs_sync_flush(rcs);
         } else {
index 145c5199978052b66a764778c26786a50c41c2c9..82baad597f92168f9c1ec09d8092ff82644fd5e8 100644 (file)
@@ -99,7 +99,7 @@ _mesa_glthread_init(struct gl_context *ctx)
    struct util_queue_fence fence;
    util_queue_fence_init(&fence);
    util_queue_add_job(&glthread->queue, ctx, &fence,
-                      glthread_thread_initialization, NULL);
+                      glthread_thread_initialization, NULL, 0);
    util_queue_fence_wait(&fence);
    util_queue_fence_destroy(&fence);
 }
@@ -167,7 +167,7 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
    p_atomic_add(&glthread->stats.num_offloaded_items, next->used);
 
    util_queue_add_job(&glthread->queue, next, &next->fence,
-                      glthread_unmarshal_batch, NULL);
+                      glthread_unmarshal_batch, NULL, 0);
    glthread->last = glthread->next;
    glthread->next = (glthread->next + 1) % MARSHAL_MAX_BATCHES;
 }
index ed2e437a159cb8d2353511005d2e360f10323d96..46124b39c4c28f1efac00ce2a937a7f56d4842b9 100644 (file)
@@ -1037,7 +1037,7 @@ disk_cache_put(struct disk_cache *cache, const cache_key key,
    if (dc_job) {
       util_queue_fence_init(&dc_job->fence);
       util_queue_add_job(&cache->cache_queue, dc_job, &dc_job->fence,
-                         cache_put, destroy_put_job);
+                         cache_put, destroy_put_job, 0);
    }
 }
 
index 81225a80faa46129fe691ee69269503605f05ef4..ca72968053a1fba7ff1f502d60dbe24755d70344 100644 (file)
@@ -33,6 +33,9 @@
 #include "util/u_thread.h"
 #include "u_process.h"
 
+/* Define 256MB */
+#define S_256MB (256 * 1024 * 1024)
+
 static void
 util_queue_kill_threads(struct util_queue *queue, unsigned keep_num_threads,
                         bool finish_locked);
@@ -290,6 +293,8 @@ util_queue_thread_func(void *input)
          util_queue_fence_signal(job.fence);
          if (job.cleanup)
             job.cleanup(job.job, thread_index);
+
+         queue->total_jobs_size -= job.job_size;
       }
    }
 
@@ -513,7 +518,8 @@ util_queue_add_job(struct util_queue *queue,
                    void *job,
                    struct util_queue_fence *fence,
                    util_queue_execute_func execute,
-                   util_queue_execute_func cleanup)
+                   util_queue_execute_func cleanup,
+                   const size_t job_size)
 {
    struct util_queue_job *ptr;
 
@@ -531,7 +537,8 @@ util_queue_add_job(struct util_queue *queue,
    assert(queue->num_queued >= 0 && queue->num_queued <= queue->max_jobs);
 
    if (queue->num_queued == queue->max_jobs) {
-      if (queue->flags & UTIL_QUEUE_INIT_RESIZE_IF_FULL) {
+      if (queue->flags & UTIL_QUEUE_INIT_RESIZE_IF_FULL &&
+          queue->total_jobs_size + job_size < S_256MB) {
          /* If the queue is full, make it larger to avoid waiting for a free
           * slot.
           */
@@ -570,7 +577,10 @@ util_queue_add_job(struct util_queue *queue,
    ptr->fence = fence;
    ptr->execute = execute;
    ptr->cleanup = cleanup;
+   ptr->job_size = job_size;
+
    queue->write_idx = (queue->write_idx + 1) % queue->max_jobs;
+   queue->total_jobs_size += ptr->job_size;
 
    queue->num_queued++;
    cnd_signal(&queue->has_queued_cond);
@@ -642,7 +652,8 @@ util_queue_finish(struct util_queue *queue)
 
    for (unsigned i = 0; i < queue->num_threads; ++i) {
       util_queue_fence_init(&fences[i]);
-      util_queue_add_job(queue, &barrier, &fences[i], util_queue_finish_execute, NULL);
+      util_queue_add_job(queue, &barrier, &fences[i],
+                         util_queue_finish_execute, NULL, 0);
    }
 
    for (unsigned i = 0; i < queue->num_threads; ++i) {
index 2d269099c2001a9d33002fb557bc6b74c92ba4d3..9666fd9c4b5df12b1c1814b529abb2da0e4df5df 100644 (file)
@@ -193,6 +193,7 @@ typedef void (*util_queue_execute_func)(void *job, int thread_index);
 
 struct util_queue_job {
    void *job;
+   size_t job_size;
    struct util_queue_fence *fence;
    util_queue_execute_func execute;
    util_queue_execute_func cleanup;
@@ -212,6 +213,7 @@ struct util_queue {
    unsigned num_threads; /* decreasing this number will terminate threads */
    int max_jobs;
    int write_idx, read_idx; /* ring buffer pointers */
+   size_t total_jobs_size;  /* memory use of all jobs in the queue */
    struct util_queue_job *jobs;
 
    /* for cleanup at exit(), protected by exit_mutex */
@@ -230,7 +232,8 @@ void util_queue_add_job(struct util_queue *queue,
                         void *job,
                         struct util_queue_fence *fence,
                         util_queue_execute_func execute,
-                        util_queue_execute_func cleanup);
+                        util_queue_execute_func cleanup,
+                        const size_t job_size);
 void util_queue_drop_job(struct util_queue *queue,
                          struct util_queue_fence *fence);