radeonsi: use a compiler queue with a low priority for optimized shaders

author Marek Olšák <marek.olsak@amd.com>

Wed, 31 May 2017 11:18:53 +0000 (13:18 +0200)

committer Marek Olšák <marek.olsak@amd.com>

Wed, 7 Jun 2017 16:43:42 +0000 (18:43 +0200)
author Marek Olšák <marek.olsak@amd.com>
Wed, 31 May 2017 11:18:53 +0000 (13:18 +0200)
committer Marek Olšák <marek.olsak@amd.com>
Wed, 7 Jun 2017 16:43:42 +0000 (18:43 +0200)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c

index 47426b41da6202df2aea0554c4f2639221968d22..805392d7132c9a8fa6f7f1ad501b106ef4bc71c0 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -742,11 +742,16 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
                 return;
  
         util_queue_destroy(&sscreen->shader_compiler_queue);
+       util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
  
         for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
                 if (sscreen->tm[i])
                         LLVMDisposeTargetMachine(sscreen->tm[i]);
  
+       for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++)
+               if (sscreen->tm_low_priority[i])
+                       LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]);
+
         /* Free shader parts. */
         for (i = 0; i < ARRAY_SIZE(parts); i++) {
                 while (parts[i]) {
@@ -860,7 +865,7 @@ static void si_test_vmfault(struct si_screen *sscreen)
  struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
  {
         struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-       unsigned num_cpus, num_compiler_threads, i;
+       unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
  
         if (!sscreen) {
                 return NULL;
@@ -885,9 +890,11 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
         /* Only enable as many threads as we have target machines, but at most
          * the number of CPUs - 1 if there is more than one.
          */
-       num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-       num_cpus = MAX2(1, num_cpus - 1);
-       num_compiler_threads = MIN2(num_cpus, ARRAY_SIZE(sscreen->tm));
+       num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+       num_threads = MAX2(1, num_threads - 1);
+       num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm));
+       num_compiler_threads_lowprio =
+               MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority));
  
         if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
                              32, num_compiler_threads, 0)) {
@@ -896,6 +903,20 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
                 return NULL;
         }
  
+       /* The queue must be large enough so that adding optimized shaders
+        * doesn't stall draw calls when the queue is full. Especially varying
+        * packing generates a very high volume of optimized shader compilation
+        * jobs.
+        */
+       if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
+                            "si_shader_low",
+                            1024, num_compiler_threads,
+                            UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+              si_destroy_shader_cache(sscreen);
+              FREE(sscreen);
+              return NULL;
+       }
+
         si_handle_env_var_force_family(sscreen);
  
         if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
@@ -959,6 +980,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
  
         for (i = 0; i < num_compiler_threads; i++)
                 sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
+       for (i = 0; i < num_compiler_threads_lowprio; i++)
+               sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen);
  
         /* Create the auxiliary context. This must be done last. */
         sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index 13ec0729b192e62825d71dafd95b314e8cb8adc7..e917cb1b781d1ac3d3428c582dfd06d9460d8656 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -110,6 +110,9 @@ struct si_screen {
         /* Shader compiler queue for multithreaded compilation. */
         struct util_queue               shader_compiler_queue;
         LLVMTargetMachineRef            tm[4]; /* used by the queue only */
+
+       struct util_queue               shader_compiler_queue_low_priority;
+       LLVMTargetMachineRef            tm_low_priority[4];
  };
  
  struct si_blend_color {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index 62bb2212110a6ca8d648167e3c7f8c9f42f786fe..5a22add0abc6a97650c40c121834a6c197f04781 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1450,8 +1450,8 @@ static void si_build_shader_variant(void *job, int thread_index)
         int r;
  
         if (thread_index >= 0) {
-               assert(thread_index < ARRAY_SIZE(sscreen->tm));
-               tm = sscreen->tm[thread_index];
+               assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority));
+               tm = sscreen->tm_low_priority[thread_index];
                 if (!debug->async)
                         debug = NULL;
         } else {
@@ -1679,7 +1679,7 @@ again:
             !is_pure_monolithic &&
             thread_index < 0) {
                 /* Compile it asynchronously. */
-               util_queue_add_job(&sscreen->shader_compiler_queue,
+               util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
                                    shader, &shader->optimized_ready,
                                    si_build_shader_variant, NULL);
  
@@ -2258,7 +2258,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
  static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
  {
         if (shader->is_optimized) {
-               util_queue_drop_job(&sctx->screen->shader_compiler_queue,
+               util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
                                     &shader->optimized_ready);
                 util_queue_fence_destroy(&shader->optimized_ready);
         }
author	Marek Olšák <marek.olsak@amd.com>
	Wed, 31 May 2017 11:18:53 +0000 (13:18 +0200)
committer	Marek Olšák <marek.olsak@amd.com>
	Wed, 7 Jun 2017 16:43:42 +0000 (18:43 +0200)
src/gallium/drivers/radeonsi/si_pipe.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_shaders.c		patch \| blob \| history