radeonsi: emit_spi_map packets optimization

[mesa.git] / src / gallium / drivers / radeonsi / si_state_shaders.c
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index e7610af2fa7d315d83f47bf1a65c15cff7b2e288..4e0320a226d823dc277ed040c23ea69206e5a006 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -45,7 +45,7 @@
   * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
   * size as integer.
   */
-static void *si_get_ir_binary(struct si_shader_selector *sel)
+void *si_get_ir_binary(struct si_shader_selector *sel)
  {
         struct blob blob;
         unsigned ir_size;
@@ -202,10 +202,9 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
   *
   * Returns false on failure, in which case the ir_binary should be freed.
   */
-static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
-                                         void *ir_binary,
-                                         struct si_shader *shader,
-                                         bool insert_into_disk_cache)
+bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+                                  struct si_shader *shader,
+                                  bool insert_into_disk_cache)
  {
         void *hw_binary;
         struct hash_entry *entry;
@@ -235,9 +234,8 @@ static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
         return true;
  }
  
-static bool si_shader_cache_load_shader(struct si_screen *sscreen,
-                                       void *ir_binary,
-                                       struct si_shader *shader)
+bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+                                struct si_shader *shader)
  {
         struct hash_entry *entry =
                 _mesa_hash_table_search(sscreen->shader_cache, ir_binary);
@@ -1511,7 +1509,7 @@ static void si_build_shader_variant(struct si_shader *shader,
  {
         struct si_shader_selector *sel = shader->selector;
         struct si_screen *sscreen = sel->screen;
-       struct si_compiler *compiler;
+       struct ac_llvm_compiler *compiler;
         struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
         int r;
  
@@ -1582,10 +1580,10 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
                 main_part->selector = sel;
                 main_part->key.as_es = key->as_es;
                 main_part->key.as_ls = key->as_ls;
+               main_part->is_monolithic = false;
  
                 if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
-                                          main_part, false,
-                                          &compiler_state->debug) != 0) {
+                                          main_part, &compiler_state->debug) != 0) {
                         FREE(main_part);
                         return false;
                 }
@@ -1858,7 +1856,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
  {
         struct si_shader_selector *sel = (struct si_shader_selector *)job;
         struct si_screen *sscreen = sel->screen;
-       struct si_compiler *compiler;
+       struct ac_llvm_compiler *compiler;
         struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
  
         assert(!debug->debug_message || debug->async);
@@ -1884,6 +1882,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                 util_queue_fence_init(&shader->ready);
  
                 shader->selector = sel;
+               shader->is_monolithic = false;
                 si_parse_next_shader_property(&sel->info,
                                               sel->so.num_outputs != 0,
                                               &shader->key);
@@ -1902,7 +1901,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                         mtx_unlock(&sscreen->shader_cache_mutex);
  
                         /* Compile the shader if it hasn't been loaded from the cache. */
-                       if (si_compile_tgsi_shader(sscreen, compiler, shader, false,
+                       if (si_compile_tgsi_shader(sscreen, compiler, shader,
                                                    debug) != 0) {
                                 FREE(shader);
                                 FREE(ir_binary);
@@ -1975,6 +1974,34 @@ static void si_init_shader_selector_async(void *job, int thread_index)
         }
  }
  
+void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
+                                struct util_queue_fence *ready_fence,
+                                struct si_compiler_ctx_state *compiler_ctx_state,
+                                void *job, util_queue_execute_func execute)
+{
+       util_queue_fence_init(ready_fence);
+
+       struct util_async_debug_callback async_debug;
+       bool wait =
+               (sctx->debug.debug_message && !sctx->debug.async) ||
+               sctx->is_debug ||
+               si_can_dump_shader(sctx->screen, processor);
+
+       if (wait) {
+               u_async_debug_init(&async_debug);
+               compiler_ctx_state->debug = async_debug.base;
+       }
+
+       util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
+                          ready_fence, execute, NULL);
+
+       if (wait) {
+               util_queue_fence_wait(ready_fence);
+               u_async_debug_drain(&async_debug, &sctx->debug);
+               u_async_debug_cleanup(&async_debug);
+       }
+}
+
  /* Return descriptor slot usage masks from the given shader info. */
  void si_get_active_slot_masks(const struct tgsi_shader_info *info,
                               uint32_t *const_and_shader_buffers,
@@ -2243,29 +2270,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
         }
  
         (void) mtx_init(&sel->mutex, mtx_plain);
-       util_queue_fence_init(&sel->ready);
-
-       struct util_async_debug_callback async_debug;
-       bool wait =
-               (sctx->debug.debug_message && !sctx->debug.async) ||
-               sctx->is_debug ||
-               si_can_dump_shader(sscreen, sel->info.processor);
-
-       if (wait) {
-               u_async_debug_init(&async_debug);
-               sel->compiler_ctx_state.debug = async_debug.base;
-       }
-
-       util_queue_add_job(&sscreen->shader_compiler_queue, sel,
-                          &sel->ready, si_init_shader_selector_async,
-                          NULL);
-
-       if (wait) {
-               util_queue_fence_wait(&sel->ready);
-               u_async_debug_drain(&async_debug, &sctx->debug);
-               u_async_debug_cleanup(&async_debug);
-       }
  
+       si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
+                                   &sel->compiler_ctx_state, sel,
+                                   si_init_shader_selector_async);
         return sel;
  }
  
@@ -2626,27 +2634,25 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
  
  static void si_emit_spi_map(struct si_context *sctx)
  {
-       struct radeon_winsys_cs *cs = sctx->gfx_cs;
         struct si_shader *ps = sctx->ps_shader.current;
         struct si_shader *vs = si_get_vs_state(sctx);
         struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
         unsigned i, num_interp, num_written = 0, bcol_interp[2];
+       unsigned spi_ps_input_cntl[32];
  
         if (!ps || !ps->selector->info.num_inputs)
                 return;
  
         num_interp = si_get_ps_num_interp(ps);
         assert(num_interp > 0);
-       radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, num_interp);
  
         for (i = 0; i < psinfo->num_inputs; i++) {
                 unsigned name = psinfo->input_semantic_name[i];
                 unsigned index = psinfo->input_semantic_index[i];
                 unsigned interpolate = psinfo->input_interpolate[i];
  
-               radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, name, index,
-                                                    interpolate));
-               num_written++;
+               spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
+                                                           index, interpolate);
  
                 if (name == TGSI_SEMANTIC_COLOR) {
                         assert(index < ARRAY_SIZE(bcol_interp));
@@ -2661,12 +2667,19 @@ static void si_emit_spi_map(struct si_context *sctx)
                         if (!(psinfo->colors_read & (0xf << (i * 4))))
                                 continue;
  
-                       radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, bcol,
-                                                            i, bcol_interp[i]));
-                       num_written++;
+                       spi_ps_input_cntl[num_written++] =
+                         si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+
                 }
         }
         assert(num_interp == num_written);
+
+       /* R_028644_SPI_PS_INPUT_CNTL_0 */
+       /* Dota 2: Only ~16% of SPI map updates set different values. */
+       /* Talos: Only ~9% of SPI map updates set different values. */
+       radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
+                                   spi_ps_input_cntl,
+                                   sctx->tracked_regs.spi_ps_input_cntl, num_interp);
  }
  
  /**
@@ -3356,7 +3369,7 @@ bool si_update_shaders(struct si_context *sctx)
  
  static void si_emit_scratch_state(struct si_context *sctx)
  {
-       struct radeon_winsys_cs *cs = sctx->gfx_cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
         radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
                                sctx->spi_tmpring_size);