radeonsi: emit_spi_map packets optimization

[mesa.git] / src / gallium / drivers / radeonsi / si_state_shaders.c
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index 1b618502e857760cb746b256ce313845529b574d..4e0320a226d823dc277ed040c23ea69206e5a006 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -45,7 +45,7 @@
   * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
   * size as integer.
   */
-static void *si_get_ir_binary(struct si_shader_selector *sel)
+void *si_get_ir_binary(struct si_shader_selector *sel)
  {
         struct blob blob;
         unsigned ir_size;
@@ -202,10 +202,9 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
   *
   * Returns false on failure, in which case the ir_binary should be freed.
   */
-static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
-                                         void *ir_binary,
-                                         struct si_shader *shader,
-                                         bool insert_into_disk_cache)
+bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+                                  struct si_shader *shader,
+                                  bool insert_into_disk_cache)
  {
         void *hw_binary;
         struct hash_entry *entry;
@@ -235,9 +234,8 @@ static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
         return true;
  }
  
-static bool si_shader_cache_load_shader(struct si_screen *sscreen,
-                                       void *ir_binary,
-                                       struct si_shader *shader)
+bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+                                struct si_shader *shader)
  {
         struct hash_entry *entry =
                 _mesa_hash_table_search(sscreen->shader_cache, ir_binary);
@@ -605,6 +603,30 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
         polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
  }
  
+static unsigned si_conv_prim_to_gs_out(unsigned mode)
+{
+       static const int prim_conv[] = {
+               [PIPE_PRIM_POINTS]                      = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+               [PIPE_PRIM_LINES]                       = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+               [PIPE_PRIM_LINE_LOOP]                   = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+               [PIPE_PRIM_LINE_STRIP]                  = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+               [PIPE_PRIM_TRIANGLES]                   = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_TRIANGLE_STRIP]              = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_TRIANGLE_FAN]                = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_QUADS]                       = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_QUAD_STRIP]                  = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_POLYGON]                     = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_LINES_ADJACENCY]             = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+               [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+               [PIPE_PRIM_TRIANGLES_ADJACENCY]         = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+               [PIPE_PRIM_PATCHES]                     = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+       };
+       assert(mode < ARRAY_SIZE(prim_conv));
+
+       return prim_conv[mode];
+}
+
  struct gfx9_gs_info {
         unsigned es_verts_per_subgroup;
         unsigned gs_prims_per_subgroup;
@@ -735,6 +757,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
         if (max_stream >= 2)
                 offset += num_components[2] * sel->gs_max_out_vertices;
         si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
+       si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
+                      si_conv_prim_to_gs_out(sel->gs_output_prim));
         if (max_stream >= 3)
                 offset += num_components[3] * sel->gs_max_out_vertices;
         si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
@@ -1223,12 +1247,13 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx,
         }
  
         /* Find out which VS outputs aren't used by the PS. */
-       uint64_t outputs_written = vs->outputs_written;
+       uint64_t outputs_written = vs->outputs_written_before_ps;
         uint64_t inputs_read = 0;
  
-       /* ignore POSITION, PSIZE */
-       outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0) |
-                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0))));
+       /* Ignore outputs that are not passed from VS to PS. */
+       outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+                            (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
  
         if (!ps_disabled) {
                 inputs_read = ps->inputs_read;
@@ -1391,67 +1416,63 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                         key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
                 }
  
-               if (rs) {
-                       bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
-                                       sctx->current_rast_prim <= PIPE_PRIM_POLYGON) ||
-                                      sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
-                       bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
+               bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+               bool is_line = util_prim_is_lines(sctx->current_rast_prim);
  
-                       key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
-                       key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+               key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+               key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
  
-                       if (sctx->queued.named.blend) {
-                               key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
-                                                             rs->multisample_enable;
-                       }
+               if (sctx->queued.named.blend) {
+                       key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
+                                                          rs->multisample_enable;
+               }
  
-                       key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-                       key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
-                                                             (is_line && rs->line_smooth)) &&
-                                                            sctx->framebuffer.nr_samples <= 1;
-                       key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+               key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+               key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
+                                                          (is_line && rs->line_smooth)) &&
+                                                         sctx->framebuffer.nr_samples <= 1;
+               key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
  
-                       if (sctx->ps_iter_samples > 1 &&
-                           sel->info.reads_samplemask) {
-                               key->part.ps.prolog.samplemask_log_ps_iter =
-                                       util_logbase2(sctx->ps_iter_samples);
-                       }
+               if (sctx->ps_iter_samples > 1 &&
+                   sel->info.reads_samplemask) {
+                       key->part.ps.prolog.samplemask_log_ps_iter =
+                               util_logbase2(sctx->ps_iter_samples);
+               }
  
-                       if (rs->force_persample_interp &&
-                           rs->multisample_enable &&
-                           sctx->framebuffer.nr_samples > 1 &&
-                           sctx->ps_iter_samples > 1) {
-                               key->part.ps.prolog.force_persp_sample_interp =
-                                       sel->info.uses_persp_center ||
-                                       sel->info.uses_persp_centroid;
-
-                               key->part.ps.prolog.force_linear_sample_interp =
-                                       sel->info.uses_linear_center ||
-                                       sel->info.uses_linear_centroid;
-                       } else if (rs->multisample_enable &&
-                                  sctx->framebuffer.nr_samples > 1) {
-                               key->part.ps.prolog.bc_optimize_for_persp =
-                                       sel->info.uses_persp_center &&
-                                       sel->info.uses_persp_centroid;
-                               key->part.ps.prolog.bc_optimize_for_linear =
-                                       sel->info.uses_linear_center &&
-                                       sel->info.uses_linear_centroid;
-                       } else {
-                               /* Make sure SPI doesn't compute more than 1 pair
-                                * of (i,j), which is the optimization here. */
-                               key->part.ps.prolog.force_persp_center_interp =
-                                       sel->info.uses_persp_center +
-                                       sel->info.uses_persp_centroid +
-                                       sel->info.uses_persp_sample > 1;
-
-                               key->part.ps.prolog.force_linear_center_interp =
-                                       sel->info.uses_linear_center +
-                                       sel->info.uses_linear_centroid +
-                                       sel->info.uses_linear_sample > 1;
-
-                               if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
-                                       key->mono.u.ps.interpolate_at_sample_force_center = 1;
-                       }
+               if (rs->force_persample_interp &&
+                   rs->multisample_enable &&
+                   sctx->framebuffer.nr_samples > 1 &&
+                   sctx->ps_iter_samples > 1) {
+                       key->part.ps.prolog.force_persp_sample_interp =
+                               sel->info.uses_persp_center ||
+                               sel->info.uses_persp_centroid;
+
+                       key->part.ps.prolog.force_linear_sample_interp =
+                               sel->info.uses_linear_center ||
+                               sel->info.uses_linear_centroid;
+               } else if (rs->multisample_enable &&
+                          sctx->framebuffer.nr_samples > 1) {
+                       key->part.ps.prolog.bc_optimize_for_persp =
+                               sel->info.uses_persp_center &&
+                               sel->info.uses_persp_centroid;
+                       key->part.ps.prolog.bc_optimize_for_linear =
+                               sel->info.uses_linear_center &&
+                               sel->info.uses_linear_centroid;
+               } else {
+                       /* Make sure SPI doesn't compute more than 1 pair
+                        * of (i,j), which is the optimization here. */
+                       key->part.ps.prolog.force_persp_center_interp =
+                               sel->info.uses_persp_center +
+                               sel->info.uses_persp_centroid +
+                               sel->info.uses_persp_sample > 1;
+
+                       key->part.ps.prolog.force_linear_center_interp =
+                               sel->info.uses_linear_center +
+                               sel->info.uses_linear_centroid +
+                               sel->info.uses_linear_sample > 1;
+
+                       if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
+                               key->mono.u.ps.interpolate_at_sample_force_center = 1;
                 }
  
                 key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
@@ -1488,7 +1509,7 @@ static void si_build_shader_variant(struct si_shader *shader,
  {
         struct si_shader_selector *sel = shader->selector;
         struct si_screen *sscreen = sel->screen;
-       struct si_compiler *compiler;
+       struct ac_llvm_compiler *compiler;
         struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
         int r;
  
@@ -1559,10 +1580,10 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
                 main_part->selector = sel;
                 main_part->key.as_es = key->as_es;
                 main_part->key.as_ls = key->as_ls;
+               main_part->is_monolithic = false;
  
                 if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
-                                          main_part, false,
-                                          &compiler_state->debug) != 0) {
+                                          main_part, &compiler_state->debug) != 0) {
                         FREE(main_part);
                         return false;
                 }
@@ -1835,7 +1856,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
  {
         struct si_shader_selector *sel = (struct si_shader_selector *)job;
         struct si_screen *sscreen = sel->screen;
-       struct si_compiler *compiler;
+       struct ac_llvm_compiler *compiler;
         struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
  
         assert(!debug->debug_message || debug->async);
@@ -1861,6 +1882,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                 util_queue_fence_init(&shader->ready);
  
                 shader->selector = sel;
+               shader->is_monolithic = false;
                 si_parse_next_shader_property(&sel->info,
                                               sel->so.num_outputs != 0,
                                               &shader->key);
@@ -1879,7 +1901,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                         mtx_unlock(&sscreen->shader_cache_mutex);
  
                         /* Compile the shader if it hasn't been loaded from the cache. */
-                       if (si_compile_tgsi_shader(sscreen, compiler, shader, false,
+                       if (si_compile_tgsi_shader(sscreen, compiler, shader,
                                                    debug) != 0) {
                                 FREE(shader);
                                 FREE(ir_binary);
@@ -1927,8 +1949,8 @@ static void si_init_shader_selector_async(void *job, int thread_index)
                                                 break;
                                         /* fall through */
                                 default:
-                                       id = si_shader_io_get_unique_index(name, index);
-                                       sel->outputs_written &= ~(1ull << id);
+                                       id = si_shader_io_get_unique_index(name, index, true);
+                                       sel->outputs_written_before_ps &= ~(1ull << id);
                                         break;
                                 case TGSI_SEMANTIC_POSITION: /* ignore these */
                                 case TGSI_SEMANTIC_PSIZE:
@@ -1952,6 +1974,34 @@ static void si_init_shader_selector_async(void *job, int thread_index)
         }
  }
  
+void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
+                                struct util_queue_fence *ready_fence,
+                                struct si_compiler_ctx_state *compiler_ctx_state,
+                                void *job, util_queue_execute_func execute)
+{
+       util_queue_fence_init(ready_fence);
+
+       struct util_async_debug_callback async_debug;
+       bool wait =
+               (sctx->debug.debug_message && !sctx->debug.async) ||
+               sctx->is_debug ||
+               si_can_dump_shader(sctx->screen, processor);
+
+       if (wait) {
+               u_async_debug_init(&async_debug);
+               compiler_ctx_state->debug = async_debug.base;
+       }
+
+       util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
+                          ready_fence, execute, NULL);
+
+       if (wait) {
+               util_queue_fence_wait(ready_fence);
+               u_async_debug_drain(&async_debug, &sctx->debug);
+               u_async_debug_cleanup(&async_debug);
+       }
+}
+
  /* Return descriptor slot usage masks from the given shader info. */
  void si_get_active_slot_masks(const struct tgsi_shader_info *info,
                               uint32_t *const_and_shader_buffers,
@@ -2101,9 +2151,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                                 /* fall through */
                         default:
                                 sel->outputs_written |=
-                                       1ull << si_shader_io_get_unique_index(name, index);
+                                       1ull << si_shader_io_get_unique_index(name, index, false);
+                               sel->outputs_written_before_ps |=
+                                       1ull << si_shader_io_get_unique_index(name, index, true);
                                 break;
-                       case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
                         case TGSI_SEMANTIC_EDGEFLAG:
                                 break;
                         }
@@ -2115,6 +2166,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                  */
                 if (sctx->chip_class >= GFX9)
                         sel->esgs_itemsize += 4;
+
+               assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
                 break;
  
         case PIPE_SHADER_FRAGMENT:
@@ -2130,7 +2183,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
                                 /* fall through */
                         default:
                                 sel->inputs_read |=
-                                       1ull << si_shader_io_get_unique_index(name, index);
+                                       1ull << si_shader_io_get_unique_index(name, index, true);
                                 break;
                         case TGSI_SEMANTIC_PCOORD: /* ignore this */
                                 break;
@@ -2217,29 +2270,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
         }
  
         (void) mtx_init(&sel->mutex, mtx_plain);
-       util_queue_fence_init(&sel->ready);
-
-       struct util_async_debug_callback async_debug;
-       bool wait =
-               (sctx->debug.debug_message && !sctx->debug.async) ||
-               sctx->is_debug ||
-               si_can_dump_shader(sscreen, sel->info.processor);
-
-       if (wait) {
-               u_async_debug_init(&async_debug);
-               sel->compiler_ctx_state.debug = async_debug.base;
-       }
-
-       util_queue_add_job(&sscreen->shader_compiler_queue, sel,
-                          &sel->ready, si_init_shader_selector_async,
-                          NULL);
-
-       if (wait) {
-               util_queue_fence_wait(&sel->ready);
-               u_async_debug_drain(&async_debug, &sctx->debug);
-               u_async_debug_cleanup(&async_debug);
-       }
  
+       si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
+                                   &sel->compiler_ctx_state, sel,
+                                   si_init_shader_selector_async);
         return sel;
  }
  
@@ -2600,27 +2634,25 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx,
  
  static void si_emit_spi_map(struct si_context *sctx)
  {
-       struct radeon_winsys_cs *cs = sctx->gfx_cs;
         struct si_shader *ps = sctx->ps_shader.current;
         struct si_shader *vs = si_get_vs_state(sctx);
         struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
         unsigned i, num_interp, num_written = 0, bcol_interp[2];
+       unsigned spi_ps_input_cntl[32];
  
         if (!ps || !ps->selector->info.num_inputs)
                 return;
  
         num_interp = si_get_ps_num_interp(ps);
         assert(num_interp > 0);
-       radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, num_interp);
  
         for (i = 0; i < psinfo->num_inputs; i++) {
                 unsigned name = psinfo->input_semantic_name[i];
                 unsigned index = psinfo->input_semantic_index[i];
                 unsigned interpolate = psinfo->input_interpolate[i];
  
-               radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, name, index,
-                                                    interpolate));
-               num_written++;
+               spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
+                                                           index, interpolate);
  
                 if (name == TGSI_SEMANTIC_COLOR) {
                         assert(index < ARRAY_SIZE(bcol_interp));
@@ -2635,12 +2667,19 @@ static void si_emit_spi_map(struct si_context *sctx)
                         if (!(psinfo->colors_read & (0xf << (i * 4))))
                                 continue;
  
-                       radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, bcol,
-                                                            i, bcol_interp[i]));
-                       num_written++;
+                       spi_ps_input_cntl[num_written++] =
+                         si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+
                 }
         }
         assert(num_interp == num_written);
+
+       /* R_028644_SPI_PS_INPUT_CNTL_0 */
+       /* Dota 2: Only ~16% of SPI map updates set different values. */
+       /* Talos: Only ~9% of SPI map updates set different values. */
+       radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
+                                   spi_ps_input_cntl,
+                                   sctx->tracked_regs.spi_ps_input_cntl, num_interp);
  }
  
  /**
@@ -3330,7 +3369,7 @@ bool si_update_shaders(struct si_context *sctx)
  
  static void si_emit_scratch_state(struct si_context *sctx)
  {
-       struct radeon_winsys_cs *cs = sctx->gfx_cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
  
         radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
                                sctx->spi_tmpring_size);