radeonsi: rewrite late alloc VS limit computation

[mesa.git] / src / gallium / drivers / radeonsi / si_state_draw.c
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index d13c8b7086fa11172b1d7c34ee26737852eecfb8..1d8be49a480752fbf9d6702854ba0d873a3ab1f9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -30,6 +30,7 @@
  #include "gfx9d.h"
  
  #include "util/u_index_modify.h"
+#include "util/u_log.h"
  #include "util/u_upload_mgr.h"
  #include "util/u_prim.h"
  
@@ -105,7 +106,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
         unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
         bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
                                          sctx->b.screen->info.max_se == 1;
-       unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+       unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
         unsigned num_tcs_input_cp = info->vertices_per_patch;
         unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
         unsigned num_tcs_patch_outputs;
@@ -194,7 +195,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
          */
         *num_patches = MIN2(*num_patches, 40);
  
-       if (sctx->b.chip_class == SI) {
+       if (sctx->b.chip_class == SI ||
+           /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
+            * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
+           (sctx->b.chip_class == GFX9 &&
+            num_tcs_input_cp > num_tcs_output_cp)) {
                 /* SI bug workaround, related to power management. Limit LS-HS
                  * threadgroups to only one wave.
                  */
@@ -567,7 +572,7 @@ static void si_emit_vs_state(struct si_context *sctx,
                 struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
  
                 radeon_set_sh_reg(cs,
-                       sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
+                       sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
                         SI_SGPR_VS_STATE_BITS * 4,
                         sctx->current_vs_state);
  
@@ -640,7 +645,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
  {
         struct pipe_draw_indirect_info *indirect = info->indirect;
         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-       unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+       unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
         bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
         uint32_t index_max_size = 0;
         uint64_t index_va = 0;
@@ -894,7 +899,8 @@ void si_emit_cache_flush(struct si_context *sctx)
                         /* Necessary for DCC */
                         if (rctx->chip_class == VI)
                                 r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-                                                        0, 0, NULL, 0, 0, 0);
+                                                        0, EOP_DATA_SEL_DISCARD, NULL,
+                                                        0, 0, R600_NOT_QUERY);
                 }
                 if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
                         cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
@@ -954,9 +960,8 @@ void si_emit_cache_flush(struct si_context *sctx)
          * wait for idle on GFX9. We have to use a TS event.
          */
         if (sctx->b.chip_class >= GFX9 && flush_cb_db) {
-               struct r600_resource *rbuf = NULL;
                 uint64_t va;
-               unsigned offset = 0, tc_flags, cb_db_event;
+               unsigned tc_flags, cb_db_event;
  
                 /* Set the CB/DB flush event. */
                 switch (flush_cb_db) {
@@ -971,24 +976,30 @@ void si_emit_cache_flush(struct si_context *sctx)
                         cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
                 }
  
-               /* TC    | TC_WB         = invalidate L2 data
-                * TC_MD | TC_WB         = invalidate L2 metadata
-                * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
+               /* These are the only allowed combinations. If you need to
+                * do multiple operations at once, do them separately.
+                * All operations that invalidate L2 also seem to invalidate
+                * metadata. Volatile (VOL) and WC flushes are not listed here.
                  *
-                * The metadata cache must always be invalidated for coherency
-                * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
-                *
-                * TC must be invalidated on GFX9 only if the CB/DB surface is
-                * not pipe-aligned. If the surface is RB-aligned, it might not
-                * strictly be pipe-aligned since RB alignment takes precendence.
+                * TC    | TC_WB         = writeback & invalidate L2 & L1
+                * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+                *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
+                * TC            | TC_NC = invalidate L2 for MTYPE == NC
+                * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
+                * TCL1                  = invalidate L1
                  */
-               tc_flags = EVENT_TC_WB_ACTION_ENA |
-                          EVENT_TC_MD_ACTION_ENA;
+               tc_flags = 0;
+
+               if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) {
+                       tc_flags = EVENT_TC_ACTION_ENA |
+                                  EVENT_TC_MD_ACTION_ENA;
+               }
  
                 /* Ideally flush TC together with CB/DB. */
                 if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
-                       tc_flags |= EVENT_TC_ACTION_ENA |
-                                   EVENT_TCL1_ACTION_ENA;
+                       /* Writeback and invalidate everything in L2 & L1. */
+                       tc_flags = EVENT_TC_ACTION_ENA |
+                                  EVENT_TC_WB_ACTION_ENA;
  
                         /* Clear the flags. */
                         rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
@@ -997,14 +1008,15 @@ void si_emit_cache_flush(struct si_context *sctx)
                         sctx->b.num_L2_invalidates++;
                 }
  
-               /* Allocate memory for the fence. */
-               u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4,
-                                    &offset, (struct pipe_resource**)&rbuf);
-               va = rbuf->gpu_address + offset;
+               /* Do the flush (enqueue the event and wait for it). */
+               va = sctx->wait_mem_scratch->gpu_address;
+               sctx->wait_mem_number++;
  
-               r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
-                                        rbuf, va, 0, 1);
-               r600_gfx_wait_fence(rctx, va, 1, 0xffffffff);
+               r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags,
+                                        EOP_DATA_SEL_VALUE_32BIT,
+                                        sctx->wait_mem_scratch, va,
+                                        sctx->wait_mem_number, R600_NOT_QUERY);
+               r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
         }
  
         /* Make sure ME is idle (it executes most packets) before continuing.
@@ -1148,25 +1160,40 @@ static void si_get_draw_start_count(struct si_context *sctx,
         }
  }
  
-void si_ce_pre_draw_synchronization(struct si_context *sctx)
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+                              unsigned skip_atom_mask)
  {
-       if (sctx->ce_need_synchronization) {
-               radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
-               radeon_emit(sctx->ce_ib, 1);
+       /* Emit state atoms. */
+       unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+       while (mask) {
+               struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
  
-               radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0));
-               radeon_emit(sctx->b.gfx.cs, 1);
+               atom->emit(&sctx->b, atom);
         }
-}
+       sctx->dirty_atoms &= skip_atom_mask;
  
-void si_ce_post_draw_synchronization(struct si_context *sctx)
-{
-       if (sctx->ce_need_synchronization) {
-               radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
-               radeon_emit(sctx->b.gfx.cs, 0);
+       /* Emit states. */
+       mask = sctx->dirty_states;
+       while (mask) {
+               unsigned i = u_bit_scan(&mask);
+               struct si_pm4_state *state = sctx->queued.array[i];
+
+               if (!state || sctx->emitted.array[i] == state)
+                       continue;
  
-               sctx->ce_need_synchronization = false;
+               si_pm4_emit(sctx, state);
+               sctx->emitted.array[i] = state;
         }
+       sctx->dirty_states = 0;
+
+       /* Emit draw states. */
+       unsigned num_patches = 0;
+
+       si_emit_rasterizer_prim_state(sctx);
+       if (sctx->tes_shader.cso)
+               si_emit_derived_tess_state(sctx, info, &num_patches);
+       si_emit_vs_state(sctx, info);
+       si_emit_draw_registers(sctx, info, num_patches);
  }
  
  void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
@@ -1174,9 +1201,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
         struct si_context *sctx = (struct si_context *)ctx;
         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
         struct pipe_resource *indexbuf = info->index.resource;
-       unsigned mask, dirty_tex_counter;
+       unsigned dirty_tex_counter;
         enum pipe_prim_type rast_prim;
-       unsigned num_patches = 0;
         unsigned index_size = info->index_size;
         unsigned index_offset = info->indirect ? info->start * index_size : 0;
  
@@ -1214,7 +1240,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                 sctx->framebuffer.dirty_cbufs |=
                         ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
                 sctx->framebuffer.dirty_zsbuf = true;
-               sctx->framebuffer.do_update_surf_dirtiness = true;
                 si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
                 si_update_all_texture_descriptors(sctx);
         }
@@ -1259,9 +1284,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
         if (sctx->do_update_shaders && !si_update_shaders(sctx))
                 return;
  
-       if (!si_upload_graphics_shader_descriptors(sctx))
-               return;
-
         if (index_size) {
                 /* Translate or upload, if needed. */
                 /* 8-bit indices are supported on VI. */
@@ -1320,20 +1342,26 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                 /* Add the buffer size for memory checking in need_cs_space. */
                 r600_context_add_resource_size(ctx, indirect->buffer);
  
-               if (r600_resource(indirect->buffer)->TC_L2_dirty) {
-                       sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                       r600_resource(indirect->buffer)->TC_L2_dirty = false;
-               }
+               /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+               if (sctx->b.chip_class <= VI) {
+                       if (r600_resource(indirect->buffer)->TC_L2_dirty) {
+                               sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                               r600_resource(indirect->buffer)->TC_L2_dirty = false;
+                       }
  
-               if (indirect->indirect_draw_count &&
-                   r600_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
-                       sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                       r600_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+                       if (indirect->indirect_draw_count &&
+                           r600_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
+                               sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                               r600_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+                       }
                 }
         }
  
         si_need_cs_space(sctx);
  
+       if (unlikely(sctx->b.log))
+               si_log_draw_state(sctx, sctx->b.log);
+
         /* Since we've called r600_context_add_resource_size for vertex buffers,
          * this must be called after si_need_cs_space, because we must let
          * need_cs_space flush before we add buffers to the buffer list.
@@ -1341,50 +1369,70 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
         if (!si_upload_vertex_buffer_descriptors(sctx))
                 return;
  
-       /* GFX9 scissor bug workaround. There is also a more efficient but
-        * more involved alternative workaround. */
+       /* GFX9 scissor bug workaround. This must be done before VPORT scissor
+        * registers are changed. There is also a more efficient but more
+        * involved alternative workaround.
+        */
         if (sctx->b.chip_class == GFX9 &&
-           si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
+           si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) {
                 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+               si_emit_cache_flush(sctx);
+       }
+
+       /* Use optimal packet order based on whether we need to sync the pipeline. */
+       if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+                                     SI_CONTEXT_FLUSH_AND_INV_DB |
+                                     SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                     SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+               /* If we have to wait for idle, set all states first, so that all
+                * SET packets are processed in parallel with previous draw calls.
+                * Then upload descriptors, set shader pointers, and draw, and
+                * prefetch at the end. This ensures that the time the CUs
+                * are idle is very short. (there are only SET_SH packets between
+                * the wait and the draw)
+                */
+               struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
  
-       /* Flush caches before the first state atom, which does L2 prefetches. */
-       if (sctx->b.flags)
+               /* Emit all states except shader pointers. */
+               si_emit_all_states(sctx, info, 1 << shader_pointers->id);
                 si_emit_cache_flush(sctx);
  
-       /* Emit state atoms. */
-       mask = sctx->dirty_atoms;
-       while (mask) {
-               struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+               /* <-- CUs are idle here. */
+               if (!si_upload_graphics_shader_descriptors(sctx))
+                       return;
  
-               atom->emit(&sctx->b, atom);
-       }
-       sctx->dirty_atoms = 0;
+               /* Set shader pointers after descriptors are uploaded. */
+               if (si_is_atom_dirty(sctx, shader_pointers)) {
+                       shader_pointers->emit(&sctx->b, NULL);
+                       sctx->dirty_atoms = 0;
+               }
  
-       /* Emit states. */
-       mask = sctx->dirty_states;
-       while (mask) {
-               unsigned i = u_bit_scan(&mask);
-               struct si_pm4_state *state = sctx->queued.array[i];
+               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+               /* <-- CUs are busy here. */
  
-               if (!state || sctx->emitted.array[i] == state)
-                       continue;
+               /* Start prefetches after the draw has been started. Both will run
+                * in parallel, but starting the draw first is more important.
+                */
+               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+                       cik_emit_prefetch_L2(sctx);
+       } else {
+               /* If we don't wait for idle, start prefetches first, then set
+                * states, and draw at the end.
+                */
+               if (sctx->b.flags)
+                       si_emit_cache_flush(sctx);
  
-               si_pm4_emit(sctx, state);
-               sctx->emitted.array[i] = state;
-       }
-       sctx->dirty_states = 0;
+               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+                       cik_emit_prefetch_L2(sctx);
  
-       si_emit_rasterizer_prim_state(sctx);
-       if (sctx->tes_shader.cso)
-               si_emit_derived_tess_state(sctx, info, &num_patches);
-       si_emit_vs_state(sctx, info);
-       si_emit_draw_registers(sctx, info, num_patches);
+               if (!si_upload_graphics_shader_descriptors(sctx))
+                       return;
  
-       si_ce_pre_draw_synchronization(sctx);
-       si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
-       si_ce_post_draw_synchronization(sctx);
+               si_emit_all_states(sctx, info, 0);
+               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+       }
  
-       if (sctx->trace_buf)
+       if (unlikely(sctx->current_saved_cs))
                 si_trace_emit(sctx);
  
         /* Workaround for a VGT hang when streamout is enabled.
@@ -1396,41 +1444,17 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                 sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
         }
  
-       if (sctx->framebuffer.do_update_surf_dirtiness) {
-               /* Set the depth buffer as dirty. */
-               if (sctx->framebuffer.state.zsbuf) {
-                       struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
-                       struct r600_texture *rtex = (struct r600_texture *)surf->texture;
-
-                       rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-
-                       if (rtex->surface.flags & RADEON_SURF_SBUFFER)
-                               rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
-               }
-               if (sctx->framebuffer.compressed_cb_mask) {
-                       struct pipe_surface *surf;
-                       struct r600_texture *rtex;
-                       unsigned mask = sctx->framebuffer.compressed_cb_mask;
-
-                       do {
-                               unsigned i = u_bit_scan(&mask);
-                               surf = sctx->framebuffer.state.cbufs[i];
-                               rtex = (struct r600_texture*)surf->texture;
-
-                               if (rtex->fmask.size)
-                                       rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-                               if (rtex->dcc_gather_statistics)
-                                       rtex->separate_dcc_dirty = true;
-                       } while (mask);
-               }
-               sctx->framebuffer.do_update_surf_dirtiness = false;
+       if (unlikely(sctx->decompression_enabled)) {
+               sctx->b.num_decompress_calls++;
+       } else {
+               sctx->b.num_draw_calls++;
+               if (sctx->framebuffer.state.nr_cbufs > 1)
+                       sctx->b.num_mrt_draw_calls++;
+               if (info->primitive_restart)
+                       sctx->b.num_prim_restart_calls++;
+               if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+                       sctx->b.num_spill_draw_calls++;
         }
-
-       sctx->b.num_draw_calls++;
-       if (info->primitive_restart)
-               sctx->b.num_prim_restart_calls++;
-       if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
-               sctx->b.num_spill_draw_calls++;
         if (index_size && indexbuf != info->index.resource)
                 pipe_resource_reference(&indexbuf, NULL);
  }
@@ -1438,17 +1462,19 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
  void si_trace_emit(struct si_context *sctx)
  {
         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       uint64_t va = sctx->current_saved_cs->trace_buf->gpu_address;
+       uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
  
-       sctx->trace_id++;
-       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
-                             RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
         radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
                     S_370_WR_CONFIRM(1) |
                     S_370_ENGINE_SEL(V_370_ME));
-       radeon_emit(cs, sctx->trace_buf->gpu_address);
-       radeon_emit(cs, sctx->trace_buf->gpu_address >> 32);
-       radeon_emit(cs, sctx->trace_id);
+       radeon_emit(cs, va);
+       radeon_emit(cs, va >> 32);
+       radeon_emit(cs, trace_id);
         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-       radeon_emit(cs, AC_ENCODE_TRACE_POINT(sctx->trace_id));
+       radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+
+       if (sctx->b.log)
+               u_log_flush(sctx->b.log);
  }