X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state_draw.c;h=1d8be49a480752fbf9d6702854ba0d873a3ab1f9;hb=9c92e82b324291b56e701d6ad265ec73b21a654f;hp=a5f5b7f98a0add0b98da7ef6f3988ef14917956d;hpb=5d8359ff4d8c379fdf1a78758f405bb4cdf69459;p=mesa.git

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index a5f5b7f98a0..1d8be49a480 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -30,6 +30,7 @@
 #include "gfx9d.h"
 
 #include "util/u_index_modify.h"
+#include "util/u_log.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
 
@@ -105,7 +106,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
 	bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
 					 sctx->b.screen->info.max_se == 1;
-	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+	unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
 	unsigned num_tcs_patch_outputs;
@@ -194,7 +195,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	 */
 	*num_patches = MIN2(*num_patches, 40);
 
-	if (sctx->b.chip_class == SI) {
+	if (sctx->b.chip_class == SI ||
+	    /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
+	     * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
+	    (sctx->b.chip_class == GFX9 &&
+	     num_tcs_input_cp > num_tcs_output_cp)) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
@@ -567,7 +572,7 @@ static void si_emit_vs_state(struct si_context *sctx,
 		struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 		radeon_set_sh_reg(cs,
-			sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
+			sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
 			SI_SGPR_VS_STATE_BITS * 4,
 			sctx->current_vs_state);
 
@@ -640,7 +645,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 {
 	struct pipe_draw_indirect_info *indirect = info->indirect;
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+	unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
 	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
 	uint32_t index_max_size = 0;
 	uint64_t index_va = 0;
@@ -894,7 +899,8 @@ void si_emit_cache_flush(struct si_context *sctx)
 			/* Necessary for DCC */
 			if (rctx->chip_class == VI)
 				r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-							 0, 0, NULL, 0, 0, 0);
+							 0, EOP_DATA_SEL_DISCARD, NULL,
+							 0, 0, R600_NOT_QUERY);
 		}
 		if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
 			cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
@@ -970,17 +976,30 @@ void si_emit_cache_flush(struct si_context *sctx)
 			cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
 		}
 
-		/* TC    | TC_WB         = invalidate L2 data
-		 * TC_MD | TC_WB         = invalidate L2 metadata (DCC, etc.)
-		 * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
+		/* These are the only allowed combinations. If you need to
+		 * do multiple operations at once, do them separately.
+		 * All operations that invalidate L2 also seem to invalidate
+		 * metadata. Volatile (VOL) and WC flushes are not listed here.
+		 *
+		 * TC    | TC_WB         = writeback & invalidate L2 & L1
+		 * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+		 *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
+		 * TC            | TC_NC = invalidate L2 for MTYPE == NC
+		 * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
+		 * TCL1                  = invalidate L1
 		 */
 		tc_flags = 0;
 
+		if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) {
+			tc_flags = EVENT_TC_ACTION_ENA |
+				   EVENT_TC_MD_ACTION_ENA;
+		}
+
 		/* Ideally flush TC together with CB/DB. */
 		if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
-			tc_flags |= EVENT_TC_ACTION_ENA |
-				    EVENT_TC_WB_ACTION_ENA |
-				    EVENT_TCL1_ACTION_ENA;
+			/* Writeback and invalidate everything in L2 & L1. */
+			tc_flags = EVENT_TC_ACTION_ENA |
+				   EVENT_TC_WB_ACTION_ENA;
 
 			/* Clear the flags. */
 			rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
@@ -993,10 +1012,10 @@ void si_emit_cache_flush(struct si_context *sctx)
 		va = sctx->wait_mem_scratch->gpu_address;
 		sctx->wait_mem_number++;
 
-		r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
+		r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags,
+					 EOP_DATA_SEL_VALUE_32BIT,
 					 sctx->wait_mem_scratch, va,
-					 sctx->wait_mem_number - 1,
-					 sctx->wait_mem_number);
+					 sctx->wait_mem_number, R600_NOT_QUERY);
 		r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
 	}
 
@@ -1141,25 +1160,40 @@ static void si_get_draw_start_count(struct si_context *sctx,
 	}
 }
 
-void si_ce_pre_draw_synchronization(struct si_context *sctx)
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+			       unsigned skip_atom_mask)
 {
-	if (sctx->ce_need_synchronization) {
-		radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
-		radeon_emit(sctx->ce_ib, 1);
+	/* Emit state atoms. */
+	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+	while (mask) {
+		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
-		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0));
-		radeon_emit(sctx->b.gfx.cs, 1);
+		atom->emit(&sctx->b, atom);
 	}
-}
+	sctx->dirty_atoms &= skip_atom_mask;
 
-void si_ce_post_draw_synchronization(struct si_context *sctx)
-{
-	if (sctx->ce_need_synchronization) {
-		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
-		radeon_emit(sctx->b.gfx.cs, 0);
+	/* Emit states. */
+	mask = sctx->dirty_states;
+	while (mask) {
+		unsigned i = u_bit_scan(&mask);
+		struct si_pm4_state *state = sctx->queued.array[i];
+
+		if (!state || sctx->emitted.array[i] == state)
+			continue;
 
-		sctx->ce_need_synchronization = false;
+		si_pm4_emit(sctx, state);
+		sctx->emitted.array[i] = state;
 	}
+	sctx->dirty_states = 0;
+
+	/* Emit draw states. */
+	unsigned num_patches = 0;
+
+	si_emit_rasterizer_prim_state(sctx);
+	if (sctx->tes_shader.cso)
+		si_emit_derived_tess_state(sctx, info, &num_patches);
+	si_emit_vs_state(sctx, info);
+	si_emit_draw_registers(sctx, info, num_patches);
 }
 
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
@@ -1167,9 +1201,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_resource *indexbuf = info->index.resource;
-	unsigned mask, dirty_tex_counter;
+	unsigned dirty_tex_counter;
 	enum pipe_prim_type rast_prim;
-	unsigned num_patches = 0;
 	unsigned index_size = info->index_size;
 	unsigned index_offset = info->indirect ? info->start * index_size : 0;
 
@@ -1251,9 +1284,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if (sctx->do_update_shaders && !si_update_shaders(sctx))
 		return;
 
-	if (!si_upload_graphics_shader_descriptors(sctx))
-		return;
-
 	if (index_size) {
 		/* Translate or upload, if needed. */
 		/* 8-bit indices are supported on VI. */
@@ -1329,6 +1359,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	si_need_cs_space(sctx);
 
+	if (unlikely(sctx->b.log))
+		si_log_draw_state(sctx, sctx->b.log);
+
 	/* Since we've called r600_context_add_resource_size for vertex buffers,
 	 * this must be called after si_need_cs_space, because we must let
 	 * need_cs_space flush before we add buffers to the buffer list.
@@ -1336,50 +1369,70 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		return;
 
-	/* GFX9 scissor bug workaround. There is also a more efficient but
-	 * more involved alternative workaround. */
+	/* GFX9 scissor bug workaround. This must be done before VPORT scissor
+	 * registers are changed. There is also a more efficient but more
+	 * involved alternative workaround.
+	 */
 	if (sctx->b.chip_class == GFX9 &&
-	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
+	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) {
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+		si_emit_cache_flush(sctx);
+	}
+
+	/* Use optimal packet order based on whether we need to sync the pipeline. */
+	if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+				      SI_CONTEXT_FLUSH_AND_INV_DB |
+				      SI_CONTEXT_PS_PARTIAL_FLUSH |
+				      SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+		/* If we have to wait for idle, set all states first, so that all
+		 * SET packets are processed in parallel with previous draw calls.
+		 * Then upload descriptors, set shader pointers, and draw, and
+		 * prefetch at the end. This ensures that the time the CUs
+		 * are idle is very short. (there are only SET_SH packets between
+		 * the wait and the draw)
+		 */
+		struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
 
-	/* Flush caches before the first state atom, which does L2 prefetches. */
-	if (sctx->b.flags)
+		/* Emit all states except shader pointers. */
+		si_emit_all_states(sctx, info, 1 << shader_pointers->id);
 		si_emit_cache_flush(sctx);
 
-	/* Emit state atoms. */
-	mask = sctx->dirty_atoms;
-	while (mask) {
-		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+		/* <-- CUs are idle here. */
+		if (!si_upload_graphics_shader_descriptors(sctx))
+			return;
 
-		atom->emit(&sctx->b, atom);
-	}
-	sctx->dirty_atoms = 0;
+		/* Set shader pointers after descriptors are uploaded. */
+		if (si_is_atom_dirty(sctx, shader_pointers)) {
+			shader_pointers->emit(&sctx->b, NULL);
+			sctx->dirty_atoms = 0;
+		}
 
-	/* Emit states. */
-	mask = sctx->dirty_states;
-	while (mask) {
-		unsigned i = u_bit_scan(&mask);
-		struct si_pm4_state *state = sctx->queued.array[i];
+		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+		/* <-- CUs are busy here. */
 
-		if (!state || sctx->emitted.array[i] == state)
-			continue;
+		/* Start prefetches after the draw has been started. Both will run
+		 * in parallel, but starting the draw first is more important.
+		 */
+		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+			cik_emit_prefetch_L2(sctx);
+	} else {
+		/* If we don't wait for idle, start prefetches first, then set
+		 * states, and draw at the end.
+		 */
+		if (sctx->b.flags)
+			si_emit_cache_flush(sctx);
 
-		si_pm4_emit(sctx, state);
-		sctx->emitted.array[i] = state;
-	}
-	sctx->dirty_states = 0;
+		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+			cik_emit_prefetch_L2(sctx);
 
-	si_emit_rasterizer_prim_state(sctx);
-	if (sctx->tes_shader.cso)
-		si_emit_derived_tess_state(sctx, info, &num_patches);
-	si_emit_vs_state(sctx, info);
-	si_emit_draw_registers(sctx, info, num_patches);
+		if (!si_upload_graphics_shader_descriptors(sctx))
+			return;
 
-	si_ce_pre_draw_synchronization(sctx);
-	si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
-	si_ce_post_draw_synchronization(sctx);
+		si_emit_all_states(sctx, info, 0);
+		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+	}
 
-	if (sctx->trace_buf)
+	if (unlikely(sctx->current_saved_cs))
 		si_trace_emit(sctx);
 
 	/* Workaround for a VGT hang when streamout is enabled.
@@ -1391,13 +1444,17 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
 	}
 
-	sctx->b.num_draw_calls++;
-	if (sctx->framebuffer.state.nr_cbufs > 1)
-		sctx->b.num_mrt_draw_calls++;
-	if (info->primitive_restart)
-		sctx->b.num_prim_restart_calls++;
-	if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
-		sctx->b.num_spill_draw_calls++;
+	if (unlikely(sctx->decompression_enabled)) {
+		sctx->b.num_decompress_calls++;
+	} else {
+		sctx->b.num_draw_calls++;
+		if (sctx->framebuffer.state.nr_cbufs > 1)
+			sctx->b.num_mrt_draw_calls++;
+		if (info->primitive_restart)
+			sctx->b.num_prim_restart_calls++;
+		if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+			sctx->b.num_spill_draw_calls++;
+	}
 	if (index_size && indexbuf != info->index.resource)
 		pipe_resource_reference(&indexbuf, NULL);
 }
@@ -1405,17 +1462,19 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 void si_trace_emit(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	uint64_t va = sctx->current_saved_cs->trace_buf->gpu_address;
+	uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
 
-	sctx->trace_id++;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
 		    S_370_WR_CONFIRM(1) |
 		    S_370_ENGINE_SEL(V_370_ME));
-	radeon_emit(cs, sctx->trace_buf->gpu_address);
-	radeon_emit(cs, sctx->trace_buf->gpu_address >> 32);
-	radeon_emit(cs, sctx->trace_id);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, trace_id);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, AC_ENCODE_TRACE_POINT(sctx->trace_id));
+	radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+
+	if (sctx->b.log)
+		u_log_flush(sctx->b.log);
 }