#include "gfx9d.h"
#include "util/u_index_modify.h"
+#include "util/u_log.h"
#include "util/u_upload_mgr.h"
#include "util/u_prim.h"
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
sctx->b.screen->info.max_se == 1;
- unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+ unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
unsigned num_tcs_input_cp = info->vertices_per_patch;
unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
unsigned num_tcs_patch_outputs;
*/
*num_patches = MIN2(*num_patches, 40);
- if (sctx->b.chip_class == SI) {
+ if (sctx->b.chip_class == SI ||
+ /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
+ * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
+ (sctx->b.chip_class == GFX9 &&
+ num_tcs_input_cp > num_tcs_output_cp)) {
/* SI bug workaround, related to power management. Limit LS-HS
* threadgroups to only one wave.
*/
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
radeon_set_sh_reg(cs,
- sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
+ sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
SI_SGPR_VS_STATE_BITS * 4,
sctx->current_vs_state);
{
struct pipe_draw_indirect_info *indirect = info->indirect;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
- unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+ unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
uint32_t index_max_size = 0;
uint64_t index_va = 0;
/* Necessary for DCC */
if (rctx->chip_class == VI)
r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS,
- 0, 0, NULL, 0, 0, 0);
+ 0, EOP_DATA_SEL_DISCARD, NULL,
+ 0, 0, R600_NOT_QUERY);
}
if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
}
- /* TC | TC_WB = invalidate L2 data
- * TC_MD | TC_WB = invalidate L2 metadata (DCC, etc.)
- * TC | TC_WB | TC_MD = invalidate L2 data & metadata
+ /* These are the only allowed combinations. If you need to
+ * do multiple operations at once, do them separately.
+ * All operations that invalidate L2 also seem to invalidate
+ * metadata. Volatile (VOL) and WC flushes are not listed here.
+ *
+ * TC | TC_WB = writeback & invalidate L2 & L1
+ * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+ * TC_WB | TC_NC = writeback L2 for MTYPE == NC
+ * TC | TC_NC = invalidate L2 for MTYPE == NC
+ * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
+ * TCL1 = invalidate L1
*/
tc_flags = 0;
+ if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) {
+ tc_flags = EVENT_TC_ACTION_ENA |
+ EVENT_TC_MD_ACTION_ENA;
+ }
+
/* Ideally flush TC together with CB/DB. */
if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
- tc_flags |= EVENT_TC_ACTION_ENA |
- EVENT_TC_WB_ACTION_ENA |
- EVENT_TCL1_ACTION_ENA;
+ /* Writeback and invalidate everything in L2 & L1. */
+ tc_flags = EVENT_TC_ACTION_ENA |
+ EVENT_TC_WB_ACTION_ENA;
/* Clear the flags. */
rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
va = sctx->wait_mem_scratch->gpu_address;
sctx->wait_mem_number++;
- r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
+ r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags,
+ EOP_DATA_SEL_VALUE_32BIT,
sctx->wait_mem_scratch, va,
- sctx->wait_mem_number - 1,
- sctx->wait_mem_number);
+ sctx->wait_mem_number, R600_NOT_QUERY);
r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
}
}
}
-void si_ce_pre_draw_synchronization(struct si_context *sctx)
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+ unsigned skip_atom_mask)
{
- if (sctx->ce_need_synchronization) {
- radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
- radeon_emit(sctx->ce_ib, 1);
+ /* Emit state atoms. */
+ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+ while (mask) {
+ struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
- radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0));
- radeon_emit(sctx->b.gfx.cs, 1);
+ atom->emit(&sctx->b, atom);
}
-}
+ sctx->dirty_atoms &= skip_atom_mask;
-void si_ce_post_draw_synchronization(struct si_context *sctx)
-{
- if (sctx->ce_need_synchronization) {
- radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
- radeon_emit(sctx->b.gfx.cs, 0);
+ /* Emit states. */
+ mask = sctx->dirty_states;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct si_pm4_state *state = sctx->queued.array[i];
+
+ if (!state || sctx->emitted.array[i] == state)
+ continue;
- sctx->ce_need_synchronization = false;
+ si_pm4_emit(sctx, state);
+ sctx->emitted.array[i] = state;
}
+ sctx->dirty_states = 0;
+
+ /* Emit draw states. */
+ unsigned num_patches = 0;
+
+ si_emit_rasterizer_prim_state(sctx);
+ if (sctx->tes_shader.cso)
+ si_emit_derived_tess_state(sctx, info, &num_patches);
+ si_emit_vs_state(sctx, info);
+ si_emit_draw_registers(sctx, info, num_patches);
}
void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_resource *indexbuf = info->index.resource;
- unsigned mask, dirty_tex_counter;
+ unsigned dirty_tex_counter;
enum pipe_prim_type rast_prim;
- unsigned num_patches = 0;
unsigned index_size = info->index_size;
unsigned index_offset = info->indirect ? info->start * index_size : 0;
if (sctx->do_update_shaders && !si_update_shaders(sctx))
return;
- if (!si_upload_graphics_shader_descriptors(sctx))
- return;
-
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on VI. */
si_need_cs_space(sctx);
+ if (unlikely(sctx->b.log))
+ si_log_draw_state(sctx, sctx->b.log);
+
/* Since we've called r600_context_add_resource_size for vertex buffers,
* this must be called after si_need_cs_space, because we must let
* need_cs_space flush before we add buffers to the buffer list.
if (!si_upload_vertex_buffer_descriptors(sctx))
return;
- /* GFX9 scissor bug workaround. There is also a more efficient but
- * more involved alternative workaround. */
+ /* GFX9 scissor bug workaround. This must be done before VPORT scissor
+ * registers are changed. There is also a more efficient but more
+ * involved alternative workaround.
+ */
if (sctx->b.chip_class == GFX9 &&
- si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
+ si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) {
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+ si_emit_cache_flush(sctx);
+ }
+
+ /* Use optimal packet order based on whether we need to sync the pipeline. */
+ if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+ SI_CONTEXT_FLUSH_AND_INV_DB |
+ SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* If we have to wait for idle, set all states first, so that all
+ * SET packets are processed in parallel with previous draw calls.
+ * Then upload descriptors, set shader pointers, and draw, and
+ * prefetch at the end. This ensures that the time the CUs
+ * are idle is very short. (there are only SET_SH packets between
+ * the wait and the draw)
+ */
+ struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
- /* Flush caches before the first state atom, which does L2 prefetches. */
- if (sctx->b.flags)
+ /* Emit all states except shader pointers. */
+ si_emit_all_states(sctx, info, 1 << shader_pointers->id);
si_emit_cache_flush(sctx);
- /* Emit state atoms. */
- mask = sctx->dirty_atoms;
- while (mask) {
- struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+ /* <-- CUs are idle here. */
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
- atom->emit(&sctx->b, atom);
- }
- sctx->dirty_atoms = 0;
+ /* Set shader pointers after descriptors are uploaded. */
+ if (si_is_atom_dirty(sctx, shader_pointers)) {
+ shader_pointers->emit(&sctx->b, NULL);
+ sctx->dirty_atoms = 0;
+ }
- /* Emit states. */
- mask = sctx->dirty_states;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct si_pm4_state *state = sctx->queued.array[i];
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ /* <-- CUs are busy here. */
- if (!state || sctx->emitted.array[i] == state)
- continue;
+ /* Start prefetches after the draw has been started. Both will run
+ * in parallel, but starting the draw first is more important.
+ */
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and draw at the end.
+ */
+ if (sctx->b.flags)
+ si_emit_cache_flush(sctx);
- si_pm4_emit(sctx, state);
- sctx->emitted.array[i] = state;
- }
- sctx->dirty_states = 0;
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
- si_emit_rasterizer_prim_state(sctx);
- if (sctx->tes_shader.cso)
- si_emit_derived_tess_state(sctx, info, &num_patches);
- si_emit_vs_state(sctx, info);
- si_emit_draw_registers(sctx, info, num_patches);
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
- si_ce_pre_draw_synchronization(sctx);
- si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
- si_ce_post_draw_synchronization(sctx);
+ si_emit_all_states(sctx, info, 0);
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ }
- if (sctx->trace_buf)
+ if (unlikely(sctx->current_saved_cs))
si_trace_emit(sctx);
/* Workaround for a VGT hang when streamout is enabled.
sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
}
- sctx->b.num_draw_calls++;
- if (sctx->framebuffer.state.nr_cbufs > 1)
- sctx->b.num_mrt_draw_calls++;
- if (info->primitive_restart)
- sctx->b.num_prim_restart_calls++;
- if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
- sctx->b.num_spill_draw_calls++;
+ if (unlikely(sctx->decompression_enabled)) {
+ sctx->b.num_decompress_calls++;
+ } else {
+ sctx->b.num_draw_calls++;
+ if (sctx->framebuffer.state.nr_cbufs > 1)
+ sctx->b.num_mrt_draw_calls++;
+ if (info->primitive_restart)
+ sctx->b.num_prim_restart_calls++;
+ if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+ sctx->b.num_spill_draw_calls++;
+ }
if (index_size && indexbuf != info->index.resource)
pipe_resource_reference(&indexbuf, NULL);
}
void si_trace_emit(struct si_context *sctx)
{
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+ uint64_t va = sctx->current_saved_cs->trace_buf->gpu_address;
+ uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
- sctx->trace_id++;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
- RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_ME));
- radeon_emit(cs, sctx->trace_buf->gpu_address);
- radeon_emit(cs, sctx->trace_buf->gpu_address >> 32);
- radeon_emit(cs, sctx->trace_id);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, trace_id);
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(cs, AC_ENCODE_TRACE_POINT(sctx->trace_id));
+ radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+
+ if (sctx->b.log)
+ u_log_flush(sctx->b.log);
}