unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
sctx->b.screen->info.max_se == 1;
- unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+ unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
unsigned num_tcs_input_cp = info->vertices_per_patch;
unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
unsigned num_tcs_patch_outputs;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
radeon_set_sh_reg(cs,
- sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
+ sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
SI_SGPR_VS_STATE_BITS * 4,
sctx->current_vs_state);
{
struct pipe_draw_indirect_info *indirect = info->indirect;
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
- unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+ unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
uint32_t index_max_size = 0;
uint64_t index_va = 0;
/* Necessary for DCC */
if (rctx->chip_class == VI)
r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS,
- 0, 0, NULL, 0, 0, 0);
+ 0, 0, NULL, 0, 0, 0, 0);
}
if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
}
/* TC | TC_WB = invalidate L2 data
- * TC_MD | TC_WB = invalidate L2 metadata
+ * TC_MD | TC_WB = invalidate L2 metadata (DCC, etc.)
* TC | TC_WB | TC_MD = invalidate L2 data & metadata
- *
- * The metadata cache must always be invalidated for coherency
- * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
- *
- * TC must be invalidated on GFX9 only if the CB/DB surface is
- * not pipe-aligned. If the surface is RB-aligned, it might not
- * strictly be pipe-aligned since RB alignment takes precendence.
*/
- tc_flags = EVENT_TC_WB_ACTION_ENA |
- EVENT_TC_MD_ACTION_ENA;
+ tc_flags = 0;
/* Ideally flush TC together with CB/DB. */
if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
tc_flags |= EVENT_TC_ACTION_ENA |
+ EVENT_TC_WB_ACTION_ENA |
EVENT_TCL1_ACTION_ENA;
/* Clear the flags. */
r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
sctx->wait_mem_scratch, va,
sctx->wait_mem_number - 1,
- sctx->wait_mem_number);
+ sctx->wait_mem_number, 0);
r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
}
{
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0));
- radeon_emit(sctx->ce_ib, 1);
+ radeon_emit(sctx->ce_ib, 1); /* 1 = increment CE counter */
radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0));
- radeon_emit(sctx->b.gfx.cs, 1);
+ radeon_emit(sctx->b.gfx.cs, 0); /* 0 = don't flush sL1 conditionally */
}
}
{
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
- radeon_emit(sctx->b.gfx.cs, 0);
+ radeon_emit(sctx->b.gfx.cs, 0); /* unused */
sctx->ce_need_synchronization = false;
}
}
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+ unsigned skip_atom_mask)
+{
+ /* Emit state atoms. */
+ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+ while (mask) {
+ struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+
+ atom->emit(&sctx->b, atom);
+ }
+ sctx->dirty_atoms &= skip_atom_mask;
+
+ /* Emit states. */
+ mask = sctx->dirty_states;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct si_pm4_state *state = sctx->queued.array[i];
+
+ if (!state || sctx->emitted.array[i] == state)
+ continue;
+
+ si_pm4_emit(sctx, state);
+ sctx->emitted.array[i] = state;
+ }
+ sctx->dirty_states = 0;
+
+ /* Emit draw states. */
+ unsigned num_patches = 0;
+
+ si_emit_rasterizer_prim_state(sctx);
+ if (sctx->tes_shader.cso)
+ si_emit_derived_tess_state(sctx, info, &num_patches);
+ si_emit_vs_state(sctx, info);
+ si_emit_draw_registers(sctx, info, num_patches);
+}
+
void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_resource *indexbuf = info->index.resource;
- unsigned mask, dirty_tex_counter;
+ unsigned dirty_tex_counter;
enum pipe_prim_type rast_prim;
- unsigned num_patches = 0;
unsigned index_size = info->index_size;
unsigned index_offset = info->indirect ? info->start * index_size : 0;
sctx->framebuffer.dirty_cbufs |=
((1 << sctx->framebuffer.state.nr_cbufs) - 1);
sctx->framebuffer.dirty_zsbuf = true;
- sctx->framebuffer.do_update_surf_dirtiness = true;
si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
si_update_all_texture_descriptors(sctx);
}
if (sctx->do_update_shaders && !si_update_shaders(sctx))
return;
- if (!si_upload_graphics_shader_descriptors(sctx))
- return;
-
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on VI. */
if (!si_upload_vertex_buffer_descriptors(sctx))
return;
- /* GFX9 scissor bug workaround. There is also a more efficient but
- * more involved alternative workaround. */
+ /* GFX9 scissor bug workaround. This must be done before VPORT scissor
+ * registers are changed. There is also a more efficient but more
+ * involved alternative workaround.
+ */
if (sctx->b.chip_class == GFX9 &&
- si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
+ si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) {
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+ si_emit_cache_flush(sctx);
+ }
- /* Flush caches before the first state atom, which does L2 prefetches. */
- if (sctx->b.flags)
+ /* Use optimal packet order based on whether we need to sync the pipeline. */
+ if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+ SI_CONTEXT_FLUSH_AND_INV_DB |
+ SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* If we have to wait for idle, set all states first, so that all
+ * SET packets are processed in parallel with previous draw calls.
+ * Then upload descriptors, set shader pointers, and draw, and
+ * prefetch at the end. This ensures that the time the CUs
+ * are idle is very short. (there are only SET_SH packets between
+ * the wait and the draw)
+ */
+ struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
+
+ /* Emit all states except shader pointers. */
+ si_emit_all_states(sctx, info, 1 << shader_pointers->id);
si_emit_cache_flush(sctx);
- /* Emit state atoms. */
- mask = sctx->dirty_atoms;
- while (mask) {
- struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+ /* <-- CUs are idle here. */
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
- atom->emit(&sctx->b, atom);
- }
- sctx->dirty_atoms = 0;
+ /* Set shader pointers after descriptors are uploaded. */
+ if (si_is_atom_dirty(sctx, shader_pointers)) {
+ shader_pointers->emit(&sctx->b, NULL);
+ sctx->dirty_atoms = 0;
+ }
- /* Emit states. */
- mask = sctx->dirty_states;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct si_pm4_state *state = sctx->queued.array[i];
+ si_ce_pre_draw_synchronization(sctx);
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ /* <-- CUs are busy here. */
- if (!state || sctx->emitted.array[i] == state)
- continue;
+ /* Start prefetches after the draw has been started. Both will run
+ * in parallel, but starting the draw first is more important.
+ */
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and draw at the end.
+ */
+ if (sctx->b.flags)
+ si_emit_cache_flush(sctx);
- si_pm4_emit(sctx, state);
- sctx->emitted.array[i] = state;
- }
- sctx->dirty_states = 0;
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
- si_emit_rasterizer_prim_state(sctx);
- if (sctx->tes_shader.cso)
- si_emit_derived_tess_state(sctx, info, &num_patches);
- si_emit_vs_state(sctx, info);
- si_emit_draw_registers(sctx, info, num_patches);
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
+
+ si_emit_all_states(sctx, info, 0);
+ si_ce_pre_draw_synchronization(sctx);
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ }
- si_ce_pre_draw_synchronization(sctx);
- si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
si_ce_post_draw_synchronization(sctx);
if (sctx->trace_buf)
sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
}
- if (sctx->framebuffer.do_update_surf_dirtiness) {
- /* Set the depth buffer as dirty. */
- if (sctx->framebuffer.state.zsbuf) {
- struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
- struct r600_texture *rtex = (struct r600_texture *)surf->texture;
-
- rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-
- if (rtex->surface.flags & RADEON_SURF_SBUFFER)
- rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
- }
- if (sctx->framebuffer.compressed_cb_mask) {
- struct pipe_surface *surf;
- struct r600_texture *rtex;
- unsigned mask = sctx->framebuffer.compressed_cb_mask;
-
- do {
- unsigned i = u_bit_scan(&mask);
- surf = sctx->framebuffer.state.cbufs[i];
- rtex = (struct r600_texture*)surf->texture;
-
- if (rtex->fmask.size)
- rtex->dirty_level_mask |= 1 << surf->u.tex.level;
- if (rtex->dcc_gather_statistics)
- rtex->separate_dcc_dirty = true;
- } while (mask);
- }
- sctx->framebuffer.do_update_surf_dirtiness = false;
+ if (unlikely(sctx->decompression_enabled)) {
+ sctx->b.num_decompress_calls++;
+ } else {
+ sctx->b.num_draw_calls++;
+ if (sctx->framebuffer.state.nr_cbufs > 1)
+ sctx->b.num_mrt_draw_calls++;
+ if (info->primitive_restart)
+ sctx->b.num_prim_restart_calls++;
+ if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+ sctx->b.num_spill_draw_calls++;
}
-
- sctx->b.num_draw_calls++;
- if (info->primitive_restart)
- sctx->b.num_prim_restart_calls++;
- if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
- sctx->b.num_spill_draw_calls++;
if (index_size && indexbuf != info->index.resource)
pipe_resource_reference(&indexbuf, NULL);
}
sctx->trace_id++;
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
S_370_WR_CONFIRM(1) |
radeon_emit(cs, sctx->trace_id);
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(cs, AC_ENCODE_TRACE_POINT(sctx->trace_id));
+
+ if (sctx->ce_ib) {
+ struct radeon_winsys_cs *ce = sctx->ce_ib;
+
+ radeon_emit(ce, PKT3(PKT3_WRITE_DATA, 3, 0));
+ radeon_emit(ce, S_370_DST_SEL(V_370_MEM_ASYNC) |
+ S_370_WR_CONFIRM(1) |
+ S_370_ENGINE_SEL(V_370_CE));
+ radeon_emit(ce, sctx->trace_buf->gpu_address + 4);
+ radeon_emit(ce, (sctx->trace_buf->gpu_address + 4) >> 32);
+ radeon_emit(ce, sctx->trace_id);
+ radeon_emit(ce, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(ce, AC_ENCODE_TRACE_POINT(sctx->trace_id));
+ }
}