X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state_draw.c;h=1d8be49a480752fbf9d6702854ba0d873a3ab1f9;hb=9c92e82b324291b56e701d6ad265ec73b21a654f;hp=a5f5b7f98a0add0b98da7ef6f3988ef14917956d;hpb=5d8359ff4d8c379fdf1a78758f405bb4cdf69459;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index a5f5b7f98a0..1d8be49a480 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -30,6 +30,7 @@ #include "gfx9d.h" #include "util/u_index_modify.h" +#include "util/u_log.h" #include "util/u_upload_mgr.h" #include "util/u_prim.h" @@ -105,7 +106,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; bool has_primid_instancing_bug = sctx->b.chip_class == SI && sctx->b.screen->info.max_se == 1; - unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL]; + unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; unsigned num_tcs_input_cp = info->vertices_per_patch; unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs; unsigned num_tcs_patch_outputs; @@ -194,7 +195,11 @@ static void si_emit_derived_tess_state(struct si_context *sctx, */ *num_patches = MIN2(*num_patches, 40); - if (sctx->b.chip_class == SI) { + if (sctx->b.chip_class == SI || + /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and + * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */ + (sctx->b.chip_class == GFX9 && + num_tcs_input_cp > num_tcs_output_cp)) { /* SI bug workaround, related to power management. Limit LS-HS * threadgroups to only one wave. */ @@ -567,7 +572,7 @@ static void si_emit_vs_state(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_sh_reg(cs, - sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] + + sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4, sctx->current_vs_state); @@ -640,7 +645,7 @@ static void si_emit_draw_packets(struct si_context *sctx, { struct pipe_draw_indirect_info *indirect = info->indirect; struct radeon_winsys_cs *cs = sctx->b.gfx.cs; - unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX]; + unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX]; bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; uint32_t index_max_size = 0; uint64_t index_va = 0; @@ -894,7 +899,8 @@ void si_emit_cache_flush(struct si_context *sctx) /* Necessary for DCC */ if (rctx->chip_class == VI) r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS, - 0, 0, NULL, 0, 0, 0); + 0, EOP_DATA_SEL_DISCARD, NULL, + 0, 0, R600_NOT_QUERY); } if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | @@ -970,17 +976,30 @@ void si_emit_cache_flush(struct si_context *sctx) cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; } - /* TC | TC_WB = invalidate L2 data - * TC_MD | TC_WB = invalidate L2 metadata (DCC, etc.) - * TC | TC_WB | TC_MD = invalidate L2 data & metadata + /* These are the only allowed combinations. If you need to + * do multiple operations at once, do them separately. + * All operations that invalidate L2 also seem to invalidate + * metadata. Volatile (VOL) and WC flushes are not listed here. + * + * TC | TC_WB = writeback & invalidate L2 & L1 + * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC + * TC_WB | TC_NC = writeback L2 for MTYPE == NC + * TC | TC_NC = invalidate L2 for MTYPE == NC + * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) + * TCL1 = invalidate L1 */ tc_flags = 0; + if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) { + tc_flags = EVENT_TC_ACTION_ENA | + EVENT_TC_MD_ACTION_ENA; + } + /* Ideally flush TC together with CB/DB. */ if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { - tc_flags |= EVENT_TC_ACTION_ENA | - EVENT_TC_WB_ACTION_ENA | - EVENT_TCL1_ACTION_ENA; + /* Writeback and invalidate everything in L2 & L1. */ + tc_flags = EVENT_TC_ACTION_ENA | + EVENT_TC_WB_ACTION_ENA; /* Clear the flags. */ rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 | @@ -993,10 +1012,10 @@ void si_emit_cache_flush(struct si_context *sctx) va = sctx->wait_mem_scratch->gpu_address; sctx->wait_mem_number++; - r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1, + r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, + EOP_DATA_SEL_VALUE_32BIT, sctx->wait_mem_scratch, va, - sctx->wait_mem_number - 1, - sctx->wait_mem_number); + sctx->wait_mem_number, R600_NOT_QUERY); r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff); } @@ -1141,25 +1160,40 @@ static void si_get_draw_start_count(struct si_context *sctx, } } -void si_ce_pre_draw_synchronization(struct si_context *sctx) +static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, + unsigned skip_atom_mask) { - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0)); - radeon_emit(sctx->ce_ib, 1); + /* Emit state atoms. */ + unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; + while (mask) { + struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 1); + atom->emit(&sctx->b, atom); } -} + sctx->dirty_atoms &= skip_atom_mask; -void si_ce_post_draw_synchronization(struct si_context *sctx) -{ - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 0); + /* Emit states. */ + mask = sctx->dirty_states; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct si_pm4_state *state = sctx->queued.array[i]; + + if (!state || sctx->emitted.array[i] == state) + continue; - sctx->ce_need_synchronization = false; + si_pm4_emit(sctx, state); + sctx->emitted.array[i] = state; } + sctx->dirty_states = 0; + + /* Emit draw states. */ + unsigned num_patches = 0; + + si_emit_rasterizer_prim_state(sctx); + if (sctx->tes_shader.cso) + si_emit_derived_tess_state(sctx, info, &num_patches); + si_emit_vs_state(sctx, info); + si_emit_draw_registers(sctx, info, num_patches); } void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) @@ -1167,9 +1201,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct pipe_resource *indexbuf = info->index.resource; - unsigned mask, dirty_tex_counter; + unsigned dirty_tex_counter; enum pipe_prim_type rast_prim; - unsigned num_patches = 0; unsigned index_size = info->index_size; unsigned index_offset = info->indirect ? info->start * index_size : 0; @@ -1251,9 +1284,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (sctx->do_update_shaders && !si_update_shaders(sctx)) return; - if (!si_upload_graphics_shader_descriptors(sctx)) - return; - if (index_size) { /* Translate or upload, if needed. */ /* 8-bit indices are supported on VI. */ @@ -1329,6 +1359,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_need_cs_space(sctx); + if (unlikely(sctx->b.log)) + si_log_draw_state(sctx, sctx->b.log); + /* Since we've called r600_context_add_resource_size for vertex buffers, * this must be called after si_need_cs_space, because we must let * need_cs_space flush before we add buffers to the buffer list. @@ -1336,50 +1369,70 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (!si_upload_vertex_buffer_descriptors(sctx)) return; - /* GFX9 scissor bug workaround. There is also a more efficient but - * more involved alternative workaround. */ + /* GFX9 scissor bug workaround. This must be done before VPORT scissor + * registers are changed. There is also a more efficient but more + * involved alternative workaround. + */ if (sctx->b.chip_class == GFX9 && - si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) + si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) { sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + si_emit_cache_flush(sctx); + } + + /* Use optimal packet order based on whether we need to sync the pipeline. */ + if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* If we have to wait for idle, set all states first, so that all + * SET packets are processed in parallel with previous draw calls. + * Then upload descriptors, set shader pointers, and draw, and + * prefetch at the end. This ensures that the time the CUs + * are idle is very short. (there are only SET_SH packets between + * the wait and the draw) + */ + struct r600_atom *shader_pointers = &sctx->shader_pointers.atom; - /* Flush caches before the first state atom, which does L2 prefetches. */ - if (sctx->b.flags) + /* Emit all states except shader pointers. */ + si_emit_all_states(sctx, info, 1 << shader_pointers->id); si_emit_cache_flush(sctx); - /* Emit state atoms. */ - mask = sctx->dirty_atoms; - while (mask) { - struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; + /* <-- CUs are idle here. */ + if (!si_upload_graphics_shader_descriptors(sctx)) + return; - atom->emit(&sctx->b, atom); - } - sctx->dirty_atoms = 0; + /* Set shader pointers after descriptors are uploaded. */ + if (si_is_atom_dirty(sctx, shader_pointers)) { + shader_pointers->emit(&sctx->b, NULL); + sctx->dirty_atoms = 0; + } - /* Emit states. */ - mask = sctx->dirty_states; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct si_pm4_state *state = sctx->queued.array[i]; + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + /* <-- CUs are busy here. */ - if (!state || sctx->emitted.array[i] == state) - continue; + /* Start prefetches after the draw has been started. Both will run + * in parallel, but starting the draw first is more important. + */ + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx); + } else { + /* If we don't wait for idle, start prefetches first, then set + * states, and draw at the end. + */ + if (sctx->b.flags) + si_emit_cache_flush(sctx); - si_pm4_emit(sctx, state); - sctx->emitted.array[i] = state; - } - sctx->dirty_states = 0; + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx); - si_emit_rasterizer_prim_state(sctx); - if (sctx->tes_shader.cso) - si_emit_derived_tess_state(sctx, info, &num_patches); - si_emit_vs_state(sctx, info); - si_emit_draw_registers(sctx, info, num_patches); + if (!si_upload_graphics_shader_descriptors(sctx)) + return; - si_ce_pre_draw_synchronization(sctx); - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); - si_ce_post_draw_synchronization(sctx); + si_emit_all_states(sctx, info, 0); + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + } - if (sctx->trace_buf) + if (unlikely(sctx->current_saved_cs)) si_trace_emit(sctx); /* Workaround for a VGT hang when streamout is enabled. @@ -1391,13 +1444,17 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; } - sctx->b.num_draw_calls++; - if (sctx->framebuffer.state.nr_cbufs > 1) - sctx->b.num_mrt_draw_calls++; - if (info->primitive_restart) - sctx->b.num_prim_restart_calls++; - if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size)) - sctx->b.num_spill_draw_calls++; + if (unlikely(sctx->decompression_enabled)) { + sctx->b.num_decompress_calls++; + } else { + sctx->b.num_draw_calls++; + if (sctx->framebuffer.state.nr_cbufs > 1) + sctx->b.num_mrt_draw_calls++; + if (info->primitive_restart) + sctx->b.num_prim_restart_calls++; + if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size)) + sctx->b.num_spill_draw_calls++; + } if (index_size && indexbuf != info->index.resource) pipe_resource_reference(&indexbuf, NULL); } @@ -1405,17 +1462,19 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) void si_trace_emit(struct si_context *sctx) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + uint64_t va = sctx->current_saved_cs->trace_buf->gpu_address; + uint32_t trace_id = ++sctx->current_saved_cs->trace_id; - sctx->trace_id++; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf, - RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); - radeon_emit(cs, sctx->trace_buf->gpu_address); - radeon_emit(cs, sctx->trace_buf->gpu_address >> 32); - radeon_emit(cs, sctx->trace_id); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, trace_id); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, AC_ENCODE_TRACE_POINT(sctx->trace_id)); + radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); + + if (sctx->b.log) + u_log_flush(sctx->b.log); }