{
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
- /* TODO: this should use the GS output primitive type. */
+ if (sctx->gs_shader)
+ mode = sctx->gs_shader->gs_output_prim;
+
+ if (mode == sctx->last_rast_prim)
+ return;
+
r600_write_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE,
sctx->pa_sc_line_stipple |
S_028A0C_AUTO_RESET_CNTL(mode == PIPE_PRIM_LINES ? 1 :
S_028814_PROVOKING_VTX_LAST(mode == PIPE_PRIM_QUADS ||
mode == PIPE_PRIM_QUAD_STRIP ||
mode == PIPE_PRIM_POLYGON));
+
+ sctx->last_rast_prim = mode;
}
static void si_emit_draw_registers(struct si_context *sctx,
unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info);
/* Draw state. */
- if (sctx->b.chip_class >= CIK) {
- r600_write_context_reg(cs, R_028B74_VGT_DISPATCH_DRAW_INDEX,
- ib->index_size == 4 ? 0xFC000000 : 0xFC00);
-
- radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
- radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
- radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
- radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */
- } else {
- r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
- r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+ if (prim != sctx->last_prim ||
+ ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+ if (sctx->b.chip_class >= CIK) {
+ radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
+ radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
+ radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
+ radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */
+ } else {
+ r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
+ r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+ }
+ sctx->last_prim = prim;
+ sctx->last_multi_vgt_param = ia_multi_vgt_param;
}
- r600_write_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+ if (gs_out_prim != sctx->last_gs_out_prim) {
+ r600_write_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+ sctx->last_gs_out_prim = gs_out_prim;
+ }
/* Primitive restart. */
if (info->primitive_restart != sctx->last_primitive_restart_en) {
}
}
+#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
+
void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *atom)
{
struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
uint32_t cp_coher_cntl = 0;
+ uint32_t sqc_caches = 0;
uint32_t compute =
- PKT3_SHADER_TYPE_S(!!(sctx->flags & R600_CONTEXT_FLAG_COMPUTE));
-
- /* XXX SI flushes both ICACHE and KCACHE if either flag is set.
- * XXX CIK shouldn't have this issue. Test CIK before separating the flags
- * XXX to ensure there is no regression. Also find out if there is another
- * XXX way to flush either ICACHE or KCACHE but not both for SI. */
- if (sctx->flags & (R600_CONTEXT_INV_SHADER_CACHE |
- R600_CONTEXT_INV_CONST_CACHE)) {
- cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1) |
- S_0085F0_SH_KCACHE_ACTION_ENA(1);
- }
- if (sctx->flags & (R600_CONTEXT_INV_TEX_CACHE |
- R600_CONTEXT_STREAMOUT_FLUSH)) {
- cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) |
- S_0085F0_TCL1_ACTION_ENA(1);
+ PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
+
+ /* SI has a bug that it always flushes ICACHE and KCACHE if either
+ * bit is set. An alternative way is to write SQC_CACHES. */
+ if (sctx->chip_class == SI &&
+ sctx->flags & BOTH_ICACHE_KCACHE &&
+ (sctx->flags & BOTH_ICACHE_KCACHE) != BOTH_ICACHE_KCACHE) {
+ sqc_caches =
+ S_008C08_INST_INVALIDATE(!!(sctx->flags & SI_CONTEXT_INV_ICACHE)) |
+ S_008C08_DATA_INVALIDATE(!!(sctx->flags & SI_CONTEXT_INV_KCACHE));
+ } else {
+ if (sctx->flags & SI_CONTEXT_INV_ICACHE)
+ cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+ if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+ cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
}
- if (sctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB) {
+
+ if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+ cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
+ if (sctx->flags & SI_CONTEXT_INV_TC_L2)
+ cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
+
+ if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
S_0085F0_CB0_DEST_BASE_ENA(1) |
S_0085F0_CB1_DEST_BASE_ENA(1) |
S_0085F0_CB6_DEST_BASE_ENA(1) |
S_0085F0_CB7_DEST_BASE_ENA(1);
}
- if (sctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB) {
+ if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
S_0085F0_DB_DEST_BASE_ENA(1);
}
- if (cp_coher_cntl) {
- if (sctx->chip_class >= CIK) {
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | compute);
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- } else {
- radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0) | compute);
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- }
- }
-
- if (sctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB_META) {
+ if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
}
- if (sctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB_META) {
+ if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
}
- if (sctx->flags & R600_CONTEXT_FLUSH_WITH_INV_L2) {
+ if (sctx->flags & SI_CONTEXT_FLUSH_WITH_INV_L2) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) | EVENT_INDEX(7) |
EVENT_WRITE_INV_L2);
}
- if (sctx->flags & (R600_CONTEXT_WAIT_3D_IDLE |
- R600_CONTEXT_PS_PARTIAL_FLUSH)) {
+ /* FLUSH_AND_INV events must be emitted before PS_PARTIAL_FLUSH.
+ * Otherwise, clearing CMASK (CB meta) with CP DMA isn't reliable.
+ *
+ * I think the reason is that FLUSH_AND_INV is only added to a queue
+ * and it is PS_PARTIAL_FLUSH that waits for it to complete.
+ */
+ if (sctx->flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- } else if (sctx->flags & R600_CONTEXT_STREAMOUT_FLUSH) {
- /* Needed if streamout buffers are going to be used as a source. */
+ } else if (sctx->flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
}
-
- if (sctx->flags & R600_CONTEXT_CS_PARTIAL_FLUSH) {
+ if (sctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
}
-
- if (sctx->flags & R600_CONTEXT_VGT_FLUSH) {
+ if (sctx->flags & SI_CONTEXT_VGT_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
}
- if (sctx->flags & R600_CONTEXT_VGT_STREAMOUT_SYNC) {
+ if (sctx->flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute);
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
}
+ /* SURFACE_SYNC must be emitted after partial flushes.
+ * It looks like SURFACE_SYNC flushes caches immediately and doesn't
+ * wait for any engines. This should be last.
+ */
+ if (sqc_caches) {
+ r600_write_config_reg(cs, R_008C08_SQC_CACHES, sqc_caches);
+ cs->buf[cs->cdw-3] |= compute; /* set the compute bit in the header */
+ }
+ if (cp_coher_cntl) {
+ if (sctx->chip_class >= CIK) {
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | compute);
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ } else {
+ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0) | compute);
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ }
+ }
+
sctx->flags = 0;
}
-const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 21 }; /* number of CS dwords */
+const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 24 }; /* number of CS dwords */
static void si_get_draw_start_count(struct si_context *sctx,
const struct pipe_draw_info *info,
}
}
+ if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
+ sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+ r600_resource(ib.buffer)->TC_L2_dirty = false;
+ }
+
/* Check flush flags. */
if (sctx->b.flags)
sctx->atoms.s.cache_flush->dirty = true;
+ if (sctx->emit_scratch_reloc) {
+ struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+ sctx->spi_tmpring_size);
+
+ if (sctx->scratch_buffer) {
+ r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+ sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RESOURCE_RW);
+
+ }
+ sctx->emit_scratch_reloc = false;
+ }
+
si_need_cs_space(sctx, 0, TRUE);
/* Emit states. */
if (sctx->b.family == CHIP_HAWAII &&
(sctx->b.streamout.streamout_enabled ||
sctx->b.streamout.prims_gen_query_enabled)) {
- sctx->b.flags |= R600_CONTEXT_VGT_STREAMOUT_SYNC;
+ sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
}
/* Set the depth buffer as dirty. */