SMEM and VMEM caches are L0 on gfx10. Ported from RadeonSI.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
case VK_ACCESS_SHADER_WRITE_BIT:
case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
- flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+ flush_bits |= RADV_CMD_FLAG_WB_L2;
break;
case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
case VK_ACCESS_TRANSFER_WRITE_BIT:
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
RADV_CMD_FLAG_FLUSH_AND_INV_DB |
- RADV_CMD_FLAG_INV_GLOBAL_L2;
+ RADV_CMD_FLAG_INV_L2;
if (flush_CB_meta)
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
break;
case VK_ACCESS_UNIFORM_READ_BIT:
- flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 | RADV_CMD_FLAG_INV_SMEM_L1;
+ flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
break;
case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
case VK_ACCESS_TRANSFER_READ_BIT:
case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
- flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2;
+ flush_bits |= RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_INV_L2;
break;
case VK_ACCESS_SHADER_READ_BIT:
- flush_bits |= RADV_CMD_FLAG_INV_VMEM_L1;
+ flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
if (!image_is_coherent)
- flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2;
+ flush_bits |= RADV_CMD_FLAG_INV_L2;
break;
case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
if (flush_CB)
if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
- cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
/* Make sure to sync all pending active queries at the end of
* command buffer.
queue->device->physical_device->rad_info.chip_class >= GFX7,
(queue->queue_family_index == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
RADV_CMD_FLAG_INV_ICACHE |
- RADV_CMD_FLAG_INV_SMEM_L1 |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2 |
+ RADV_CMD_FLAG_INV_SCACHE |
+ RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
} else if (i == 1) {
si_cs_emit_cache_flush(cs,
queue->queue_family_index == RING_COMPUTE &&
queue->device->physical_device->rad_info.chip_class >= GFX7,
RADV_CMD_FLAG_INV_ICACHE |
- RADV_CMD_FLAG_INV_SMEM_L1 |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2 |
+ RADV_CMD_FLAG_INV_SCACHE |
+ RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
}
if (size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
fill_buffer_shader(cmd_buffer, bo, offset, size, value);
flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+ RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_WB_L2;
} else if (size) {
uint64_t va = radv_buffer_get_va(bo);
va += offset;
radv_meta_restore(&saved_state, cmd_buffer);
return RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
+ RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_WB_L2;
}
static uint32_t
radv_meta_restore(&saved_state, cmd_buffer);
state->flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1;
+ RADV_CMD_FLAG_INV_VCACHE;
/* Initialize the DCC metadata as "fully expanded". */
radv_meta_restore(&saved_state, cmd_buffer);
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_GLOBAL_L2;
+ RADV_CMD_FLAG_INV_L2;
/* Re-initialize FMASK in fully expanded mode. */
radv_initialize_fmask(cmd_buffer, image, subresourceRange);
}
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1;
+ RADV_CMD_FLAG_INV_VCACHE;
}
void
}
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1;
+ RADV_CMD_FLAG_INV_VCACHE;
if (radv_image_has_htile(dst_image)) {
if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
};
enum radv_cmd_flush_bits {
- RADV_CMD_FLAG_INV_ICACHE = 1 << 0,
- /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
- RADV_CMD_FLAG_INV_SMEM_L1 = 1 << 1,
- /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
- RADV_CMD_FLAG_INV_VMEM_L1 = 1 << 2,
- /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
- RADV_CMD_FLAG_INV_GLOBAL_L2 = 1 << 3,
- /* Same as above, but only writes back and doesn't invalidate */
- RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 = 1 << 4,
+ /* Instruction cache. */
+ RADV_CMD_FLAG_INV_ICACHE = 1 << 0,
+ /* Scalar L1 cache. */
+ RADV_CMD_FLAG_INV_SCACHE = 1 << 1,
+ /* Vector L1 cache. */
+ RADV_CMD_FLAG_INV_VCACHE = 1 << 2,
+ /* L2 cache + L2 metadata cache writeback & invalidate.
+ * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
+ RADV_CMD_FLAG_INV_L2 = 1 << 3,
+ /* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
+ * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
+ * GFX6-7 will do complete invalidation, because the writeback is unsupported. */
+ RADV_CMD_FLAG_WB_L2 = 1 << 4,
/* Framebuffer caches */
- RADV_CMD_FLAG_FLUSH_AND_INV_CB_META = 1 << 5,
- RADV_CMD_FLAG_FLUSH_AND_INV_DB_META = 1 << 6,
- RADV_CMD_FLAG_FLUSH_AND_INV_DB = 1 << 7,
- RADV_CMD_FLAG_FLUSH_AND_INV_CB = 1 << 8,
+ RADV_CMD_FLAG_FLUSH_AND_INV_CB_META = 1 << 5,
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB_META = 1 << 6,
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB = 1 << 7,
+ RADV_CMD_FLAG_FLUSH_AND_INV_CB = 1 << 8,
/* Engine synchronization. */
- RADV_CMD_FLAG_VS_PARTIAL_FLUSH = 1 << 9,
- RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 10,
- RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 11,
- RADV_CMD_FLAG_VGT_FLUSH = 1 << 12,
+ RADV_CMD_FLAG_VS_PARTIAL_FLUSH = 1 << 9,
+ RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 10,
+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 11,
+ RADV_CMD_FLAG_VGT_FLUSH = 1 << 12,
/* Pipeline query controls. */
- RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13,
- RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 14,
- RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 15,
+ RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13,
+ RADV_CMD_FLAG_STOP_PIPELINE_STATS = 1 << 14,
+ RADV_CMD_FLAG_VGT_STREAMOUT_SYNC = 1 << 15,
RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
&push_constants);
- cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
- RADV_CMD_FLAG_INV_VMEM_L1;
+ cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 |
+ RADV_CMD_FLAG_INV_VCACHE;
if (flags & VK_QUERY_RESULT_WAIT_BIT)
cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_GLOBAL_L2 |
- RADV_CMD_FLAG_INV_VMEM_L1;
+ RADV_CMD_FLAG_INV_L2 |
+ RADV_CMD_FLAG_INV_VCACHE;
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
RADV_CMD_FLAG_FLUSH_AND_INV_DB;
if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
- if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
+ if (flush_bits & RADV_CMD_FLAG_INV_SCACHE)
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
if (chip_class <= GFX8) {
EVENT_TC_MD_ACTION_ENA;
/* Ideally flush TC together with CB/DB. */
- if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
+ if (flush_bits & RADV_CMD_FLAG_INV_L2) {
/* Writeback and invalidate everything in L2 & L1. */
tc_flags = EVENT_TC_ACTION_ENA |
EVENT_TC_WB_ACTION_ENA;
/* Clear the flags. */
- flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
- RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 |
- RADV_CMD_FLAG_INV_VMEM_L1);
+ flush_bits &= ~(RADV_CMD_FLAG_INV_L2 |
+ RADV_CMD_FLAG_WB_L2 |
+ RADV_CMD_FLAG_INV_VCACHE);
}
assert(flush_cnt);
(*flush_cnt)++;
*/
if ((cp_coher_cntl ||
(flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
- RADV_CMD_FLAG_INV_VMEM_L1 |
- RADV_CMD_FLAG_INV_GLOBAL_L2 |
- RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
+ RADV_CMD_FLAG_INV_VCACHE |
+ RADV_CMD_FLAG_INV_L2 |
+ RADV_CMD_FLAG_WB_L2))) &&
!is_mec) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
- if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
- (chip_class <= GFX7 && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
+ if ((flush_bits & RADV_CMD_FLAG_INV_L2) ||
+ (chip_class <= GFX7 && (flush_bits & RADV_CMD_FLAG_WB_L2))) {
si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
cp_coher_cntl |
S_0085F0_TC_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8));
cp_coher_cntl = 0;
} else {
- if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
+ if(flush_bits & RADV_CMD_FLAG_WB_L2) {
/* WB = write-back
* NC = apply to non-coherent MTYPEs
* (i.e. MTYPE <= 1, which is what we use everywhere)
S_0301F0_TC_NC_ACTION_ENA(1));
cp_coher_cntl = 0;
}
- if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
+ if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
si_emit_acquire_mem(cs, is_mec,
chip_class >= GFX9,
cp_coher_cntl |