S_030800_INSTANCE_BROADCAST_WRITES(1));
}
-static void
+void
si_init_compute(struct radv_physical_device *physical_device,
- struct radeon_winsys_cs *cs)
+ struct radv_cmd_buffer *cmd_buffer)
{
+ struct radeon_winsys_cs *cs = cmd_buffer->cs;
radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
radeon_emit(cs, 0);
radeon_emit(cs, 0);
if (physical_device->rad_info.family == CHIP_STONEY)
radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0);
- si_init_compute(physical_device, cs);
+ si_init_compute(physical_device, cmd_buffer);
}
static void
radeon_emit(cs, fui(translate[2]));
}
+ radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
+ first_vp * 4 * 2, count * 2);
for (i = 0; i < count; i++) {
float zmin = MIN2(viewports[i].minDepth, viewports[i].maxDepth);
float zmax = MAX2(viewports[i].minDepth, viewports[i].maxDepth);
- radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
- first_vp * 4 * 2, count * 2);
radeon_emit(cs, fui(zmin));
radeon_emit(cs, fui(zmax));
}
uint32_t
si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer)
{
- enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
- struct radeon_info *info = &cmd_buffer->device->instance->physicalDevice.rad_info;
+ enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
+ struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
unsigned prim = cmd_buffer->state.pipeline->graphics.prim;
unsigned primgroup_size = 128; /* recommended without a GS */
unsigned max_primgroup_in_wave = 2;
void
si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
{
- enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
+ enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
unsigned cp_coher_cntl = 0;
+ bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
+
+ if (is_compute)
+ cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+ RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
+ RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_VGT_FLUSH);
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
S_0085F0_CB7_DEST_BASE_ENA(1);
/* Necessary for DCC */
- if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) {
+ if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
EVENT_INDEX(5));
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
- if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
+ if ((cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+ !radv_cmd_buffer_uses_mec(cmd_buffer)) {
radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cmd_buffer->cs, 0);
}
* Therefore, it should be last. Done in PFP.
*/
if (cp_coher_cntl) {
- /* ACQUIRE_MEM is only required on a compute ring. */
- radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
- radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */
- radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */
+ if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+ radeon_emit(cmd_buffer->cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
+ PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cmd_buffer->cs, 0xff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */
+ } else {
+ /* ACQUIRE_MEM is only required on a compute ring. */
+ radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+ radeon_emit(cmd_buffer->cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cmd_buffer->cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cmd_buffer->cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cmd_buffer->cs, 0x0000000A); /* POLL_INTERVAL */
+ }
}
+ if (cmd_buffer->state.flush_bits)
+ radv_cmd_buffer_trace_emit(cmd_buffer);
cmd_buffer->state.flush_bits = 0;
}
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
- if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) {
+ if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */
radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
* indices. If we wanted to execute CP DMA in PFP, this packet
* should precede it.
*/
- if (sync_flag) {
+ if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
+
+ radv_cmd_buffer_trace_emit(cmd_buffer);
}
/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
- if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) {
+ if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
radeon_emit(cs, sync_flag | dst_sel | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
radeon_emit(cs, clear_value); /* DATA [31:0] */
}
/* See "copy_buffer" for explanation. */
- if (sync_flag) {
+ if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
+ radv_cmd_buffer_trace_emit(cmd_buffer);
}
static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count,
uint64_t skipped_size = 0, realign_size = 0;
- if (cmd_buffer->device->instance->physicalDevice.rad_info.family <= CHIP_CARRIZO ||
- cmd_buffer->device->instance->physicalDevice.rad_info.family == CHIP_STONEY) {
+ if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
+ cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
/* If the size is not aligned, we must add a dummy copy at the end
* just to align the internal counter. Otherwise, the DMA engine
* would slow down by an order of magnitude for following copies.