radv: Track scratch usage across pipelines & command buffers.
[mesa.git] / src / amd / vulkan / si_cmd_buffer.c
index a61a950de682c0008dabea01c586e9f5d62b0513..e2025b1dd19b549ef1a8ff010e7c28092c8bde72 100644 (file)
@@ -170,10 +170,11 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
                                       S_030800_INSTANCE_BROADCAST_WRITES(1));
 }
 
-static void
+void
 si_init_compute(struct radv_physical_device *physical_device,
-                struct radeon_winsys_cs *cs)
+                struct radv_cmd_buffer *cmd_buffer)
 {
+       struct radeon_winsys_cs *cs = cmd_buffer->cs;
        radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
        radeon_emit(cs, 0);
        radeon_emit(cs, 0);
@@ -419,7 +420,7 @@ void si_init_config(struct radv_physical_device *physical_device,
        if (physical_device->rad_info.family == CHIP_STONEY)
                radeon_set_context_reg(cs, R_028C40_PA_SC_SHADER_CONTROL, 0);
 
-       si_init_compute(physical_device, cs);
+       si_init_compute(physical_device, cmd_buffer);
 }
 
 static void
@@ -479,11 +480,11 @@ si_write_viewport(struct radeon_winsys_cs *cs, int first_vp,
                radeon_emit(cs, fui(translate[2]));
        }
 
+       radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
+                                  first_vp * 4 * 2, count * 2);
        for (i = 0; i < count; i++) {
                float zmin = MIN2(viewports[i].minDepth, viewports[i].maxDepth);
                float zmax = MAX2(viewports[i].minDepth, viewports[i].maxDepth);
-               radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
-                                          first_vp * 4 * 2, count * 2);
                radeon_emit(cs, fui(zmin));
                radeon_emit(cs, fui(zmax));
        }
@@ -510,8 +511,8 @@ si_write_scissors(struct radeon_winsys_cs *cs, int first,
 uint32_t
 si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer)
 {
-       enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
-       struct radeon_info *info = &cmd_buffer->device->instance->physicalDevice.rad_info;
+       enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
+       struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
        unsigned prim = cmd_buffer->state.pipeline->graphics.prim;
        unsigned primgroup_size = 128; /* recommended without a GS */
        unsigned max_primgroup_in_wave = 2;
@@ -598,8 +599,18 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer)
 void
 si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 {
-       enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
+       enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
        unsigned cp_coher_cntl = 0;
+       bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
+
+       if (is_compute)
+               cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+                                                 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
+                                                 RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+                                                 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
+                                                 RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+                                                 RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
+                                                 RADV_CMD_FLAG_VGT_FLUSH);
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
 
@@ -627,7 +638,7 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
                        S_0085F0_CB7_DEST_BASE_ENA(1);
 
                /* Necessary for DCC */
-               if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) {
+               if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) {
                        radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
                        radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
                                                    EVENT_INDEX(5));
@@ -678,7 +689,8 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
        /* Make sure ME is idle (it executes most packets) before continuing.
         * This prevents read-after-write hazards between PFP and ME.
         */
-       if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
+       if ((cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+           !radv_cmd_buffer_uses_mec(cmd_buffer)) {
                radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cmd_buffer->cs, 0);
        }
@@ -687,14 +699,27 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
         * Therefore, it should be last. Done in PFP.
         */
        if (cp_coher_cntl) {
-               /* ACQUIRE_MEM is only required on a compute ring. */
-               radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-               radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* CP_COHER_CNTL */
-               radeon_emit(cmd_buffer->cs, 0xffffffff);      /* CP_COHER_SIZE */
-               radeon_emit(cmd_buffer->cs, 0);               /* CP_COHER_BASE */
-               radeon_emit(cmd_buffer->cs, 0x0000000A);      /* POLL_INTERVAL */
+               if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+                       radeon_emit(cmd_buffer->cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
+                                                   PKT3_SHADER_TYPE_S(1));
+                       radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* CP_COHER_CNTL */
+                       radeon_emit(cmd_buffer->cs, 0xffffffff);      /* CP_COHER_SIZE */
+                       radeon_emit(cmd_buffer->cs, 0xff);            /* CP_COHER_SIZE_HI */
+                       radeon_emit(cmd_buffer->cs, 0);               /* CP_COHER_BASE */
+                       radeon_emit(cmd_buffer->cs, 0);               /* CP_COHER_BASE_HI */
+                       radeon_emit(cmd_buffer->cs, 0x0000000A);      /* POLL_INTERVAL */
+               } else {
+                       /* ACQUIRE_MEM is only required on a compute ring. */
+                       radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+                       radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* CP_COHER_CNTL */
+                       radeon_emit(cmd_buffer->cs, 0xffffffff);      /* CP_COHER_SIZE */
+                       radeon_emit(cmd_buffer->cs, 0);               /* CP_COHER_BASE */
+                       radeon_emit(cmd_buffer->cs, 0x0000000A);      /* POLL_INTERVAL */
+               }
        }
 
+       if (cmd_buffer->state.flush_bits)
+               radv_cmd_buffer_trace_emit(cmd_buffer);
        cmd_buffer->state.flush_bits = 0;
 }
 
@@ -731,7 +756,7 @@ static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer,
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
 
-       if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) {
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
                radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
                radeon_emit(cs, sync_flag | sel);       /* CP_SYNC [31] */
                radeon_emit(cs, src_va);                /* SRC_ADDR_LO [31:0] */
@@ -753,10 +778,12 @@ static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer,
         * indices. If we wanted to execute CP DMA in PFP, this packet
         * should precede it.
         */
-       if (sync_flag) {
+       if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
        }
+
+       radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
 /* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
@@ -775,7 +802,7 @@ static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer,
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
 
-       if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) {
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
                radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
                radeon_emit(cs, sync_flag | dst_sel | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
                radeon_emit(cs, clear_value);           /* DATA [31:0] */
@@ -793,10 +820,11 @@ static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer,
        }
 
        /* See "copy_buffer" for explanation. */
-       if (sync_flag) {
+       if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
        }
+       radv_cmd_buffer_trace_emit(cmd_buffer);
 }
 
 static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count,
@@ -847,8 +875,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
        uint64_t skipped_size = 0, realign_size = 0;
 
 
-       if (cmd_buffer->device->instance->physicalDevice.rad_info.family <= CHIP_CARRIZO ||
-           cmd_buffer->device->instance->physicalDevice.rad_info.family == CHIP_STONEY) {
+       if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
+           cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
                /* If the size is not aligned, we must add a dummy copy at the end
                 * just to align the internal counter. Otherwise, the DMA engine
                 * would slow down by an order of magnitude for following copies.