radv: drop wrong initialization of COMPUTE_RESOURCE_LIMITS
[mesa.git] / src / amd / vulkan / si_cmd_buffer.c
index 0692124bf51653ff3a38dea8c350676144673a5f..2cfa7f4c2c386f376ac633b783c18b41467e4cd9 100644 (file)
@@ -88,9 +88,7 @@ si_emit_compute(struct radv_physical_device *physical_device,
        radeon_emit(cs, 0);
        radeon_emit(cs, 0);
 
-       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                             S_00B854_WAVES_PER_SH(0x3));
-       radeon_emit(cs, 0);
+       radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
        /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
        radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
        radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));
@@ -673,24 +671,41 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 }
 
 void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
-                               bool predicated,
                                enum chip_class chip_class,
                                bool is_mec,
                                unsigned event, unsigned event_flags,
                                unsigned data_sel,
                                uint64_t va,
                                uint32_t old_fence,
-                               uint32_t new_fence)
+                               uint32_t new_fence,
+                               uint64_t gfx9_eop_bug_va)
 {
        unsigned op = EVENT_TYPE(event) |
                EVENT_INDEX(5) |
                event_flags;
        unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
+       unsigned sel = EOP_DATA_SEL(data_sel);
+
+       /* Wait for write confirmation before writing data, but don't send
+        * an interrupt. */
+       if (data_sel != EOP_DATA_SEL_DISCARD)
+               sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
 
        if (chip_class >= GFX9 || is_gfx8_mec) {
-               radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, predicated));
+               /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+                * counters) must immediately precede every timestamp event to
+                * prevent a GPU hang on GFX9.
+                */
+               if (chip_class == GFX9) {
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+                       radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+                       radeon_emit(cs, gfx9_eop_bug_va);
+                       radeon_emit(cs, gfx9_eop_bug_va >> 32);
+               }
+
+               radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, false));
                radeon_emit(cs, op);
-               radeon_emit(cs, EOP_DATA_SEL(data_sel));
+               radeon_emit(cs, sel);
                radeon_emit(cs, va);            /* address lo */
                radeon_emit(cs, va >> 32);      /* address hi */
                radeon_emit(cs, new_fence);     /* immediate data lo */
@@ -704,18 +719,18 @@ void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
                         * (and optional cache flushes executed) before the timestamp
                         * is written.
                         */
-                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
                        radeon_emit(cs, op);
                        radeon_emit(cs, va);
-                       radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+                       radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
                        radeon_emit(cs, old_fence); /* immediate data */
                        radeon_emit(cs, 0); /* unused */
                }
 
-               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
                radeon_emit(cs, op);
                radeon_emit(cs, va);
-               radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+               radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
                radeon_emit(cs, new_fence); /* immediate data */
                radeon_emit(cs, 0); /* unused */
        }
@@ -723,11 +738,10 @@ void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
 
 void
 si_emit_wait_fence(struct radeon_cmdbuf *cs,
-                  bool predicated,
                   uint64_t va, uint32_t ref,
                   uint32_t mask)
 {
-       radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, predicated));
+       radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false));
        radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
        radeon_emit(cs, va);
        radeon_emit(cs, va >> 32);
@@ -739,13 +753,12 @@ si_emit_wait_fence(struct radeon_cmdbuf *cs,
 static void
 si_emit_acquire_mem(struct radeon_cmdbuf *cs,
                     bool is_mec,
-                   bool predicated,
                    bool is_gfx9,
                     unsigned cp_coher_cntl)
 {
        if (is_mec || is_gfx9) {
                uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
-               radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, predicated) |
+               radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, false) |
                                            PKT3_SHADER_TYPE_S(is_mec));
                radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
                radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
@@ -755,7 +768,7 @@ si_emit_acquire_mem(struct radeon_cmdbuf *cs,
                radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
        } else {
                /* ACQUIRE_MEM is only required on a compute ring. */
-               radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, predicated));
+               radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, false));
                radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
                radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
                radeon_emit(cs, 0);               /* CP_COHER_BASE */
@@ -769,7 +782,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                       uint32_t *flush_cnt,
                       uint64_t flush_va,
                        bool is_mec,
-                       enum radv_cmd_flush_bits flush_bits)
+                       enum radv_cmd_flush_bits flush_bits,
+                      uint64_t gfx9_eop_bug_va)
 {
        unsigned cp_coher_cntl = 0;
        uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
@@ -795,11 +809,13 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                        /* Necessary for DCC */
                        if (chip_class >= VI) {
                                si_cs_emit_write_event_eop(cs,
-                                                          false,
                                                           chip_class,
                                                           is_mec,
                                                           V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-                                                          0, 0, 0, 0, 0);
+                                                          0,
+                                                          EOP_DATA_SEL_DISCARD,
+                                                          0, 0, 0,
+                                                          gfx9_eop_bug_va);
                        }
                }
                if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
@@ -834,26 +850,9 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
        if (chip_class >= GFX9 && flush_cb_db) {
                unsigned cb_db_event, tc_flags;
 
-#if 0
-               /* This breaks a bunch of:
-                  dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input*.
-                  use the big hammer always.
-               */
                /* Set the CB/DB flush event. */
-               switch (flush_cb_db) {
-               case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
-                       cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-                       break;
-               case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
-                       cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-                       break;
-               default:
-                       /* both CB & DB */
-                       cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-               }
-#else
                cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-#endif
+
                /* These are the only allowed combinations. If you need to
                 * do multiple operations at once, do them separately.
                 * All operations that invalidate L2 also seem to invalidate
@@ -884,9 +883,11 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                assert(flush_cnt);
                uint32_t old_fence = (*flush_cnt)++;
 
-               si_cs_emit_write_event_eop(cs, false, chip_class, false, cb_db_event, tc_flags, 1,
-                                          flush_va, old_fence, *flush_cnt);
-               si_emit_wait_fence(cs, false, flush_va, *flush_cnt, 0xffffffff);
+               si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags,
+                                          EOP_DATA_SEL_VALUE_32BIT,
+                                          flush_va, old_fence, *flush_cnt,
+                                          gfx9_eop_bug_va);
+               si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff);
        }
 
        /* VGT state sync */
@@ -910,7 +911,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
 
        if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
            (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-               si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9,
+               si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
                                    cp_coher_cntl |
                                    S_0085F0_TC_ACTION_ENA(1) |
                                    S_0085F0_TCL1_ACTION_ENA(1) |
@@ -924,7 +925,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                         *
                         * WB doesn't work without NC.
                         */
-                       si_emit_acquire_mem(cs, is_mec, false,
+                       si_emit_acquire_mem(cs, is_mec,
                                            chip_class >= GFX9,
                                            cp_coher_cntl |
                                            S_0301F0_TC_WB_ACTION_ENA(1) |
@@ -933,7 +934,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                }
                if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
                        si_emit_acquire_mem(cs, is_mec,
-                                           false, chip_class >= GFX9,
+                                           chip_class >= GFX9,
                                            cp_coher_cntl |
                                            S_0085F0_TCL1_ACTION_ENA(1));
                        cp_coher_cntl = 0;
@@ -944,7 +945,17 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
         * Therefore, it should be last. Done in PFP.
         */
        if (cp_coher_cntl)
-               si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9, cp_coher_cntl);
+               si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl);
+
+       if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
+                               EVENT_INDEX(0));
+       } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) {
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
+                               EVENT_INDEX(0));
+       }
 }
 
 void
@@ -959,7 +970,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
                                                  RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
                                                  RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
                                                  RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
-                                                 RADV_CMD_FLAG_VGT_FLUSH);
+                                                 RADV_CMD_FLAG_VGT_FLUSH |
+                                                 RADV_CMD_FLAG_START_PIPELINE_STATS |
+                                                 RADV_CMD_FLAG_STOP_PIPELINE_STATS);
 
        if (!cmd_buffer->state.flush_bits)
                return;
@@ -977,7 +990,8 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
                               cmd_buffer->device->physical_device->rad_info.chip_class,
                               ptr, va,
                               radv_cmd_buffer_uses_mec(cmd_buffer),
-                              cmd_buffer->state.flush_bits);
+                              cmd_buffer->state.flush_bits,
+                              cmd_buffer->gfx9_eop_bug_va);
 
 
        if (unlikely(cmd_buffer->device->trace_bo))
@@ -988,12 +1002,23 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 
 /* sets the CP predication state using a boolean stored at va */
 void
-si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
+si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
+                             bool inverted, uint64_t va)
 {
        uint32_t op = 0;
 
-       if (va)
-               op = PRED_OP(PREDICATION_OP_BOOL64) | PREDICATION_DRAW_VISIBLE;
+       if (va) {
+               op = PRED_OP(PREDICATION_OP_BOOL64);
+
+               /* By default, our internal rendering commands are discarded
+                * only if the predicate is non-zero (ie. DRAW_VISIBLE). But
+                * VK_EXT_conditional_rendering also allows to discard commands
+                * when the predicate is zero, which means we have to use a
+                * different flag.
+                */
+               op |= inverted ? PREDICATION_DRAW_VISIBLE :
+                                PREDICATION_DRAW_NOT_VISIBLE;
+       }
        if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
                radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
                radeon_emit(cmd_buffer->cs, op);
@@ -1042,7 +1067,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
        struct radeon_cmdbuf *cs = cmd_buffer->cs;
        uint32_t header = 0, command = 0;
 
-       assert(size);
        assert(size <= cp_dma_max_byte_count(cmd_buffer));
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
@@ -1068,9 +1092,9 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
        if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
            !(flags & CP_DMA_CLEAR) &&
            src_va == dst_va)
-               header |= S_411_DSL_SEL(V_411_NOWHERE); /* prefetch only */
+               header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
        else if (flags & CP_DMA_USE_L2)
-               header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);
+               header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
 
        if (flags & CP_DMA_CLEAR)
                header |= S_411_SRC_SEL(V_411_DATA);
@@ -1101,9 +1125,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
         * indices. If we wanted to execute CP DMA in PFP, this packet
         * should precede it.
         */
-       if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
-               radeon_emit(cs, 0);
+       if (flags & CP_DMA_SYNC) {
+               if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+                       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
+                       radeon_emit(cs, 0);
+               }
+
+               /* CP will see the sync flag and wait for all DMAs to complete. */
+               cmd_buffer->state.dma_is_busy = false;
        }
 
        if (unlikely(cmd_buffer->device->trace_bo))
@@ -1167,6 +1196,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
        uint64_t main_src_va, main_dest_va;
        uint64_t skipped_size = 0, realign_size = 0;
 
+       /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
 
        if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
            cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
@@ -1230,6 +1261,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 
        assert(va % 4 == 0 && size % 4 == 0);
 
+       /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
+
        while (size) {
                unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
                unsigned dma_flags = CP_DMA_CLEAR;
@@ -1245,6 +1279,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
        }
 }
 
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
+{
+       if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
+               return;
+
+       if (!cmd_buffer->state.dma_is_busy)
+               return;
+
+       /* Issue a dummy DMA that copies zero bytes.
+        *
+        * The DMA engine will see that there's no work to do and skip this
+        * DMA request, however, the CP will see the sync flag and still wait
+        * for all DMAs to complete.
+        */
+       si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
+
+       cmd_buffer->state.dma_is_busy = false;
+}
+
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
        (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |                  \