radv: add GFX9 cache flushing support.
authorDave Airlie <airlied@redhat.com>
Mon, 5 Jun 2017 23:01:48 +0000 (09:01 +1000)
committerDave Airlie <airlied@redhat.com>
Mon, 5 Jun 2017 23:43:40 +0000 (09:43 +1000)
GFX9 needs to write event EOP to a fence buffer, allocate some
space for this, and just write an ever increasing number to it,
this isn't exactly what radeonsi does, but it seems to work.

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index d66f8979e8f47db2ae41ba37dac6945c7ba9637e..d078421182d11f8a455adf3ec7867b1849e8f9ae 100644 (file)
@@ -234,6 +234,14 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
        cmd_buffer->record_fail = false;
 
        cmd_buffer->ring_offsets_idx = -1;
+
+       if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+               void *fence_ptr;
+               radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
+                                            &cmd_buffer->gfx9_fence_offset,
+                                            &fence_ptr);
+               cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
+       }
 }
 
 static bool
index ca42ab8e0e10995455ae8929ed51c7760cc29f52..9d510ea59ea0cad938bd41b8a040c6b03ac9ac4b 100644 (file)
@@ -1103,6 +1103,7 @@ VkResult radv_CreateDevice(
                case RADV_QUEUE_COMPUTE:
                        si_cs_emit_cache_flush(device->flush_cs[family],
                                               device->physical_device->rad_info.chip_class,
+                                              NULL, 0,
                                               family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
                                               RADV_CMD_FLAG_INV_ICACHE |
                                               RADV_CMD_FLAG_INV_SMEM_L1 |
@@ -1118,6 +1119,7 @@ VkResult radv_CreateDevice(
                case RADV_QUEUE_COMPUTE:
                        si_cs_emit_cache_flush(device->flush_shader_cs[family],
                                               device->physical_device->rad_info.chip_class,
+                                              NULL, 0,
                                               family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
                                               family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
                                               RADV_CMD_FLAG_INV_ICACHE |
@@ -1763,6 +1765,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
                if (!i) {
                        si_cs_emit_cache_flush(cs,
                                               queue->device->physical_device->rad_info.chip_class,
+                                              NULL, 0,
                                               queue->queue_family_index == RING_COMPUTE &&
                                                 queue->device->physical_device->rad_info.chip_class >= CIK,
                                               RADV_CMD_FLAG_INV_ICACHE |
index e1b9a29cee1a91c3f045de5dd0aa4348d886aa1a..6a6c1e2351aada813fca5570b6820393aaa35c61 100644 (file)
@@ -822,6 +822,9 @@ struct radv_cmd_buffer {
        bool record_fail;
 
        int ring_offsets_idx; /* just used for verification */
+       uint32_t gfx9_fence_offset;
+       struct radeon_winsys_bo *gfx9_fence_bo;
+       uint32_t gfx9_fence_idx;
 };
 
 struct radv_image;
@@ -854,9 +857,10 @@ void si_emit_wait_fence(struct radeon_winsys_cs *cs,
                        uint64_t va, uint32_t ref,
                        uint32_t mask);
 void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
-                            enum chip_class chip_class,
-                            bool is_mec,
-                            enum radv_cmd_flush_bits flush_bits);
+                           enum chip_class chip_class,
+                           uint32_t *fence_ptr, uint64_t va,
+                           bool is_mec,
+                           enum radv_cmd_flush_bits flush_bits);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
                           uint64_t src_va, uint64_t dest_va,
index eda24be462ac50cd02bf09693df0277a057afed2..3e0b8ee0200a295265baf03c03fd9dc4e3c30765 100644 (file)
@@ -823,15 +823,18 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
        unsigned op = EVENT_TYPE(event) |
                EVENT_INDEX(5) |
                event_flags;
+       unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
 
-       if (is_mec) {
-               radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
+       if (chip_class >= GFX9 || is_gfx8_mec) {
+               radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, 0));
                radeon_emit(cs, op);
                radeon_emit(cs, EOP_DATA_SEL(data_sel));
                radeon_emit(cs, va);            /* address lo */
                radeon_emit(cs, va >> 32);      /* address hi */
                radeon_emit(cs, new_fence);     /* immediate data lo */
                radeon_emit(cs, 0); /* immediate data hi */
+               if (!is_gfx8_mec)
+                       radeon_emit(cs, 0); /* unused */
        } else {
                if (chip_class == CIK ||
                    chip_class == VI) {
@@ -872,15 +875,16 @@ si_emit_wait_fence(struct radeon_winsys_cs *cs,
 
 static void
 si_emit_acquire_mem(struct radeon_winsys_cs *cs,
-                    bool is_mec,
+                    bool is_mec, bool is_gfx9,
                     unsigned cp_coher_cntl)
 {
-       if (is_mec) {
+       if (is_mec || is_gfx9) {
+               uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
                radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
-                                           PKT3_SHADER_TYPE_S(1));
+                                           PKT3_SHADER_TYPE_S(is_mec));
                radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
                radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
-               radeon_emit(cs, 0xff);            /* CP_COHER_SIZE_HI */
+               radeon_emit(cs, hi_val);          /* CP_COHER_SIZE_HI */
                radeon_emit(cs, 0);               /* CP_COHER_BASE */
                radeon_emit(cs, 0);               /* CP_COHER_BASE_HI */
                radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
@@ -897,40 +901,45 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs,
 void
 si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                        enum chip_class chip_class,
+                      uint32_t *flush_cnt,
+                      uint64_t flush_va,
                        bool is_mec,
                        enum radv_cmd_flush_bits flush_bits)
 {
        unsigned cp_coher_cntl = 0;
-
+       uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+                                            RADV_CMD_FLAG_FLUSH_AND_INV_DB);
+       
        if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
                cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
        if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
                cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-       if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
-               cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
-                       S_0085F0_CB0_DEST_BASE_ENA(1) |
-                       S_0085F0_CB1_DEST_BASE_ENA(1) |
-                       S_0085F0_CB2_DEST_BASE_ENA(1) |
-                       S_0085F0_CB3_DEST_BASE_ENA(1) |
-                       S_0085F0_CB4_DEST_BASE_ENA(1) |
-                       S_0085F0_CB5_DEST_BASE_ENA(1) |
-                       S_0085F0_CB6_DEST_BASE_ENA(1) |
-                       S_0085F0_CB7_DEST_BASE_ENA(1);
-
-               /* Necessary for DCC */
-               if (chip_class >= VI) {
-                       si_cs_emit_write_event_eop(cs,
-                                                  chip_class,
-                                                  is_mec,
-                                                  V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-                                                  0, 0, 0, 0, 0);
+       if (chip_class <= VI) {
+               if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
+                       cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
+                               S_0085F0_CB0_DEST_BASE_ENA(1) |
+                               S_0085F0_CB1_DEST_BASE_ENA(1) |
+                               S_0085F0_CB2_DEST_BASE_ENA(1) |
+                               S_0085F0_CB3_DEST_BASE_ENA(1) |
+                               S_0085F0_CB4_DEST_BASE_ENA(1) |
+                               S_0085F0_CB5_DEST_BASE_ENA(1) |
+                               S_0085F0_CB6_DEST_BASE_ENA(1) |
+                               S_0085F0_CB7_DEST_BASE_ENA(1);
+
+                       /* Necessary for DCC */
+                       if (chip_class >= VI) {
+                               si_cs_emit_write_event_eop(cs,
+                                                          chip_class,
+                                                          is_mec,
+                                                          V_028A90_FLUSH_AND_INV_CB_DATA_TS,
+                                                          0, 0, 0, 0, 0);
+                       }
+               }
+               if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
+                       cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
+                               S_0085F0_DB_DEST_BASE_ENA(1);
                }
-       }
-
-       if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
-               cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
-                       S_0085F0_DB_DEST_BASE_ENA(1);
        }
 
        if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
@@ -943,8 +952,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
        }
 
-       if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-                                             RADV_CMD_FLAG_FLUSH_AND_INV_DB))) {
+       if (!flush_cb_db) {
                if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
@@ -959,6 +967,54 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
                radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
        }
 
+       if (chip_class >= GFX9 && flush_cb_db) {
+               unsigned cb_db_event, tc_flags;
+
+               /* Set the CB/DB flush event. */
+               switch (flush_cb_db) {
+               case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
+                       cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+                       break;
+               case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
+                       cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+                       break;
+               default:
+                       /* both CB & DB */
+                       cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+               }
+
+               /* TC    | TC_WB         = invalidate L2 data
+                * TC_MD | TC_WB         = invalidate L2 metadata
+                * TC    | TC_WB | TC_MD = invalidate L2 data & metadata
+                *
+                * The metadata cache must always be invalidated for coherency
+                * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
+                *
+                * TC must be invalidated on GFX9 only if the CB/DB surface is
+                * not pipe-aligned. If the surface is RB-aligned, it might not
+                * strictly be pipe-aligned since RB alignment takes precendence.
+                */
+               tc_flags = EVENT_TC_WB_ACTION_ENA |
+                          EVENT_TC_MD_ACTION_ENA;
+
+               /* Ideally flush TC together with CB/DB. */
+               if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
+                       tc_flags |= EVENT_TC_ACTION_ENA |
+                                   EVENT_TCL1_ACTION_ENA;
+
+                       /* Clear the flags. */
+                       flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
+                                        RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 |
+                                        RADV_CMD_FLAG_INV_VMEM_L1);
+               }
+               assert(flush_cnt);
+               uint32_t old_fence = (*flush_cnt)++;
+
+               si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 1,
+                                          flush_va, old_fence, *flush_cnt);
+               si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff);
+       }
+
        /* VGT state sync */
        if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -968,7 +1024,11 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
        /* Make sure ME is idle (it executes most packets) before continuing.
         * This prevents read-after-write hazards between PFP and ME.
         */
-       if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+       if ((cp_coher_cntl ||
+            (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+                           RADV_CMD_FLAG_INV_VMEM_L1 |
+                           RADV_CMD_FLAG_INV_GLOBAL_L2 |
+                           RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
            !is_mec) {
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
@@ -976,34 +1036,46 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 
        if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
            (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-               cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
-               if (chip_class >= VI)
-                       cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
-       } else  if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
-               cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) |
-                                S_0301F0_TC_NC_ACTION_ENA(1);
-
-               /* L2 writeback doesn't combine with L1 invalidate */
-               si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
-
+               si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+                                   cp_coher_cntl |
+                                   S_0085F0_TC_ACTION_ENA(1) |
+                                   S_0085F0_TCL1_ACTION_ENA(1) |
+                                   S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI));
                cp_coher_cntl = 0;
+       } else {
+               if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
+                       /* WB = write-back
+                        * NC = apply to non-coherent MTYPEs
+                        *      (i.e. MTYPE <= 1, which is what we use everywhere)
+                        *
+                        * WB doesn't work without NC.
+                        */
+                       si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+                                           cp_coher_cntl |
+                                           S_0301F0_TC_WB_ACTION_ENA(1) |
+                                           S_0301F0_TC_NC_ACTION_ENA(1));
+                       cp_coher_cntl = 0;
+               }
+               if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
+                       si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
+                                           cp_coher_cntl |
+                                           S_0085F0_TCL1_ACTION_ENA(1));
+                       cp_coher_cntl = 0;
+               }
        }
 
-       if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1)
-               cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-
        /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
         * Therefore, it should be last. Done in PFP.
         */
        if (cp_coher_cntl)
-               si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
+               si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl);
 }
 
 void
 si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 {
        bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
-
+       enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
        if (is_compute)
                cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
                                                  RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
@@ -1015,8 +1087,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
 
+       uint32_t *ptr = NULL;
+       uint64_t va = 0;
+       if (chip_class == GFX9) {
+               va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset;
+               ptr = &cmd_buffer->gfx9_fence_idx;
+       }
        si_cs_emit_cache_flush(cmd_buffer->cs,
                               cmd_buffer->device->physical_device->rad_info.chip_class,
+                              ptr, va,
                               radv_cmd_buffer_uses_mec(cmd_buffer),
                               cmd_buffer->state.flush_bits);