r600g: fix CP DMA hazard with index buffer fetches (v3)
authorMarek Olšák <marek.olsak@amd.com>
Thu, 26 May 2016 20:00:03 +0000 (22:00 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Sat, 4 Jun 2016 13:42:33 +0000 (15:42 +0200)
v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel,
    otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Tested-by: Grazvydas Ignotas <notasas@gmail.com>
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
src/gallium/drivers/r600/evergreen_hw_context.c
src/gallium/drivers/r600/evergreend.h
src/gallium/drivers/r600/r600_blit.c
src/gallium/drivers/r600/r600_hw_context.c
src/gallium/drivers/r600/r600_pipe.h
src/gallium/drivers/r600/r600d.h
src/gallium/drivers/radeonsi/sid.h

index f456696970c2cd9d83028c25a61d2fcfb11a4e0b..2feb8015082e95d5d8fecd8162f3e992fb381c23 100644 (file)
@@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                                   struct pipe_resource *dst, uint64_t offset,
-                                  unsigned size, uint32_t clear_value)
+                                  unsigned size, uint32_t clear_value,
+                                  enum r600_coherency coher)
 {
        struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
@@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
                unsigned reloc;
 
-               r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
+               r600_need_cs_space(rctx,
+                                  10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
+                                  R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
@@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                offset += byte_count;
        }
 
+       /* CP DMA is executed in ME, but index buffers are read by PFP.
+        * This ensures that ME (CP DMA) is idle before PFP starts fetching
+        * indices. If we wanted to execute CP DMA in PFP, this packet
+        * should precede it.
+        */
+       if (coher == R600_COHERENCY_SHADER)
+               r600_emit_pfp_sync_me(rctx);
+
        /* Invalidate the read caches. */
        rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
                         R600_CONTEXT_INV_VERTEX_CACHE |
                         R600_CONTEXT_INV_TEX_CACHE;
 }
-
index c1c616910defae7b4b21c1e081bf64b2f61cc52d..a81b6c5fc8153af15366cd9eb09cc31d860bf14c 100644 (file)
@@ -88,6 +88,7 @@
 #define                WAIT_REG_MEM_EQUAL              3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME                      0x42
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
index 282645f149696248dabcdf0f02a0173bc03e1542..76c3364a818efc2596778960db0609db0cea7279 100644 (file)
@@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds
        if (rctx->screen->b.has_cp_dma &&
            rctx->b.chip_class >= EVERGREEN &&
            offset % 4 == 0 && size % 4 == 0) {
-               evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
+               evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher);
        } else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) {
                union pipe_color_union clear_value;
                clear_value.ui[0] = value;
index 808bd27607f2f9ba668635e89b479cf20b249b06..3ba723d0541e5b0acca82f78c91d3252b0b496f2 100644 (file)
@@ -364,6 +364,66 @@ void r600_begin_new_cs(struct r600_context *ctx)
        ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
 }
 
+void r600_emit_pfp_sync_me(struct r600_context *rctx)
+{
+       struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+
+       if (rctx->b.chip_class >= EVERGREEN &&
+           rctx->b.screen->info.drm_minor >= 46) {
+               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+               radeon_emit(cs, 0);
+       } else {
+               /* Emulate PFP_SYNC_ME by writing a value to memory in ME and
+                * waiting for it in PFP.
+                */
+               struct r600_resource *buf = NULL;
+               unsigned offset, reloc;
+               uint64_t va;
+
+               /* 16-byte address alignment is required by WAIT_REG_MEM. */
+               u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
+                                    &offset, (struct pipe_resource**)&buf);
+               if (!buf) {
+                       /* This is too heavyweight, but will work. */
+                       rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+                       return;
+               }
+
+               reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
+                                                 RADEON_USAGE_READWRITE,
+                                                 RADEON_PRIO_FENCE);
+
+               va = buf->gpu_address + offset;
+               assert(va % 16 == 0);
+
+               /* Write 1 to memory in ME. */
+               radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+               radeon_emit(cs, va);
+               radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
+               radeon_emit(cs, 1);
+               radeon_emit(cs, 0);
+
+               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+               radeon_emit(cs, reloc);
+
+               /* Wait in PFP (PFP can only do GEQUAL against memory). */
+               radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+               radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
+                               WAIT_REG_MEM_MEMORY |
+                               WAIT_REG_MEM_PFP);
+               radeon_emit(cs, va);
+               radeon_emit(cs, va >> 32);
+               radeon_emit(cs, 1); /* reference value */
+               radeon_emit(cs, 0xffffffff); /* mask */
+               radeon_emit(cs, 4); /* poll interval */
+
+               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+               radeon_emit(cs, reloc);
+
+               r600_resource_reference(&buf, NULL);
+       }
+}
+
 /* The max number of bytes to copy per packet. */
 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
 
@@ -407,7 +467,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 
                r600_need_cs_space(rctx,
                                   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-                                  3, FALSE);
+                                  3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
@@ -447,6 +507,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
                                      S_008040_WAIT_CP_DMA_IDLE(1));
 
+       /* CP DMA is executed in ME, but index buffers are read by PFP.
+        * This ensures that ME (CP DMA) is idle before PFP starts fetching
+        * indices. If we wanted to execute CP DMA in PFP, this packet
+        * should precede it.
+        */
+       r600_emit_pfp_sync_me(rctx);
+
        /* Invalidate the read caches. */
        rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
                         R600_CONTEXT_INV_VERTEX_CACHE |
index 76178c225099f7787bf4bc2da0e2afcc672ba86a..313bf69c31441354ad1c60ee2d95169fd1147113 100644 (file)
@@ -57,6 +57,7 @@
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS       18
 #define R600_MAX_DRAW_CS_DWORDS                58
+#define R600_MAX_PFP_SYNC_ME_DWORDS    16
 
 #define R600_MAX_USER_CONST_BUFFERS 13
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 void r600_begin_new_cs(struct r600_context *ctx);
 void r600_flush_emit(struct r600_context *ctx);
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_emit_pfp_sync_me(struct r600_context *rctx);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                             struct pipe_resource *dst, uint64_t dst_offset,
                             struct pipe_resource *src, uint64_t src_offset,
                             unsigned size);
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                                   struct pipe_resource *dst, uint64_t offset,
-                                  unsigned size, uint32_t clear_value);
+                                  unsigned size, uint32_t clear_value,
+                                  enum r600_coherency coher);
 void r600_dma_copy_buffer(struct r600_context *rctx,
                          struct pipe_resource *dst,
                          struct pipe_resource *src,
index 24f599ea6e99eea6f3d1d864dd7fdceeef90359f..75d64c13081e1b2718462051c1b70e17df6ade97 100644 (file)
 #define                COPY_DW_DST_IS_MEM              (1 << 1)
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define                WAIT_REG_MEM_EQUAL              3
+#define                WAIT_REG_MEM_GEQUAL             5
+#define                WAIT_REG_MEM_MEMORY             (1 << 4)
+#define                WAIT_REG_MEM_PFP                (1 << 8)
 #define PKT3_MEM_WRITE                         0x3D
+#define                MEM_WRITE_32_BITS               (1 << 18)
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME                      0x42 /* EG+ */
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
index 1b466aae574498185a17b6ffebd28fd84cb135bd..a6d5c05ec11f94a8064356986b98e8e180322469 100644 (file)
 #define                COPY_DATA_DST_SEL(x)            (((unsigned)(x) & 0xf) << 8)
 #define                COPY_DATA_COUNT_SEL             (1 << 16)
 #define                COPY_DATA_WR_CONFIRM            (1 << 20)
-#define PKT3_PFP_SYNC_ME                      0x42 /* r7xx+ */
+#define PKT3_PFP_SYNC_ME                      0x42
 #define PKT3_SURFACE_SYNC                      0x43 /* deprecated on CIK, use ACQUIRE_MEM */
 #define PKT3_ME_INITIALIZE                     0x44 /* not on CIK */
 #define PKT3_COND_WRITE                        0x45