From: Marek Olšák Date: Thu, 26 May 2016 20:00:03 +0000 (+0200) Subject: r600g: fix CP DMA hazard with index buffer fetches (v3) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5ea5ed60500a8612166853975b42abd40a459216;p=mesa.git r600g: fix CP DMA hazard with index buffer fetches (v3) v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel, otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that Reviewed-by: Alex Deucher Tested-by: Grazvydas Ignotas Tested-by: Dieter Nützel --- diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index f456696970c..2feb8015082 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, - unsigned size, uint32_t clear_value) + unsigned size, uint32_t clear_value, + enum r600_coherency coher) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; @@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned reloc; - r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); + r600_need_cs_space(rctx, + 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { @@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, offset += byte_count; } + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + if (coher == R600_COHERENCY_SHADER) + r600_emit_pfp_sync_me(rctx); + /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; } - diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index c1c616910de..a81b6c5fc81 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -88,6 +88,7 @@ #define WAIT_REG_MEM_EQUAL 3 #define PKT3_MEM_WRITE 0x3D #define PKT3_INDIRECT_BUFFER 0x32 +#define PKT3_PFP_SYNC_ME 0x42 #define PKT3_SURFACE_SYNC 0x43 #define PKT3_ME_INITIALIZE 0x44 #define PKT3_COND_WRITE 0x45 diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index 282645f1496..76c3364a818 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds if (rctx->screen->b.has_cp_dma && rctx->b.chip_class >= EVERGREEN && offset % 4 == 0 && size % 4 == 0) { - evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value); + evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher); } else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) { union pipe_color_union clear_value; clear_value.ui[0] = value; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 808bd27607f..3ba723d0541 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -364,6 +364,66 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw; } +void r600_emit_pfp_sync_me(struct r600_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + + if (rctx->b.chip_class >= EVERGREEN && + rctx->b.screen->info.drm_minor >= 46) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } else { + /* Emulate PFP_SYNC_ME by writing a value to memory in ME and + * waiting for it in PFP. + */ + struct r600_resource *buf = NULL; + unsigned offset, reloc; + uint64_t va; + + /* 16-byte address alignment is required by WAIT_REG_MEM. */ + u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16, + &offset, (struct pipe_resource**)&buf); + if (!buf) { + /* This is too heavyweight, but will work. */ + rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + return; + } + + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf, + RADEON_USAGE_READWRITE, + RADEON_PRIO_FENCE); + + va = buf->gpu_address + offset; + assert(va % 16 == 0); + + /* Write 1 to memory in ME. */ + radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS); + radeon_emit(cs, 1); + radeon_emit(cs, 0); + + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + + /* Wait in PFP (PFP can only do GEQUAL against memory). */ + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_GEQUAL | + WAIT_REG_MEM_MEMORY | + WAIT_REG_MEM_PFP); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 1); /* reference value */ + radeon_emit(cs, 0xffffffff); /* mask */ + radeon_emit(cs, 4); /* poll interval */ + + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + + r600_resource_reference(&buf, NULL); + } +} + /* The max number of bytes to copy per packet. */ #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) @@ -407,7 +467,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + - 3, FALSE); + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { @@ -447,6 +507,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + r600_emit_pfp_sync_me(rctx); + /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 76178c22509..313bf69c314 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -57,6 +57,7 @@ /* the number of CS dwords for flushing and drawing */ #define R600_MAX_FLUSH_CS_DWORDS 18 #define R600_MAX_DRAW_CS_DWORDS 58 +#define R600_MAX_PFP_SYNC_ME_DWORDS 16 #define R600_MAX_USER_CONST_BUFFERS 13 #define R600_MAX_DRIVER_CONST_BUFFERS 3 @@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags, void r600_begin_new_cs(struct r600_context *ctx); void r600_flush_emit(struct r600_context *ctx); void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in); +void r600_emit_pfp_sync_me(struct r600_context *rctx); void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size); void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, - unsigned size, uint32_t clear_value); + unsigned size, uint32_t clear_value, + enum r600_coherency coher); void r600_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 24f599ea6e9..75d64c13081 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -96,8 +96,13 @@ #define COPY_DW_DST_IS_MEM (1 << 1) #define PKT3_WAIT_REG_MEM 0x3C #define WAIT_REG_MEM_EQUAL 3 +#define WAIT_REG_MEM_GEQUAL 5 +#define WAIT_REG_MEM_MEMORY (1 << 4) +#define WAIT_REG_MEM_PFP (1 << 8) #define PKT3_MEM_WRITE 0x3D +#define MEM_WRITE_32_BITS (1 << 18) #define PKT3_INDIRECT_BUFFER 0x32 +#define PKT3_PFP_SYNC_ME 0x42 /* EG+ */ #define PKT3_SURFACE_SYNC 0x43 #define PKT3_ME_INITIALIZE 0x44 #define PKT3_COND_WRITE 0x45 diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 1b466aae574..a6d5c05ec11 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -143,7 +143,7 @@ #define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8) #define COPY_DATA_COUNT_SEL (1 << 16) #define COPY_DATA_WR_CONFIRM (1 << 20) -#define PKT3_PFP_SYNC_ME 0x42 /* r7xx+ */ +#define PKT3_PFP_SYNC_ME 0x42 #define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */ #define PKT3_ME_INITIALIZE 0x44 /* not on CIK */ #define PKT3_COND_WRITE 0x45