void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t offset,
- unsigned size, uint32_t clear_value)
+ unsigned size, uint32_t clear_value,
+ enum r600_coherency coher)
{
struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
unsigned reloc;
- r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
+ r600_need_cs_space(rctx,
+ 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
+ R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
/* Flush the caches for the first copy only. */
if (rctx->b.flags) {
offset += byte_count;
}
+ /* CP DMA is executed in ME, but index buffers are read by PFP.
+ * This ensures that ME (CP DMA) is idle before PFP starts fetching
+ * indices. If we wanted to execute CP DMA in PFP, this packet
+ * should precede it.
+ */
+ if (coher == R600_COHERENCY_SHADER)
+ r600_emit_pfp_sync_me(rctx);
+
/* Invalidate the read caches. */
rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
R600_CONTEXT_INV_VERTEX_CACHE |
R600_CONTEXT_INV_TEX_CACHE;
}
-
#define WAIT_REG_MEM_EQUAL 3
#define PKT3_MEM_WRITE 0x3D
#define PKT3_INDIRECT_BUFFER 0x32
+#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_SURFACE_SYNC 0x43
#define PKT3_ME_INITIALIZE 0x44
#define PKT3_COND_WRITE 0x45
if (rctx->screen->b.has_cp_dma &&
rctx->b.chip_class >= EVERGREEN &&
offset % 4 == 0 && size % 4 == 0) {
- evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
+ evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher);
} else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) {
union pipe_color_union clear_value;
clear_value.ui[0] = value;
ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
}
+void r600_emit_pfp_sync_me(struct r600_context *rctx)
+{
+ struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+
+ if (rctx->b.chip_class >= EVERGREEN &&
+ rctx->b.screen->info.drm_minor >= 46) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ } else {
+ /* Emulate PFP_SYNC_ME by writing a value to memory in ME and
+ * waiting for it in PFP.
+ */
+ struct r600_resource *buf = NULL;
+ unsigned offset, reloc;
+ uint64_t va;
+
+ /* 16-byte address alignment is required by WAIT_REG_MEM. */
+ u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
+ &offset, (struct pipe_resource**)&buf);
+ if (!buf) {
+ /* This is too heavyweight, but will work. */
+ rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+ return;
+ }
+
+ reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_FENCE);
+
+ va = buf->gpu_address + offset;
+ assert(va % 16 == 0);
+
+ /* Write 1 to memory in ME. */
+ radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, 0);
+
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ /* Wait in PFP (PFP can only do GEQUAL against memory). */
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
+ WAIT_REG_MEM_MEMORY |
+ WAIT_REG_MEM_PFP);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, 1); /* reference value */
+ radeon_emit(cs, 0xffffffff); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
+
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, reloc);
+
+ r600_resource_reference(&buf, NULL);
+ }
+}
+
/* The max number of bytes to copy per packet. */
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
r600_need_cs_space(rctx,
10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
- 3, FALSE);
+ 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
/* Flush the caches for the first copy only. */
if (rctx->b.flags) {
radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
S_008040_WAIT_CP_DMA_IDLE(1));
+ /* CP DMA is executed in ME, but index buffers are read by PFP.
+ * This ensures that ME (CP DMA) is idle before PFP starts fetching
+ * indices. If we wanted to execute CP DMA in PFP, this packet
+ * should precede it.
+ */
+ r600_emit_pfp_sync_me(rctx);
+
/* Invalidate the read caches. */
rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
R600_CONTEXT_INV_VERTEX_CACHE |
/* the number of CS dwords for flushing and drawing */
#define R600_MAX_FLUSH_CS_DWORDS 18
#define R600_MAX_DRAW_CS_DWORDS 58
+#define R600_MAX_PFP_SYNC_ME_DWORDS 16
#define R600_MAX_USER_CONST_BUFFERS 13
#define R600_MAX_DRIVER_CONST_BUFFERS 3
void r600_begin_new_cs(struct r600_context *ctx);
void r600_flush_emit(struct r600_context *ctx);
void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_emit_pfp_sync_me(struct r600_context *rctx);
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t dst_offset,
struct pipe_resource *src, uint64_t src_offset,
unsigned size);
void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t offset,
- unsigned size, uint32_t clear_value);
+ unsigned size, uint32_t clear_value,
+ enum r600_coherency coher);
void r600_dma_copy_buffer(struct r600_context *rctx,
struct pipe_resource *dst,
struct pipe_resource *src,
#define COPY_DW_DST_IS_MEM (1 << 1)
#define PKT3_WAIT_REG_MEM 0x3C
#define WAIT_REG_MEM_EQUAL 3
+#define WAIT_REG_MEM_GEQUAL 5
+#define WAIT_REG_MEM_MEMORY (1 << 4)
+#define WAIT_REG_MEM_PFP (1 << 8)
#define PKT3_MEM_WRITE 0x3D
+#define MEM_WRITE_32_BITS (1 << 18)
#define PKT3_INDIRECT_BUFFER 0x32
+#define PKT3_PFP_SYNC_ME 0x42 /* EG+ */
#define PKT3_SURFACE_SYNC 0x43
#define PKT3_ME_INITIALIZE 0x44
#define PKT3_COND_WRITE 0x45
#define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8)
#define COPY_DATA_COUNT_SEL (1 << 16)
#define COPY_DATA_WR_CONFIRM (1 << 20)
-#define PKT3_PFP_SYNC_ME 0x42 /* r7xx+ */
+#define PKT3_PFP_SYNC_ME 0x42
#define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */
#define PKT3_ME_INITIALIZE 0x44 /* not on CIK */
#define PKT3_COND_WRITE 0x45