From: Marek Olšák Date: Sat, 22 Dec 2012 18:33:47 +0000 (+0100) Subject: r600g: implement buffer copying using CP DMA for R7xx, Evergreen, Cayman X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a70e5e2b94194da3f4102a9f8e3c8ed5ca6dd8b8;p=mesa.git r600g: implement buffer copying using CP DMA for R7xx, Evergreen, Cayman R6xx doesn't work - the issue seems to be with flushing (sometimes the destination buffer contains garbage). There are no hangs, so we're good. R7xx doesn't seem to have any alignment restriction despite our initial thinking. Everything just works. Reviewed-by: Alex Deucher --- diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 8d3050beff0..ed5055b950f 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -329,7 +329,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, */ r600_emit_command_buffer(ctx->cs, &ctx->start_compute_cs_cmd); - ctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; r600_flush_emit(ctx); /* Emit colorbuffers. */ diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 7635f867210..d0402c219fb 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1570,14 +1570,14 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx, uint32_t i, log_samples; if (rctx->framebuffer.state.nr_cbufs) { - rctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; if (rctx->framebuffer.state.cbufs[0]->texture->nr_samples > 1) { rctx->flags |= R600_CONTEXT_FLUSH_AND_INV_CB_META; } } if (rctx->framebuffer.state.zsbuf) { - rctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; } util_copy_framebuffer_state(&rctx->framebuffer.state, state); diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h index 260536ecb44..93604fbe7b5 100644 --- a/src/gallium/drivers/r600/r600.h +++ b/src/gallium/drivers/r600/r600.h @@ -147,9 +147,10 @@ struct r600_so_target { #define R600_CONTEXT_INVAL_READ_CACHES (1 << 0) #define R600_CONTEXT_STREAMOUT_FLUSH (1 << 1) -#define R600_CONTEXT_WAIT_IDLE (1 << 2) -#define R600_CONTEXT_FLUSH_AND_INV (1 << 3) -#define R600_CONTEXT_FLUSH_AND_INV_CB_META (1 << 4) +#define R600_CONTEXT_WAIT_3D_IDLE (1 << 2) +#define R600_CONTEXT_WAIT_CP_DMA_IDLE (1 << 3) +#define R600_CONTEXT_FLUSH_AND_INV (1 << 4) +#define R600_CONTEXT_FLUSH_AND_INV_CB_META (1 << 5) struct r600_context; struct r600_screen; @@ -170,6 +171,10 @@ void r600_context_streamout_begin(struct r600_context *ctx); void r600_context_streamout_end(struct r600_context *ctx); void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in); void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block, unsigned pkt_flags); +void r600_cp_dma_copy_buffer(struct r600_context *rctx, + struct pipe_resource *dst, unsigned dst_offset, + struct pipe_resource *src, unsigned src_offset, + unsigned size); int evergreen_context_init(struct r600_context *ctx); diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index b348aa728b0..c4ce7f7652b 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -503,15 +503,18 @@ static void r600_clear_depth_stencil(struct pipe_context *ctx, r600_blitter_end(ctx); } -void r600_copy_buffer(struct pipe_context *ctx, struct - pipe_resource *dst, unsigned dstx, +void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dstx, struct pipe_resource *src, const struct pipe_box *src_box) { struct r600_context *rctx = (struct r600_context*)ctx; - if (rctx->screen->has_streamout && - /* Require dword alignment. */ - dstx % 4 == 0 && src_box->x % 4 == 0 && src_box->width % 4 == 0) { + /* CP DMA doesn't work on R600 (flushing seems to be unreliable). */ + if (rctx->screen->info.drm_minor >= 27 && rctx->chip_class >= R700) { + r600_cp_dma_copy_buffer(rctx, dst, dstx, src, src_box->x, src_box->width); + } + else if (rctx->screen->has_streamout && + /* Require 4-byte alignment. */ + dstx % 4 == 0 && src_box->x % 4 == 0 && src_box->width % 4 == 0) { r600_blitter_begin(ctx, R600_COPY_BUFFER); util_blitter_copy_buffer(rctx->blitter, dst, dstx, src, src_box->x, src_box->width); r600_blitter_end(ctx); diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 1506b393ce1..caebf5c7a54 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -434,7 +434,7 @@ void r600_context_dirty_block(struct r600_context *ctx, LIST_ADDTAIL(&block->list,&ctx->dirty); if (block->flags & REG_FLAG_FLUSH_CHANGE) { - ctx->flags |= R600_CONTEXT_WAIT_IDLE; + ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE; } } } @@ -606,6 +606,7 @@ void r600_flush_emit(struct r600_context *rctx) { struct radeon_winsys_cs *cs = rctx->cs; unsigned cp_coher_cntl = 0; + unsigned wait_until = 0; unsigned emit_flush = 0; if (!rctx->flags) { @@ -674,9 +675,15 @@ void r600_flush_emit(struct r600_context *rctx) cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ } - if (rctx->flags & R600_CONTEXT_WAIT_IDLE) { + if (rctx->flags & R600_CONTEXT_WAIT_3D_IDLE) { + wait_until |= S_008040_WAIT_3D_IDLE(1); + } + if (rctx->flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) { + wait_until |= S_008040_WAIT_CP_DMA_IDLE(1); + } + if (wait_until) { /* wait for things to settle */ - r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1)); + r600_write_config_reg(cs, R_008040_WAIT_UNTIL, wait_until); } /* everything is properly flushed */ @@ -709,7 +716,8 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags) */ ctx->flags |= R600_CONTEXT_FLUSH_AND_INV | R600_CONTEXT_FLUSH_AND_INV_CB_META | - R600_CONTEXT_WAIT_IDLE; + R600_CONTEXT_WAIT_3D_IDLE | + R600_CONTEXT_WAIT_CP_DMA_IDLE; r600_flush_emit(ctx); @@ -1049,6 +1057,73 @@ void r600_context_streamout_end(struct r600_context *ctx) } r600_set_streamout_enable(ctx, 0); } - ctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; ctx->num_cs_dw_streamout_end = 0; } + +/* The max number of bytes to copy per packet. */ +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) + +void r600_cp_dma_copy_buffer(struct r600_context *rctx, + struct pipe_resource *dst, unsigned dst_offset, + struct pipe_resource *src, unsigned src_offset, + unsigned size) +{ + struct radeon_winsys_cs *cs = rctx->cs; + + assert(size); + assert(rctx->chip_class != R600); + + /* CP DMA doesn't work on R600 (flushing seems to be unreliable). */ + if (rctx->chip_class == R600) { + return; + } + + /* We flush the caches, because we might read from or write + * to resources which are bound right now. */ + rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES | + R600_CONTEXT_FLUSH_AND_INV | + R600_CONTEXT_FLUSH_AND_INV_CB_META | + R600_CONTEXT_STREAMOUT_FLUSH | + R600_CONTEXT_WAIT_3D_IDLE; + + /* There are differences between R700 and EG in CP DMA, + * but we only use the common bits here. */ + while (size) { + unsigned sync = 0; + unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); + unsigned src_reloc, dst_reloc; + + r600_need_cs_space(rctx, 10 + (rctx->flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); + + /* Flush the caches for the first copy only. */ + if (rctx->flags) { + r600_flush_emit(rctx); + } + + /* Do the synchronization after the last copy, so that all data is written to memory. */ + if (size == byte_count) { + sync = PKT3_CP_DMA_CP_SYNC; + } + + /* This must be done after r600_need_cs_space. */ + src_reloc = r600_context_bo_reloc(rctx, (struct r600_resource*)src, RADEON_USAGE_READ); + dst_reloc = r600_context_bo_reloc(rctx, (struct r600_resource*)dst, RADEON_USAGE_WRITE); + + r600_write_value(cs, PKT3(PKT3_CP_DMA, 4, 0)); + r600_write_value(cs, src_offset); /* SRC_ADDR_LO [31:0] */ + r600_write_value(cs, sync); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ + r600_write_value(cs, dst_offset); /* DST_ADDR_LO [31:0] */ + r600_write_value(cs, 0); /* DST_ADDR_HI [7:0] */ + r600_write_value(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + + r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); + r600_write_value(cs, src_reloc); + r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); + r600_write_value(cs, dst_reloc); + + size -= byte_count; + src_offset += byte_count; + dst_offset += byte_count; + } +} diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 934a6f547c0..5d22c93cf7d 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -598,8 +598,7 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx, void evergreen_update_db_shader_control(struct r600_context * rctx); /* r600_blit.c */ -void r600_copy_buffer(struct pipe_context *ctx, struct - pipe_resource *dst, unsigned dstx, +void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dstx, struct pipe_resource *src, const struct pipe_box *src_box); void r600_init_blit_functions(struct r600_context *rctx); void r600_blit_decompress_depth(struct pipe_context *ctx, diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 0cfc4e4ee68..e2d0f7544c1 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -1465,7 +1465,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, unsigned i; if (rctx->framebuffer.state.nr_cbufs) { - rctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; if (rctx->chip_class >= R700 && rctx->framebuffer.state.cbufs[0]->texture->nr_samples > 1) { @@ -1473,7 +1473,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, } } if (rctx->framebuffer.state.zsbuf) { - rctx->flags |= R600_CONTEXT_WAIT_IDLE | R600_CONTEXT_FLUSH_AND_INV; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV; } /* Set the new state. */ @@ -2299,7 +2299,7 @@ bool r600_adjust_gprs(struct r600_context *rctx) if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp) { rctx->config_state.sq_gpr_resource_mgmt_1 = tmp; rctx->config_state.atom.dirty = true; - rctx->flags |= R600_CONTEXT_WAIT_IDLE; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE; } return true; } diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index bbcfc4f5062..3b61413f84e 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -88,7 +88,7 @@ static void r600_texture_barrier(struct pipe_context *ctx) { struct r600_context *rctx = (struct r600_context *)ctx; - rctx->flags |= R600_CONTEXT_WAIT_IDLE; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE; rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES; rctx->flags |= R600_CONTEXT_FLUSH_AND_INV; } @@ -357,7 +357,7 @@ void r600_sampler_states_dirty(struct r600_context *rctx, { if (state->dirty_mask) { if (state->dirty_mask & state->has_bordercolor_mask) { - rctx->flags |= R600_CONTEXT_WAIT_IDLE; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE; } state->atom.num_dw = util_bitcount(state->dirty_mask & state->has_bordercolor_mask) * 11 + @@ -420,7 +420,7 @@ static void r600_bind_sampler_states(struct pipe_context *pipe, seamless_cube_map != -1 && seamless_cube_map != rctx->seamless_cube_map.enabled) { /* change in TA_CNTL_AUX need a pipeline flush */ - rctx->flags |= R600_CONTEXT_WAIT_IDLE; + rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE; rctx->seamless_cube_map.enabled = seamless_cube_map; rctx->seamless_cube_map.atom.dirty = true; } diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 69bfd7a2f87..dd64aca3d51 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -159,6 +159,40 @@ #define PKT3_PRED_S(x) (((x) >> 0) & 0x1) #define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) +#define PKT3_CP_DMA 0x41 +/* 1. header + * 2. SRC_ADDR_LO [31:0] + * 3. CP_SYNC [31] | SRC_ADDR_HI [7:0] + * 4. DST_ADDR_LO [31:0] + * 5. DST_ADDR_HI [7:0] + * 6. COMMAND [29:22] | BYTE_COUNT [20:0] + */ +#define PKT3_CP_DMA_CP_SYNC (1 << 31) +/* COMMAND */ +#define PKT3_CP_DMA_CMD_SRC_SWAP(x) ((x) << 23) +/* 0 - none + * 1 - 8 in 16 + * 2 - 8 in 32 + * 3 - 8 in 64 + */ +#define PKT3_CP_DMA_CMD_DST_SWAP(x) ((x) << 24) +/* 0 - none + * 1 - 8 in 16 + * 2 - 8 in 32 + * 3 - 8 in 64 + */ +#define PKT3_CP_DMA_CMD_SAS (1 << 26) +/* 0 - memory + * 1 - register + */ +#define PKT3_CP_DMA_CMD_DAS (1 << 27) +/* 0 - memory + * 1 - register + */ +#define PKT3_CP_DMA_CMD_SAIC (1 << 28) +#define PKT3_CP_DMA_CMD_DAIC (1 << 29) + + /* Registers */ #define R_008490_CP_STRMOUT_CNTL 0x008490 #define S_008490_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0)