From 325422c49449acdd8df1eb2ca8ed81f7696c38cc Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Mon, 7 Jan 2013 17:45:59 -0500 Subject: [PATCH] r600g: add async for staging buffer upload v2 v2: Add virtual address to dma src/dst offset for cayman Signed-off-by: Jerome Glisse --- .../drivers/r600/evergreen_hw_context.c | 46 ++++ src/gallium/drivers/r600/evergreen_state.c | 201 ++++++++++++++++++ src/gallium/drivers/r600/evergreend.h | 15 ++ src/gallium/drivers/r600/r600.h | 27 +++ src/gallium/drivers/r600/r600_buffer.c | 25 ++- src/gallium/drivers/r600/r600_hw_context.c | 48 ++++- src/gallium/drivers/r600/r600_pipe.c | 6 +- src/gallium/drivers/r600/r600_pipe.h | 9 + src/gallium/drivers/r600/r600_state.c | 190 +++++++++++++++++ src/gallium/drivers/r600/r600_state_common.c | 6 +- src/gallium/drivers/r600/r600_texture.c | 24 ++- src/gallium/drivers/r600/r600d.h | 15 ++ 12 files changed, 595 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index fa90c9a9b74..ca4f4b3bdf4 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -26,6 +26,7 @@ #include "r600_hw_context_priv.h" #include "evergreend.h" #include "util/u_memory.h" +#include "util/u_math.h" static const struct r600_reg cayman_config_reg_list[] = { {R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE, 0}, @@ -238,3 +239,48 @@ void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_en r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0)); } } + +void evergreen_dma_copy(struct r600_context *rctx, + struct pipe_resource *dst, + struct pipe_resource *src, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long size) +{ + struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + unsigned i, ncopy, csize, sub_cmd, shift; + struct r600_resource *rdst = (struct r600_resource*)dst; + struct r600_resource *rsrc = (struct r600_resource*)src; + + /* make sure that the dma ring is only one active */ + rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC); + dst_offset += r600_resource_va(&rctx->screen->screen, dst); + src_offset += r600_resource_va(&rctx->screen->screen, src); + + /* see if we use dword or byte copy */ + if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) { + size >>= 2; + sub_cmd = 0x00; + shift = 2; + } else { + sub_cmd = 0x40; + shift = 0; + } + ncopy = (size / 0x000fffff) + !!(size % 0x000fffff); + + r600_need_dma_space(rctx, ncopy * 5); + for (i = 0; i < ncopy; i++) { + csize = size < 0x000fffff ? size : 0x000fffff; + /* emit reloc before writting cs so that cs is always in consistent state */ + r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ); + r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE); + cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize); + cs->buf[cs->cdw++] = dst_offset & 0xffffffff; + cs->buf[cs->cdw++] = src_offset & 0xffffffff; + cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff; + cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff; + dst_offset += csize << shift; + src_offset += csize << shift; + size -= csize; + } +} diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index d03c376f0fc..be1c427ce29 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -30,6 +30,20 @@ #include "util/u_framebuffer.h" #include "util/u_dual_blend.h" #include "evergreen_compute.h" +#include "util/u_math.h" + +static INLINE unsigned evergreen_array_mode(unsigned mode) +{ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_028C70_ARRAY_LINEAR_ALIGNED; + break; + case RADEON_SURF_MODE_1D: return V_028C70_ARRAY_1D_TILED_THIN1; + break; + case RADEON_SURF_MODE_2D: return V_028C70_ARRAY_2D_TILED_THIN1; + default: + case RADEON_SURF_MODE_LINEAR: return V_028C70_ARRAY_LINEAR_GENERAL; + } +} static uint32_t eg_num_banks(uint32_t nbanks) { @@ -3445,3 +3459,190 @@ void evergreen_update_db_shader_control(struct r600_context * rctx) rctx->db_misc_state.atom.dirty = true; } } + +static void evergreen_dma_copy_tile(struct r600_context *rctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, + unsigned dst_y, + unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + unsigned src_x, + unsigned src_y, + unsigned src_z, + unsigned copy_height, + unsigned pitch, + unsigned bpp) +{ + struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; + unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode; + unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split; + unsigned long base, addr; + + /* make sure that the dma ring is only one active */ + rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC); + + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + assert(dst_mode != src_mode); + + y = 0; + sub_cmd = 0x8; + lbpp = util_logbase2(bpp); + pitch_tile_max = ((pitch / bpp) >> 3) - 1; + nbanks = eg_num_banks(rctx->screen->tiling_info.num_banks); + + if (dst_mode == RADEON_SURF_MODE_LINEAR) { + /* T2L */ + array_mode = evergreen_array_mode(src_mode); + slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1; + /* linear height must be the same as the slice tile max height, it's ok even + * if the linear destination/source have smaller heigh as the size of the + * dma packet will be using the copy_height which is always smaller or equal + * to the linear height + */ + height = rsrc->surface.level[src_level].npix_y; + detile = 1; + x = src_x; + y = src_y; + z = src_z; + base = rsrc->surface.level[src_level].offset; + addr = rdst->surface.level[dst_level].offset; + addr += rdst->surface.level[dst_level].slice_size * dst_z; + addr += dst_y * pitch + dst_x * bpp; + bank_h = eg_bank_wh(rsrc->surface.bankh); + bank_w = eg_bank_wh(rsrc->surface.bankw); + mt_aspect = eg_macro_tile_aspect(rsrc->surface.mtilea); + tile_split = eg_tile_split(rsrc->surface.tile_split); + base += r600_resource_va(&rctx->screen->screen, src); + addr += r600_resource_va(&rctx->screen->screen, dst); + } else { + /* L2T */ + array_mode = evergreen_array_mode(dst_mode); + slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1; + /* linear height must be the same as the slice tile max height, it's ok even + * if the linear destination/source have smaller heigh as the size of the + * dma packet will be using the copy_height which is always smaller or equal + * to the linear height + */ + height = rdst->surface.level[dst_level].npix_y; + detile = 0; + x = dst_x; + y = dst_y; + z = dst_z; + base = rdst->surface.level[dst_level].offset; + addr = rsrc->surface.level[src_level].offset; + addr += rsrc->surface.level[src_level].slice_size * src_z; + addr += src_y * pitch + src_x * bpp; + bank_h = eg_bank_wh(rdst->surface.bankh); + bank_w = eg_bank_wh(rdst->surface.bankw); + mt_aspect = eg_macro_tile_aspect(rdst->surface.mtilea); + tile_split = eg_tile_split(rdst->surface.tile_split); + base += r600_resource_va(&rctx->screen->screen, dst); + addr += r600_resource_va(&rctx->screen->screen, src); + } + + size = (copy_height * pitch) >> 2; + ncopy = (size / 0x000fffff) + !!(size % 0x000fffff); + r600_need_dma_space(rctx, ncopy * 9); + + for (i = 0; i < ncopy; i++) { + cheight = copy_height; + if (((cheight * pitch) >> 2) > 0x000fffff) { + cheight = (0x000fffff << 2) / pitch; + } + size = (cheight * pitch) >> 2; + /* emit reloc before writting cs so that cs is always in consistent state */ + r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ); + r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE); + cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size); + cs->buf[cs->cdw++] = base >> 8; + cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) | + (lbpp << 24) | (bank_h << 21) | + (bank_w << 18) | (mt_aspect << 16); + cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 16); + cs->buf[cs->cdw++] = (slice_tile_max << 0); + cs->buf[cs->cdw++] = (x << 0) | (z << 18); + cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 25); + cs->buf[cs->cdw++] = addr & 0xfffffffc; + cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff; + copy_height -= cheight; + addr += cheight * pitch; + y += cheight; + } +} + +boolean evergreen_dma_blit(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height; + unsigned src_w, dst_w; + + if (rctx->rings.dma.cs == NULL) { + return FALSE; + } + if (src->format != dst->format) { + return FALSE; + } + + bpp = rdst->surface.bpe; + dst_pitch = rdst->surface.level[dst_level].pitch_bytes; + src_pitch = rsrc->surface.level[src_level].pitch_bytes; + src_w = rsrc->surface.level[src_level].npix_x; + dst_w = rdst->surface.level[dst_level].npix_x; + copy_height = src_box->height / rsrc->surface.blk_h; + + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + + if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) { + /* FIXME evergreen can do partial blit */ + return FALSE; + } + /* the x test here are currently useless (because we don't support partial blit) + * but keep them around so we don't forget about those + */ + if ((src_pitch & 0x7) || (src_box->x & 0x7) || (dst_x & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) { + return FALSE; + } + + if (src_mode == dst_mode) { + unsigned long dst_offset, src_offset; + /* simple dma blit would do NOTE code here assume : + * src_box.x/y == 0 + * dst_x/y == 0 + * dst_pitch == src_pitch + */ + src_offset= rsrc->surface.level[src_level].offset; + src_offset += rsrc->surface.level[src_level].slice_size * src_box->z; + src_offset += src_box->y * src_pitch + src_box->x * bpp; + dst_offset = rdst->surface.level[dst_level].offset; + dst_offset += rdst->surface.level[dst_level].slice_size * dst_z; + dst_offset += dst_y * dst_pitch + dst_x * bpp; + evergreen_dma_copy(rctx, dst, src, dst_offset, src_offset, + src_box->height * src_pitch); + } else { + evergreen_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z, + src, src_level, src_box->x, src_box->y, src_box->z, + copy_height, dst_pitch, bpp); + } + return TRUE; +} diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index d9dba959bd5..12c7ed14095 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -2317,4 +2317,19 @@ #define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1) #define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF +/* async DMA packets */ +#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \ + (((sub_cmd) & 0xFF) << 20) |\ + (((n) & 0xFFFFF) << 0)) +/* async DMA Packet types */ +#define DMA_PACKET_WRITE 0x2 +#define DMA_PACKET_COPY 0x3 +#define DMA_PACKET_INDIRECT_BUFFER 0x4 +#define DMA_PACKET_SEMAPHORE 0x5 +#define DMA_PACKET_FENCE 0x6 +#define DMA_PACKET_TRAP 0x7 +#define DMA_PACKET_SRBM_WRITE 0x9 +#define DMA_PACKET_CONSTANT_FILL 0xd +#define DMA_PACKET_NOP 0xf + #endif diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h index 2734594987e..a383c908f64 100644 --- a/src/gallium/drivers/r600/r600.h +++ b/src/gallium/drivers/r600/r600.h @@ -170,6 +170,33 @@ void r600_flush_emit(struct r600_context *ctx); void r600_context_streamout_begin(struct r600_context *ctx); void r600_context_streamout_end(struct r600_context *ctx); void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in); +void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw); +void r600_dma_copy(struct r600_context *rctx, + struct pipe_resource *dst, + struct pipe_resource *src, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long size); +boolean r600_dma_blit(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); +void evergreen_dma_copy(struct r600_context *rctx, + struct pipe_resource *dst, + struct pipe_resource *src, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long size); +boolean evergreen_dma_blit(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block, unsigned pkt_flags); void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c index be171f8850a..6df0d91a56c 100644 --- a/src/gallium/drivers/r600/r600_buffer.c +++ b/src/gallium/drivers/r600/r600_buffer.c @@ -27,6 +27,7 @@ #include "r600_pipe.h" #include "util/u_upload_mgr.h" #include "util/u_memory.h" +#include "util/u_surface.h" static void r600_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf) @@ -179,13 +180,27 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe, struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; if (rtransfer->staging) { - struct pipe_box box; - u_box_1d(rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT, - transfer->box.width, &box); + struct pipe_resource *dst, *src; + unsigned soffset, doffset, size; + dst = transfer->resource; + src = &rtransfer->staging->b.b; + size = transfer->box.width; + doffset = transfer->box.x; + soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT; /* Copy the staging buffer into the original one. */ - r600_copy_buffer(pipe, transfer->resource, transfer->box.x, - &rtransfer->staging->b.b, &box); + if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset)) { + if (rctx->screen->chip_class >= EVERGREEN) { + evergreen_dma_copy(rctx, dst, src, doffset, soffset, size); + } else { + r600_dma_copy(rctx, dst, src, doffset, soffset, size); + } + } else { + struct pipe_box box; + + u_box_1d(soffset, size, &box); + r600_copy_buffer(pipe, dst, doffset, src, &box); + } pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL); } util_slab_free(&rctx->pool_transfers, transfer); diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index b28827f2b29..61299ef40fb 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -762,8 +762,6 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags) } } #endif - - r600_begin_new_cs(ctx); } void r600_begin_new_cs(struct r600_context *ctx) @@ -1129,3 +1127,49 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, dst_offset += byte_count; } } + +void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw) +{ + /* The number of dwords we already used in the DMA so far. */ + num_dw += ctx->rings.dma.cs->cdw; + /* Flush if there's not enough space. */ + if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { + ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC); + } +} + +void r600_dma_copy(struct r600_context *rctx, + struct pipe_resource *dst, + struct pipe_resource *src, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long size) +{ + struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + unsigned i, ncopy, csize, shift; + struct r600_resource *rdst = (struct r600_resource*)dst; + struct r600_resource *rsrc = (struct r600_resource*)src; + + /* make sure that the dma ring is only one active */ + rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC); + + size >>= 2; + shift = 2; + ncopy = (size / 0xffff) + !!(size % 0xffff); + + r600_need_dma_space(rctx, ncopy * 5); + for (i = 0; i < ncopy; i++) { + csize = size < 0xffff ? size : 0xffff; + /* emit reloc before writting cs so that cs is always in consistent state */ + r600_context_bo_reloc(rctx, &rctx->rings.dma, rsrc, RADEON_USAGE_READ); + r600_context_bo_reloc(rctx, &rctx->rings.dma, rdst, RADEON_USAGE_WRITE); + cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize); + cs->buf[cs->cdw++] = dst_offset & 0xfffffffc; + cs->buf[cs->cdw++] = src_offset & 0xfffffffc; + cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff; + cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff; + dst_offset += csize << shift; + src_offset += csize << shift; + size -= csize; + } +} diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index c72ee8f8571..676741255f5 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -30,6 +30,7 @@ #include "util/u_memory.h" #include "util/u_simple_shaders.h" #include "util/u_upload_mgr.h" +#include "util/u_math.h" #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "os/os_time.h" @@ -128,12 +129,13 @@ static void r600_flush(struct pipe_context *ctx, unsigned flags) } r600_context_flush(rctx, flags); + rctx->rings.gfx.flushing = false; + r600_begin_new_cs(rctx); /* Re-enable render condition. */ if (render_cond) { ctx->render_condition(ctx, render_cond, render_cond_mode); } - rctx->rings.gfx.flushing = false; } static void r600_flush_from_st(struct pipe_context *ctx, @@ -1111,8 +1113,10 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) if (rscreen->chip_class >= EVERGREEN) { rscreen->screen.is_format_supported = evergreen_is_format_supported; + rscreen->dma_blit = &evergreen_dma_blit; } else { rscreen->screen.is_format_supported = r600_is_format_supported; + rscreen->dma_blit = &r600_dma_blit; } rscreen->screen.is_video_format_supported = vl_video_buffer_is_format_supported; rscreen->screen.context_create = r600_create_context; diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 5cb080579f0..31dcd057265 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -220,6 +220,14 @@ enum r600_msaa_texture_mode { MSAA_TEXTURE_COMPRESSED }; +typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + struct r600_screen { struct pipe_screen screen; struct radeon_winsys *ws; @@ -243,6 +251,7 @@ struct r600_screen { uint32_t *trace_ptr; unsigned cs_count; #endif + r600g_dma_blit_t dma_blit; }; struct r600_pipe_sampler_view { diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index da249c247bc..6b4b2c3819a 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -2945,3 +2945,193 @@ void r600_update_db_shader_control(struct r600_context * rctx) rctx->db_misc_state.atom.dirty = true; } } + +static INLINE unsigned r600_array_mode(unsigned mode) +{ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: return V_0280A0_ARRAY_LINEAR_ALIGNED; + break; + case RADEON_SURF_MODE_1D: return V_0280A0_ARRAY_1D_TILED_THIN1; + break; + case RADEON_SURF_MODE_2D: return V_0280A0_ARRAY_2D_TILED_THIN1; + default: + case RADEON_SURF_MODE_LINEAR: return V_0280A0_ARRAY_LINEAR_GENERAL; + } +} + +static boolean r600_dma_copy_tile(struct r600_context *rctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, + unsigned dst_y, + unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + unsigned src_x, + unsigned src_y, + unsigned src_z, + unsigned copy_height, + unsigned pitch, + unsigned bpp) +{ + struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; + unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode; + unsigned long base, addr; + + /* make sure that the dma ring is only one active */ + rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC); + + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + assert(dst_mode != src_mode); + + y = 0; + lbpp = util_logbase2(bpp); + pitch_tile_max = ((pitch / bpp) >> 3) - 1; + + if (dst_mode == RADEON_SURF_MODE_LINEAR) { + /* T2L */ + array_mode = r600_array_mode(src_mode); + slice_tile_max = (((pitch * rsrc->surface.level[src_level].npix_y) >> 6) / bpp) - 1; + /* linear height must be the same as the slice tile max height, it's ok even + * if the linear destination/source have smaller heigh as the size of the + * dma packet will be using the copy_height which is always smaller or equal + * to the linear height + */ + height = rsrc->surface.level[src_level].npix_y; + detile = 1; + x = src_x; + y = src_y; + z = src_z; + base = rsrc->surface.level[src_level].offset; + addr = rdst->surface.level[dst_level].offset; + addr += rdst->surface.level[dst_level].slice_size * dst_z; + addr += dst_y * pitch + dst_x * bpp; + } else { + /* L2T */ + array_mode = r600_array_mode(dst_mode); + slice_tile_max = (((pitch * rdst->surface.level[dst_level].npix_y) >> 6) / bpp) - 1; + /* linear height must be the same as the slice tile max height, it's ok even + * if the linear destination/source have smaller heigh as the size of the + * dma packet will be using the copy_height which is always smaller or equal + * to the linear height + */ + height = rdst->surface.level[dst_level].npix_y; + detile = 0; + x = dst_x; + y = dst_y; + z = dst_z; + base = rdst->surface.level[dst_level].offset; + addr = rsrc->surface.level[src_level].offset; + addr += rsrc->surface.level[src_level].slice_size * src_z; + addr += src_y * pitch + src_x * bpp; + } + /* check that we are in dw/base alignment constraint */ + if ((addr & 0x3) || (base & 0xff)) { + return FALSE; + } + + size = (copy_height * pitch) >> 2; + ncopy = (size / 0x0000ffff) + !!(size % 0x0000ffff); + r600_need_dma_space(rctx, ncopy * 7); + for (i = 0; i < ncopy; i++) { + cheight = copy_height; + if (((cheight * pitch) >> 2) > 0x0000ffff) { + cheight = (0x0000ffff << 2) / pitch; + } + size = (cheight * pitch) >> 2; + /* emit reloc before writting cs so that cs is always in consistent state */ + r600_context_bo_reloc(rctx, &rctx->rings.dma, &rsrc->resource, RADEON_USAGE_READ); + r600_context_bo_reloc(rctx, &rctx->rings.dma, &rdst->resource, RADEON_USAGE_WRITE); + cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size); + cs->buf[cs->cdw++] = base >> 8; + cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) | + (lbpp << 24) | ((height - 1) << 10) | + pitch_tile_max; + cs->buf[cs->cdw++] = (slice_tile_max << 12) | (z << 0); + cs->buf[cs->cdw++] = (x << 3) | (y << 17); + cs->buf[cs->cdw++] = addr & 0xfffffffc; + cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff; + copy_height -= cheight; + addr += cheight * pitch; + y += cheight; + } + return TRUE; +} + +boolean r600_dma_blit(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_texture *rsrc = (struct r600_texture*)src; + struct r600_texture *rdst = (struct r600_texture*)dst; + unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height; + unsigned src_w, dst_w; + + if (rctx->rings.dma.cs == NULL) { + return FALSE; + } + if (src->format != dst->format) { + return FALSE; + } + + bpp = rdst->surface.bpe; + dst_pitch = rdst->surface.level[dst_level].pitch_bytes; + src_pitch = rsrc->surface.level[src_level].pitch_bytes; + src_w = rsrc->surface.level[src_level].npix_x; + dst_w = rdst->surface.level[dst_level].npix_x; + copy_height = src_box->height / rsrc->surface.blk_h; + + dst_mode = rdst->surface.level[dst_level].mode; + src_mode = rsrc->surface.level[src_level].mode; + /* downcast linear aligned to linear to simplify test */ + src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode; + dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode; + + if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) { + /* strick requirement on r6xx/r7xx */ + return FALSE; + } + /* lot of constraint on alignment this should capture them all */ + if ((src_pitch & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) { + return FALSE; + } + + if (src_mode == dst_mode) { + unsigned long dst_offset, src_offset, size; + + /* simple dma blit would do NOTE code here assume : + * src_box.x/y == 0 + * dst_x/y == 0 + * dst_pitch == src_pitch + */ + src_offset= rsrc->surface.level[src_level].offset; + src_offset += rsrc->surface.level[src_level].slice_size * src_box->z; + src_offset += src_box->y * src_pitch + src_box->x * bpp; + dst_offset = rdst->surface.level[dst_level].offset; + dst_offset += rdst->surface.level[dst_level].slice_size * dst_z; + dst_offset += dst_y * dst_pitch + dst_x * bpp; + size = src_box->height * src_pitch; + /* must be dw aligned */ + if ((dst_offset & 0x3) || (src_offset & 0x3) || (size & 0x3)) { + return FALSE; + } + r600_dma_copy(rctx, dst, src, dst_offset, src_offset, size); + } else { + return r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z, + src, src_level, src_box->x, src_box->y, src_box->z, + copy_height, dst_pitch, bpp); + } + return TRUE; +} diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index c7f672f748c..b547d647cba 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1273,6 +1273,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info return; } + /* make sure that the gfx ring is only one active */ + rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC); + if (!r600_update_derived_state(rctx)) { /* useless to render because current rendering command * can't be achieved @@ -1280,9 +1283,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info return; } - /* make sure that the gfx ring is only one active */ - rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC); - if (info.indexed) { /* Initialize the index buffer struct. */ pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer); diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c index b0c55145f1f..96c3729bc08 100644 --- a/src/gallium/drivers/r600/r600_texture.c +++ b/src/gallium/drivers/r600/r600_texture.c @@ -35,13 +35,19 @@ /* Copy from a full GPU texture to a transfer's staging one. */ static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) { + struct r600_context *rctx = (struct r600_context*)ctx; struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; struct pipe_resource *dst = &rtransfer->staging->b.b; struct pipe_resource *src = transfer->resource; if (src->nr_samples <= 1) { - ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0, - src, transfer->level, &transfer->box); + if (!rctx->screen->dma_blit(ctx, dst, 0, 0, 0, 0, + src, transfer->level, + &transfer->box)) { + /* async dma could not be use */ + ctx->resource_copy_region(ctx, dst, 0, 0, 0, 0, + src, transfer->level, &transfer->box); + } } else { /* Resolve the resource. */ struct pipe_blit_info blit; @@ -66,16 +72,22 @@ static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_t /* Copy from a transfer's staging texture to a full GPU one. */ static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) { + struct r600_context *rctx = (struct r600_context*)ctx; struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; struct pipe_resource *texture = transfer->resource; struct pipe_box sbox; u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); - ctx->resource_copy_region(ctx, texture, transfer->level, - transfer->box.x, transfer->box.y, transfer->box.z, - &rtransfer->staging->b.b, - 0, &sbox); + if (!rctx->screen->dma_blit(ctx, texture, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + &rtransfer->staging->b.b, 0, &sbox)) { + /* async dma could not be use */ + ctx->resource_copy_region(ctx, texture, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + &rtransfer->staging->b.b, + 0, &sbox); + } } unsigned r600_texture_get_offset(struct r600_texture *rtex, diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index dd64aca3d51..621e7a10e78 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -3681,4 +3681,19 @@ #define SQ_TEX_INST_SAMPLE_C_G_LB 0x1E #define SQ_TEX_INST_SAMPLE_C_G_LZ 0x1F +/* async DMA packets */ +#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \ + (((t) & 0x1) << 23) | \ + (((s) & 0x1) << 22) | \ + (((n) & 0xFFFF) << 0)) +/* async DMA Packet types */ +#define DMA_PACKET_WRITE 0x2 +#define DMA_PACKET_COPY 0x3 +#define DMA_PACKET_INDIRECT_BUFFER 0x4 +#define DMA_PACKET_SEMAPHORE 0x5 +#define DMA_PACKET_FENCE 0x6 +#define DMA_PACKET_TRAP 0x7 +#define DMA_PACKET_CONSTANT_FILL 0xd /* 7xx only */ +#define DMA_PACKET_NOP 0xf + #endif -- 2.30.2