X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_cp_dma.c;h=f98fad43b3e3389f9a81320b710fe6e9df991834;hb=ce8e6b970b5bb765630bfffce67545b63542b78a;hp=cd88d38ccede76ed2960f38e7514308ccfaf332a;hpb=fa09388704fd413c0d53397b46293af03d73ddb6;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index cd88d38cced..f98fad43b3e 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -1,5 +1,6 @@ /* * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,7 +24,6 @@ #include "si_pipe.h" #include "sid.h" -#include "radeon/r600_cs.h" /* Recommended maximum sizes for optimal performance. * Fall back to compute or SDMA if the size is greater. @@ -45,7 +45,7 @@ /* The max number of bytes that can be copied per packet. */ static inline unsigned cp_dma_max_byte_count(struct si_context *sctx) { - unsigned max = sctx->b.chip_class >= GFX9 ? + unsigned max = sctx->chip_class >= GFX9 ? S_414_BYTE_COUNT_GFX9(~0u) : S_414_BYTE_COUNT_GFX6(~0u); @@ -60,15 +60,14 @@ static inline unsigned cp_dma_max_byte_count(struct si_context *sctx) */ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags, - enum r600_coherency coher) + enum si_coherency coher) { - struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; uint32_t header = 0, command = 0; - assert(size); assert(size <= cp_dma_max_byte_count(sctx)); - if (sctx->b.chip_class >= GFX9) + if (sctx->chip_class >= GFX9) command |= S_414_BYTE_COUNT_GFX9(size); else command |= S_414_BYTE_COUNT_GFX6(size); @@ -77,7 +76,7 @@ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, if (flags & CP_DMA_SYNC) header |= S_411_CP_SYNC(1); else { - if (sctx->b.chip_class >= GFX9) + if (sctx->chip_class >= GFX9) command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); else command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); @@ -87,18 +86,18 @@ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, command |= S_414_RAW_WAIT(1); /* Src and dst flags. */ - if (sctx->b.chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && + if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) - header |= S_411_DSL_SEL(V_411_NOWHERE); /* prefetch only */ + header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ else if (flags & CP_DMA_USE_L2) - header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2); + header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); if (flags & CP_DMA_CLEAR) header |= S_411_SRC_SEL(V_411_DATA); else if (flags & CP_DMA_USE_L2) header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); - if (sctx->b.chip_class >= CIK) { + if (sctx->chip_class >= CIK) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, header); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ @@ -122,31 +121,42 @@ static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, * indices. If we wanted to execute CP DMA in PFP, this packet * should precede it. */ - if (coher == R600_COHERENCY_SHADER && flags & CP_DMA_SYNC) { + if (coher == SI_COHERENCY_SHADER && flags & CP_DMA_SYNC) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } } -static unsigned get_flush_flags(struct si_context *sctx, enum r600_coherency coher) +void si_cp_dma_wait_for_idle(struct si_context *sctx) +{ + /* Issue a dummy DMA that copies zero bytes. + * + * The DMA engine will see that there's no work to do and skip this + * DMA request, however, the CP will see the sync flag and still wait + * for all DMAs to complete. + */ + si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, SI_COHERENCY_NONE); +} + +static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher) { switch (coher) { default: - case R600_COHERENCY_NONE: + case SI_COHERENCY_NONE: return 0; - case R600_COHERENCY_SHADER: + case SI_COHERENCY_SHADER: return SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_INV_VMEM_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0); - case R600_COHERENCY_CB_META: + (sctx->chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0); + case SI_COHERENCY_CB_META: return SI_CONTEXT_FLUSH_AND_INV_CB; } } -static unsigned get_tc_l2_flag(struct si_context *sctx, enum r600_coherency coher) +static unsigned get_tc_l2_flag(struct si_context *sctx, enum si_coherency coher) { - if ((sctx->b.chip_class >= GFX9 && coher == R600_COHERENCY_CB_META) || - (sctx->b.chip_class >= CIK && coher == R600_COHERENCY_SHADER)) + if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) || + (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER)) return CP_DMA_USE_L2; return 0; @@ -165,9 +175,9 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { /* Count memory usage in so that need_cs_space can take it into account. */ - si_context_add_resource_size(&sctx->b.b, dst); + si_context_add_resource_size(sctx, dst); if (src) - si_context_add_resource_size(&sctx->b.b, src); + si_context_add_resource_size(sctx, src); } if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE)) @@ -175,19 +185,19 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst /* This must be done after need_cs_space. */ if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)dst, + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + r600_resource(dst), RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); if (src) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)src, + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + r600_resource(src), RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); } /* Flush the caches for the first copy only. * Also wait for the previous CP DMA operations. */ - if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->b.flags) + if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags) si_emit_cache_flush(sctx); if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first) @@ -203,12 +213,11 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst *packet_flags |= CP_DMA_SYNC; } -void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, - enum r600_coherency coher) + enum si_coherency coher) { - struct si_context *sctx = (struct si_context*)ctx; - struct radeon_winsys *ws = sctx->b.ws; + struct radeon_winsys *ws = sctx->ws; struct r600_resource *rdst = r600_resource(dst); unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher); unsigned flush_flags = get_flush_flags(sctx, coher); @@ -228,7 +237,7 @@ void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* dma_clear_buffer can use clear_buffer on failure. Make sure that * doesn't happen. We don't want an infinite recursion: */ - if (sctx->b.dma.cs && + if (sctx->dma_cs && !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) && (offset % 4 == 0) && /* CP DMA is very slow. Always use SDMA for big clears. This @@ -240,9 +249,9 @@ void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, * si_emit_framebuffer_state (in a draw call) adds them. * For example, DeusEx:MD has 21 buffer clears per frame and all * of them are moved to SDMA thanks to this. */ - !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf, + !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, RADEON_USAGE_READWRITE))) { - sctx->b.dma_clear_buffer(ctx, dst, offset, dma_clear_size, value); + sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value); offset += dma_clear_size; size -= dma_clear_size; @@ -253,7 +262,7 @@ void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, size -= dma_clear_size; /* Flush the caches. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; while (dma_clear_size) { @@ -274,8 +283,8 @@ void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, rdst->TC_L2_dirty = true; /* If it's not a framebuffer fast clear... */ - if (coher == R600_COHERENCY_SHADER) - sctx->b.num_cp_dma_calls++; + if (coher == SI_COHERENCY_SHADER) + sctx->num_cp_dma_calls++; } if (size) { @@ -286,7 +295,7 @@ void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, assert(dst->target == PIPE_BUFFER); assert(size < 4); - pipe_buffer_write(ctx, dst, offset, size, &value); + pipe_buffer_write(&sctx->b, dst, offset, size, &value); } } @@ -321,11 +330,11 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, union pipe_color_union clear_value; memcpy(&clear_value, clear_value_ptr, clear_value_size); - si_blitter_begin(ctx, SI_DISABLE_RENDER_COND); + si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); util_blitter_clear_buffer(sctx->blitter, dst, offset, size, clear_value_size / 4, &clear_value); - si_blitter_end(ctx); + si_blitter_end(sctx); return; } } @@ -346,8 +355,8 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, dword_value = *(uint32_t*)clear_value_ptr; } - si_clear_buffer(ctx, dst, offset, size, dword_value, - R600_COHERENCY_SHADER); + si_clear_buffer(sctx, dst, offset, size, dword_value, + SI_COHERENCY_SHADER); } /** @@ -371,15 +380,15 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) { r600_resource_reference(&sctx->scratch_buffer, NULL); - sctx->scratch_buffer = (struct r600_resource*) + sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, - R600_RESOURCE_FLAG_UNMAPPABLE, + SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_size, 256); if (!sctx->scratch_buffer) return; - si_mark_atom_dirty(sctx, &sctx->scratch_state); + si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); } si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, @@ -388,7 +397,7 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, va = sctx->scratch_buffer->gpu_address; si_emit_cp_dma(sctx, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, - R600_COHERENCY_SHADER); + SI_COHERENCY_SHADER); } /** @@ -404,8 +413,8 @@ void si_copy_buffer(struct si_context *sctx, uint64_t main_dst_offset, main_src_offset; unsigned skipped_size = 0; unsigned realign_size = 0; - unsigned tc_l2_flag = get_tc_l2_flag(sctx, R600_COHERENCY_SHADER); - unsigned flush_flags = get_flush_flags(sctx, R600_COHERENCY_SHADER); + unsigned tc_l2_flag = get_tc_l2_flag(sctx, SI_COHERENCY_SHADER); + unsigned flush_flags = get_flush_flags(sctx, SI_COHERENCY_SHADER); bool is_first = true; if (!size) @@ -423,8 +432,8 @@ void si_copy_buffer(struct si_context *sctx, src_offset += r600_resource(src)->gpu_address; /* The workarounds aren't needed on Fiji and beyond. */ - if (sctx->b.family <= CHIP_CARRIZO || - sctx->b.family == CHIP_STONEY) { + if (sctx->family <= CHIP_CARRIZO || + sctx->family == CHIP_STONEY) { /* If the size is not aligned, we must add a dummy copy at the end * just to align the internal counter. Otherwise, the DMA engine * would slow down by an order of magnitude for following copies. @@ -446,7 +455,7 @@ void si_copy_buffer(struct si_context *sctx, /* Flush the caches. */ if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; /* This is the main part doing the copying. Src is always aligned. */ @@ -462,7 +471,7 @@ void si_copy_buffer(struct si_context *sctx, user_flags, &is_first, &dma_flags); si_emit_cp_dma(sctx, main_dst_offset, main_src_offset, - byte_count, dma_flags, R600_COHERENCY_SHADER); + byte_count, dma_flags, SI_COHERENCY_SHADER); size -= byte_count; main_src_offset += byte_count; @@ -478,7 +487,7 @@ void si_copy_buffer(struct si_context *sctx, &is_first, &dma_flags); si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size, - dma_flags, R600_COHERENCY_SHADER); + dma_flags, SI_COHERENCY_SHADER); } /* Finally, realign the engine if the size wasn't aligned. */ @@ -491,13 +500,13 @@ void si_copy_buffer(struct si_context *sctx, /* If it's not a prefetch... */ if (dst_offset != src_offset) - sctx->b.num_cp_dma_calls++; + sctx->num_cp_dma_calls++; } void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size) { - assert(sctx->b.chip_class >= CIK); + assert(sctx->chip_class >= CIK); si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL); } @@ -521,67 +530,110 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx) sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param vertex_stage_only Whether only the the API VS and VBO descriptors + * should be prefetched. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only) { + unsigned mask = sctx->prefetch_L2_mask; + assert(mask); + /* Prefetch shaders and VBO descriptors to TC L2. */ - if (sctx->b.chip_class >= GFX9) { + if (sctx->chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } - if (sctx->prefetch_L2_mask & SI_PREFETCH_PS) + if (mask & SI_PREFETCH_PS) cik_prefetch_shader_async(sctx, sctx->queued.named.ps); sctx->prefetch_L2_mask = 0; @@ -589,5 +641,5 @@ void cik_emit_prefetch_L2(struct si_context *sctx) void si_init_cp_dma_functions(struct si_context *sctx) { - sctx->b.b.clear_buffer = si_pipe_clear_buffer; + sctx->b.clear_buffer = si_pipe_clear_buffer; }