From d7008fe46a8f689ce4ee2b14b61dc39baebccaa8 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Fri, 27 Mar 2020 19:32:38 +0100 Subject: [PATCH] radeonsi: switch to 3-spaces style MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Generated automatically using clang-format and the following config: AlignAfterOpenBracket: true AlignConsecutiveMacros: true AllowAllArgumentsOnNextLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: false AlwaysBreakAfterReturnType: None BasedOnStyle: LLVM BraceWrapping: AfterControlStatement: false AfterEnum: true AfterFunction: true AfterStruct: false BeforeElse: false SplitEmptyFunction: true BinPackArguments: true BinPackParameters: true BreakBeforeBraces: Custom ColumnLimit: 100 ContinuationIndentWidth: 3 Cpp11BracedListStyle: false Cpp11BracedListStyle: true ForEachMacros: - LIST_FOR_EACH_ENTRY - LIST_FOR_EACH_ENTRY_SAFE - util_dynarray_foreach - nir_foreach_variable - nir_foreach_variable_safe - nir_foreach_register - nir_foreach_register_safe - nir_foreach_use - nir_foreach_use_safe - nir_foreach_if_use - nir_foreach_if_use_safe - nir_foreach_def - nir_foreach_def_safe - nir_foreach_phi_src - nir_foreach_phi_src_safe - nir_foreach_parallel_copy_entry - nir_foreach_instr - nir_foreach_instr_reverse - nir_foreach_instr_safe - nir_foreach_instr_reverse_safe - nir_foreach_function - nir_foreach_block - nir_foreach_block_safe - nir_foreach_block_reverse - nir_foreach_block_reverse_safe - nir_foreach_block_in_cf_node IncludeBlocks: Regroup IncludeCategories: - Regex: '<[[:alnum:].]+>' Priority: 2 - Regex: '.*' Priority: 1 IndentWidth: 3 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyExcessCharacter: 100 SpaceAfterCStyleCast: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: false SpacesInContainerLiterals: false Reviewed-by: Marek Olšák Part-of: --- src/gallium/drivers/radeonsi/.editorconfig | 3 - src/gallium/drivers/radeonsi/cik_sdma.c | 1122 +- .../drivers/radeonsi/driinfo_radeonsi.h | 16 +- src/gallium/drivers/radeonsi/gfx10_query.c | 783 +- .../drivers/radeonsi/gfx10_shader_ngg.c | 3763 +++---- src/gallium/drivers/radeonsi/si_blit.c | 2174 ++-- src/gallium/drivers/radeonsi/si_buffer.c | 1328 ++- src/gallium/drivers/radeonsi/si_build_pm4.h | 279 +- src/gallium/drivers/radeonsi/si_clear.c | 1341 ++- src/gallium/drivers/radeonsi/si_compute.c | 1662 ++- src/gallium/drivers/radeonsi/si_compute.h | 32 +- .../drivers/radeonsi/si_compute_blit.c | 1335 ++- .../radeonsi/si_compute_prim_discard.c | 2664 +++-- src/gallium/drivers/radeonsi/si_cp_dma.c | 1018 +- src/gallium/drivers/radeonsi/si_debug.c | 1925 ++-- .../drivers/radeonsi/si_debug_options.h | 6 +- src/gallium/drivers/radeonsi/si_descriptors.c | 4714 ++++---- src/gallium/drivers/radeonsi/si_dma_cs.c | 523 +- src/gallium/drivers/radeonsi/si_fence.c | 1020 +- src/gallium/drivers/radeonsi/si_get.c | 1760 ++- src/gallium/drivers/radeonsi/si_gfx_cs.c | 925 +- src/gallium/drivers/radeonsi/si_gpu_load.c | 404 +- src/gallium/drivers/radeonsi/si_perfcounter.c | 2012 ++-- src/gallium/drivers/radeonsi/si_pipe.c | 2319 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 2831 +++-- src/gallium/drivers/radeonsi/si_pm4.c | 221 +- src/gallium/drivers/radeonsi/si_pm4.h | 56 +- src/gallium/drivers/radeonsi/si_query.c | 3285 +++--- src/gallium/drivers/radeonsi/si_query.h | 360 +- src/gallium/drivers/radeonsi/si_shader.c | 4634 ++++---- src/gallium/drivers/radeonsi/si_shader.h | 1234 +- .../drivers/radeonsi/si_shader_internal.h | 431 +- src/gallium/drivers/radeonsi/si_shader_llvm.c | 1232 +- .../drivers/radeonsi/si_shader_llvm_gs.c | 1290 +-- .../drivers/radeonsi/si_shader_llvm_ps.c | 1687 ++- .../radeonsi/si_shader_llvm_resources.c | 449 +- .../drivers/radeonsi/si_shader_llvm_tess.c | 1952 ++-- .../drivers/radeonsi/si_shader_llvm_vs.c | 1911 ++-- src/gallium/drivers/radeonsi/si_shader_nir.c | 1764 ++- .../drivers/radeonsi/si_shaderlib_tgsi.c | 1626 ++- src/gallium/drivers/radeonsi/si_state.c | 9980 ++++++++--------- src/gallium/drivers/radeonsi/si_state.h | 824 +- .../drivers/radeonsi/si_state_binning.c | 1063 +- src/gallium/drivers/radeonsi/si_state_draw.c | 4121 ++++--- src/gallium/drivers/radeonsi/si_state_msaa.c | 193 +- .../drivers/radeonsi/si_state_shaders.c | 7329 ++++++------ .../drivers/radeonsi/si_state_streamout.c | 755 +- .../drivers/radeonsi/si_state_viewport.c | 1125 +- src/gallium/drivers/radeonsi/si_test_dma.c | 682 +- .../drivers/radeonsi/si_test_dma_perf.c | 869 +- src/gallium/drivers/radeonsi/si_texture.c | 4599 ++++---- src/gallium/drivers/radeonsi/si_uvd.c | 80 +- 52 files changed, 42975 insertions(+), 46736 deletions(-) delete mode 100644 src/gallium/drivers/radeonsi/.editorconfig diff --git a/src/gallium/drivers/radeonsi/.editorconfig b/src/gallium/drivers/radeonsi/.editorconfig deleted file mode 100644 index 21a3c7d1274..00000000000 --- a/src/gallium/drivers/radeonsi/.editorconfig +++ /dev/null @@ -1,3 +0,0 @@ -[*.{c,h}] -indent_style = tab -indent_size = tab diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index df8a2fcd577..74c289b0134 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -23,643 +23,531 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "sid.h" #include "si_pipe.h" +#include "sid.h" static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) { - width = u_minify(width, level); - return DIV_ROUND_UP(width, blk_w); + width = u_minify(width, level); + return DIV_ROUND_UP(width, blk_w); } -static unsigned encode_tile_info(struct si_context *sctx, - struct si_texture *tex, unsigned level, - bool set_bpp) +static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level, + bool set_bpp) { - struct radeon_info *info = &sctx->screen->info; - unsigned tile_index = tex->surface.u.legacy.tiling_index[level]; - unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index; - unsigned tile_mode = info->si_tile_mode_array[tile_index]; - unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index]; - - return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | - (G_009910_ARRAY_MODE(tile_mode) << 3) | - (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) | - /* Non-depth modes don't have TILE_SPLIT set. */ - ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) | - (G_009990_BANK_WIDTH(macro_tile_mode) << 15) | - (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) | - (G_009990_NUM_BANKS(macro_tile_mode) << 21) | - (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) | - (G_009910_PIPE_CONFIG(tile_mode) << 26); + struct radeon_info *info = &sctx->screen->info; + unsigned tile_index = tex->surface.u.legacy.tiling_index[level]; + unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[tile_index]; + unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index]; + + return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) | + (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) | + /* Non-depth modes don't have TILE_SPLIT set. */ + ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) | + (G_009990_BANK_WIDTH(macro_tile_mode) << 15) | + (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) | + (G_009990_NUM_BANKS(macro_tile_mode) << 21) | + (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) | + (G_009910_PIPE_CONFIG(tile_mode) << 26); } - -static bool si_sdma_v4_copy_texture(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) { - struct si_texture *ssrc = (struct si_texture*)src; - struct si_texture *sdst = (struct si_texture*)dst; - - unsigned bpp = sdst->surface.bpe; - uint64_t dst_address = sdst->buffer.gpu_address + - sdst->surface.u.gfx9.surf_offset; - uint64_t src_address = ssrc->buffer.gpu_address + - ssrc->surface.u.gfx9.surf_offset; - unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch; - unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch; - uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp; - uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp; - unsigned srcx = src_box->x / ssrc->surface.blk_w; - unsigned srcy = src_box->y / ssrc->surface.blk_h; - unsigned srcz = src_box->z; - unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); - unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); - unsigned copy_depth = src_box->depth; - unsigned xalign = MAX2(1, 4 / bpp); - - assert(src_level <= src->last_level); - assert(dst_level <= dst->last_level); - assert(sdst->surface.u.gfx9.surf_offset + - dst_slice_pitch * bpp * (dstz + src_box->depth) <= - sdst->buffer.buf->size); - assert(ssrc->surface.u.gfx9.surf_offset + - src_slice_pitch * bpp * (srcz + src_box->depth) <= - ssrc->buffer.buf->size); - - if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, - dstz, ssrc, src_level, src_box)) - return false; - - dstx /= sdst->surface.blk_w; - dsty /= sdst->surface.blk_h; - - if (srcx >= (1 << 14) || - srcy >= (1 << 14) || - srcz >= (1 << 11) || - dstx >= (1 << 14) || - dsty >= (1 << 14) || - dstz >= (1 << 11)) - return false; - - /* Linear -> linear sub-window copy. */ - if (ssrc->surface.is_linear && - sdst->surface.is_linear) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - - /* Check if everything fits into the bitfields */ - if (!(src_pitch <= (1 << 19) && - dst_pitch <= (1 << 19) && - src_slice_pitch <= (1 << 28) && - dst_slice_pitch <= (1 << 28) && - copy_width <= (1 << 14) && - copy_height <= (1 << 14) && - copy_depth <= (1 << 11))) - return false; - - si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); - - src_address += ssrc->surface.u.gfx9.offset[src_level]; - dst_address += sdst->surface.u.gfx9.offset[dst_level]; - - /* Check alignments */ - if ((src_address % 4) != 0 || - (dst_address % 4) != 0 || - (src_pitch % xalign) != 0) - return false; - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | - (util_logbase2(bpp) << 29)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | ((src_pitch - 1) << 13)); - radeon_emit(cs, src_slice_pitch - 1); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | ((dst_pitch - 1) << 13)); - radeon_emit(cs, dst_slice_pitch - 1); - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - return true; - } - - /* Linear <-> Tiled sub-window copy */ - if (ssrc->surface.is_linear != sdst->surface.is_linear) { - struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc; - struct si_texture *linear = tiled == ssrc ? sdst : ssrc; - unsigned tiled_level = tiled == ssrc ? src_level : dst_level; - unsigned linear_level = linear == ssrc ? src_level : dst_level; - unsigned tiled_x = tiled == ssrc ? srcx : dstx; - unsigned linear_x = linear == ssrc ? srcx : dstx; - unsigned tiled_y = tiled == ssrc ? srcy : dsty; - unsigned linear_y = linear == ssrc ? srcy : dsty; - unsigned tiled_z = tiled == ssrc ? srcz : dstz; - unsigned linear_z = linear == ssrc ? srcz : dstz; - unsigned tiled_width = tiled == ssrc ? - DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) : - DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w); - unsigned tiled_height = tiled == ssrc ? - DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) : - DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h); - unsigned tiled_depth = tiled == ssrc ? - ssrc->buffer.b.b.depth0 : - sdst->buffer.b.b.depth0; - unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; - unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; - uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; - uint64_t linear_address = linear == ssrc ? src_address : dst_address; - struct radeon_cmdbuf *cs = sctx->sdma_cs; - - linear_address += linear->surface.u.gfx9.offset[linear_level]; - - /* Check if everything fits into the bitfields */ - if (!(tiled_x <= (1 << 14) && - tiled_y <= (1 << 14) && - tiled_z <= (1 << 11) && - tiled_width <= (1 << 14) && - tiled_height <= (1 << 14) && - tiled_depth <= (1 << 11) && - tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && - linear_x <= (1 << 14) && - linear_y <= (1 << 14) && - linear_z <= (1 << 11) && - linear_pitch <= (1 << 14) && - linear_slice_pitch <= (1 << 28) && - copy_width <= (1 << 14) && - copy_height <= (1 << 14) && - copy_depth <= (1 << 11))) - return false; - - /* Check alignments */ - if ((tiled_address % 256 != 0) || - (linear_address % 4 != 0) || - (linear_pitch % xalign != 0) || - (linear_slice_pitch % xalign != 0)) - return false; - - si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | - tiled->buffer.b.b.last_level << 20 | - tiled_level << 24 | - (linear == sdst ? 1u : 0) << 31); - radeon_emit(cs, (uint32_t) tiled_address); - radeon_emit(cs, (uint32_t) (tiled_address >> 32)); - radeon_emit(cs, tiled_x | (tiled_y << 16)); - radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16)); - radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16); - radeon_emit(cs, util_logbase2(bpp) | - tiled->surface.u.gfx9.surf.swizzle_mode << 3 | - tiled->surface.u.gfx9.resource_type << 9 | - tiled->surface.u.gfx9.surf.epitch << 16); - radeon_emit(cs, (uint32_t) linear_address); - radeon_emit(cs, (uint32_t) (linear_address >> 32)); - radeon_emit(cs, linear_x | (linear_y << 16)); - radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); - radeon_emit(cs, linear_slice_pitch - 1); - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - return true; - } - - return false; + struct si_texture *ssrc = (struct si_texture *)src; + struct si_texture *sdst = (struct si_texture *)dst; + + unsigned bpp = sdst->surface.bpe; + uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset; + uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset; + unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch; + unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch; + uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp; + uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp; + unsigned srcx = src_box->x / ssrc->surface.blk_w; + unsigned srcy = src_box->y / ssrc->surface.blk_h; + unsigned srcz = src_box->z; + unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); + unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); + unsigned copy_depth = src_box->depth; + unsigned xalign = MAX2(1, 4 / bpp); + + assert(src_level <= src->last_level); + assert(dst_level <= dst->last_level); + assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <= + sdst->buffer.buf->size); + assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <= + ssrc->buffer.buf->size); + + if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box)) + return false; + + dstx /= sdst->surface.blk_w; + dsty /= sdst->surface.blk_h; + + if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) || + dsty >= (1 << 14) || dstz >= (1 << 11)) + return false; + + /* Linear -> linear sub-window copy. */ + if (ssrc->surface.is_linear && sdst->surface.is_linear) { + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + /* Check if everything fits into the bitfields */ + if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) && + dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) && + copy_depth <= (1 << 11))) + return false; + + si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); + + src_address += ssrc->surface.u.gfx9.offset[src_level]; + dst_address += sdst->surface.u.gfx9.offset[dst_level]; + + /* Check alignments */ + if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0) + return false; + + radeon_emit( + cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | + (util_logbase2(bpp) << 29)); + radeon_emit(cs, src_address); + radeon_emit(cs, src_address >> 32); + radeon_emit(cs, srcx | (srcy << 16)); + radeon_emit(cs, srcz | ((src_pitch - 1) << 13)); + radeon_emit(cs, src_slice_pitch - 1); + radeon_emit(cs, dst_address); + radeon_emit(cs, dst_address >> 32); + radeon_emit(cs, dstx | (dsty << 16)); + radeon_emit(cs, dstz | ((dst_pitch - 1) << 13)); + radeon_emit(cs, dst_slice_pitch - 1); + radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + return true; + } + + /* Linear <-> Tiled sub-window copy */ + if (ssrc->surface.is_linear != sdst->surface.is_linear) { + struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc; + struct si_texture *linear = tiled == ssrc ? sdst : ssrc; + unsigned tiled_level = tiled == ssrc ? src_level : dst_level; + unsigned linear_level = linear == ssrc ? src_level : dst_level; + unsigned tiled_x = tiled == ssrc ? srcx : dstx; + unsigned linear_x = linear == ssrc ? srcx : dstx; + unsigned tiled_y = tiled == ssrc ? srcy : dsty; + unsigned linear_y = linear == ssrc ? srcy : dsty; + unsigned tiled_z = tiled == ssrc ? srcz : dstz; + unsigned linear_z = linear == ssrc ? srcz : dstz; + unsigned tiled_width = tiled == ssrc + ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) + : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w); + unsigned tiled_height = tiled == ssrc + ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) + : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h); + unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0; + unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; + unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; + uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; + uint64_t linear_address = linear == ssrc ? src_address : dst_address; + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + linear_address += linear->surface.u.gfx9.offset[linear_level]; + + /* Check if everything fits into the bitfields */ + if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) && + tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) && + tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && linear_x <= (1 << 14) && + linear_y <= (1 << 14) && linear_z <= (1 << 11) && linear_pitch <= (1 << 14) && + linear_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && + copy_height <= (1 << 14) && copy_depth <= (1 << 11))) + return false; + + /* Check alignments */ + if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) || + (linear_slice_pitch % xalign != 0)) + return false; + + si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); + + radeon_emit( + cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | + tiled->buffer.b.b.last_level << 20 | tiled_level << 24 | + (linear == sdst ? 1u : 0) << 31); + radeon_emit(cs, (uint32_t)tiled_address); + radeon_emit(cs, (uint32_t)(tiled_address >> 32)); + radeon_emit(cs, tiled_x | (tiled_y << 16)); + radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16)); + radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16); + radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 | + tiled->surface.u.gfx9.resource_type << 9 | + tiled->surface.u.gfx9.surf.epitch << 16); + radeon_emit(cs, (uint32_t)linear_address); + radeon_emit(cs, (uint32_t)(linear_address >> 32)); + radeon_emit(cs, linear_x | (linear_y << 16)); + radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); + radeon_emit(cs, linear_slice_pitch - 1); + radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + return true; + } + + return false; } -static bool cik_sdma_copy_texture(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) { - struct radeon_info *info = &sctx->screen->info; - struct si_texture *ssrc = (struct si_texture*)src; - struct si_texture *sdst = (struct si_texture*)dst; - unsigned bpp = sdst->surface.bpe; - uint64_t dst_address = sdst->buffer.gpu_address + - sdst->surface.u.legacy.level[dst_level].offset; - uint64_t src_address = ssrc->buffer.gpu_address + - ssrc->surface.u.legacy.level[src_level].offset; - unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; - unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode; - unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level]; - unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level]; - unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index]; - unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index]; - unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode); - unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode); - unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? - sdst->surface.tile_swizzle : 0; - unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? - ssrc->surface.tile_swizzle : 0; - unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x; - unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x; - uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp; - uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp; - unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, - dst_level, sdst->surface.blk_w); - unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, - src_level, ssrc->surface.blk_w); - unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, - dst_level, sdst->surface.blk_h); - unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, - src_level, ssrc->surface.blk_h); - unsigned srcx = src_box->x / ssrc->surface.blk_w; - unsigned srcy = src_box->y / ssrc->surface.blk_h; - unsigned srcz = src_box->z; - unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); - unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); - unsigned copy_depth = src_box->depth; - - assert(src_level <= src->last_level); - assert(dst_level <= dst->last_level); - assert(sdst->surface.u.legacy.level[dst_level].offset + - dst_slice_pitch * bpp * (dstz + src_box->depth) <= - sdst->buffer.buf->size); - assert(ssrc->surface.u.legacy.level[src_level].offset + - src_slice_pitch * bpp * (srcz + src_box->depth) <= - ssrc->buffer.buf->size); - - if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, - dstz, ssrc, src_level, src_box)) - return false; - - dstx /= sdst->surface.blk_w; - dsty /= sdst->surface.blk_h; - - if (srcx >= (1 << 14) || - srcy >= (1 << 14) || - srcz >= (1 << 11) || - dstx >= (1 << 14) || - dsty >= (1 << 14) || - dstz >= (1 << 11)) - return false; - - dst_address |= dst_tile_swizzle << 8; - src_address |= src_tile_swizzle << 8; - - /* Linear -> linear sub-window copy. */ - if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && - src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && - /* check if everything fits into the bitfields */ - src_pitch <= (1 << 14) && - dst_pitch <= (1 << 14) && - src_slice_pitch <= (1 << 28) && - dst_slice_pitch <= (1 << 28) && - copy_width <= (1 << 14) && - copy_height <= (1 << 14) && - copy_depth <= (1 << 11) && - /* HW limitation - GFX7: */ - (sctx->chip_class != GFX7 || - (copy_width < (1 << 14) && - copy_height < (1 << 14) && - copy_depth < (1 << 11))) && - /* HW limitation - some GFX7 parts: */ - ((sctx->family != CHIP_BONAIRE && - sctx->family != CHIP_KAVERI) || - (srcx + copy_width != (1 << 14) && - srcy + copy_height != (1 << 14)))) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - - si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | - (util_logbase2(bpp) << 29)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | ((src_pitch - 1) << 16)); - radeon_emit(cs, src_slice_pitch - 1); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | ((dst_pitch - 1) << 16)); - radeon_emit(cs, dst_slice_pitch - 1); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width | (copy_height << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - - /* Tiled <-> linear sub-window copy. */ - if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) { - struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst; - struct si_texture *linear = tiled == ssrc ? sdst : ssrc; - unsigned tiled_level = tiled == ssrc ? src_level : dst_level; - unsigned linear_level = linear == ssrc ? src_level : dst_level; - unsigned tiled_x = tiled == ssrc ? srcx : dstx; - unsigned linear_x = linear == ssrc ? srcx : dstx; - unsigned tiled_y = tiled == ssrc ? srcy : dsty; - unsigned linear_y = linear == ssrc ? srcy : dsty; - unsigned tiled_z = tiled == ssrc ? srcz : dstz; - unsigned linear_z = linear == ssrc ? srcz : dstz; - unsigned tiled_width = tiled == ssrc ? src_width : dst_width; - unsigned linear_width = linear == ssrc ? src_width : dst_width; - unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch; - unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; - unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch; - unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; - uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; - uint64_t linear_address = linear == ssrc ? src_address : dst_address; - unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode; - - assert(tiled_pitch % 8 == 0); - assert(tiled_slice_pitch % 64 == 0); - unsigned pitch_tile_max = tiled_pitch / 8 - 1; - unsigned slice_tile_max = tiled_slice_pitch / 64 - 1; - unsigned xalign = MAX2(1, 4 / bpp); - unsigned copy_width_aligned = copy_width; - - /* If the region ends at the last pixel and is unaligned, we - * can copy the remainder of the line that is not visible to - * make it aligned. - */ - if (copy_width % xalign != 0 && - linear_x + copy_width == linear_width && - tiled_x + copy_width == tiled_width && - linear_x + align(copy_width, xalign) <= linear_pitch && - tiled_x + align(copy_width, xalign) <= tiled_pitch) - copy_width_aligned = align(copy_width, xalign); - - /* HW limitations. */ - if ((sctx->family == CHIP_BONAIRE || - sctx->family == CHIP_KAVERI) && - linear_pitch - 1 == 0x3fff && - bpp == 16) - return false; - - if (sctx->chip_class == GFX7 && - (copy_width_aligned == (1 << 14) || - copy_height == (1 << 14) || - copy_depth == (1 << 11))) - return false; - - if ((sctx->family == CHIP_BONAIRE || - sctx->family == CHIP_KAVERI || - sctx->family == CHIP_KABINI) && - (tiled_x + copy_width == (1 << 14) || - tiled_y + copy_height == (1 << 14))) - return false; - - /* The hw can read outside of the given linear buffer bounds, - * or access those pages but not touch the memory in case - * of writes. (it still causes a VM fault) - * - * Out-of-bounds memory access or page directory access must - * be prevented. - */ - int64_t start_linear_address, end_linear_address; - unsigned granularity; - - /* Deduce the size of reads from the linear surface. */ - switch (tiled_micro_mode) { - case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING: - granularity = bpp == 1 ? 64 / (8*bpp) : - 128 / (8*bpp); - break; - case V_009910_ADDR_SURF_THIN_MICRO_TILING: - case V_009910_ADDR_SURF_DEPTH_MICRO_TILING: - if (0 /* TODO: THICK microtiling */) - granularity = bpp == 1 ? 32 / (8*bpp) : - bpp == 2 ? 64 / (8*bpp) : - bpp <= 8 ? 128 / (8*bpp) : - 256 / (8*bpp); - else - granularity = bpp <= 2 ? 64 / (8*bpp) : - bpp <= 8 ? 128 / (8*bpp) : - 256 / (8*bpp); - break; - default: - return false; - } - - /* The linear reads start at tiled_x & ~(granularity - 1). - * If linear_x == 0 && tiled_x % granularity != 0, the hw - * starts reading from an address preceding linear_address!!! - */ - start_linear_address = - linear->surface.u.legacy.level[linear_level].offset + - bpp * (linear_z * linear_slice_pitch + - linear_y * linear_pitch + - linear_x); - start_linear_address -= (int)(bpp * (tiled_x % granularity)); - - end_linear_address = - linear->surface.u.legacy.level[linear_level].offset + - bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch + - (linear_y + copy_height - 1) * linear_pitch + - (linear_x + copy_width)); - - if ((tiled_x + copy_width) % granularity) - end_linear_address += granularity - - (tiled_x + copy_width) % granularity; - - if (start_linear_address < 0 || - end_linear_address > linear->surface.surf_size) - return false; - - /* Check requirements. */ - if (tiled_address % 256 == 0 && - linear_address % 4 == 0 && - linear_pitch % xalign == 0 && - linear_x % xalign == 0 && - tiled_x % xalign == 0 && - copy_width_aligned % xalign == 0 && - tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING && - /* check if everything fits into the bitfields */ - tiled->surface.u.legacy.tile_split <= 4096 && - pitch_tile_max < (1 << 11) && - slice_tile_max < (1 << 22) && - linear_pitch <= (1 << 14) && - linear_slice_pitch <= (1 << 28) && - copy_width_aligned <= (1 << 14) && - copy_height <= (1 << 14) && - copy_depth <= (1 << 11)) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - uint32_t direction = linear == sdst ? 1u << 31 : 0; - - si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | - direction); - radeon_emit(cs, tiled_address); - radeon_emit(cs, tiled_address >> 32); - radeon_emit(cs, tiled_x | (tiled_y << 16)); - radeon_emit(cs, tiled_z | (pitch_tile_max << 16)); - radeon_emit(cs, slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true)); - radeon_emit(cs, linear_address); - radeon_emit(cs, linear_address >> 32); - radeon_emit(cs, linear_x | (linear_y << 16)); - radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); - radeon_emit(cs, linear_slice_pitch - 1); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width_aligned | (copy_height << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - } - - /* Tiled -> Tiled sub-window copy. */ - if (dst_mode >= RADEON_SURF_MODE_1D && - src_mode >= RADEON_SURF_MODE_1D && - /* check if these fit into the bitfields */ - src_address % 256 == 0 && - dst_address % 256 == 0 && - ssrc->surface.u.legacy.tile_split <= 4096 && - sdst->surface.u.legacy.tile_split <= 4096 && - dstx % 8 == 0 && - dsty % 8 == 0 && - srcx % 8 == 0 && - srcy % 8 == 0 && - /* this can either be equal, or display->rotated (GFX8+ only) */ - (src_micro_mode == dst_micro_mode || - (sctx->chip_class >= GFX8 && - src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING && - dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) { - assert(src_pitch % 8 == 0); - assert(dst_pitch % 8 == 0); - assert(src_slice_pitch % 64 == 0); - assert(dst_slice_pitch % 64 == 0); - unsigned src_pitch_tile_max = src_pitch / 8 - 1; - unsigned dst_pitch_tile_max = dst_pitch / 8 - 1; - unsigned src_slice_tile_max = src_slice_pitch / 64 - 1; - unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1; - unsigned copy_width_aligned = copy_width; - unsigned copy_height_aligned = copy_height; - - /* If the region ends at the last pixel and is unaligned, we - * can copy the remainder of the tile that is not visible to - * make it aligned. - */ - if (copy_width % 8 != 0 && - srcx + copy_width == src_width && - dstx + copy_width == dst_width) - copy_width_aligned = align(copy_width, 8); - - if (copy_height % 8 != 0 && - srcy + copy_height == src_height && - dsty + copy_height == dst_height) - copy_height_aligned = align(copy_height, 8); - - /* check if these fit into the bitfields */ - if (src_pitch_tile_max < (1 << 11) && - dst_pitch_tile_max < (1 << 11) && - src_slice_tile_max < (1 << 22) && - dst_slice_tile_max < (1 << 22) && - copy_width_aligned <= (1 << 14) && - copy_height_aligned <= (1 << 14) && - copy_depth <= (1 << 11) && - copy_width_aligned % 8 == 0 && - copy_height_aligned % 8 == 0 && - /* HW limitation - GFX7: */ - (sctx->chip_class != GFX7 || - (copy_width_aligned < (1 << 14) && - copy_height_aligned < (1 << 14) && - copy_depth < (1 << 11))) && - /* HW limitation - some GFX7 parts: */ - ((sctx->family != CHIP_BONAIRE && - sctx->family != CHIP_KAVERI && - sctx->family != CHIP_KABINI) || - (srcx + copy_width_aligned != (1 << 14) && - srcy + copy_height_aligned != (1 << 14) && - dstx + copy_width != (1 << 14)))) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - - si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer); - - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0)); - radeon_emit(cs, src_address); - radeon_emit(cs, src_address >> 32); - radeon_emit(cs, srcx | (srcy << 16)); - radeon_emit(cs, srcz | (src_pitch_tile_max << 16)); - radeon_emit(cs, src_slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true)); - radeon_emit(cs, dst_address); - radeon_emit(cs, dst_address >> 32); - radeon_emit(cs, dstx | (dsty << 16)); - radeon_emit(cs, dstz | (dst_pitch_tile_max << 16)); - radeon_emit(cs, dst_slice_tile_max); - radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false)); - if (sctx->chip_class == GFX7) { - radeon_emit(cs, copy_width_aligned | - (copy_height_aligned << 16)); - radeon_emit(cs, copy_depth); - } else { - radeon_emit(cs, (copy_width_aligned - 8) | - ((copy_height_aligned - 8) << 16)); - radeon_emit(cs, (copy_depth - 1)); - } - return true; - } - } - - return false; + struct radeon_info *info = &sctx->screen->info; + struct si_texture *ssrc = (struct si_texture *)src; + struct si_texture *sdst = (struct si_texture *)dst; + unsigned bpp = sdst->surface.bpe; + uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset; + uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset; + unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode; + unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode; + unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level]; + unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level]; + unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index]; + unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index]; + unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode); + unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode); + unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0; + unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0; + unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x; + unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x; + uint64_t dst_slice_pitch = + ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp; + uint64_t src_slice_pitch = + ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp; + unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w); + unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w); + unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h); + unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h); + unsigned srcx = src_box->x / ssrc->surface.blk_w; + unsigned srcy = src_box->y / ssrc->surface.blk_h; + unsigned srcz = src_box->z; + unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w); + unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h); + unsigned copy_depth = src_box->depth; + + assert(src_level <= src->last_level); + assert(dst_level <= dst->last_level); + assert(sdst->surface.u.legacy.level[dst_level].offset + + dst_slice_pitch * bpp * (dstz + src_box->depth) <= + sdst->buffer.buf->size); + assert(ssrc->surface.u.legacy.level[src_level].offset + + src_slice_pitch * bpp * (srcz + src_box->depth) <= + ssrc->buffer.buf->size); + + if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box)) + return false; + + dstx /= sdst->surface.blk_w; + dsty /= sdst->surface.blk_h; + + if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) || + dsty >= (1 << 14) || dstz >= (1 << 11)) + return false; + + dst_address |= dst_tile_swizzle << 8; + src_address |= src_tile_swizzle << 8; + + /* Linear -> linear sub-window copy. */ + if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && + /* check if everything fits into the bitfields */ + src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) && + dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) && + copy_depth <= (1 << 11) && + /* HW limitation - GFX7: */ + (sctx->chip_class != GFX7 || + (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) && + /* HW limitation - some GFX7 parts: */ + ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) || + (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) { + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer); + + radeon_emit( + cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | + (util_logbase2(bpp) << 29)); + radeon_emit(cs, src_address); + radeon_emit(cs, src_address >> 32); + radeon_emit(cs, srcx | (srcy << 16)); + radeon_emit(cs, srcz | ((src_pitch - 1) << 16)); + radeon_emit(cs, src_slice_pitch - 1); + radeon_emit(cs, dst_address); + radeon_emit(cs, dst_address >> 32); + radeon_emit(cs, dstx | (dsty << 16)); + radeon_emit(cs, dstz | ((dst_pitch - 1) << 16)); + radeon_emit(cs, dst_slice_pitch - 1); + if (sctx->chip_class == GFX7) { + radeon_emit(cs, copy_width | (copy_height << 16)); + radeon_emit(cs, copy_depth); + } else { + radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + } + return true; + } + + /* Tiled <-> linear sub-window copy. */ + if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) { + struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst; + struct si_texture *linear = tiled == ssrc ? sdst : ssrc; + unsigned tiled_level = tiled == ssrc ? src_level : dst_level; + unsigned linear_level = linear == ssrc ? src_level : dst_level; + unsigned tiled_x = tiled == ssrc ? srcx : dstx; + unsigned linear_x = linear == ssrc ? srcx : dstx; + unsigned tiled_y = tiled == ssrc ? srcy : dsty; + unsigned linear_y = linear == ssrc ? srcy : dsty; + unsigned tiled_z = tiled == ssrc ? srcz : dstz; + unsigned linear_z = linear == ssrc ? srcz : dstz; + unsigned tiled_width = tiled == ssrc ? src_width : dst_width; + unsigned linear_width = linear == ssrc ? src_width : dst_width; + unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch; + unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch; + unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch; + unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch; + uint64_t tiled_address = tiled == ssrc ? src_address : dst_address; + uint64_t linear_address = linear == ssrc ? src_address : dst_address; + unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode; + + assert(tiled_pitch % 8 == 0); + assert(tiled_slice_pitch % 64 == 0); + unsigned pitch_tile_max = tiled_pitch / 8 - 1; + unsigned slice_tile_max = tiled_slice_pitch / 64 - 1; + unsigned xalign = MAX2(1, 4 / bpp); + unsigned copy_width_aligned = copy_width; + + /* If the region ends at the last pixel and is unaligned, we + * can copy the remainder of the line that is not visible to + * make it aligned. + */ + if (copy_width % xalign != 0 && linear_x + copy_width == linear_width && + tiled_x + copy_width == tiled_width && + linear_x + align(copy_width, xalign) <= linear_pitch && + tiled_x + align(copy_width, xalign) <= tiled_pitch) + copy_width_aligned = align(copy_width, xalign); + + /* HW limitations. */ + if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) && + linear_pitch - 1 == 0x3fff && bpp == 16) + return false; + + if (sctx->chip_class == GFX7 && + (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11))) + return false; + + if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI || + sctx->family == CHIP_KABINI) && + (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14))) + return false; + + /* The hw can read outside of the given linear buffer bounds, + * or access those pages but not touch the memory in case + * of writes. (it still causes a VM fault) + * + * Out-of-bounds memory access or page directory access must + * be prevented. + */ + int64_t start_linear_address, end_linear_address; + unsigned granularity; + + /* Deduce the size of reads from the linear surface. */ + switch (tiled_micro_mode) { + case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING: + granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp); + break; + case V_009910_ADDR_SURF_THIN_MICRO_TILING: + case V_009910_ADDR_SURF_DEPTH_MICRO_TILING: + if (0 /* TODO: THICK microtiling */) + granularity = + bpp == 1 ? 32 / (8 * bpp) + : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp); + else + granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp); + break; + default: + return false; + } + + /* The linear reads start at tiled_x & ~(granularity - 1). + * If linear_x == 0 && tiled_x % granularity != 0, the hw + * starts reading from an address preceding linear_address!!! + */ + start_linear_address = + linear->surface.u.legacy.level[linear_level].offset + + bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x); + start_linear_address -= (int)(bpp * (tiled_x % granularity)); + + end_linear_address = + linear->surface.u.legacy.level[linear_level].offset + + bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch + + (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width)); + + if ((tiled_x + copy_width) % granularity) + end_linear_address += granularity - (tiled_x + copy_width) % granularity; + + if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size) + return false; + + /* Check requirements. */ + if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 && + linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 && + tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING && + /* check if everything fits into the bitfields */ + tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) && + slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) && + linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) && + copy_height <= (1 << 14) && copy_depth <= (1 << 11)) { + struct radeon_cmdbuf *cs = sctx->sdma_cs; + uint32_t direction = linear == sdst ? 1u << 31 : 0; + + si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer); + + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | + direction); + radeon_emit(cs, tiled_address); + radeon_emit(cs, tiled_address >> 32); + radeon_emit(cs, tiled_x | (tiled_y << 16)); + radeon_emit(cs, tiled_z | (pitch_tile_max << 16)); + radeon_emit(cs, slice_tile_max); + radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true)); + radeon_emit(cs, linear_address); + radeon_emit(cs, linear_address >> 32); + radeon_emit(cs, linear_x | (linear_y << 16)); + radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); + radeon_emit(cs, linear_slice_pitch - 1); + if (sctx->chip_class == GFX7) { + radeon_emit(cs, copy_width_aligned | (copy_height << 16)); + radeon_emit(cs, copy_depth); + } else { + radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); + radeon_emit(cs, (copy_depth - 1)); + } + return true; + } + } + + /* Tiled -> Tiled sub-window copy. */ + if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D && + /* check if these fit into the bitfields */ + src_address % 256 == 0 && dst_address % 256 == 0 && + ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 && + dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 && + /* this can either be equal, or display->rotated (GFX8+ only) */ + (src_micro_mode == dst_micro_mode || + (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING && + dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) { + assert(src_pitch % 8 == 0); + assert(dst_pitch % 8 == 0); + assert(src_slice_pitch % 64 == 0); + assert(dst_slice_pitch % 64 == 0); + unsigned src_pitch_tile_max = src_pitch / 8 - 1; + unsigned dst_pitch_tile_max = dst_pitch / 8 - 1; + unsigned src_slice_tile_max = src_slice_pitch / 64 - 1; + unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1; + unsigned copy_width_aligned = copy_width; + unsigned copy_height_aligned = copy_height; + + /* If the region ends at the last pixel and is unaligned, we + * can copy the remainder of the tile that is not visible to + * make it aligned. + */ + if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width) + copy_width_aligned = align(copy_width, 8); + + if (copy_height % 8 != 0 && srcy + copy_height == src_height && + dsty + copy_height == dst_height) + copy_height_aligned = align(copy_height, 8); + + /* check if these fit into the bitfields */ + if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) && + src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) && + copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) && + copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 && + /* HW limitation - GFX7: */ + (sctx->chip_class != GFX7 || + (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) && + copy_depth < (1 << 11))) && + /* HW limitation - some GFX7 parts: */ + ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI && + sctx->family != CHIP_KABINI) || + (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) && + dstx + copy_width != (1 << 14)))) { + struct radeon_cmdbuf *cs = sctx->sdma_cs; + + si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer); + + radeon_emit( + cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0)); + radeon_emit(cs, src_address); + radeon_emit(cs, src_address >> 32); + radeon_emit(cs, srcx | (srcy << 16)); + radeon_emit(cs, srcz | (src_pitch_tile_max << 16)); + radeon_emit(cs, src_slice_tile_max); + radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true)); + radeon_emit(cs, dst_address); + radeon_emit(cs, dst_address >> 32); + radeon_emit(cs, dstx | (dsty << 16)); + radeon_emit(cs, dstz | (dst_pitch_tile_max << 16)); + radeon_emit(cs, dst_slice_tile_max); + radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false)); + if (sctx->chip_class == GFX7) { + radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16)); + radeon_emit(cs, copy_depth); + } else { + radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16)); + radeon_emit(cs, (copy_depth - 1)); + } + return true; + } + } + + return false; } -static void cik_sdma_copy(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, + unsigned src_level, const struct pipe_box *src_box) { - struct si_context *sctx = (struct si_context *)ctx; - - assert(src->target != PIPE_BUFFER); - - if (!sctx->sdma_cs || - src->flags & PIPE_RESOURCE_FLAG_SPARSE || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE) - goto fallback; - - /* SDMA causes corruption. See: - * https://bugs.freedesktop.org/show_bug.cgi?id=110575 - * https://bugs.freedesktop.org/show_bug.cgi?id=110635 - * - * Keep SDMA enabled on APUs. - */ - if (sctx->screen->debug_flags & DBG(FORCE_SDMA) || - (!sctx->screen->info.has_dedicated_vram && - !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) { - if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) && - cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box)) - return; - else if (sctx->chip_class == GFX9 && - si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box)) - return; - } + struct si_context *sctx = (struct si_context *)ctx; + + assert(src->target != PIPE_BUFFER); + + if (!sctx->sdma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE || + dst->flags & PIPE_RESOURCE_FLAG_SPARSE) + goto fallback; + + /* SDMA causes corruption. See: + * https://bugs.freedesktop.org/show_bug.cgi?id=110575 + * https://bugs.freedesktop.org/show_bug.cgi?id=110635 + * + * Keep SDMA enabled on APUs. + */ + if (sctx->screen->debug_flags & DBG(FORCE_SDMA) || + (!sctx->screen->info.has_dedicated_vram && + !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) { + if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) && + cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box)) + return; + else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, + dstz, src, src_level, src_box)) + return; + } fallback: - si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); + si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); } void cik_init_sdma_functions(struct si_context *sctx) { - sctx->dma_copy = cik_sdma_copy; + sctx->dma_copy = cik_sdma_copy; } diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h index 59b3d0a6b49..1570f286053 100644 --- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h +++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h @@ -1,18 +1,18 @@ // DriConf options specific to radeonsi DRI_CONF_SECTION_PERFORMANCE - DRI_CONF_ADAPTIVE_SYNC("true") - DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false") - DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false") - DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false") +DRI_CONF_ADAPTIVE_SYNC("true") +DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false") +DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false") +DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false") DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG //= BEGIN VERBATIM -#define OPT_BOOL(name, dflt, description) \ - DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \ - DRI_CONF_DESC(en, description) \ - DRI_CONF_OPT_END +#define OPT_BOOL(name, dflt, description) \ + DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \ + DRI_CONF_DESC(en, description) \ + DRI_CONF_OPT_END #include "radeonsi/si_debug_options.h" //= END VERBATIM diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c index c0a0bc8ce57..aedf5090eed 100644 --- a/src/gallium/drivers/radeonsi/gfx10_query.c +++ b/src/gallium/drivers/radeonsi/gfx10_query.c @@ -22,13 +22,13 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include - #include "si_pipe.h" #include "si_query.h" +#include "sid.h" #include "util/u_memory.h" #include "util/u_suballoc.h" -#include "sid.h" + +#include /** * The query buffer is written to by ESGS NGG shaders with statistics about @@ -39,12 +39,12 @@ * without additional GPU cost. */ struct gfx10_sh_query_buffer { - struct list_head list; - struct si_resource *buf; - unsigned refcount; + struct list_head list; + struct si_resource *buf; + unsigned refcount; - /* Offset into the buffer in bytes; points at the first un-emitted entry. */ - unsigned head; + /* Offset into the buffer in bytes; points at the first un-emitted entry. */ + unsigned head; }; /* Memory layout of the query buffer. Must be kept in sync with shaders @@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer { * of all those values unconditionally. */ struct gfx10_sh_query_buffer_mem { - struct { - uint64_t generated_primitives_start_dummy; - uint64_t emitted_primitives_start_dummy; - uint64_t generated_primitives; - uint64_t emitted_primitives; - } stream[4]; - uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ - uint32_t pad[31]; + struct { + uint64_t generated_primitives_start_dummy; + uint64_t emitted_primitives_start_dummy; + uint64_t generated_primitives; + uint64_t emitted_primitives; + } stream[4]; + uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ + uint32_t pad[31]; }; /* Shader-based queries. */ struct gfx10_sh_query { - struct si_query b; + struct si_query b; - struct gfx10_sh_query_buffer *first; - struct gfx10_sh_query_buffer *last; - unsigned first_begin; - unsigned last_end; + struct gfx10_sh_query_buffer *first; + struct gfx10_sh_query_buffer *last; + unsigned first_begin; + unsigned last_end; - unsigned stream; + unsigned stream; }; static void emit_shader_query(struct si_context *sctx) { - assert(!list_is_empty(&sctx->shader_query_buffers)); + assert(!list_is_empty(&sctx->shader_query_buffers)); - struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); + struct gfx10_sh_query_buffer *qbuf = + list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); } static void gfx10_release_query_buffers(struct si_context *sctx, - struct gfx10_sh_query_buffer *first, - struct gfx10_sh_query_buffer *last) + struct gfx10_sh_query_buffer *first, + struct gfx10_sh_query_buffer *last) { - while (first) { - struct gfx10_sh_query_buffer *qbuf = first; - if (first != last) - first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); - else - first = NULL; - - qbuf->refcount--; - if (qbuf->refcount) - continue; - - if (qbuf->list.next == &sctx->shader_query_buffers) - continue; /* keep the most recent buffer; it may not be full yet */ - if (qbuf->list.prev == &sctx->shader_query_buffers) - continue; /* keep the oldest buffer for recycling */ - - list_del(&qbuf->list); - si_resource_reference(&qbuf->buf, NULL); - FREE(qbuf); - } + while (first) { + struct gfx10_sh_query_buffer *qbuf = first; + if (first != last) + first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + else + first = NULL; + + qbuf->refcount--; + if (qbuf->refcount) + continue; + + if (qbuf->list.next == &sctx->shader_query_buffers) + continue; /* keep the most recent buffer; it may not be full yet */ + if (qbuf->list.prev == &sctx->shader_query_buffers) + continue; /* keep the oldest buffer for recycling */ + + list_del(&qbuf->list); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } } static bool gfx10_alloc_query_buffer(struct si_context *sctx) { - if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) - return true; - - struct gfx10_sh_query_buffer *qbuf = NULL; - - if (!list_is_empty(&sctx->shader_query_buffers)) { - qbuf = list_last_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) - goto success; - - qbuf = list_first_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - if (!qbuf->refcount && - !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && - sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { - /* Can immediately re-use the oldest buffer */ - list_del(&qbuf->list); - } else { - qbuf = NULL; - } - } - - if (!qbuf) { - qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); - if (unlikely(!qbuf)) - return false; - - struct si_screen *screen = sctx->screen; - unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem), - screen->info.min_alloc_size); - qbuf->buf = si_resource( - pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); - if (unlikely(!qbuf->buf)) { - FREE(qbuf); - return false; - } - } - - /* The buffer is currently unused by the GPU. Initialize it. - * - * We need to set the high bit of all the primitive counters for - * compatibility with the SET_PREDICATION packet. - */ - uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL, - PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED); - assert(results); - - for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); - i < e; ++i) { - for (unsigned j = 0; j < 16; ++j) - results[32 * i + j] = (uint64_t)1 << 63; - results[32 * i + 16] = 0; - } - - list_addtail(&qbuf->list, &sctx->shader_query_buffers); - qbuf->head = 0; - qbuf->refcount = sctx->num_active_shader_queries; + if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) + return true; + + struct gfx10_sh_query_buffer *qbuf = NULL; + + if (!list_is_empty(&sctx->shader_query_buffers)) { + qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) + goto success; + + qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + if (!qbuf->refcount && + !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && + sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Can immediately re-use the oldest buffer */ + list_del(&qbuf->list); + } else { + qbuf = NULL; + } + } + + if (!qbuf) { + qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); + if (unlikely(!qbuf)) + return false; + + struct si_screen *screen = sctx->screen; + unsigned buf_size = + MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size); + qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); + if (unlikely(!qbuf->buf)) { + FREE(qbuf); + return false; + } + } + + /* The buffer is currently unused by the GPU. Initialize it. + * + * We need to set the high bit of all the primitive counters for + * compatibility with the SET_PREDICATION packet. + */ + uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL, + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); + assert(results); + + for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e; + ++i) { + for (unsigned j = 0; j < 16; ++j) + results[32 * i + j] = (uint64_t)1 << 63; + results[32 * i + 16] = 0; + } + + list_addtail(&qbuf->list, &sctx->shader_query_buffers); + qbuf->head = 0; + qbuf->refcount = sctx->num_active_shader_queries; success:; - struct pipe_shader_buffer sbuf; - sbuf.buffer = &qbuf->buf->b.b; - sbuf.buffer_offset = qbuf->head; - sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); - si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf); - sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1); - - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); - return true; + struct pipe_shader_buffer sbuf; + sbuf.buffer = &qbuf->buf->b.b; + sbuf.buffer_offset = qbuf->head; + sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf); + sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); + return true; } static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); - FREE(query); + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + gfx10_release_query_buffers(sctx, query->first, query->last); + FREE(query); } static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); - query->first = query->last = NULL; + gfx10_release_query_buffers(sctx, query->first, query->last); + query->first = query->last = NULL; - if (unlikely(!gfx10_alloc_query_buffer(sctx))) - return false; + if (unlikely(!gfx10_alloc_query_buffer(sctx))) + return false; - query->first = list_last_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - query->first_begin = query->first->head; + query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + query->first_begin = query->first->head; - sctx->num_active_shader_queries++; - query->first->refcount++; + sctx->num_active_shader_queries++; + query->first->refcount++; - return true; + return true; } static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - - if (unlikely(!query->first)) - return false; /* earlier out of memory error */ - - query->last = list_last_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - query->last_end = query->last->head; - - /* Signal the fence of the previous chunk */ - if (query->last_end != 0) { - uint64_t fence_va = query->last->buf->gpu_address; - fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); - fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); - si_cp_release_mem(sctx, sctx->gfx_cs, - V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - query->last->buf, fence_va, 0xffffffff, - PIPE_QUERY_GPU_FINISHED); - } - - sctx->num_active_shader_queries--; - - if (sctx->num_active_shader_queries > 0) { - gfx10_alloc_query_buffer(sctx); - } else { - si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL); - sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED; - - /* If a query_begin is followed by a query_end without a draw - * in-between, we need to clear the atom to ensure that the - * next query_begin will re-initialize the shader buffer. */ - si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); - } - - return true; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + + query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + query->last_end = query->last->head; + + /* Signal the fence of the previous chunk */ + if (query->last_end != 0) { + uint64_t fence_va = query->last->buf->gpu_address; + fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); + fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, + EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va, + 0xffffffff, PIPE_QUERY_GPU_FINISHED); + } + + sctx->num_active_shader_queries--; + + if (sctx->num_active_shader_queries > 0) { + gfx10_alloc_query_buffer(sctx); + } else { + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL); + sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED; + + /* If a query_begin is followed by a query_end without a draw + * in-between, we need to clear the atom to ensure that the + * next query_begin will re-initialize the shader buffer. */ + si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); + } + + return true; } static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, - struct gfx10_sh_query_buffer_mem *qmem, - union pipe_query_result *result) + struct gfx10_sh_query_buffer_mem *qmem, + union pipe_query_result *result) { - static const uint64_t mask = ((uint64_t)1 << 63) - 1; - - switch (query->b.type) { - case PIPE_QUERY_PRIMITIVES_EMITTED: - result->u64 += qmem->stream[query->stream].emitted_primitives & mask; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - result->u64 += qmem->stream[query->stream].generated_primitives & mask; - break; - case PIPE_QUERY_SO_STATISTICS: - result->so_statistics.num_primitives_written += - qmem->stream[query->stream].emitted_primitives & mask; - result->so_statistics.primitives_storage_needed += - qmem->stream[query->stream].generated_primitives & mask; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - result->b |= qmem->stream[query->stream].emitted_primitives != - qmem->stream[query->stream].generated_primitives; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { - result->b |= qmem->stream[query->stream].emitted_primitives != - qmem->stream[query->stream].generated_primitives; - } - break; - default: - assert(0); - } + static const uint64_t mask = ((uint64_t)1 << 63) - 1; + + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_EMITTED: + result->u64 += qmem->stream[query->stream].emitted_primitives & mask; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + result->u64 += qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_STATISTICS: + result->so_statistics.num_primitives_written += + qmem->stream[query->stream].emitted_primitives & mask; + result->so_statistics.primitives_storage_needed += + qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + } + break; + default: + assert(0); + } } -static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, - bool wait, union pipe_query_result *result) +static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, + union pipe_query_result *result) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - util_query_clear_result(result, query->b.type); + util_query_clear_result(result, query->b.type); - if (unlikely(!query->first)) - return false; /* earlier out of memory error */ - assert(query->last); + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + assert(query->last); - for (struct gfx10_sh_query_buffer *qbuf = query->last;; - qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) { - unsigned usage = PIPE_TRANSFER_READ | - (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); - void *map; + for (struct gfx10_sh_query_buffer *qbuf = query->last;; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) { + unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + void *map; - if (rquery->b.flushed) - map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); - else - map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); + if (rquery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); - if (!map) - return false; + if (!map) + return false; - unsigned results_begin = 0; - unsigned results_end = qbuf->head; - if (qbuf == query->first) - results_begin = query->first_begin; - if (qbuf == query->last) - results_end = query->last_end; + unsigned results_begin = 0; + unsigned results_end = qbuf->head; + if (qbuf == query->first) + results_begin = query->first_begin; + if (qbuf == query->last) + results_end = query->last_end; - while (results_begin != results_end) { - struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; - results_begin += sizeof(*qmem); + while (results_begin != results_end) { + struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; + results_begin += sizeof(*qmem); - gfx10_sh_query_add_result(query, qmem, result); - } + gfx10_sh_query_add_result(query, qmem, result); + } - if (qbuf == query->first) - break; - } + if (qbuf == query->first) + break; + } - return true; + return true; } -static void gfx10_sh_query_get_result_resource(struct si_context *sctx, - struct si_query *rquery, - bool wait, - enum pipe_query_value_type result_type, - int index, - struct pipe_resource *resource, - unsigned offset) +static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, + bool wait, enum pipe_query_value_type result_type, + int index, struct pipe_resource *resource, + unsigned offset) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - struct si_qbo_state saved_state = {}; - struct pipe_resource *tmp_buffer = NULL; - unsigned tmp_buffer_offset = 0; - - if (!sctx->sh_query_result_shader) { - sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); - if (!sctx->sh_query_result_shader) - return; - } - - if (query->first != query->last) { - u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, - &tmp_buffer_offset, &tmp_buffer); - if (!tmp_buffer) - return; - } - - si_save_qbo_state(sctx, &saved_state); - - /* Pre-fill the constants configuring the shader behavior. */ - struct { - uint32_t config; - uint32_t offset; - uint32_t chain; - uint32_t result_count; - } consts; - struct pipe_constant_buffer constant_buffer = {}; - - if (index >= 0) { - switch (query->b.type) { - case PIPE_QUERY_PRIMITIVES_GENERATED: - consts.offset = sizeof(uint32_t) * query->stream; - consts.config = 0; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - consts.offset = sizeof(uint32_t) * (4 + query->stream); - consts.config = 0; - break; - case PIPE_QUERY_SO_STATISTICS: - consts.offset = sizeof(uint32_t) * (4 * index + query->stream); - consts.config = 0; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - consts.offset = sizeof(uint32_t) * query->stream; - consts.config = 2; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - consts.offset = 0; - consts.config = 3; - break; - default: unreachable("bad query type"); - } - } else { - /* Check result availability. */ - consts.offset = 0; - consts.config = 1; - } - - if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) - consts.config |= 8; - - constant_buffer.buffer_size = sizeof(consts); - constant_buffer.user_buffer = &consts; - - /* Pre-fill the SSBOs and grid. */ - struct pipe_shader_buffer ssbo[3]; - struct pipe_grid_info grid = {}; - - ssbo[1].buffer = tmp_buffer; - ssbo[1].buffer_offset = tmp_buffer_offset; - ssbo[1].buffer_size = 16; - - ssbo[2] = ssbo[1]; - - sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader); - - grid.block[0] = 1; - grid.block[1] = 1; - grid.block[2] = 1; - grid.grid[0] = 1; - grid.grid[1] = 1; - grid.grid[2] = 1; - - struct gfx10_sh_query_buffer *qbuf = query->first; - for (;;) { - unsigned begin = qbuf == query->first ? query->first_begin : 0; - unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; - if (!end) - continue; - - ssbo[0].buffer = &qbuf->buf->b.b; - ssbo[0].buffer_offset = begin; - ssbo[0].buffer_size = end - begin; - - consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); - consts.chain = 0; - if (qbuf != query->first) - consts.chain |= 1; - if (qbuf != query->last) - consts.chain |= 2; - - if (qbuf == query->last) { - ssbo[2].buffer = resource; - ssbo[2].buffer_offset = offset; - ssbo[2].buffer_size = 8; - } - - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); - - if (wait) { - uint64_t va; - - /* Wait for result availability. Wait only for readiness - * of the last entry, since the fence writes should be - * serialized in the CP. - */ - va = qbuf->buf->gpu_address; - va += end - sizeof(struct gfx10_sh_query_buffer_mem); - va += offsetof(struct gfx10_sh_query_buffer_mem, fence); - - si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); - } - - sctx->b.launch_grid(&sctx->b, &grid); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - - if (qbuf == query->last) - break; - qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); - } - - si_restore_qbo_state(sctx, &saved_state); - pipe_resource_reference(&tmp_buffer, NULL); + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct si_qbo_state saved_state = {}; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + + if (!sctx->sh_query_result_shader) { + sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); + if (!sctx->sh_query_result_shader) + return; + } + + if (query->first != query->last) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + si_save_qbo_state(sctx, &saved_state); + + /* Pre-fill the constants configuring the shader behavior. */ + struct { + uint32_t config; + uint32_t offset; + uint32_t chain; + uint32_t result_count; + } consts; + struct pipe_constant_buffer constant_buffer = {}; + + if (index >= 0) { + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_GENERATED: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 0; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + consts.offset = sizeof(uint32_t) * (4 + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_STATISTICS: + consts.offset = sizeof(uint32_t) * (4 * index + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 2; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + consts.offset = 0; + consts.config = 3; + break; + default: + unreachable("bad query type"); + } + } else { + /* Check result availability. */ + consts.offset = 0; + consts.config = 1; + } + + if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) + consts.config |= 8; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + /* Pre-fill the SSBOs and grid. */ + struct pipe_shader_buffer ssbo[3]; + struct pipe_grid_info grid = {}; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + struct gfx10_sh_query_buffer *qbuf = query->first; + for (;;) { + unsigned begin = qbuf == query->first ? query->first_begin : 0; + unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; + if (!end) + continue; + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = begin; + ssbo[0].buffer_size = end - begin; + + consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + consts.chain = 0; + if (qbuf != query->first) + consts.chain |= 1; + if (qbuf != query->last) + consts.chain |= 2; + + if (qbuf == query->last) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + } + + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); + + if (wait) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address; + va += end - sizeof(struct gfx10_sh_query_buffer_mem); + va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + + si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); + } + + sctx->b.launch_grid(&sctx->b, &grid); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (qbuf == query->last) + break; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + } + + si_restore_qbo_state(sctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); } static const struct si_query_ops gfx10_sh_query_ops = { - .destroy = gfx10_sh_query_destroy, - .begin = gfx10_sh_query_begin, - .end = gfx10_sh_query_end, - .get_result = gfx10_sh_query_get_result, - .get_result_resource = gfx10_sh_query_get_result_resource, + .destroy = gfx10_sh_query_destroy, + .begin = gfx10_sh_query_begin, + .end = gfx10_sh_query_end, + .get_result = gfx10_sh_query_get_result, + .get_result_resource = gfx10_sh_query_get_result_resource, }; -struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, - enum pipe_query_type query_type, - unsigned index) +struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, + unsigned index) { - struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); - if (unlikely(!query)) - return NULL; + struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); + if (unlikely(!query)) + return NULL; - query->b.ops = &gfx10_sh_query_ops; - query->b.type = query_type; - query->stream = index; + query->b.ops = &gfx10_sh_query_ops; + query->b.type = query_type; + query->stream = index; - return (struct pipe_query *)query; + return (struct pipe_query *)query; } void gfx10_init_query(struct si_context *sctx) { - list_inithead(&sctx->shader_query_buffers); - sctx->atoms.s.shader_query.emit = emit_shader_query; + list_inithead(&sctx->shader_query_buffers); + sctx->atoms.s.shader_query.emit = emit_shader_query; } void gfx10_destroy_query(struct si_context *sctx) { - while (!list_is_empty(&sctx->shader_query_buffers)) { - struct gfx10_sh_query_buffer *qbuf = - list_first_entry(&sctx->shader_query_buffers, - struct gfx10_sh_query_buffer, list); - list_del(&qbuf->list); - - assert(!qbuf->refcount); - si_resource_reference(&qbuf->buf, NULL); - FREE(qbuf); - } + while (!list_is_empty(&sctx->shader_query_buffers)) { + struct gfx10_sh_query_buffer *qbuf = + list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + list_del(&qbuf->list); + + assert(!qbuf->refcount); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } } diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 63439733507..06eba4a1f61 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -21,250 +21,239 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_llvm_cull.h" #include "si_pipe.h" #include "si_shader_internal.h" - #include "sid.h" - #include "util/u_memory.h" #include "util/u_prim.h" -#include "ac_llvm_cull.h" static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); + return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); } static LLVMValueRef get_tgsize(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4); + return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4); } static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); - return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); + return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); } static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9); + return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9); } static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9); + return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9); } static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12); + return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12); } static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) { - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, - LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false)); + return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false)); } static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index) { - if (ctx->type == PIPE_SHADER_VERTEX) { - LLVMValueRef tmp; - tmp = LLVMBuildLShr(ctx->ac.builder, - ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + index, false), ""); - return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); - } - return ctx->ac.i1false; + if (ctx->type == PIPE_SHADER_VERTEX) { + LLVMValueRef tmp; + tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + index, false), ""); + return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); + } + return ctx->ac.i1false; } /** * Return the number of vertices as a constant in \p num_vertices, * and return a more precise value as LLVMValueRef from the function. */ -static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, - unsigned *num_vertices) +static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices) { - const struct si_shader_info *info = &ctx->shader->selector->info; - - if (ctx->type == PIPE_SHADER_VERTEX) { - if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - /* Blits always use axis-aligned rectangles with 3 vertices. */ - *num_vertices = 3; - return LLVMConstInt(ctx->ac.i32, 3, 0); - } else { - /* We always build up all three indices for the prim export - * independent of the primitive type. The additional garbage - * data shouldn't hurt. This number doesn't matter with - * NGG passthrough. - */ - *num_vertices = 3; - - /* Extract OUTPRIM field. */ - LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2); - return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, ""); - } - } else { - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - - if (info->properties[TGSI_PROPERTY_TES_POINT_MODE]) - *num_vertices = 1; - else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) - *num_vertices = 2; - else - *num_vertices = 3; - - return LLVMConstInt(ctx->ac.i32, *num_vertices, false); - } + const struct si_shader_info *info = &ctx->shader->selector->info; + + if (ctx->type == PIPE_SHADER_VERTEX) { + if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + /* Blits always use axis-aligned rectangles with 3 vertices. */ + *num_vertices = 3; + return LLVMConstInt(ctx->ac.i32, 3, 0); + } else { + /* We always build up all three indices for the prim export + * independent of the primitive type. The additional garbage + * data shouldn't hurt. This number doesn't matter with + * NGG passthrough. + */ + *num_vertices = 3; + + /* Extract OUTPRIM field. */ + LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2); + return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, ""); + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + + if (info->properties[TGSI_PROPERTY_TES_POINT_MODE]) + *num_vertices = 1; + else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) + *num_vertices = 2; + else + *num_vertices = 3; + + return LLVMConstInt(ctx->ac.i32, *num_vertices, false); + } } bool gfx10_ngg_export_prim_early(struct si_shader *shader) { - struct si_shader_selector *sel = shader->selector; + struct si_shader_selector *sel = shader->selector; - assert(shader->key.as_ngg && !shader->key.as_es); + assert(shader->key.as_ngg && !shader->key.as_es); - return sel->type != PIPE_SHADER_GEOMETRY && - !sel->info.writes_edgeflag; + return sel->type != PIPE_SHADER_GEOMETRY && !sel->info.writes_edgeflag; } void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx) { - ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), - ngg_get_vtx_cnt(ctx), - ngg_get_prim_cnt(ctx)); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx), + ngg_get_prim_cnt(ctx)); } -void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, - LLVMValueRef user_edgeflags[3], - LLVMValueRef prim_passthrough) +void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3], + LLVMValueRef prim_passthrough) { - LLVMBuilderRef builder = ctx->ac.builder; - - if (gfx10_is_ngg_passthrough(ctx->shader) || - ctx->shader->key.opt.ngg_culling) { - ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); - { - struct ac_ngg_prim prim = {}; - - if (prim_passthrough) - prim.passthrough = prim_passthrough; - else - prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); - - /* This is only used with NGG culling, which returns the NGG - * passthrough prim export encoding. - */ - if (ctx->shader->selector->info.writes_edgeflag) { - unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS; - LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0); - - unsigned num_vertices; - ngg_get_vertices_per_prim(ctx, &num_vertices); - - for (unsigned i = 0; i < num_vertices; i++) { - unsigned shift = 9 + i*10; - LLVMValueRef edge; - - edge = LLVMBuildLoad(builder, user_edgeflags[i], ""); - edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, ""); - edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), ""); - edgeflags = LLVMBuildOr(builder, edgeflags, edge, ""); - } - prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, ""); - } - - ac_build_export_prim(&ctx->ac, &prim); - } - ac_build_endif(&ctx->ac, 6001); - return; - } - - ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); - { - struct ac_ngg_prim prim = {}; - - ngg_get_vertices_per_prim(ctx, &prim.num_vertices); - - prim.isnull = ctx->ac.i1false; - prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); - prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); - prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); - - for (unsigned i = 0; i < prim.num_vertices; ++i) { - prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i); - - if (ctx->shader->selector->info.writes_edgeflag) { - LLVMValueRef edge; - - edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], ""); - edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, ""); - prim.edgeflag[i] = edge; - } - } - - ac_build_export_prim(&ctx->ac, &prim); - } - ac_build_endif(&ctx->ac, 6001); + LLVMBuilderRef builder = ctx->ac.builder; + + if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) { + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); + { + struct ac_ngg_prim prim = {}; + + if (prim_passthrough) + prim.passthrough = prim_passthrough; + else + prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); + + /* This is only used with NGG culling, which returns the NGG + * passthrough prim export encoding. + */ + if (ctx->shader->selector->info.writes_edgeflag) { + unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS; + LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0); + + unsigned num_vertices; + ngg_get_vertices_per_prim(ctx, &num_vertices); + + for (unsigned i = 0; i < num_vertices; i++) { + unsigned shift = 9 + i * 10; + LLVMValueRef edge; + + edge = LLVMBuildLoad(builder, user_edgeflags[i], ""); + edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, ""); + edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), ""); + edgeflags = LLVMBuildOr(builder, edgeflags, edge, ""); + } + prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, ""); + } + + ac_build_export_prim(&ctx->ac, &prim); + } + ac_build_endif(&ctx->ac, 6001); + return; + } + + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001); + { + struct ac_ngg_prim prim = {}; + + ngg_get_vertices_per_prim(ctx, &prim.num_vertices); + + prim.isnull = ctx->ac.i1false; + prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + + for (unsigned i = 0; i < prim.num_vertices; ++i) { + prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i); + + if (ctx->shader->selector->info.writes_edgeflag) { + LLVMValueRef edge; + + edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], ""); + edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, ""); + prim.edgeflag[i] = edge; + } + } + + ac_build_export_prim(&ctx->ac, &prim); + } + ac_build_endif(&ctx->ac, 6001); } -static void build_streamout_vertex(struct si_shader_context *ctx, - LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw, - unsigned stream, LLVMValueRef offset_vtx, - LLVMValueRef vertexptr) +static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer, + LLVMValueRef *wg_offset_dw, unsigned stream, + LLVMValueRef offset_vtx, LLVMValueRef vertexptr) { - struct si_shader_info *info = &ctx->shader->selector->info; - struct pipe_stream_output_info *so = &ctx->shader->selector->so; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef offset[4] = {}; - LLVMValueRef tmp; - - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (!wg_offset_dw[buffer]) - continue; - - tmp = LLVMBuildMul(builder, offset_vtx, - LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), ""); - tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); - offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); - } - - for (unsigned i = 0; i < so->num_outputs; ++i) { - if (so->output[i].stream != stream) - continue; - - unsigned reg = so->output[i].register_index; - struct si_shader_output_values out; - out.semantic_name = info->output_semantic_name[reg]; - out.semantic_index = info->output_semantic_index[reg]; - - for (unsigned comp = 0; comp < 4; comp++) { - tmp = ac_build_gep0(&ctx->ac, vertexptr, - LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false)); - out.values[comp] = LLVMBuildLoad(builder, tmp, ""); - out.vertex_stream[comp] = - (info->output_streams[reg] >> (2 * comp)) & 3; - } - - si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out); - } + struct si_shader_info *info = &ctx->shader->selector->info; + struct pipe_stream_output_info *so = &ctx->shader->selector->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset[4] = {}; + LLVMValueRef tmp; + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (!wg_offset_dw[buffer]) + continue; + + tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), + ""); + tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); + offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); + } + + for (unsigned i = 0; i < so->num_outputs; ++i) { + if (so->output[i].stream != stream) + continue; + + unsigned reg = so->output[i].register_index; + struct si_shader_output_values out; + out.semantic_name = info->output_semantic_name[reg]; + out.semantic_index = info->output_semantic_index[reg]; + + for (unsigned comp = 0; comp < 4; comp++) { + tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false)); + out.values[comp] = LLVMBuildLoad(builder, tmp, ""); + out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3; + } + + si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out); + } } struct ngg_streamout { - LLVMValueRef num_vertices; + LLVMValueRef num_vertices; - /* per-thread data */ - LLVMValueRef prim_enable[4]; /* i1 per stream */ - LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */ + /* per-thread data */ + LLVMValueRef prim_enable[4]; /* i1 per stream */ + LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */ - /* Output */ - LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */ + /* Output */ + LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */ }; /** @@ -276,427 +265,405 @@ struct ngg_streamout { * * Clobbers gs_ngg_scratch[8:]. */ -static void build_streamout(struct si_shader_context *ctx, - struct ngg_streamout *nggso) +static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso) { - struct si_shader_info *info = &ctx->shader->selector->info; - struct pipe_stream_output_info *so = &ctx->shader->selector->so; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - LLVMValueRef tid = get_thread_id_in_tg(ctx); - LLVMValueRef tmp, tmp2; - LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); - LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); - LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); - LLVMValueRef so_buffer[4] = {}; - unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + - (nggso->vertices[2] ? 1 : 0); - LLVMValueRef prim_stride_dw[4] = {}; - LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); - int stream_for_buffer[4] = { -1, -1, -1, -1 }; - unsigned bufmask_for_stream[4] = {}; - bool isgs = ctx->type == PIPE_SHADER_GEOMETRY; - unsigned scratch_emit_base = isgs ? 4 : 0; - LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; - unsigned scratch_offset_base = isgs ? 8 : 4; - LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; - - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256); - - /* Determine the mapping of streamout buffers to vertex streams. */ - for (unsigned i = 0; i < so->num_outputs; ++i) { - unsigned buf = so->output[i].output_buffer; - unsigned stream = so->output[i].stream; - assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream); - stream_for_buffer[buf] = stream; - bufmask_for_stream[stream] |= 1 << buf; - } - - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (stream_for_buffer[buffer] == -1) - continue; - - assert(so->stride[buffer]); - - tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false); - prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, ""); - prim_stride_dw_vgpr = ac_build_writelane( - &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], - LLVMConstInt(ctx->ac.i32, buffer, false)); - - so_buffer[buffer] = ac_build_load_to_sgpr( - &ctx->ac, buf_ptr, - LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false)); - } - - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5200); - { - LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); - LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); - - /* Advance the streamout offsets in GDS. */ - LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); - ac_build_ifcc(&ctx->ac, tmp, 5210); - { - if (isgs) { - tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); - tmp = LLVMBuildLoad(builder, tmp, ""); - } else { - tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, - ngg_get_prim_cnt(ctx), ctx->ac.i32_0); - } - LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); - - unsigned swizzle[4]; - int unused_stream = -1; - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) { - unused_stream = stream; - break; - } - } - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (stream_for_buffer[buffer] >= 0) { - swizzle[buffer] = stream_for_buffer[buffer]; - } else { - assert(unused_stream >= 0); - swizzle[buffer] = unused_stream; - } - } - - tmp = ac_build_quad_swizzle(&ctx->ac, tmp, - swizzle[0], swizzle[1], swizzle[2], swizzle[3]); - tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); - - LLVMValueRef args[] = { - LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), - tmp, - ctx->ac.i32_0, // ordering - ctx->ac.i32_0, // scope - ctx->ac.i1false, // isVolatile - LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index - ctx->ac.i1true, // wave release - ctx->ac.i1true, // wave done - }; - tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", - ctx->ac.i32, args, ARRAY_SIZE(args), 0); - - /* Keep offsets in a VGPR for quick retrieval via readlane by - * the first wave for bounds checking, and also store in LDS - * for retrieval by all waves later. */ - LLVMBuildStore(builder, tmp, offsets_vgpr); - - tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), - scratch_offset_basev, ""); - tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2); - LLVMBuildStore(builder, tmp, tmp2); - } - ac_build_endif(&ctx->ac, 5210); - - /* Determine the max emit per buffer. This is done via the SALU, in part - * because LLVM can't generate divide-by-multiply if we try to do this - * via VALU with one lane per buffer. - */ - LLVMValueRef max_emit[4] = {}; - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (stream_for_buffer[buffer] == -1) - continue; - - LLVMValueRef bufsize_dw = - LLVMBuildLShr(builder, - LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), - i32_2, ""); - - tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); - LLVMValueRef offset_dw = - ac_build_readlane(&ctx->ac, tmp, - LLVMConstInt(ctx->ac.i32, buffer, false)); - - tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); - tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], ""); - - tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, ""); - max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, ""); - } - - /* Determine the number of emitted primitives per stream and fixup the - * GDS counter if necessary. - * - * This is complicated by the fact that a single stream can emit to - * multiple buffers (but luckily not vice versa). - */ - LLVMValueRef emit_vgpr = ctx->ac.i32_0; - - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); - LLVMValueRef generated = - ac_build_readlane(&ctx->ac, tmp, - LLVMConstInt(ctx->ac.i32, stream, false)); - - LLVMValueRef emit = generated; - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (stream_for_buffer[buffer] == stream) - emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]); - } - - emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit, - LLVMConstInt(ctx->ac.i32, stream, false)); - - /* Fixup the offset using a plain GDS atomic if we overflowed. */ - tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); - ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */ - tmp = LLVMBuildLShr(builder, - LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), - ac_get_thread_id(&ctx->ac), ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - ac_build_ifcc(&ctx->ac, tmp, 5222); - { - tmp = LLVMBuildSub(builder, generated, emit, ""); - tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); - tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); - LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, - LLVMAtomicOrderingMonotonic, false); - } - ac_build_endif(&ctx->ac, 5222); - ac_build_endif(&ctx->ac, 5221); - } - - tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); - ac_build_ifcc(&ctx->ac, tmp, 5225); - { - tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), - scratch_emit_basev, ""); - tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp); - LLVMBuildStore(builder, emit_vgpr, tmp); - } - ac_build_endif(&ctx->ac, 5225); - } - ac_build_endif(&ctx->ac, 5200); - - /* Determine the workgroup-relative per-thread / primitive offset into - * the streamout buffers */ - struct ac_wg_scan primemit_scan[4] = {}; - - if (isgs) { - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - primemit_scan[stream].enable_exclusive = true; - primemit_scan[stream].op = nir_op_iadd; - primemit_scan[stream].src = nggso->prim_enable[stream]; - primemit_scan[stream].scratch = - ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, - LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); - primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); - primemit_scan[stream].numwaves = get_tgsize(ctx); - primemit_scan[stream].maxwaves = 8; - ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]); - } - } - - ac_build_s_barrier(&ctx->ac); - - /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ - LLVMValueRef wgoffset_dw[4] = {}; - - { - LLVMValueRef scratch_vgpr; - - tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac)); - scratch_vgpr = LLVMBuildLoad(builder, tmp, ""); - - for (unsigned buffer = 0; buffer < 4; ++buffer) { - if (stream_for_buffer[buffer] >= 0) { - wgoffset_dw[buffer] = ac_build_readlane( - &ctx->ac, scratch_vgpr, - LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); - } - } - - for (unsigned stream = 0; stream < 4; ++stream) { - if (info->num_stream_output_components[stream]) { - nggso->emit[stream] = ac_build_readlane( - &ctx->ac, scratch_vgpr, - LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); - } - } - } - - /* Write out primitive data */ - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - if (isgs) { - ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]); - } else { - primemit_scan[stream].result_exclusive = tid; - } - - tmp = LLVMBuildICmp(builder, LLVMIntULT, - primemit_scan[stream].result_exclusive, - nggso->emit[stream], ""); - tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], ""); - ac_build_ifcc(&ctx->ac, tmp, 5240); - { - LLVMValueRef offset_vtx = - LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, - nggso->num_vertices, ""); - - for (unsigned i = 0; i < max_num_vertices; ++i) { - tmp = LLVMBuildICmp(builder, LLVMIntULT, - LLVMConstInt(ctx->ac.i32, i, false), - nggso->num_vertices, ""); - ac_build_ifcc(&ctx->ac, tmp, 5241); - build_streamout_vertex(ctx, so_buffer, wgoffset_dw, - stream, offset_vtx, nggso->vertices[i]); - ac_build_endif(&ctx->ac, 5241); - offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); - } - } - ac_build_endif(&ctx->ac, 5240); - } + struct si_shader_info *info = &ctx->shader->selector->info; + struct pipe_stream_output_info *so = &ctx->shader->selector->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef tmp, tmp2; + LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); + LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); + LLVMValueRef so_buffer[4] = {}; + unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0); + LLVMValueRef prim_stride_dw[4] = {}; + LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); + int stream_for_buffer[4] = {-1, -1, -1, -1}; + unsigned bufmask_for_stream[4] = {}; + bool isgs = ctx->type == PIPE_SHADER_GEOMETRY; + unsigned scratch_emit_base = isgs ? 4 : 0; + LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; + unsigned scratch_offset_base = isgs ? 8 : 4; + LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; + + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256); + + /* Determine the mapping of streamout buffers to vertex streams. */ + for (unsigned i = 0; i < so->num_outputs; ++i) { + unsigned buf = so->output[i].output_buffer; + unsigned stream = so->output[i].stream; + assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream); + stream_for_buffer[buf] = stream; + bufmask_for_stream[stream] |= 1 << buf; + } + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + assert(so->stride[buffer]); + + tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false); + prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, ""); + prim_stride_dw_vgpr = + ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], + LLVMConstInt(ctx->ac.i32, buffer, false)); + + so_buffer[buffer] = ac_build_load_to_sgpr( + &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false)); + } + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5200); + { + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + /* Advance the streamout offsets in GDS. */ + LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, tmp, 5210); + { + if (isgs) { + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); + tmp = LLVMBuildLoad(builder, tmp, ""); + } else { + tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0); + } + LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); + + unsigned swizzle[4]; + int unused_stream = -1; + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) { + unused_stream = stream; + break; + } + } + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + swizzle[buffer] = stream_for_buffer[buffer]; + } else { + assert(unused_stream >= 0); + swizzle[buffer] = unused_stream; + } + } + + tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + + LLVMValueRef args[] = { + LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), + tmp, + ctx->ac.i32_0, // ordering + ctx->ac.i32_0, // scope + ctx->ac.i1false, // isVolatile + LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index + ctx->ac.i1true, // wave release + ctx->ac.i1true, // wave done + }; + tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args, + ARRAY_SIZE(args), 0); + + /* Keep offsets in a VGPR for quick retrieval via readlane by + * the first wave for bounds checking, and also store in LDS + * for retrieval by all waves later. */ + LLVMBuildStore(builder, tmp, offsets_vgpr); + + tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, ""); + tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2); + LLVMBuildStore(builder, tmp, tmp2); + } + ac_build_endif(&ctx->ac, 5210); + + /* Determine the max emit per buffer. This is done via the SALU, in part + * because LLVM can't generate divide-by-multiply if we try to do this + * via VALU with one lane per buffer. + */ + LLVMValueRef max_emit[4] = {}; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + LLVMValueRef bufsize_dw = LLVMBuildLShr( + builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, ""); + + tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); + LLVMValueRef offset_dw = + ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false)); + + tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); + tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], ""); + + tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, ""); + max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, ""); + } + + /* Determine the number of emitted primitives per stream and fixup the + * GDS counter if necessary. + * + * This is complicated by the fact that a single stream can emit to + * multiple buffers (but luckily not vice versa). + */ + LLVMValueRef emit_vgpr = ctx->ac.i32_0; + + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); + LLVMValueRef generated = + ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false)); + + LLVMValueRef emit = generated; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == stream) + emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]); + } + + emit_vgpr = + ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false)); + + /* Fixup the offset using a plain GDS atomic if we overflowed. */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); + ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */ + tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), + ac_get_thread_id(&ctx->ac), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5222); + { + tmp = LLVMBuildSub(builder, generated, emit, ""); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, + LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5222); + ac_build_endif(&ctx->ac, 5221); + } + + tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, tmp, 5225); + { + tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, ""); + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp); + LLVMBuildStore(builder, emit_vgpr, tmp); + } + ac_build_endif(&ctx->ac, 5225); + } + ac_build_endif(&ctx->ac, 5200); + + /* Determine the workgroup-relative per-thread / primitive offset into + * the streamout buffers */ + struct ac_wg_scan primemit_scan[4] = {}; + + if (isgs) { + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + primemit_scan[stream].enable_exclusive = true; + primemit_scan[stream].op = nir_op_iadd; + primemit_scan[stream].src = nggso->prim_enable[stream]; + primemit_scan[stream].scratch = ac_build_gep0( + &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); + primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); + primemit_scan[stream].numwaves = get_tgsize(ctx); + primemit_scan[stream].maxwaves = 8; + ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]); + } + } + + ac_build_s_barrier(&ctx->ac); + + /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ + LLVMValueRef wgoffset_dw[4] = {}; + + { + LLVMValueRef scratch_vgpr; + + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac)); + scratch_vgpr = LLVMBuildLoad(builder, tmp, ""); + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + wgoffset_dw[buffer] = + ac_build_readlane(&ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); + } + } + + for (unsigned stream = 0; stream < 4; ++stream) { + if (info->num_stream_output_components[stream]) { + nggso->emit[stream] = + ac_build_readlane(&ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); + } + } + } + + /* Write out primitive data */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + if (isgs) { + ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]); + } else { + primemit_scan[stream].result_exclusive = tid; + } + + tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive, + nggso->emit[stream], ""); + tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], ""); + ac_build_ifcc(&ctx->ac, tmp, 5240); + { + LLVMValueRef offset_vtx = + LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, ""); + + for (unsigned i = 0; i < max_num_vertices; ++i) { + tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false), + nggso->num_vertices, ""); + ac_build_ifcc(&ctx->ac, tmp, 5241); + build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx, + nggso->vertices[i]); + ac_build_endif(&ctx->ac, 5241); + offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); + } + } + ac_build_endif(&ctx->ac, 5240); + } } /* LDS layout of ES vertex data for NGG culling. */ -enum { - /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old - * ES thread ID. After vertex compaction, compacted ES threads - * store the old thread ID here to copy input VGPRs from uncompacted - * ES threads. - * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value. - * Byte 2: TES rel patch ID - * Byte 3: Unused - */ - lds_byte0_accept_flag = 0, - lds_byte0_old_thread_id = 0, - lds_byte1_new_thread_id, - lds_byte2_tes_rel_patch_id, - lds_byte3_unused, - - lds_packed_data = 0, /* lds_byteN_... */ - - lds_pos_x, - lds_pos_y, - lds_pos_z, - lds_pos_w, - lds_pos_x_div_w, - lds_pos_y_div_w, - /* If VS: */ - lds_vertex_id, - lds_instance_id, /* optional */ - /* If TES: */ - lds_tes_u = lds_vertex_id, - lds_tes_v = lds_instance_id, - lds_tes_patch_id, /* optional */ +enum +{ + /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old + * ES thread ID. After vertex compaction, compacted ES threads + * store the old thread ID here to copy input VGPRs from uncompacted + * ES threads. + * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value. + * Byte 2: TES rel patch ID + * Byte 3: Unused + */ + lds_byte0_accept_flag = 0, + lds_byte0_old_thread_id = 0, + lds_byte1_new_thread_id, + lds_byte2_tes_rel_patch_id, + lds_byte3_unused, + + lds_packed_data = 0, /* lds_byteN_... */ + + lds_pos_x, + lds_pos_y, + lds_pos_z, + lds_pos_w, + lds_pos_x_div_w, + lds_pos_y_div_w, + /* If VS: */ + lds_vertex_id, + lds_instance_id, /* optional */ + /* If TES: */ + lds_tes_u = lds_vertex_id, + lds_tes_v = lds_instance_id, + lds_tes_patch_id, /* optional */ }; -static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, - LLVMValueRef ptr, unsigned byte_index) +static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr, + unsigned byte_index) { - assert(byte_index < 4); - LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); - LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0); + assert(byte_index < 4); + LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0); - return LLVMBuildGEP(ctx->ac.builder, - LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), - &index, 1, ""); + return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index, + 1, ""); } static unsigned ngg_nogs_vertex_size(struct si_shader *shader) { - unsigned lds_vertex_size = 0; - - /* The edgeflag is always stored in the last element that's also - * used for padding to reduce LDS bank conflicts. */ - if (shader->selector->so.num_outputs) - lds_vertex_size = 4 * shader->selector->info.num_outputs + 1; - if (shader->selector->info.writes_edgeflag) - lds_vertex_size = MAX2(lds_vertex_size, 1); - - /* LDS size for passing data from GS to ES. - * GS stores Primitive IDs into LDS at the address corresponding - * to the ES thread of the provoking vertex. All ES threads - * load and export PrimitiveID for their thread. - */ - if (shader->selector->type == PIPE_SHADER_VERTEX && - shader->key.mono.u.vs_export_prim_id) - lds_vertex_size = MAX2(lds_vertex_size, 1); - - if (shader->key.opt.ngg_culling) { - if (shader->selector->type == PIPE_SHADER_VERTEX) { - STATIC_ASSERT(lds_instance_id + 1 == 9); - lds_vertex_size = MAX2(lds_vertex_size, 9); - } else { - assert(shader->selector->type == PIPE_SHADER_TESS_EVAL); - - if (shader->selector->info.uses_primid || - shader->key.mono.u.vs_export_prim_id) { - STATIC_ASSERT(lds_tes_patch_id + 2 == 11); - lds_vertex_size = MAX2(lds_vertex_size, 11); - } else { - STATIC_ASSERT(lds_tes_v + 1 == 9); - lds_vertex_size = MAX2(lds_vertex_size, 9); - } - } - } - - return lds_vertex_size; + unsigned lds_vertex_size = 0; + + /* The edgeflag is always stored in the last element that's also + * used for padding to reduce LDS bank conflicts. */ + if (shader->selector->so.num_outputs) + lds_vertex_size = 4 * shader->selector->info.num_outputs + 1; + if (shader->selector->info.writes_edgeflag) + lds_vertex_size = MAX2(lds_vertex_size, 1); + + /* LDS size for passing data from GS to ES. + * GS stores Primitive IDs into LDS at the address corresponding + * to the ES thread of the provoking vertex. All ES threads + * load and export PrimitiveID for their thread. + */ + if (shader->selector->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id) + lds_vertex_size = MAX2(lds_vertex_size, 1); + + if (shader->key.opt.ngg_culling) { + if (shader->selector->type == PIPE_SHADER_VERTEX) { + STATIC_ASSERT(lds_instance_id + 1 == 9); + lds_vertex_size = MAX2(lds_vertex_size, 9); + } else { + assert(shader->selector->type == PIPE_SHADER_TESS_EVAL); + + if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) { + STATIC_ASSERT(lds_tes_patch_id + 2 == 11); + lds_vertex_size = MAX2(lds_vertex_size, 11); + } else { + STATIC_ASSERT(lds_tes_v + 1 == 9); + lds_vertex_size = MAX2(lds_vertex_size, 9); + } + } + } + + return lds_vertex_size; } /** * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage * for the vertex outputs. */ -static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, - LLVMValueRef vtxid) +static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid) { - /* The extra dword is used to avoid LDS bank conflicts. */ - unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader); - LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); - LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); - LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); - return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); + /* The extra dword is used to avoid LDS bank conflicts. */ + unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); + LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); + return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); } -static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, - LLVMValueRef ret, struct ac_arg param, - unsigned return_index) +static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret, + struct ac_arg param, unsigned return_index) { - LLVMValueRef v = ac_get_arg(&ctx->ac, param); - - for (unsigned i = 0; i < 4; i++) { - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_llvm_extract_elem(&ctx->ac, v, i), - return_index + i, ""); - } - return ret; + LLVMValueRef v = ac_get_arg(&ctx->ac, param); + + for (unsigned i = 0; i < 4; i++) { + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i), + return_index + i, ""); + } + return ret; } -static void load_bitmasks_2x64(struct si_shader_context *ctx, - LLVMValueRef lds_ptr, unsigned dw_offset, - LLVMValueRef mask[2], LLVMValueRef *total_bitcount) +static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr, + unsigned dw_offset, LLVMValueRef mask[2], + LLVMValueRef *total_bitcount) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr, - LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), - AC_ADDR_SPACE_LDS), ""); - for (unsigned i = 0; i < 2; i++) { - LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0); - mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), ""); - } - - /* We get better code if we don't use the 128-bit bitcount. */ - *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]), - ac_build_bit_count(&ctx->ac, mask[1]), ""); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr64 = LLVMBuildPointerCast( + builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), ""); + for (unsigned i = 0; i < 2; i++) { + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0); + mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), ""); + } + + /* We get better code if we don't use the 128-bit bitcount. */ + *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]), + ac_build_bit_count(&ctx->ac, mask[1]), ""); } /** @@ -711,38 +678,33 @@ static void load_bitmasks_2x64(struct si_shader_context *ctx, * \param wave_info_num_bits the bit size of thread count field in merged_wave_info * \param wave_info_shift the bit offset of the thread count field in merged_wave_info */ -static void update_thread_counts(struct si_shader_context *ctx, - LLVMValueRef *new_num_threads, - LLVMValueRef *tg_info, - unsigned tg_info_num_bits, - unsigned tg_info_shift, - LLVMValueRef *wave_info, - unsigned wave_info_num_bits, - unsigned wave_info_shift) +static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads, + LLVMValueRef *tg_info, unsigned tg_info_num_bits, + unsigned tg_info_shift, LLVMValueRef *wave_info, + unsigned wave_info_num_bits, unsigned wave_info_shift) { - LLVMBuilderRef builder = ctx->ac.builder; - - /* Update the total thread count. */ - unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift); - *tg_info = LLVMBuildAnd(builder, *tg_info, - LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), ""); - *tg_info = LLVMBuildOr(builder, *tg_info, - LLVMBuildShl(builder, *new_num_threads, - LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), ""); - - /* Update the per-wave thread count. */ - LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""); - *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, ""); - *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0); - *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads, - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0)); - unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift); - *wave_info = LLVMBuildAnd(builder, *wave_info, - LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), ""); - *wave_info = LLVMBuildOr(builder, *wave_info, - LLVMBuildShl(builder, *new_num_threads, - LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), ""); + LLVMBuilderRef builder = ctx->ac.builder; + + /* Update the total thread count. */ + unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift); + *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), ""); + *tg_info = LLVMBuildOr( + builder, *tg_info, + LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), ""); + + /* Update the per-wave thread count. */ + LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""); + *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, ""); + *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0); + *new_num_threads = + ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0)); + unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift); + *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), ""); + *wave_info = LLVMBuildOr( + builder, *wave_info, + LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), + ""); } /** @@ -751,759 +713,719 @@ static void update_thread_counts(struct si_shader_context *ctx, * Also return the position, which is passed to the shader as an input, * so that we don't compute it twice. */ -void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - struct si_shader_selector *sel = shader->selector; - struct si_shader_info *info = &sel->info; - LLVMBuilderRef builder = ctx->ac.builder; - - assert(shader->key.opt.ngg_culling); - assert(shader->key.as_ngg); - assert(sel->type == PIPE_SHADER_VERTEX || - (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es)); - - LLVMValueRef position[4] = {}; - for (unsigned i = 0; i < info->num_outputs; i++) { - switch (info->output_semantic_name[i]) { - case TGSI_SEMANTIC_POSITION: - for (unsigned j = 0; j < 4; j++) { - position[j] = LLVMBuildLoad(ctx->ac.builder, - addrs[4 * i + j], ""); - } - break; - } - } - assert(position[0]); - - /* Store Position.XYZW into LDS. */ - LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - for (unsigned chan = 0; chan < 4; chan++) { - LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]), - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0))); - } - /* Store Position.XY / W into LDS. */ - for (unsigned chan = 0; chan < 2; chan++) { - LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]); - LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val), - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0))); - } - - /* Store VertexID and InstanceID. ES threads will have to load them - * from LDS after vertex compaction and use them instead of their own - * system values. - */ - bool uses_instance_id = false; - bool uses_tes_prim_id = false; - LLVMValueRef packed_data = ctx->ac.i32_0; - - if (ctx->type == PIPE_SHADER_VERTEX) { - uses_instance_id = sel->info.uses_instanceid || - shader->key.part.vs.prolog.instance_divisor_is_one || - shader->key.part.vs.prolog.instance_divisor_is_fetched; - - LLVMBuildStore(builder, ctx->abi.vertex_id, - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0))); - if (uses_instance_id) { - LLVMBuildStore(builder, ctx->abi.instance_id, - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_instance_id, 0))); - } - } else { - uses_tes_prim_id = sel->info.uses_primid || - shader->key.mono.u.vs_export_prim_id; - - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)), - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_tes_u, 0))); - LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)), - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_tes_v, 0))); - packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id), - LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), ""); - if (uses_tes_prim_id) { - LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id), - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0))); - } - } - /* Initialize the packed data. */ - LLVMBuildStore(builder, packed_data, - ac_build_gep0(&ctx->ac, es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_packed_data, 0))); - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - LLVMValueRef tid = ac_get_thread_id(&ctx->ac); - - /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less - * than 4 waves, but we always read all 4 values. This is where the thread - * bitmasks of unculled threads will be stored. - * - * gs_ngg_scratch layout: esmask[0..3] - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101); - { - LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, ""); - LLVMBuildStore(builder, ctx->ac.i32_0, - ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index)); - } - ac_build_endif(&ctx->ac, 16101); - ac_build_s_barrier(&ctx->ac); - - /* The hardware requires that there are no holes between unculled vertices, - * which means we have to pack ES threads, i.e. reduce the ES thread count - * and move ES input VGPRs to lower threads. The upside is that varyings - * are only fetched and computed for unculled vertices. - * - * Vertex compaction in GS threads: - * - * Part 1: Compute the surviving vertex mask in GS threads: - * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves) - * - In GS, notify ES threads whether the vertex survived. - * - Barrier - * - ES threads will create the mask and store it in LDS. - * - Barrier - * - Each GS thread loads the vertex masks from LDS. - * - * Part 2: Compact ES threads in GS threads: - * - Compute the prefix sum for all 3 vertices from the masks. These are the new - * thread IDs for each vertex within the primitive. - * - Write the value of the old thread ID into the LDS address of the new thread ID. - * The ES thread will load the old thread ID and use it to load the position, VertexID, - * and InstanceID. - * - Update vertex indices and null flag in the GS input VGPRs. - * - Barrier - * - * Part 3: Update inputs GPRs - * - For all waves, update per-wave thread counts in input SGPRs. - * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs). - */ - - LLVMValueRef vtxindex[3]; - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) { - /* For the GS fast launch, the VS prologs simply puts the Vertex IDs - * into these VGPRs. - */ - vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); - vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset); - vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset); - } else { - vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); - vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); - vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); - }; - LLVMValueRef gs_vtxptr[] = { - ngg_nogs_vertex_ptr(ctx, vtxindex[0]), - ngg_nogs_vertex_ptr(ctx, vtxindex[1]), - ngg_nogs_vertex_ptr(ctx, vtxindex[2]), - }; - es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - - LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - - /* Do culling in GS threads. */ - ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002); - { - /* Load positions. */ - LLVMValueRef pos[3][4] = {}; - for (unsigned vtx = 0; vtx < 3; vtx++) { - for (unsigned chan = 0; chan < 4; chan++) { - unsigned index; - if (chan == 0 || chan == 1) - index = lds_pos_x_div_w + chan; - else if (chan == 3) - index = lds_pos_w; - else - continue; - - LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], - LLVMConstInt(ctx->ac.i32, index, 0)); - pos[vtx][chan] = LLVMBuildLoad(builder, addr, ""); - pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]); - } - } - - /* Load the viewport state for small prim culling. */ - LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), - ctx->ac.i32_0); - vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); - LLVMValueRef vp_scale[2], vp_translate[2]; - vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); - vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); - vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); - vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - - /* Get the small prim filter precision. */ - LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4); - small_prim_precision = LLVMBuildOr(builder, small_prim_precision, - LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); - small_prim_precision = LLVMBuildShl(builder, small_prim_precision, - LLVMConstInt(ctx->ac.i32, 23, 0), ""); - small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); - - /* Execute culling code. */ - struct ac_cull_options options = {}; - options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; - options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; - options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS; - options.cull_small_prims = options.cull_view_xy; - options.cull_zero_area = options.cull_front || options.cull_back; - options.cull_w = true; - - /* Tell ES threads whether their vertex survived. */ - ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, - vp_scale, vp_translate, - small_prim_precision, &options), 16003); - { - LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted); - for (unsigned vtx = 0; vtx < 3; vtx++) { - LLVMBuildStore(builder, ctx->ac.i8_1, - si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag)); - } - } - ac_build_endif(&ctx->ac, 16003); - } - ac_build_endif(&ctx->ac, 16002); - ac_build_s_barrier(&ctx->ac); - - gs_accepted = LLVMBuildLoad(builder, gs_accepted, ""); - - LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, ""); - - /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */ - ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007); - { - LLVMValueRef es_accepted_flag = - LLVMBuildLoad(builder, - si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), ""); - - LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE, - es_accepted_flag, ctx->ac.i8_0, ""); - LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool); - - LLVMBuildStore(builder, es_accepted_bool, es_accepted); - - ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, - tid, ctx->ac.i32_0, ""), 16008); - { - LLVMBuildStore(builder, es_mask, - ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, - get_wave_id_in_tg(ctx))); - } - ac_build_endif(&ctx->ac, 16008); - } - ac_build_endif(&ctx->ac, 16007); - ac_build_s_barrier(&ctx->ac); - - /* Load the vertex masks and compute the new ES thread count. */ - LLVMValueRef es_mask[2], new_num_es_threads, kill_wave; - load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads); - new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL); - - /* ES threads compute their prefix sum, which is the new ES thread ID. - * Then they write the value of the old thread ID into the LDS address - * of the new thread ID. It will be used it to load input VGPRs from - * the old thread's LDS location. - */ - ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009); - { - LLVMValueRef old_id = get_thread_id_in_tg(ctx); - LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id); - - LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""), - si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), - lds_byte0_old_thread_id)); - LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""), - si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id)); - } - ac_build_endif(&ctx->ac, 16009); - - /* Kill waves that have inactive threads. */ - kill_wave = LLVMBuildICmp(builder, LLVMIntULE, - ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)), - LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), ""); - ac_build_ifcc(&ctx->ac, kill_wave, 19202); - { - /* If we are killing wave 0, send that there are no primitives - * in this threadgroup. - */ - ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), - ctx->ac.i32_0, ctx->ac.i32_0); - ac_build_s_endpgm(&ctx->ac); - } - ac_build_endif(&ctx->ac, 19202); - ac_build_s_barrier(&ctx->ac); - - /* Send the final vertex and primitive counts. */ - ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), - new_num_es_threads, ngg_get_prim_cnt(ctx)); - - /* Update thread counts in SGPRs. */ - LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info); - LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info); - - /* This also converts the thread count from the total count to the per-wave count. */ - update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, - &new_merged_wave_info, 8, 0); - - /* Update vertex indices in VGPR0 (same format as NGG passthrough). */ - LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - /* Set the null flag at the beginning (culled), and then - * overwrite it for accepted primitives. - */ - LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0); - - /* Get vertex indices after vertex compaction. */ - ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011); - { - struct ac_ngg_prim prim = {}; - prim.num_vertices = 3; - prim.isnull = ctx->ac.i1false; - - for (unsigned vtx = 0; vtx < 3; vtx++) { - prim.index[vtx] = - LLVMBuildLoad(builder, - si_build_gep_i8(ctx, gs_vtxptr[vtx], - lds_byte1_new_thread_id), ""); - prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, ""); - prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx); - } - - /* Set the new GS input VGPR. */ - LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0); - } - ac_build_endif(&ctx->ac, 16011); - - if (gfx10_ngg_export_prim_early(shader)) - gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, "")); - - /* Set the new ES input VGPRs. */ - LLVMValueRef es_data[4]; - LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - for (unsigned i = 0; i < 4; i++) - es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, - new_num_es_threads, ""), 16012); - { - LLVMValueRef old_id, old_es_vtxptr, tmp; - - /* Load ES input VGPRs from the ES thread before compaction. */ - old_id = LLVMBuildLoad(builder, - si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), ""); - old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, ""); - - LLVMBuildStore(builder, old_id, old_thread_id); - old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id); - - for (unsigned i = 0; i < 2; i++) { - tmp = LLVMBuildLoad(builder, - ac_build_gep0(&ctx->ac, old_es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), ""); - LLVMBuildStore(builder, tmp, es_data[i]); - } - - if (ctx->type == PIPE_SHADER_TESS_EVAL) { - tmp = LLVMBuildLoad(builder, - si_build_gep_i8(ctx, old_es_vtxptr, - lds_byte2_tes_rel_patch_id), ""); - tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); - LLVMBuildStore(builder, tmp, es_data[2]); - - if (uses_tes_prim_id) { - tmp = LLVMBuildLoad(builder, - ac_build_gep0(&ctx->ac, old_es_vtxptr, - LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), ""); - LLVMBuildStore(builder, tmp, es_data[3]); - } - } - } - ac_build_endif(&ctx->ac, 16012); - - /* Return values for the main function. */ - LLVMValueRef ret = ctx->return_value; - LLVMValueRef val; - - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, ""); - if (ctx->type == PIPE_SHADER_TESS_EVAL) - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4); - - ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - ret = si_insert_input_ptr(ctx, ret, - ctx->const_and_shader_buffers, - 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->samplers_and_images, - 8 + SI_SGPR_SAMPLERS_AND_IMAGES); - ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - - if (ctx->type == PIPE_SHADER_VERTEX) { - ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, - 8 + SI_SGPR_BASE_VERTEX); - ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, - 8 + SI_SGPR_START_INSTANCE); - ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, - 8 + SI_SGPR_DRAWID); - ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, - 8 + SI_VS_NUM_USER_SGPR); - - for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) { - ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i], - 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4); - } - } else { - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, - 8 + SI_SGPR_TES_OFFCHIP_LAYOUT); - ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, - 8 + SI_SGPR_TES_OFFCHIP_ADDR); - } - - unsigned vgpr; - if (ctx->type == PIPE_SHADER_VERTEX) { - if (shader->selector->num_vbos_in_user_sgprs) { - vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + - shader->selector->num_vbos_in_user_sgprs * 4; - } else { - vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1; - } - } else { - vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; - } - - val = LLVMBuildLoad(builder, new_vgpr0, ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), - vgpr++, ""); - vgpr++; /* gs_vtx23_offset */ - - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); - vgpr++; /* gs_vtx45_offset */ - - if (ctx->type == PIPE_SHADER_VERTEX) { - val = LLVMBuildLoad(builder, es_data[0], ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), - vgpr++, ""); /* VGPR5 - VertexID */ - vgpr += 2; - if (uses_instance_id) { - val = LLVMBuildLoad(builder, es_data[1], ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), - vgpr++, ""); /* VGPR8 - InstanceID */ - } else { - vgpr++; - } - } else { - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - unsigned num_vgprs = uses_tes_prim_id ? 4 : 3; - for (unsigned i = 0; i < num_vgprs; i++) { - val = LLVMBuildLoad(builder, es_data[i], ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), - vgpr++, ""); - } - if (num_vgprs == 3) - vgpr++; - } - /* Return the old thread ID. */ - val = LLVMBuildLoad(builder, old_thread_id, ""); - ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); - - /* These two also use LDS. */ - if (sel->info.writes_edgeflag || - (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) - ac_build_s_barrier(&ctx->ac); - - ctx->return_value = ret; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_selector *sel = shader->selector; + struct si_shader_info *info = &sel->info; + LLVMBuilderRef builder = ctx->ac.builder; + + assert(shader->key.opt.ngg_culling); + assert(shader->key.as_ngg); + assert(sel->type == PIPE_SHADER_VERTEX || + (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es)); + + LLVMValueRef position[4] = {}; + for (unsigned i = 0; i < info->num_outputs; i++) { + switch (info->output_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + for (unsigned j = 0; j < 4; j++) { + position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], ""); + } + break; + } + } + assert(position[0]); + + /* Store Position.XYZW into LDS. */ + LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + for (unsigned chan = 0; chan < 4; chan++) { + LLVMBuildStore( + builder, ac_to_integer(&ctx->ac, position[chan]), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0))); + } + /* Store Position.XY / W into LDS. */ + for (unsigned chan = 0; chan < 2; chan++) { + LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]); + LLVMBuildStore( + builder, ac_to_integer(&ctx->ac, val), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0))); + } + + /* Store VertexID and InstanceID. ES threads will have to load them + * from LDS after vertex compaction and use them instead of their own + * system values. + */ + bool uses_instance_id = false; + bool uses_tes_prim_id = false; + LLVMValueRef packed_data = ctx->ac.i32_0; + + if (ctx->type == PIPE_SHADER_VERTEX) { + uses_instance_id = sel->info.uses_instanceid || + shader->key.part.vs.prolog.instance_divisor_is_one || + shader->key.part.vs.prolog.instance_divisor_is_fetched; + + LLVMBuildStore( + builder, ctx->abi.vertex_id, + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0))); + if (uses_instance_id) { + LLVMBuildStore( + builder, ctx->abi.instance_id, + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0))); + } + } else { + uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id; + + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0))); + LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0))); + packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id), + LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), ""); + if (uses_tes_prim_id) { + LLVMBuildStore( + builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id), + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0))); + } + } + /* Initialize the packed data. */ + LLVMBuildStore( + builder, packed_data, + ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0))); + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less + * than 4 waves, but we always read all 4 values. This is where the thread + * bitmasks of unculled threads will be stored. + * + * gs_ngg_scratch layout: esmask[0..3] + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, 3, 0), ""), + 16101); + { + LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index)); + } + ac_build_endif(&ctx->ac, 16101); + ac_build_s_barrier(&ctx->ac); + + /* The hardware requires that there are no holes between unculled vertices, + * which means we have to pack ES threads, i.e. reduce the ES thread count + * and move ES input VGPRs to lower threads. The upside is that varyings + * are only fetched and computed for unculled vertices. + * + * Vertex compaction in GS threads: + * + * Part 1: Compute the surviving vertex mask in GS threads: + * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves) + * - In GS, notify ES threads whether the vertex survived. + * - Barrier + * - ES threads will create the mask and store it in LDS. + * - Barrier + * - Each GS thread loads the vertex masks from LDS. + * + * Part 2: Compact ES threads in GS threads: + * - Compute the prefix sum for all 3 vertices from the masks. These are the new + * thread IDs for each vertex within the primitive. + * - Write the value of the old thread ID into the LDS address of the new thread ID. + * The ES thread will load the old thread ID and use it to load the position, VertexID, + * and InstanceID. + * - Update vertex indices and null flag in the GS input VGPRs. + * - Barrier + * + * Part 3: Update inputs GPRs + * - For all waves, update per-wave thread counts in input SGPRs. + * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs). + */ + + LLVMValueRef vtxindex[3]; + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) { + /* For the GS fast launch, the VS prologs simply puts the Vertex IDs + * into these VGPRs. + */ + vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset); + vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset); + vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset); + } else { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + }; + LLVMValueRef gs_vtxptr[] = { + ngg_nogs_vertex_ptr(ctx, vtxindex[0]), + ngg_nogs_vertex_ptr(ctx, vtxindex[1]), + ngg_nogs_vertex_ptr(ctx, vtxindex[2]), + }; + es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + + LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + + /* Do culling in GS threads. */ + ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002); + { + /* Load positions. */ + LLVMValueRef pos[3][4] = {}; + for (unsigned vtx = 0; vtx < 3; vtx++) { + for (unsigned chan = 0; chan < 4; chan++) { + unsigned index; + if (chan == 0 || chan == 1) + index = lds_pos_x_div_w + chan; + else if (chan == 3) + index = lds_pos_w; + else + continue; + + LLVMValueRef addr = + ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0)); + pos[vtx][chan] = LLVMBuildLoad(builder, addr, ""); + pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]); + } + } + + /* Load the viewport state for small prim culling. */ + LLVMValueRef vp = ac_build_load_invariant( + &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0); + vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); + LLVMValueRef vp_scale[2], vp_translate[2]; + vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); + vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); + vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); + vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); + + /* Get the small prim filter precision. */ + LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4); + small_prim_precision = + LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); + small_prim_precision = + LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), ""); + small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); + + /* Execute culling code. */ + struct ac_cull_options options = {}; + options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; + options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; + options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS; + options.cull_small_prims = options.cull_view_xy; + options.cull_zero_area = options.cull_front || options.cull_back; + options.cull_w = true; + + /* Tell ES threads whether their vertex survived. */ + ac_build_ifcc(&ctx->ac, + ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate, + small_prim_precision, &options), + 16003); + { + LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted); + for (unsigned vtx = 0; vtx < 3; vtx++) { + LLVMBuildStore(builder, ctx->ac.i8_1, + si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag)); + } + } + ac_build_endif(&ctx->ac, 16003); + } + ac_build_endif(&ctx->ac, 16002); + ac_build_s_barrier(&ctx->ac); + + gs_accepted = LLVMBuildLoad(builder, gs_accepted, ""); + + LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, ""); + + /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */ + ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007); + { + LLVMValueRef es_accepted_flag = + LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), ""); + + LLVMValueRef es_accepted_bool = + LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, ""); + LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool); + + LLVMBuildStore(builder, es_accepted_bool, es_accepted); + + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008); + { + LLVMBuildStore(builder, es_mask, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx))); + } + ac_build_endif(&ctx->ac, 16008); + } + ac_build_endif(&ctx->ac, 16007); + ac_build_s_barrier(&ctx->ac); + + /* Load the vertex masks and compute the new ES thread count. */ + LLVMValueRef es_mask[2], new_num_es_threads, kill_wave; + load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads); + new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL); + + /* ES threads compute their prefix sum, which is the new ES thread ID. + * Then they write the value of the old thread ID into the LDS address + * of the new thread ID. It will be used it to load input VGPRs from + * the old thread's LDS location. + */ + ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009); + { + LLVMValueRef old_id = get_thread_id_in_tg(ctx); + LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id); + + LLVMBuildStore( + builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""), + si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id)); + LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""), + si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id)); + } + ac_build_endif(&ctx->ac, 16009); + + /* Kill waves that have inactive threads. */ + kill_wave = LLVMBuildICmp(builder, LLVMIntULE, + ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)), + LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), + ""); + ac_build_ifcc(&ctx->ac, kill_wave, 19202); + { + /* If we are killing wave 0, send that there are no primitives + * in this threadgroup. + */ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0); + ac_build_s_endpgm(&ctx->ac); + } + ac_build_endif(&ctx->ac, 19202); + ac_build_s_barrier(&ctx->ac); + + /* Send the final vertex and primitive counts. */ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads, + ngg_get_prim_cnt(ctx)); + + /* Update thread counts in SGPRs. */ + LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info); + LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info); + + /* This also converts the thread count from the total count to the per-wave count. */ + update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8, + 0); + + /* Update vertex indices in VGPR0 (same format as NGG passthrough). */ + LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + /* Set the null flag at the beginning (culled), and then + * overwrite it for accepted primitives. + */ + LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0); + + /* Get vertex indices after vertex compaction. */ + ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011); + { + struct ac_ngg_prim prim = {}; + prim.num_vertices = 3; + prim.isnull = ctx->ac.i1false; + + for (unsigned vtx = 0; vtx < 3; vtx++) { + prim.index[vtx] = LLVMBuildLoad( + builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), ""); + prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, ""); + prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx); + } + + /* Set the new GS input VGPR. */ + LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0); + } + ac_build_endif(&ctx->ac, 16011); + + if (gfx10_ngg_export_prim_early(shader)) + gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, "")); + + /* Set the new ES input VGPRs. */ + LLVMValueRef es_data[4]; + LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + for (unsigned i = 0; i < 4; i++) + es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""), + 16012); + { + LLVMValueRef old_id, old_es_vtxptr, tmp; + + /* Load ES input VGPRs from the ES thread before compaction. */ + old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), ""); + old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, ""); + + LLVMBuildStore(builder, old_id, old_thread_id); + old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id); + + for (unsigned i = 0; i < 2; i++) { + tmp = LLVMBuildLoad( + builder, + ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), + ""); + LLVMBuildStore(builder, tmp, es_data[i]); + } + + if (ctx->type == PIPE_SHADER_TESS_EVAL) { + tmp = LLVMBuildLoad(builder, + si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), ""); + tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); + LLVMBuildStore(builder, tmp, es_data[2]); + + if (uses_tes_prim_id) { + tmp = LLVMBuildLoad(builder, + ac_build_gep0(&ctx->ac, old_es_vtxptr, + LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), + ""); + LLVMBuildStore(builder, tmp, es_data[3]); + } + } + } + ac_build_endif(&ctx->ac, 16012); + + /* Return values for the main function. */ + LLVMValueRef ret = ctx->return_value; + LLVMValueRef val; + + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, ""); + if (ctx->type == PIPE_SHADER_TESS_EVAL) + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers, + 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES); + ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); + + if (ctx->type == PIPE_SHADER_VERTEX) { + ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX); + ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE); + ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID); + ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR); + + for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) { + ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i], + 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4); + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT); + ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR); + } + + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) { + if (shader->selector->num_vbos_in_user_sgprs) { + vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; + } else { + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1; + } + } else { + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + } + + val = LLVMBuildLoad(builder, new_vgpr0, ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); + vgpr++; /* gs_vtx23_offset */ + + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); + vgpr++; /* gs_vtx45_offset */ + + if (ctx->type == PIPE_SHADER_VERTEX) { + val = LLVMBuildLoad(builder, es_data[0], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, + ""); /* VGPR5 - VertexID */ + vgpr += 2; + if (uses_instance_id) { + val = LLVMBuildLoad(builder, es_data[1], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, + ""); /* VGPR8 - InstanceID */ + } else { + vgpr++; + } + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + unsigned num_vgprs = uses_tes_prim_id ? 4 : 3; + for (unsigned i = 0; i < num_vgprs; i++) { + val = LLVMBuildLoad(builder, es_data[i], ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); + } + if (num_vgprs == 3) + vgpr++; + } + /* Return the old thread ID. */ + val = LLVMBuildLoad(builder, old_thread_id, ""); + ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, ""); + + /* These two also use LDS. */ + if (sel->info.writes_edgeflag || + (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) + ac_build_s_barrier(&ctx->ac); + + ctx->return_value = ret; } /** * Emit the epilogue of an API VS or TES shader compiled as ESGS shader. */ -void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_selector *sel = ctx->shader->selector; - struct si_shader_info *info = &sel->info; - struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp, tmp2; - - assert(!ctx->shader->is_gs_copy_shader); - assert(info->num_outputs <= max_outputs); - - LLVMValueRef vertex_ptr = NULL; - - if (sel->so.num_outputs || sel->info.writes_edgeflag) - vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - - for (unsigned i = 0; i < info->num_outputs; i++) { - outputs[i].semantic_name = info->output_semantic_name[i]; - outputs[i].semantic_index = info->output_semantic_index[i]; - - for (unsigned j = 0; j < 4; j++) { - outputs[i].vertex_stream[j] = - (info->output_streams[i] >> (2 * j)) & 3; - - /* TODO: we may store more outputs than streamout needs, - * but streamout performance isn't that important. - */ - if (sel->so.num_outputs) { - tmp = ac_build_gep0(&ctx->ac, vertex_ptr, - LLVMConstInt(ctx->ac.i32, 4 * i + j, false)); - tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], ""); - tmp2 = ac_to_integer(&ctx->ac, tmp2); - LLVMBuildStore(builder, tmp2, tmp); - } - } - - /* Store the edgeflag at the end (if streamout is enabled) */ - if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && - sel->info.writes_edgeflag) { - LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], ""); - /* The output is a float, but the hw expects a 1-bit integer. */ - edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, ""); - edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1); - - tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); - tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); - LLVMBuildStore(builder, edgeflag, tmp); - } - } - - bool unterminated_es_if_block = - !sel->so.num_outputs && - !sel->info.writes_edgeflag && - !ctx->screen->use_ngg_streamout && /* no query buffer */ - (ctx->type != PIPE_SHADER_VERTEX || - !ctx->shader->key.mono.u.vs_export_prim_id); - - if (!unterminated_es_if_block) - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - LLVMValueRef is_gs_thread = si_is_gs_thread(ctx); - LLVMValueRef is_es_thread = si_is_es_thread(ctx); - LLVMValueRef vtxindex[3]; - - if (ctx->shader->key.opt.ngg_culling) { - vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9); - vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9); - vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9); - } else { - vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); - vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); - vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); - } - - /* Determine the number of vertices per primitive. */ - unsigned num_vertices; - LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices); - - /* Streamout */ - LLVMValueRef emitted_prims = NULL; - - if (sel->so.num_outputs) { - assert(!unterminated_es_if_block); - - struct ngg_streamout nggso = {}; - nggso.num_vertices = num_vertices_val; - nggso.prim_enable[0] = is_gs_thread; - - for (unsigned i = 0; i < num_vertices; ++i) - nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); - - build_streamout(ctx, &nggso); - emitted_prims = nggso.emit[0]; - } - - LLVMValueRef user_edgeflags[3] = {}; - - if (sel->info.writes_edgeflag) { - assert(!unterminated_es_if_block); - - /* Streamout already inserted the barrier, so don't insert it again. */ - if (!sel->so.num_outputs) - ac_build_s_barrier(&ctx->ac); - - ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); - /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ - for (unsigned i = 0; i < num_vertices; i++) { - tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); - tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); - tmp = ac_build_gep0(&ctx->ac, tmp, tmp2); - tmp = LLVMBuildLoad(builder, tmp, ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - - user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, ""); - LLVMBuildStore(builder, tmp, user_edgeflags[i]); - } - ac_build_endif(&ctx->ac, 5400); - } - - /* Copy Primitive IDs from GS threads to the LDS address corresponding - * to the ES thread of the provoking vertex. - */ - if (ctx->type == PIPE_SHADER_VERTEX && - ctx->shader->key.mono.u.vs_export_prim_id) { - assert(!unterminated_es_if_block); - - /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ - if (sel->so.num_outputs || sel->info.writes_edgeflag) - ac_build_s_barrier(&ctx->ac); - - ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); - /* Extract the PROVOKING_VTX_INDEX field. */ - LLVMValueRef provoking_vtx_in_prim = - si_unpack_param(ctx, ctx->vs_state_bits, 4, 2); - - /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */ - LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3); - LLVMValueRef provoking_vtx_index = - LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); - LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index); - - LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id), - ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0)); - ac_build_endif(&ctx->ac, 5400); - } - - /* Update query buffer */ - if (ctx->screen->use_ngg_streamout && - !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - assert(!unterminated_es_if_block); - - tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); - tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */ - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5030); - tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), - sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5031); - { - LLVMValueRef args[] = { - ngg_get_prim_cnt(ctx), - ngg_get_query_buf(ctx), - LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */ - ctx->ac.i32_0, /* soffset */ - ctx->ac.i32_0, /* cachepolicy */ - }; - - if (sel->so.num_outputs) { - args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1); - args[2] = ac_build_writelane(&ctx->ac, args[2], - LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1); - } - - /* TODO: should this be 64-bit atomics? */ - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", - ctx->ac.i32, args, 5, 0); - } - ac_build_endif(&ctx->ac, 5031); - ac_build_endif(&ctx->ac, 5030); - ac_build_endif(&ctx->ac, 5029); - } - - /* Build the primitive export. */ - if (!gfx10_ngg_export_prim_early(ctx->shader)) { - assert(!unterminated_es_if_block); - gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL); - } - - /* Export per-vertex data (positions and parameters). */ - if (!unterminated_es_if_block) - ac_build_ifcc(&ctx->ac, is_es_thread, 6002); - { - unsigned i; - - /* Unconditionally (re-)load the values for proper SSA form. */ - for (i = 0; i < info->num_outputs; i++) { - /* If the NGG cull shader part computed the position, don't - * use the position from the current shader part. Instead, - * load it from LDS. - */ - if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION && - ctx->shader->key.opt.ngg_culling) { - vertex_ptr = ngg_nogs_vertex_ptr(ctx, - ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id)); - - for (unsigned j = 0; j < 4; j++) { - tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0); - tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); - tmp = LLVMBuildLoad(builder, tmp, ""); - outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); - } - } else { - for (unsigned j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(builder, - addrs[4 * i + j], ""); - } - } - } - - if (ctx->shader->key.mono.u.vs_export_prim_id) { - outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; - outputs[i].semantic_index = 0; - - if (ctx->type == PIPE_SHADER_VERTEX) { - /* Wait for GS stores to finish. */ - ac_build_s_barrier(&ctx->ac); - - tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); - outputs[i].values[0] = LLVMBuildLoad(builder, tmp, ""); - } else { - assert(ctx->type == PIPE_SHADER_TESS_EVAL); - outputs[i].values[0] = si_get_primitive_id(ctx, 0); - } - - outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]); - for (unsigned j = 1; j < 4; j++) - outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32); - - memset(outputs[i].vertex_stream, 0, - sizeof(outputs[i].vertex_stream)); - i++; - } - - si_llvm_build_vs_exports(ctx, outputs, i); - } - ac_build_endif(&ctx->ac, 6002); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_selector *sel = ctx->shader->selector; + struct si_shader_info *info = &sel->info; + struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp, tmp2; + + assert(!ctx->shader->is_gs_copy_shader); + assert(info->num_outputs <= max_outputs); + + LLVMValueRef vertex_ptr = NULL; + + if (sel->so.num_outputs || sel->info.writes_edgeflag) + vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + + for (unsigned i = 0; i < info->num_outputs; i++) { + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; + + for (unsigned j = 0; j < 4; j++) { + outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3; + + /* TODO: we may store more outputs than streamout needs, + * but streamout performance isn't that important. + */ + if (sel->so.num_outputs) { + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false)); + tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], ""); + tmp2 = ac_to_integer(&ctx->ac, tmp2); + LLVMBuildStore(builder, tmp2, tmp); + } + } + + /* Store the edgeflag at the end (if streamout is enabled) */ + if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) { + LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], ""); + /* The output is a float, but the hw expects a 1-bit integer. */ + edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, ""); + edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1); + + tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); + LLVMBuildStore(builder, edgeflag, tmp); + } + } + + bool unterminated_es_if_block = + !sel->so.num_outputs && !sel->info.writes_edgeflag && + !ctx->screen->use_ngg_streamout && /* no query buffer */ + (ctx->type != PIPE_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id); + + if (!unterminated_es_if_block) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + LLVMValueRef is_gs_thread = si_is_gs_thread(ctx); + LLVMValueRef is_es_thread = si_is_es_thread(ctx); + LLVMValueRef vtxindex[3]; + + if (ctx->shader->key.opt.ngg_culling) { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9); + } else { + vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16); + vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16); + vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16); + } + + /* Determine the number of vertices per primitive. */ + unsigned num_vertices; + LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices); + + /* Streamout */ + LLVMValueRef emitted_prims = NULL; + + if (sel->so.num_outputs) { + assert(!unterminated_es_if_block); + + struct ngg_streamout nggso = {}; + nggso.num_vertices = num_vertices_val; + nggso.prim_enable[0] = is_gs_thread; + + for (unsigned i = 0; i < num_vertices; ++i) + nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + + build_streamout(ctx, &nggso); + emitted_prims = nggso.emit[0]; + } + + LLVMValueRef user_edgeflags[3] = {}; + + if (sel->info.writes_edgeflag) { + assert(!unterminated_es_if_block); + + /* Streamout already inserted the barrier, so don't insert it again. */ + if (!sel->so.num_outputs) + ac_build_s_barrier(&ctx->ac); + + ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); + /* Load edge flags from ES threads and store them into VGPRs in GS threads. */ + for (unsigned i = 0; i < num_vertices; i++) { + tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0); + tmp = ac_build_gep0(&ctx->ac, tmp, tmp2); + tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + + user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, ""); + LLVMBuildStore(builder, tmp, user_edgeflags[i]); + } + ac_build_endif(&ctx->ac, 5400); + } + + /* Copy Primitive IDs from GS threads to the LDS address corresponding + * to the ES thread of the provoking vertex. + */ + if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) { + assert(!unterminated_es_if_block); + + /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */ + if (sel->so.num_outputs || sel->info.writes_edgeflag) + ac_build_s_barrier(&ctx->ac); + + ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); + /* Extract the PROVOKING_VTX_INDEX field. */ + LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2); + + /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */ + LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3); + LLVMValueRef provoking_vtx_index = + LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); + LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index); + + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id), + ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0)); + ac_build_endif(&ctx->ac, 5400); + } + + /* Update query buffer */ + if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + assert(!unterminated_es_if_block); + + tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */ + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5030); + tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), + sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5031); + { + LLVMValueRef args[] = { + ngg_get_prim_cnt(ctx), + ngg_get_query_buf(ctx), + LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */ + ctx->ac.i32_0, /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ + }; + + if (sel->so.num_outputs) { + args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1); + args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false), + ctx->ac.i32_1); + } + + /* TODO: should this be 64-bit atomics? */ + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, + 0); + } + ac_build_endif(&ctx->ac, 5031); + ac_build_endif(&ctx->ac, 5030); + ac_build_endif(&ctx->ac, 5029); + } + + /* Build the primitive export. */ + if (!gfx10_ngg_export_prim_early(ctx->shader)) { + assert(!unterminated_es_if_block); + gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL); + } + + /* Export per-vertex data (positions and parameters). */ + if (!unterminated_es_if_block) + ac_build_ifcc(&ctx->ac, is_es_thread, 6002); + { + unsigned i; + + /* Unconditionally (re-)load the values for proper SSA form. */ + for (i = 0; i < info->num_outputs; i++) { + /* If the NGG cull shader part computed the position, don't + * use the position from the current shader part. Instead, + * load it from LDS. + */ + if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION && + ctx->shader->key.opt.ngg_culling) { + vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id)); + + for (unsigned j = 0; j < 4; j++) { + tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp); + tmp = LLVMBuildLoad(builder, tmp, ""); + outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); + } + } else { + for (unsigned j = 0; j < 4; j++) { + outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], ""); + } + } + } + + if (ctx->shader->key.mono.u.vs_export_prim_id) { + outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; + outputs[i].semantic_index = 0; + + if (ctx->type == PIPE_SHADER_VERTEX) { + /* Wait for GS stores to finish. */ + ac_build_s_barrier(&ctx->ac); + + tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); + tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); + outputs[i].values[0] = LLVMBuildLoad(builder, tmp, ""); + } else { + assert(ctx->type == PIPE_SHADER_TESS_EVAL); + outputs[i].values[0] = si_get_primitive_id(ctx, 0); + } + + outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]); + for (unsigned j = 1; j < 4; j++) + outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32); + + memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream)); + i++; + } + + si_llvm_build_vs_exports(ctx, outputs, i); + } + ac_build_endif(&ctx->ac, 6002); } -static LLVMValueRef -ngg_gs_get_vertex_storage(struct si_shader_context *ctx) +static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx) { - const struct si_shader_selector *sel = ctx->shader->selector; - const struct si_shader_info *info = &sel->info; - - LLVMTypeRef elements[2] = { - LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs), - LLVMArrayType(ctx->ac.i8, 4), - }; - LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); - type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); - return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); + const struct si_shader_selector *sel = ctx->shader->selector; + const struct si_shader_info *info = &sel->info; + + LLVMTypeRef elements[2] = { + LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs), + LLVMArrayType(ctx->ac.i8, 4), + }; + LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); + type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); + return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); } /** @@ -1536,452 +1458,424 @@ ngg_gs_get_vertex_storage(struct si_shader_context *ctx) * * \return an LDS pointer to type {[N x i32], [4 x i8]} */ -static LLVMValueRef -ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx) +static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx) { - struct si_shader_selector *sel = ctx->shader->selector; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); - - /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ - unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1; - if (write_stride_2exp) { - LLVMValueRef row = - LLVMBuildLShr(builder, vertexidx, - LLVMConstInt(ctx->ac.i32, 5, false), ""); - LLVMValueRef swizzle = - LLVMBuildAnd(builder, row, - LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, - false), ""); - vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); - } - - return ac_build_gep0(&ctx->ac, storage, vertexidx); + struct si_shader_selector *sel = ctx->shader->selector; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); + + /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ + unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1; + if (write_stride_2exp) { + LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), ""); + LLVMValueRef swizzle = LLVMBuildAnd( + builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), ""); + vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); + } + + return ac_build_gep0(&ctx->ac, storage, vertexidx); } -static LLVMValueRef -ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread, - LLVMValueRef emitidx) +static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread, + LLVMValueRef emitidx) { - struct si_shader_selector *sel = ctx->shader->selector; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - - tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false); - tmp = LLVMBuildMul(builder, tmp, gsthread, ""); - const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); - return ngg_gs_vertex_ptr(ctx, vertexidx); + struct si_shader_selector *sel = ctx->shader->selector; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false); + tmp = LLVMBuildMul(builder, tmp, gsthread, ""); + const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); + return ngg_gs_vertex_ptr(ctx, vertexidx); } -static LLVMValueRef -ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr, - unsigned out_idx) +static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, + LLVMValueRef vertexptr, unsigned out_idx) { - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first struct entry */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } -static LLVMValueRef -ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr, - unsigned stream) +static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, + LLVMValueRef vertexptr, unsigned stream) { - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } -void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, - unsigned stream, - LLVMValueRef *addrs) +void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs) { - const struct si_shader_selector *sel = ctx->shader->selector; - const struct si_shader_info *info = &sel->info; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - const LLVMValueRef vertexidx = - LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - */ - const LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); - - tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); - tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); - LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); - - ac_build_ifcc(&ctx->ac, can_emit, 9001); - - const LLVMValueRef vertexptr = - ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); - unsigned out_idx = 0; - for (unsigned i = 0; i < info->num_outputs; i++) { - for (unsigned chan = 0; chan < 4; chan++, out_idx++) { - if (!(info->output_usagemask[i] & (1 << chan)) || - ((info->output_streams[i] >> (2 * chan)) & 3) != stream) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], ""); - out_val = ac_to_integer(&ctx->ac, out_val); - LLVMBuildStore(builder, out_val, - ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); - } - } - assert(out_idx * 4 == sel->gsvs_vertex_size); - - /* Determine and store whether this vertex completed a primitive. */ - const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); - - tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false); - const LLVMValueRef iscompleteprim = - LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); - - /* Since the geometry shader emits triangle strips, we need to - * track which primitive is odd and swap vertex indices to get - * the correct vertex order. - */ - LLVMValueRef is_odd = ctx->ac.i1false; - if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) { - tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); - is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); - } - - tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); - LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - - /* The per-vertex primitive flag encoding: - * bit 0: whether this vertex finishes a primitive - * bit 1: whether the primitive is odd (if we are emitting triangle strips) - */ - tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - tmp = LLVMBuildOr(builder, tmp, - LLVMBuildShl(builder, - LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), - ctx->ac.i8_1, ""), ""); - LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); - - tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); - tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); - LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); - - ac_build_endif(&ctx->ac, 9001); + const struct si_shader_selector *sel = ctx->shader->selector; + const struct si_shader_info *info = &sel->info; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); + + /* If this thread has already emitted the declared maximum number of + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + */ + const LLVMValueRef can_emit = + LLVMBuildICmp(builder, LLVMIntULT, vertexidx, + LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); + + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); + + ac_build_ifcc(&ctx->ac, can_emit, 9001); + + const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); + unsigned out_idx = 0; + for (unsigned i = 0; i < info->num_outputs; i++) { + for (unsigned chan = 0; chan < 4; chan++, out_idx++) { + if (!(info->output_usagemask[i] & (1 << chan)) || + ((info->output_streams[i] >> (2 * chan)) & 3) != stream) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], ""); + out_val = ac_to_integer(&ctx->ac, out_val); + LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); + } + } + assert(out_idx * 4 == sel->gsvs_vertex_size); + + /* Determine and store whether this vertex completed a primitive. */ + const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); + + tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false); + const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); + + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ + tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); + tmp = LLVMBuildOr( + builder, tmp, + LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); + + tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); + LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); + + ac_build_endif(&ctx->ac, 9001); } void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx) { - /* Zero out the part of LDS scratch that is used to accumulate the - * per-stream generated primitive count. - */ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef scratchptr = ctx->gs_ngg_scratch; - LLVMValueRef tid = get_thread_id_in_tg(ctx); - LLVMValueRef tmp; - - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); - ac_build_ifcc(&ctx->ac, tmp, 5090); - { - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); - LLVMBuildStore(builder, ctx->ac.i32_0, ptr); - } - ac_build_endif(&ctx->ac, 5090); - - ac_build_s_barrier(&ctx->ac); + /* Zero out the part of LDS scratch that is used to accumulate the + * per-stream generated primitive count. + */ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef scratchptr = ctx->gs_ngg_scratch; + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef tmp; + + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5090); + { + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); + LLVMBuildStore(builder, ctx->ac.i32_0, ptr); + } + ac_build_endif(&ctx->ac, 5090); + + ac_build_s_barrier(&ctx->ac); } void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) { - const struct si_shader_selector *sel = ctx->shader->selector; - const struct si_shader_info *info = &sel->info; - const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); - LLVMValueRef tmp, tmp2; - - /* Zero out remaining (non-emitted) primitive flags. - * - * Note: Alternatively, we could pass the relevant gs_next_vertex to - * the emit threads via LDS. This is likely worse in the expected - * typical case where each GS thread emits the full set of - * vertices. - */ - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - const LLVMValueRef gsthread = get_thread_id_in_tg(ctx); - - ac_build_bgnloop(&ctx->ac, 5100); - - const LLVMValueRef vertexidx = - LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); - tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, - LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); - ac_build_ifcc(&ctx->ac, tmp, 5101); - ac_build_break(&ctx->ac); - ac_build_endif(&ctx->ac, 5101); - - tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); - LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); - - tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); - - ac_build_endloop(&ctx->ac, 5100); - } - - /* Accumulate generated primitives counts across the entire threadgroup. */ - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - LLVMValueRef numprims = - LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); - numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); - - tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5105); - { - LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, - ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, - LLVMConstInt(ctx->ac.i32, stream, false)), - numprims, LLVMAtomicOrderingMonotonic, false); - } - ac_build_endif(&ctx->ac, 5105); - } - - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - ac_build_s_barrier(&ctx->ac); - - const LLVMValueRef tid = get_thread_id_in_tg(ctx); - LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); - - /* Streamout */ - if (sel->so.num_outputs) { - struct ngg_streamout nggso = {}; - - nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); - - LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); - for (unsigned stream = 0; stream < 4; ++stream) { - if (!info->num_stream_output_components[stream]) - continue; - - tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); - tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); - nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); - } - - for (unsigned i = 0; i < verts_per_prim; ++i) { - tmp = LLVMBuildSub(builder, tid, - LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - tmp = ngg_gs_vertex_ptr(ctx, tmp); - nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); - } - - build_streamout(ctx, &nggso); - } - - /* Write shader query data. */ - if (ctx->screen->use_ngg_streamout) { - tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); - tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */ - unsigned num_query_comps = sel->so.num_outputs ? 8 : 4; - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, - LLVMConstInt(ctx->ac.i32, num_query_comps, false), ""); - ac_build_ifcc(&ctx->ac, tmp, 5110); - { - LLVMValueRef offset; - tmp = tid; - if (sel->so.num_outputs) - tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), ""); - offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), ""); - if (sel->so.num_outputs) { - tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), ""); - tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), ""); - offset = LLVMBuildAdd(builder, offset, tmp, ""); - } - - tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); - LLVMValueRef args[] = { - tmp, - ngg_get_query_buf(ctx), - offset, - LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */ - ctx->ac.i32_0, /* cachepolicy */ - }; - ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", - ctx->ac.i32, args, 5, 0); - } - ac_build_endif(&ctx->ac, 5110); - ac_build_endif(&ctx->ac, 5109); - } - - /* Determine vertex liveness. */ - LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); - - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); - ac_build_ifcc(&ctx->ac, tmp, 5120); - { - for (unsigned i = 0; i < verts_per_prim; ++i) { - const LLVMValueRef primidx = - LLVMBuildAdd(builder, tid, - LLVMConstInt(ctx->ac.i32, i, false), ""); - - if (i > 0) { - tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, ""); - ac_build_ifcc(&ctx->ac, tmp, 5121 + i); - } - - /* Load primitive liveness */ - tmp = ngg_gs_vertex_ptr(ctx, primidx); - tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); - const LLVMValueRef primlive = - LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); - - tmp = LLVMBuildLoad(builder, vertliveptr, ""); - tmp = LLVMBuildOr(builder, tmp, primlive, ""), - LLVMBuildStore(builder, tmp, vertliveptr); - - if (i > 0) - ac_build_endif(&ctx->ac, 5121 + i); - } - } - ac_build_endif(&ctx->ac, 5120); - - /* Inclusive scan addition across the current wave. */ - LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); - struct ac_wg_scan vertlive_scan = {}; - vertlive_scan.op = nir_op_iadd; - vertlive_scan.enable_reduce = true; - vertlive_scan.enable_exclusive = true; - vertlive_scan.src = vertlive; - vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0); - vertlive_scan.waveidx = get_wave_id_in_tg(ctx); - vertlive_scan.numwaves = get_tgsize(ctx); - vertlive_scan.maxwaves = 8; - - ac_build_wg_scan(&ctx->ac, &vertlive_scan); - - /* Skip all exports (including index exports) when possible. At least on - * early gfx10 revisions this is also to avoid hangs. - */ - LLVMValueRef have_exports = - LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, ""); - num_emit_threads = - LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, ""); - - /* Allocate export space. Send this message as early as possible, to - * hide the latency of the SQ <-> SPI roundtrip. - * - * Note: We could consider compacting primitives for export as well. - * PA processes 1 non-null prim / clock, but it fetches 4 DW of - * prim data per clock and skips null primitives at no additional - * cost. So compacting primitives can only be beneficial when - * there are 4 or more contiguous null primitives in the export - * (in the common case of single-dword prim exports). - */ - ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), - vertlive_scan.result_reduce, num_emit_threads); - - /* Setup the reverse vertex compaction permutation. We re-use stream 1 - * of the primitive liveness flags, relying on the fact that each - * threadgroup can have at most 256 threads. */ - ac_build_ifcc(&ctx->ac, vertlive, 5130); - { - tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); - } - ac_build_endif(&ctx->ac, 5130); - - ac_build_s_barrier(&ctx->ac); - - /* Export primitive data */ - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); - ac_build_ifcc(&ctx->ac, tmp, 5140); - { - LLVMValueRef flags; - struct ac_ngg_prim prim = {}; - prim.num_vertices = verts_per_prim; - - tmp = ngg_gs_vertex_ptr(ctx, tid); - flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); - prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); - - for (unsigned i = 0; i < verts_per_prim; ++i) { - prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, - LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); - prim.edgeflag[i] = ctx->ac.i1false; - } - - /* Geometry shaders output triangle strips, but NGG expects triangles. */ - if (verts_per_prim == 3) { - LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); - is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); - LLVMValueRef flatshade_first = - LLVMBuildICmp(builder, LLVMIntEQ, - si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), - ctx->ac.i32_0, ""); - - ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, - flatshade_first, - prim.index); - } - - ac_build_export_prim(&ctx->ac, &prim); - } - ac_build_endif(&ctx->ac, 5140); - - /* Export position and parameter data */ - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); - ac_build_ifcc(&ctx->ac, tmp, 5145); - { - struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; - - tmp = ngg_gs_vertex_ptr(ctx, tid); - tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); - tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); - const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); - - unsigned out_idx = 0; - for (unsigned i = 0; i < info->num_outputs; i++) { - outputs[i].semantic_name = info->output_semantic_name[i]; - outputs[i].semantic_index = info->output_semantic_index[i]; - - for (unsigned j = 0; j < 4; j++, out_idx++) { - tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); - tmp = LLVMBuildLoad(builder, tmp, ""); - outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); - outputs[i].vertex_stream[j] = - (info->output_streams[i] >> (2 * j)) & 3; - } - } - - si_llvm_build_vs_exports(ctx, outputs, info->num_outputs); - } - ac_build_endif(&ctx->ac, 5145); + const struct si_shader_selector *sel = ctx->shader->selector; + const struct si_shader_info *info = &sel->info; + const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false); + LLVMValueRef tmp, tmp2; + + /* Zero out remaining (non-emitted) primitive flags. + * + * Note: Alternatively, we could pass the relevant gs_next_vertex to + * the emit threads via LDS. This is likely worse in the expected + * typical case where each GS thread emits the full set of + * vertices. + */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + const LLVMValueRef gsthread = get_thread_id_in_tg(ctx); + + ac_build_bgnloop(&ctx->ac, 5100); + + const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); + tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx, + LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5101); + ac_build_break(&ctx->ac); + ac_build_endif(&ctx->ac, 5101); + + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); + + tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); + LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); + + ac_build_endloop(&ctx->ac, 5100); + } + + /* Accumulate generated primitives counts across the entire threadgroup. */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5105); + { + LLVMBuildAtomicRMW( + builder, LLVMAtomicRMWBinOpAdd, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)), + numprims, LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5105); + } + + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + ac_build_s_barrier(&ctx->ac); + + const LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); + + /* Streamout */ + if (sel->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); + + LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); + } + + for (unsigned i = 0; i < verts_per_prim; ++i) { + tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), + ""); + tmp = ngg_gs_vertex_ptr(ctx, tmp); + nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); + } + + build_streamout(ctx, &nggso); + } + + /* Write shader query data. */ + if (ctx->screen->use_ngg_streamout) { + tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */ + unsigned num_query_comps = sel->so.num_outputs ? 8 : 4; + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, num_query_comps, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + LLVMValueRef offset; + tmp = tid; + if (sel->so.num_outputs) + tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), ""); + offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), ""); + if (sel->so.num_outputs) { + tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), ""); + tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), ""); + offset = LLVMBuildAdd(builder, offset, tmp, ""); + } + + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + LLVMValueRef args[] = { + tmp, ngg_get_query_buf(ctx), + offset, LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ + }; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, + 0); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + } + + /* Determine vertex liveness. */ + LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive"); + + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5120); + { + for (unsigned i = 0; i < verts_per_prim; ++i) { + const LLVMValueRef primidx = + LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), ""); + + if (i > 0) { + tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5121 + i); + } + + /* Load primitive liveness */ + tmp = ngg_gs_vertex_ptr(ctx, primidx); + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + + tmp = LLVMBuildLoad(builder, vertliveptr, ""); + tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr); + + if (i > 0) + ac_build_endif(&ctx->ac, 5121 + i); + } + } + ac_build_endif(&ctx->ac, 5120); + + /* Inclusive scan addition across the current wave. */ + LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, ""); + struct ac_wg_scan vertlive_scan = {}; + vertlive_scan.op = nir_op_iadd; + vertlive_scan.enable_reduce = true; + vertlive_scan.enable_exclusive = true; + vertlive_scan.src = vertlive; + vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0); + vertlive_scan.waveidx = get_wave_id_in_tg(ctx); + vertlive_scan.numwaves = get_tgsize(ctx); + vertlive_scan.maxwaves = 8; + + ac_build_wg_scan(&ctx->ac, &vertlive_scan); + + /* Skip all exports (including index exports) when possible. At least on + * early gfx10 revisions this is also to avoid hangs. + */ + LLVMValueRef have_exports = + LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, ""); + num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, ""); + + /* Allocate export space. Send this message as early as possible, to + * hide the latency of the SQ <-> SPI roundtrip. + * + * Note: We could consider compacting primitives for export as well. + * PA processes 1 non-null prim / clock, but it fetches 4 DW of + * prim data per clock and skips null primitives at no additional + * cost. So compacting primitives can only be beneficial when + * there are 4 or more contiguous null primitives in the export + * (in the common case of single-dword prim exports). + */ + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce, + num_emit_threads); + + /* Setup the reverse vertex compaction permutation. We re-use stream 1 + * of the primitive liveness flags, relying on the fact that each + * threadgroup can have at most 256 threads. */ + ac_build_ifcc(&ctx->ac, vertlive, 5130); + { + tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); + tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); + LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); + } + ac_build_endif(&ctx->ac, 5130); + + ac_build_s_barrier(&ctx->ac); + + /* Export primitive data */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + ac_build_ifcc(&ctx->ac, tmp, 5140); + { + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; + prim.num_vertices = verts_per_prim; + + tmp = ngg_gs_vertex_ptr(ctx, tid); + flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); + + for (unsigned i = 0; i < verts_per_prim; ++i) { + prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + prim.edgeflag[i] = ctx->ac.i1false; + } + + /* Geometry shaders output triangle strips, but NGG expects triangles. */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + LLVMValueRef flatshade_first = LLVMBuildICmp( + builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index); + } + + ac_build_export_prim(&ctx->ac, &prim); + } + ac_build_endif(&ctx->ac, 5140); + + /* Export position and parameter data */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); + ac_build_ifcc(&ctx->ac, tmp, 5145); + { + struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; + + tmp = ngg_gs_vertex_ptr(ctx, tid); + tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); + tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); + const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); + + unsigned out_idx = 0; + for (unsigned i = 0; i < info->num_outputs; i++) { + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; + + for (unsigned j = 0; j < 4; j++, out_idx++) { + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); + tmp = LLVMBuildLoad(builder, tmp, ""); + outputs[i].values[j] = ac_to_float(&ctx->ac, tmp); + outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3; + } + } + + si_llvm_build_vs_exports(ctx, outputs, info->num_outputs); + } + ac_build_endif(&ctx->ac, 5145); } static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, - unsigned min_verts_per_prim, bool use_adjacency) + unsigned min_verts_per_prim, bool use_adjacency) { - unsigned max_reuse = max_esverts - min_verts_per_prim; - if (use_adjacency) - max_reuse /= 2; - *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); + unsigned max_reuse = max_esverts - min_verts_per_prim; + if (use_adjacency) + max_reuse /= 2; + *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); } /** @@ -1992,172 +1886,165 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts */ void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) { - const struct si_shader_selector *gs_sel = shader->selector; - const struct si_shader_selector *es_sel = - shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; - const enum pipe_shader_type gs_type = gs_sel->type; - const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); - const unsigned input_prim = si_get_input_prim(gs_sel); - const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && - input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; - const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim); - const unsigned min_verts_per_prim = - gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1; - - /* All these are in dwords: */ - /* We can't allow using the whole LDS, because GS waves compete with - * other shader stages for LDS space. - * - * TODO: We should really take the shader's internal LDS use into - * account. The linker will fail if the size is greater than - * 8K dwords. - */ - const unsigned max_lds_size = 8 * 1024 - 768; - const unsigned target_lds_size = max_lds_size; - unsigned esvert_lds_size = 0; - unsigned gsprim_lds_size = 0; - - /* All these are per subgroup: */ - bool max_vert_out_per_gs_instance = false; - unsigned max_gsprims_base = 128; /* default prim group size clamp */ - unsigned max_esverts_base = 128; - - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - max_gsprims_base = 128 / 3; - max_esverts_base = max_gsprims_base * 3; - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - max_gsprims_base = 126; - max_esverts_base = 128; - } - - /* Hardware has the following non-natural restrictions on the value - * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of - * the draw: - * - at most 252 for any line input primitive type - * - at most 251 for any quad input primitive type - * - at most 251 for triangle strips with adjacency (this happens to - * be the natural limit for triangle *lists* with adjacency) - */ - max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); - - if (gs_type == PIPE_SHADER_GEOMETRY) { - unsigned max_out_verts_per_gsprim = - gs_sel->gs_max_out_vertices * gs_num_invocations; - - if (max_out_verts_per_gsprim <= 256) { - if (max_out_verts_per_gsprim) { - max_gsprims_base = MIN2(max_gsprims_base, - 256 / max_out_verts_per_gsprim); - } - } else { - /* Use special multi-cycling mode in which each GS - * instance gets its own subgroup. Does not work with - * tessellation. */ - max_vert_out_per_gs_instance = true; - max_gsprims_base = 1; - max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices; - } - - esvert_lds_size = es_sel->esgs_itemsize / 4; - gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; - } else { - /* VS and TES. */ - /* LDS size for passing data from ES to GS. */ - esvert_lds_size = ngg_nogs_vertex_size(shader); - } - - unsigned max_gsprims = max_gsprims_base; - unsigned max_esverts = max_esverts_base; - - if (esvert_lds_size) - max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); - if (gsprim_lds_size) - max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - - if (esvert_lds_size || gsprim_lds_size) { - /* Now that we have a rough proportionality between esverts - * and gsprims based on the primitive type, scale both of them - * down simultaneously based on required LDS space. - * - * We could be smarter about this if we knew how much vertex - * reuse to expect. - */ - unsigned lds_total = max_esverts * esvert_lds_size + - max_gsprims * gsprim_lds_size; - if (lds_total > target_lds_size) { - max_esverts = max_esverts * target_lds_size / lds_total; - max_gsprims = max_gsprims * target_lds_size / lds_total; - - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, - min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } - } - - /* Round up towards full wave sizes for better ALU utilization. */ - if (!max_vert_out_per_gs_instance) { - const unsigned wavesize = gs_sel->screen->ge_wave_size; - unsigned orig_max_esverts; - unsigned orig_max_gsprims; - do { - orig_max_esverts = max_esverts; - orig_max_gsprims = max_gsprims; - - max_esverts = align(max_esverts, wavesize); - max_esverts = MIN2(max_esverts, max_esverts_base); - if (esvert_lds_size) - max_esverts = MIN2(max_esverts, - (max_lds_size - max_gsprims * gsprim_lds_size) / - esvert_lds_size); - max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); - - max_gsprims = align(max_gsprims, wavesize); - max_gsprims = MIN2(max_gsprims, max_gsprims_base); - if (gsprim_lds_size) - max_gsprims = MIN2(max_gsprims, - (max_lds_size - max_esverts * esvert_lds_size) / - gsprim_lds_size); - clamp_gsprims_to_esverts(&max_gsprims, max_esverts, - min_verts_per_prim, use_adjacency); - assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); - } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); - } - - /* Hardware restriction: minimum value of max_esverts */ - max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim); - - unsigned max_out_vertices = - max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices : - gs_type == PIPE_SHADER_GEOMETRY ? - max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices : - max_esverts; - assert(max_out_vertices <= 256); - - unsigned prim_amp_factor = 1; - if (gs_type == PIPE_SHADER_GEOMETRY) { - /* Number of output primitives per GS input primitive after - * GS instancing. */ - prim_amp_factor = gs_sel->gs_max_out_vertices; - } - - /* The GE only checks against the maximum number of ES verts after - * allocating a full GS primitive. So we need to ensure that whenever - * this check passes, there is enough space for a full primitive without - * vertex reuse. - */ - shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; - shader->ngg.max_gsprims = max_gsprims; - shader->ngg.max_out_verts = max_out_vertices; - shader->ngg.prim_amp_factor = prim_amp_factor; - shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; - - shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size; - shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; - - assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */ + const struct si_shader_selector *gs_sel = shader->selector; + const struct si_shader_selector *es_sel = + shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; + const enum pipe_shader_type gs_type = gs_sel->type; + const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + const unsigned input_prim = si_get_input_prim(gs_sel); + const bool use_adjacency = + input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; + const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim); + const unsigned min_verts_per_prim = gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1; + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. + * + * TODO: We should really take the shader's internal LDS use into + * account. The linker will fail if the size is greater than + * 8K dwords. + */ + const unsigned max_lds_size = 8 * 1024 - 768; + const unsigned target_lds_size = max_lds_size; + unsigned esvert_lds_size = 0; + unsigned gsprim_lds_size = 0; + + /* All these are per subgroup: */ + bool max_vert_out_per_gs_instance = false; + unsigned max_gsprims_base = 128; /* default prim group size clamp */ + unsigned max_esverts_base = 128; + + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + max_gsprims_base = 128 / 3; + max_esverts_base = max_gsprims_base * 3; + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + max_gsprims_base = 126; + max_esverts_base = 128; + } + + /* Hardware has the following non-natural restrictions on the value + * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of + * the draw: + * - at most 252 for any line input primitive type + * - at most 251 for any quad input primitive type + * - at most 251 for triangle strips with adjacency (this happens to + * be the natural limit for triangle *lists* with adjacency) + */ + max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1); + + if (gs_type == PIPE_SHADER_GEOMETRY) { + unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations; + + if (max_out_verts_per_gsprim <= 256) { + if (max_out_verts_per_gsprim) { + max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim); + } + } else { + /* Use special multi-cycling mode in which each GS + * instance gets its own subgroup. Does not work with + * tessellation. */ + max_vert_out_per_gs_instance = true; + max_gsprims_base = 1; + max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices; + } + + esvert_lds_size = es_sel->esgs_itemsize / 4; + gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim; + } else { + /* VS and TES. */ + /* LDS size for passing data from ES to GS. */ + esvert_lds_size = ngg_nogs_vertex_size(shader); + } + + unsigned max_gsprims = max_gsprims_base; + unsigned max_esverts = max_esverts_base; + + if (esvert_lds_size) + max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size); + if (gsprim_lds_size) + max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size); + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + + if (esvert_lds_size || gsprim_lds_size) { + /* Now that we have a rough proportionality between esverts + * and gsprims based on the primitive type, scale both of them + * down simultaneously based on required LDS space. + * + * We could be smarter about this if we knew how much vertex + * reuse to expect. + */ + unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size; + if (lds_total > target_lds_size) { + max_esverts = max_esverts * target_lds_size / lds_total; + max_gsprims = max_gsprims * target_lds_size / lds_total; + + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } + } + + /* Round up towards full wave sizes for better ALU utilization. */ + if (!max_vert_out_per_gs_instance) { + const unsigned wavesize = gs_sel->screen->ge_wave_size; + unsigned orig_max_esverts; + unsigned orig_max_gsprims; + do { + orig_max_esverts = max_esverts; + orig_max_gsprims = max_gsprims; + + max_esverts = align(max_esverts, wavesize); + max_esverts = MIN2(max_esverts, max_esverts_base); + if (esvert_lds_size) + max_esverts = + MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size); + max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim); + + max_gsprims = align(max_gsprims, wavesize); + max_gsprims = MIN2(max_gsprims, max_gsprims_base); + if (gsprim_lds_size) + max_gsprims = + MIN2(max_gsprims, (max_lds_size - max_esverts * esvert_lds_size) / gsprim_lds_size); + clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency); + assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1); + } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims); + } + + /* Hardware restriction: minimum value of max_esverts */ + max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim); + + unsigned max_out_vertices = + max_vert_out_per_gs_instance + ? gs_sel->gs_max_out_vertices + : gs_type == PIPE_SHADER_GEOMETRY + ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices + : max_esverts; + assert(max_out_vertices <= 256); + + unsigned prim_amp_factor = 1; + if (gs_type == PIPE_SHADER_GEOMETRY) { + /* Number of output primitives per GS input primitive after + * GS instancing. */ + prim_amp_factor = gs_sel->gs_max_out_vertices; + } + + /* The GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. + */ + shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1; + shader->ngg.max_gsprims = max_gsprims; + shader->ngg.max_out_verts = max_out_vertices; + shader->ngg.prim_amp_factor = prim_amp_factor; + shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance; + + shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size; + shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + + assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */ } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index e662de16127..ab69c7e4ddd 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -23,1346 +23,1220 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_pipe.h" #include "si_compute.h" +#include "si_pipe.h" #include "util/format/u_format.h" #include "util/u_log.h" #include "util/u_surface.h" -enum { - SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | - SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, +enum +{ + SI_COPY = + SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, - SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | - SI_SAVE_FRAGMENT_STATE, + SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE, - SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | - SI_DISABLE_RENDER_COND, + SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, - SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE + SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE }; void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op) { - util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); - util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso); - util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso); - util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); - util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets, - (struct pipe_stream_output_target**)sctx->streamout.targets); - util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); - - if (op & SI_SAVE_FRAGMENT_STATE) { - util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); - util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); - util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); - util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); - util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask); - util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]); - util_blitter_save_window_rectangles(sctx->blitter, - sctx->window_rectangles_include, - sctx->num_window_rectangles, - sctx->window_rectangles); - } - - if (op & SI_SAVE_FRAMEBUFFER) - util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state); - - if (op & SI_SAVE_TEXTURES) { - util_blitter_save_fragment_sampler_states( - sctx->blitter, 2, - (void**)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states); - - util_blitter_save_fragment_sampler_views(sctx->blitter, 2, - sctx->samplers[PIPE_SHADER_FRAGMENT].views); - } - - if (op & SI_DISABLE_RENDER_COND) - sctx->render_cond_force_off = true; - - if (sctx->screen->dpbb_allowed) { - sctx->dpbb_force_off = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } + util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); + util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso); + util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso); + util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); + util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets, + (struct pipe_stream_output_target **)sctx->streamout.targets); + util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); + + if (op & SI_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); + util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); + util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); + util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); + util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask); + util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]); + util_blitter_save_window_rectangles(sctx->blitter, sctx->window_rectangles_include, + sctx->num_window_rectangles, sctx->window_rectangles); + } + + if (op & SI_SAVE_FRAMEBUFFER) + util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state); + + if (op & SI_SAVE_TEXTURES) { + util_blitter_save_fragment_sampler_states( + sctx->blitter, 2, (void **)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states); + + util_blitter_save_fragment_sampler_views(sctx->blitter, 2, + sctx->samplers[PIPE_SHADER_FRAGMENT].views); + } + + if (op & SI_DISABLE_RENDER_COND) + sctx->render_cond_force_off = true; + + if (sctx->screen->dpbb_allowed) { + sctx->dpbb_force_off = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } } void si_blitter_end(struct si_context *sctx) { - if (sctx->screen->dpbb_allowed) { - sctx->dpbb_force_off = false; - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - - sctx->render_cond_force_off = false; - - /* Restore shader pointers because the VS blit shader changed all - * non-global VS user SGPRs. */ - sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); - sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; - sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0; - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + if (sctx->screen->dpbb_allowed) { + sctx->dpbb_force_off = false; + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } + + sctx->render_cond_force_off = false; + + /* Restore shader pointers because the VS blit shader changed all + * non-global VS user SGPRs. */ + sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0; + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } static unsigned u_max_sample(struct pipe_resource *r) { - return r->nr_samples ? r->nr_samples - 1 : 0; + return r->nr_samples ? r->nr_samples - 1 : 0; } -static unsigned -si_blit_dbcb_copy(struct si_context *sctx, - struct si_texture *src, - struct si_texture *dst, - unsigned planes, unsigned level_mask, - unsigned first_layer, unsigned last_layer, - unsigned first_sample, unsigned last_sample) +static unsigned si_blit_dbcb_copy(struct si_context *sctx, struct si_texture *src, + struct si_texture *dst, unsigned planes, unsigned level_mask, + unsigned first_layer, unsigned last_layer, unsigned first_sample, + unsigned last_sample) { - struct pipe_surface surf_tmpl = {{0}}; - unsigned layer, sample, checked_last_layer, max_layer; - unsigned fully_copied_levels = 0; + struct pipe_surface surf_tmpl = {{0}}; + unsigned layer, sample, checked_last_layer, max_layer; + unsigned fully_copied_levels = 0; - if (planes & PIPE_MASK_Z) - sctx->dbcb_depth_copy_enabled = true; - if (planes & PIPE_MASK_S) - sctx->dbcb_stencil_copy_enabled = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (planes & PIPE_MASK_Z) + sctx->dbcb_depth_copy_enabled = true; + if (planes & PIPE_MASK_S) + sctx->dbcb_stencil_copy_enabled = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled); + assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled); - sctx->decompression_enabled = true; + sctx->decompression_enabled = true; - while (level_mask) { - unsigned level = u_bit_scan(&level_mask); + while (level_mask) { + unsigned level = u_bit_scan(&level_mask); - /* The smaller the mipmap level, the less layers there are - * as far as 3D textures are concerned. */ - max_layer = util_max_layer(&src->buffer.b.b, level); - checked_last_layer = MIN2(last_layer, max_layer); + /* The smaller the mipmap level, the less layers there are + * as far as 3D textures are concerned. */ + max_layer = util_max_layer(&src->buffer.b.b, level); + checked_last_layer = MIN2(last_layer, max_layer); - surf_tmpl.u.tex.level = level; + surf_tmpl.u.tex.level = level; - for (layer = first_layer; layer <= checked_last_layer; layer++) { - struct pipe_surface *zsurf, *cbsurf; + for (layer = first_layer; layer <= checked_last_layer; layer++) { + struct pipe_surface *zsurf, *cbsurf; - surf_tmpl.format = src->buffer.b.b.format; - surf_tmpl.u.tex.first_layer = layer; - surf_tmpl.u.tex.last_layer = layer; + surf_tmpl.format = src->buffer.b.b.format; + surf_tmpl.u.tex.first_layer = layer; + surf_tmpl.u.tex.last_layer = layer; - zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl); + zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl); - surf_tmpl.format = dst->buffer.b.b.format; - cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl); + surf_tmpl.format = dst->buffer.b.b.format; + cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl); - for (sample = first_sample; sample <= last_sample; sample++) { - if (sample != sctx->dbcb_copy_sample) { - sctx->dbcb_copy_sample = sample; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } + for (sample = first_sample; sample <= last_sample; sample++) { + if (sample != sctx->dbcb_copy_sample) { + sctx->dbcb_copy_sample = sample; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } - si_blitter_begin(sctx, SI_DECOMPRESS); - util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample, - sctx->custom_dsa_flush, 1.0f); - si_blitter_end(sctx); - } + si_blitter_begin(sctx, SI_DECOMPRESS); + util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample, + sctx->custom_dsa_flush, 1.0f); + si_blitter_end(sctx); + } - pipe_surface_reference(&zsurf, NULL); - pipe_surface_reference(&cbsurf, NULL); - } + pipe_surface_reference(&zsurf, NULL); + pipe_surface_reference(&cbsurf, NULL); + } - if (first_layer == 0 && last_layer >= max_layer && - first_sample == 0 && last_sample >= u_max_sample(&src->buffer.b.b)) - fully_copied_levels |= 1u << level; - } + if (first_layer == 0 && last_layer >= max_layer && first_sample == 0 && + last_sample >= u_max_sample(&src->buffer.b.b)) + fully_copied_levels |= 1u << level; + } - sctx->decompression_enabled = false; - sctx->dbcb_depth_copy_enabled = false; - sctx->dbcb_stencil_copy_enabled = false; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + sctx->decompression_enabled = false; + sctx->dbcb_depth_copy_enabled = false; + sctx->dbcb_stencil_copy_enabled = false; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - return fully_copied_levels; + return fully_copied_levels; } /* Helper function for si_blit_decompress_zs_in_place. */ -static void -si_blit_decompress_zs_planes_in_place(struct si_context *sctx, - struct si_texture *texture, - unsigned planes, unsigned level_mask, - unsigned first_layer, unsigned last_layer) +static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx, + struct si_texture *texture, unsigned planes, + unsigned level_mask, unsigned first_layer, + unsigned last_layer) { - struct pipe_surface *zsurf, surf_tmpl = {{0}}; - unsigned layer, max_layer, checked_last_layer; - unsigned fully_decompressed_mask = 0; + struct pipe_surface *zsurf, surf_tmpl = {{0}}; + unsigned layer, max_layer, checked_last_layer; + unsigned fully_decompressed_mask = 0; - if (!level_mask) - return; + if (!level_mask) + return; - if (planes & PIPE_MASK_S) - sctx->db_flush_stencil_inplace = true; - if (planes & PIPE_MASK_Z) - sctx->db_flush_depth_inplace = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (planes & PIPE_MASK_S) + sctx->db_flush_stencil_inplace = true; + if (planes & PIPE_MASK_Z) + sctx->db_flush_depth_inplace = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - surf_tmpl.format = texture->buffer.b.b.format; + surf_tmpl.format = texture->buffer.b.b.format; - sctx->decompression_enabled = true; + sctx->decompression_enabled = true; - while (level_mask) { - unsigned level = u_bit_scan(&level_mask); + while (level_mask) { + unsigned level = u_bit_scan(&level_mask); - surf_tmpl.u.tex.level = level; + surf_tmpl.u.tex.level = level; - /* The smaller the mipmap level, the less layers there are - * as far as 3D textures are concerned. */ - max_layer = util_max_layer(&texture->buffer.b.b, level); - checked_last_layer = MIN2(last_layer, max_layer); + /* The smaller the mipmap level, the less layers there are + * as far as 3D textures are concerned. */ + max_layer = util_max_layer(&texture->buffer.b.b, level); + checked_last_layer = MIN2(last_layer, max_layer); - for (layer = first_layer; layer <= checked_last_layer; layer++) { - surf_tmpl.u.tex.first_layer = layer; - surf_tmpl.u.tex.last_layer = layer; + for (layer = first_layer; layer <= checked_last_layer; layer++) { + surf_tmpl.u.tex.first_layer = layer; + surf_tmpl.u.tex.last_layer = layer; - zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl); + zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl); - si_blitter_begin(sctx, SI_DECOMPRESS); - util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0, - sctx->custom_dsa_flush, - 1.0f); - si_blitter_end(sctx); + si_blitter_begin(sctx, SI_DECOMPRESS); + util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0, sctx->custom_dsa_flush, + 1.0f); + si_blitter_end(sctx); - pipe_surface_reference(&zsurf, NULL); - } + pipe_surface_reference(&zsurf, NULL); + } - /* The texture will always be dirty if some layers aren't flushed. - * I don't think this case occurs often though. */ - if (first_layer == 0 && last_layer >= max_layer) { - fully_decompressed_mask |= 1u << level; - } - } + /* The texture will always be dirty if some layers aren't flushed. + * I don't think this case occurs often though. */ + if (first_layer == 0 && last_layer >= max_layer) { + fully_decompressed_mask |= 1u << level; + } + } - if (planes & PIPE_MASK_Z) - texture->dirty_level_mask &= ~fully_decompressed_mask; - if (planes & PIPE_MASK_S) - texture->stencil_dirty_level_mask &= ~fully_decompressed_mask; + if (planes & PIPE_MASK_Z) + texture->dirty_level_mask &= ~fully_decompressed_mask; + if (planes & PIPE_MASK_S) + texture->stencil_dirty_level_mask &= ~fully_decompressed_mask; - sctx->decompression_enabled = false; - sctx->db_flush_depth_inplace = false; - sctx->db_flush_stencil_inplace = false; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + sctx->decompression_enabled = false; + sctx->db_flush_depth_inplace = false; + sctx->db_flush_stencil_inplace = false; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); } /* Helper function of si_flush_depth_texture: decompress the given levels * of Z and/or S planes in place. */ -static void -si_blit_decompress_zs_in_place(struct si_context *sctx, - struct si_texture *texture, - unsigned levels_z, unsigned levels_s, - unsigned first_layer, unsigned last_layer) +static void si_blit_decompress_zs_in_place(struct si_context *sctx, struct si_texture *texture, + unsigned levels_z, unsigned levels_s, + unsigned first_layer, unsigned last_layer) { - unsigned both = levels_z & levels_s; - - /* First, do combined Z & S decompresses for levels that need it. */ - if (both) { - si_blit_decompress_zs_planes_in_place( - sctx, texture, PIPE_MASK_Z | PIPE_MASK_S, - both, - first_layer, last_layer); - levels_z &= ~both; - levels_s &= ~both; - } - - /* Now do separate Z and S decompresses. */ - if (levels_z) { - si_blit_decompress_zs_planes_in_place( - sctx, texture, PIPE_MASK_Z, - levels_z, - first_layer, last_layer); - } - - if (levels_s) { - si_blit_decompress_zs_planes_in_place( - sctx, texture, PIPE_MASK_S, - levels_s, - first_layer, last_layer); - } + unsigned both = levels_z & levels_s; + + /* First, do combined Z & S decompresses for levels that need it. */ + if (both) { + si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z | PIPE_MASK_S, both, + first_layer, last_layer); + levels_z &= ~both; + levels_s &= ~both; + } + + /* Now do separate Z and S decompresses. */ + if (levels_z) { + si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z, levels_z, first_layer, + last_layer); + } + + if (levels_s) { + si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_S, levels_s, first_layer, + last_layer); + } } -static void -si_decompress_depth(struct si_context *sctx, - struct si_texture *tex, - unsigned required_planes, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer) +static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex, + unsigned required_planes, unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer) { - unsigned inplace_planes = 0; - unsigned copy_planes = 0; - unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1); - unsigned levels_z = 0; - unsigned levels_s = 0; - - if (required_planes & PIPE_MASK_Z) { - levels_z = level_mask & tex->dirty_level_mask; - - if (levels_z) { - if (si_can_sample_zs(tex, false)) - inplace_planes |= PIPE_MASK_Z; - else - copy_planes |= PIPE_MASK_Z; - } - } - if (required_planes & PIPE_MASK_S) { - levels_s = level_mask & tex->stencil_dirty_level_mask; - - if (levels_s) { - if (si_can_sample_zs(tex, true)) - inplace_planes |= PIPE_MASK_S; - else - copy_planes |= PIPE_MASK_S; - } - } - - if (unlikely(sctx->log)) - u_log_printf(sctx->log, - "\n------------------------------------------------\n" - "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n", - first_level, last_level, levels_z, levels_s); - - /* We may have to allocate the flushed texture here when called from - * si_decompress_subresource. - */ - if (copy_planes && - (tex->flushed_depth_texture || - si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) { - struct si_texture *dst = tex->flushed_depth_texture; - unsigned fully_copied_levels; - unsigned levels = 0; - - assert(tex->flushed_depth_texture); - - if (util_format_is_depth_and_stencil(dst->buffer.b.b.format)) - copy_planes = PIPE_MASK_Z | PIPE_MASK_S; - - if (copy_planes & PIPE_MASK_Z) { - levels |= levels_z; - levels_z = 0; - } - if (copy_planes & PIPE_MASK_S) { - levels |= levels_s; - levels_s = 0; - } - - fully_copied_levels = si_blit_dbcb_copy( - sctx, tex, dst, copy_planes, levels, - first_layer, last_layer, - 0, u_max_sample(&tex->buffer.b.b)); - - if (copy_planes & PIPE_MASK_Z) - tex->dirty_level_mask &= ~fully_copied_levels; - if (copy_planes & PIPE_MASK_S) - tex->stencil_dirty_level_mask &= ~fully_copied_levels; - } - - if (inplace_planes) { - bool has_htile = si_htile_enabled(tex, first_level, inplace_planes); - bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level, - inplace_planes); - - /* Don't decompress if there is no HTILE or when HTILE is - * TC-compatible. */ - if (has_htile && !tc_compat_htile) { - si_blit_decompress_zs_in_place( - sctx, tex, - levels_z, levels_s, - first_layer, last_layer); - } else { - /* This is only a cache flush. - * - * Only clear the mask that we are flushing, because - * si_make_DB_shader_coherent() treats different levels - * and depth and stencil differently. - */ - if (inplace_planes & PIPE_MASK_Z) - tex->dirty_level_mask &= ~levels_z; - if (inplace_planes & PIPE_MASK_S) - tex->stencil_dirty_level_mask &= ~levels_s; - } - - /* Only in-place decompression needs to flush DB caches, or - * when we don't decompress but TC-compatible planes are dirty. - */ - si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, - inplace_planes & PIPE_MASK_S, - tc_compat_htile); - } - /* set_framebuffer_state takes care of coherency for single-sample. - * The DB->CB copy uses CB for the final writes. - */ - if (copy_planes && tex->buffer.b.b.nr_samples > 1) - si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, - false, true /* no DCC */); + unsigned inplace_planes = 0; + unsigned copy_planes = 0; + unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1); + unsigned levels_z = 0; + unsigned levels_s = 0; + + if (required_planes & PIPE_MASK_Z) { + levels_z = level_mask & tex->dirty_level_mask; + + if (levels_z) { + if (si_can_sample_zs(tex, false)) + inplace_planes |= PIPE_MASK_Z; + else + copy_planes |= PIPE_MASK_Z; + } + } + if (required_planes & PIPE_MASK_S) { + levels_s = level_mask & tex->stencil_dirty_level_mask; + + if (levels_s) { + if (si_can_sample_zs(tex, true)) + inplace_planes |= PIPE_MASK_S; + else + copy_planes |= PIPE_MASK_S; + } + } + + if (unlikely(sctx->log)) + u_log_printf(sctx->log, + "\n------------------------------------------------\n" + "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n", + first_level, last_level, levels_z, levels_s); + + /* We may have to allocate the flushed texture here when called from + * si_decompress_subresource. + */ + if (copy_planes && + (tex->flushed_depth_texture || si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) { + struct si_texture *dst = tex->flushed_depth_texture; + unsigned fully_copied_levels; + unsigned levels = 0; + + assert(tex->flushed_depth_texture); + + if (util_format_is_depth_and_stencil(dst->buffer.b.b.format)) + copy_planes = PIPE_MASK_Z | PIPE_MASK_S; + + if (copy_planes & PIPE_MASK_Z) { + levels |= levels_z; + levels_z = 0; + } + if (copy_planes & PIPE_MASK_S) { + levels |= levels_s; + levels_s = 0; + } + + fully_copied_levels = si_blit_dbcb_copy(sctx, tex, dst, copy_planes, levels, first_layer, + last_layer, 0, u_max_sample(&tex->buffer.b.b)); + + if (copy_planes & PIPE_MASK_Z) + tex->dirty_level_mask &= ~fully_copied_levels; + if (copy_planes & PIPE_MASK_S) + tex->stencil_dirty_level_mask &= ~fully_copied_levels; + } + + if (inplace_planes) { + bool has_htile = si_htile_enabled(tex, first_level, inplace_planes); + bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level, inplace_planes); + + /* Don't decompress if there is no HTILE or when HTILE is + * TC-compatible. */ + if (has_htile && !tc_compat_htile) { + si_blit_decompress_zs_in_place(sctx, tex, levels_z, levels_s, first_layer, last_layer); + } else { + /* This is only a cache flush. + * + * Only clear the mask that we are flushing, because + * si_make_DB_shader_coherent() treats different levels + * and depth and stencil differently. + */ + if (inplace_planes & PIPE_MASK_Z) + tex->dirty_level_mask &= ~levels_z; + if (inplace_planes & PIPE_MASK_S) + tex->stencil_dirty_level_mask &= ~levels_s; + } + + /* Only in-place decompression needs to flush DB caches, or + * when we don't decompress but TC-compatible planes are dirty. + */ + si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, inplace_planes & PIPE_MASK_S, + tc_compat_htile); + } + /* set_framebuffer_state takes care of coherency for single-sample. + * The DB->CB copy uses CB for the final writes. + */ + if (copy_planes && tex->buffer.b.b.nr_samples > 1) + si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */); } -static void -si_decompress_sampler_depth_textures(struct si_context *sctx, - struct si_samplers *textures) +static void si_decompress_sampler_depth_textures(struct si_context *sctx, + struct si_samplers *textures) { - unsigned i; - unsigned mask = textures->needs_depth_decompress_mask; + unsigned i; + unsigned mask = textures->needs_depth_decompress_mask; - while (mask) { - struct pipe_sampler_view *view; - struct si_sampler_view *sview; - struct si_texture *tex; + while (mask) { + struct pipe_sampler_view *view; + struct si_sampler_view *sview; + struct si_texture *tex; - i = u_bit_scan(&mask); + i = u_bit_scan(&mask); - view = textures->views[i]; - assert(view); - sview = (struct si_sampler_view*)view; + view = textures->views[i]; + assert(view); + sview = (struct si_sampler_view *)view; - tex = (struct si_texture *)view->texture; - assert(tex->db_compatible); + tex = (struct si_texture *)view->texture; + assert(tex->db_compatible); - si_decompress_depth(sctx, tex, - sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z, - view->u.tex.first_level, view->u.tex.last_level, - 0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level)); - } + si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z, + view->u.tex.first_level, view->u.tex.last_level, 0, + util_max_layer(&tex->buffer.b.b, view->u.tex.first_level)); + } } -static void si_blit_decompress_color(struct si_context *sctx, - struct si_texture *tex, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - bool need_dcc_decompress, - bool need_fmask_expand) +static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + bool need_dcc_decompress, bool need_fmask_expand) { - void* custom_blend; - unsigned layer, checked_last_layer, max_layer; - unsigned level_mask = - u_bit_consecutive(first_level, last_level - first_level + 1); - - if (!need_dcc_decompress) - level_mask &= tex->dirty_level_mask; - if (!level_mask) - goto expand_fmask; - - if (unlikely(sctx->log)) - u_log_printf(sctx->log, - "\n------------------------------------------------\n" - "Decompress Color (levels %u - %u, mask 0x%x)\n\n", - first_level, last_level, level_mask); - - if (need_dcc_decompress) { - custom_blend = sctx->custom_blend_dcc_decompress; - - assert(tex->surface.dcc_offset); - - /* disable levels without DCC */ - for (int i = first_level; i <= last_level; i++) { - if (!vi_dcc_enabled(tex, i)) - level_mask &= ~(1 << i); - } - } else if (tex->surface.fmask_size) { - custom_blend = sctx->custom_blend_fmask_decompress; - } else { - custom_blend = sctx->custom_blend_eliminate_fastclear; - } - - sctx->decompression_enabled = true; - - while (level_mask) { - unsigned level = u_bit_scan(&level_mask); - - /* The smaller the mipmap level, the less layers there are - * as far as 3D textures are concerned. */ - max_layer = util_max_layer(&tex->buffer.b.b, level); - checked_last_layer = MIN2(last_layer, max_layer); - - for (layer = first_layer; layer <= checked_last_layer; layer++) { - struct pipe_surface *cbsurf, surf_tmpl; - - surf_tmpl.format = tex->buffer.b.b.format; - surf_tmpl.u.tex.level = level; - surf_tmpl.u.tex.first_layer = layer; - surf_tmpl.u.tex.last_layer = layer; - cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl); - - /* Required before and after FMASK and DCC_DECOMPRESS. */ - if (custom_blend == sctx->custom_blend_fmask_decompress || - custom_blend == sctx->custom_blend_dcc_decompress) - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - - si_blitter_begin(sctx, SI_DECOMPRESS); - util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend); - si_blitter_end(sctx); - - if (custom_blend == sctx->custom_blend_fmask_decompress || - custom_blend == sctx->custom_blend_dcc_decompress) - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - - pipe_surface_reference(&cbsurf, NULL); - } - - /* The texture will always be dirty if some layers aren't flushed. - * I don't think this case occurs often though. */ - if (first_layer == 0 && last_layer >= max_layer) { - tex->dirty_level_mask &= ~(1 << level); - } - } - - sctx->decompression_enabled = false; - si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, - vi_dcc_enabled(tex, first_level), - tex->surface.u.gfx9.dcc.pipe_aligned); + void *custom_blend; + unsigned layer, checked_last_layer, max_layer; + unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1); + + if (!need_dcc_decompress) + level_mask &= tex->dirty_level_mask; + if (!level_mask) + goto expand_fmask; + + if (unlikely(sctx->log)) + u_log_printf(sctx->log, + "\n------------------------------------------------\n" + "Decompress Color (levels %u - %u, mask 0x%x)\n\n", + first_level, last_level, level_mask); + + if (need_dcc_decompress) { + custom_blend = sctx->custom_blend_dcc_decompress; + + assert(tex->surface.dcc_offset); + + /* disable levels without DCC */ + for (int i = first_level; i <= last_level; i++) { + if (!vi_dcc_enabled(tex, i)) + level_mask &= ~(1 << i); + } + } else if (tex->surface.fmask_size) { + custom_blend = sctx->custom_blend_fmask_decompress; + } else { + custom_blend = sctx->custom_blend_eliminate_fastclear; + } + + sctx->decompression_enabled = true; + + while (level_mask) { + unsigned level = u_bit_scan(&level_mask); + + /* The smaller the mipmap level, the less layers there are + * as far as 3D textures are concerned. */ + max_layer = util_max_layer(&tex->buffer.b.b, level); + checked_last_layer = MIN2(last_layer, max_layer); + + for (layer = first_layer; layer <= checked_last_layer; layer++) { + struct pipe_surface *cbsurf, surf_tmpl; + + surf_tmpl.format = tex->buffer.b.b.format; + surf_tmpl.u.tex.level = level; + surf_tmpl.u.tex.first_layer = layer; + surf_tmpl.u.tex.last_layer = layer; + cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl); + + /* Required before and after FMASK and DCC_DECOMPRESS. */ + if (custom_blend == sctx->custom_blend_fmask_decompress || + custom_blend == sctx->custom_blend_dcc_decompress) + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + si_blitter_begin(sctx, SI_DECOMPRESS); + util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend); + si_blitter_end(sctx); + + if (custom_blend == sctx->custom_blend_fmask_decompress || + custom_blend == sctx->custom_blend_dcc_decompress) + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + pipe_surface_reference(&cbsurf, NULL); + } + + /* The texture will always be dirty if some layers aren't flushed. + * I don't think this case occurs often though. */ + if (first_layer == 0 && last_layer >= max_layer) { + tex->dirty_level_mask &= ~(1 << level); + } + } + + sctx->decompression_enabled = false; + si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, vi_dcc_enabled(tex, first_level), + tex->surface.u.gfx9.dcc.pipe_aligned); expand_fmask: - if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) { - si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b); - tex->fmask_is_identity = true; - } + if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) { + si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b); + tex->fmask_is_identity = true; + } } -static void -si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex, - unsigned first_level, unsigned last_level, - bool need_fmask_expand) +static void si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex, + unsigned first_level, unsigned last_level, + bool need_fmask_expand) { - /* CMASK or DCC can be discarded and we can still end up here. */ - if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset) - return; + /* CMASK or DCC can be discarded and we can still end up here. */ + if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset) + return; - si_blit_decompress_color(sctx, tex, first_level, last_level, 0, - util_max_layer(&tex->buffer.b.b, first_level), - false, need_fmask_expand); + si_blit_decompress_color(sctx, tex, first_level, last_level, 0, + util_max_layer(&tex->buffer.b.b, first_level), false, + need_fmask_expand); } -static void -si_decompress_sampler_color_textures(struct si_context *sctx, - struct si_samplers *textures) +static void si_decompress_sampler_color_textures(struct si_context *sctx, + struct si_samplers *textures) { - unsigned i; - unsigned mask = textures->needs_color_decompress_mask; + unsigned i; + unsigned mask = textures->needs_color_decompress_mask; - while (mask) { - struct pipe_sampler_view *view; - struct si_texture *tex; + while (mask) { + struct pipe_sampler_view *view; + struct si_texture *tex; - i = u_bit_scan(&mask); + i = u_bit_scan(&mask); - view = textures->views[i]; - assert(view); + view = textures->views[i]; + assert(view); - tex = (struct si_texture *)view->texture; + tex = (struct si_texture *)view->texture; - si_decompress_color_texture(sctx, tex, view->u.tex.first_level, - view->u.tex.last_level, false); - } + si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level, + false); + } } -static void -si_decompress_image_color_textures(struct si_context *sctx, - struct si_images *images) +static void si_decompress_image_color_textures(struct si_context *sctx, struct si_images *images) { - unsigned i; - unsigned mask = images->needs_color_decompress_mask; + unsigned i; + unsigned mask = images->needs_color_decompress_mask; - while (mask) { - const struct pipe_image_view *view; - struct si_texture *tex; + while (mask) { + const struct pipe_image_view *view; + struct si_texture *tex; - i = u_bit_scan(&mask); + i = u_bit_scan(&mask); - view = &images->views[i]; - assert(view->resource->target != PIPE_BUFFER); + view = &images->views[i]; + assert(view->resource->target != PIPE_BUFFER); - tex = (struct si_texture *)view->resource; + tex = (struct si_texture *)view->resource; - si_decompress_color_texture(sctx, tex, view->u.tex.level, - view->u.tex.level, - view->access & PIPE_IMAGE_ACCESS_WRITE); - } + si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level, + view->access & PIPE_IMAGE_ACCESS_WRITE); + } } -static void si_check_render_feedback_texture(struct si_context *sctx, - struct si_texture *tex, - unsigned first_level, - unsigned last_level, - unsigned first_layer, - unsigned last_layer) +static void si_check_render_feedback_texture(struct si_context *sctx, struct si_texture *tex, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer) { - bool render_feedback = false; + bool render_feedback = false; - if (!tex->surface.dcc_offset) - return; + if (!tex->surface.dcc_offset) + return; - for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) { - struct si_surface * surf; + for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) { + struct si_surface *surf; - if (!sctx->framebuffer.state.cbufs[j]) - continue; + if (!sctx->framebuffer.state.cbufs[j]) + continue; - surf = (struct si_surface*)sctx->framebuffer.state.cbufs[j]; + surf = (struct si_surface *)sctx->framebuffer.state.cbufs[j]; - if (tex == (struct si_texture *)surf->base.texture && - surf->base.u.tex.level >= first_level && - surf->base.u.tex.level <= last_level && - surf->base.u.tex.first_layer <= last_layer && - surf->base.u.tex.last_layer >= first_layer) { - render_feedback = true; - break; - } - } + if (tex == (struct si_texture *)surf->base.texture && surf->base.u.tex.level >= first_level && + surf->base.u.tex.level <= last_level && surf->base.u.tex.first_layer <= last_layer && + surf->base.u.tex.last_layer >= first_layer) { + render_feedback = true; + break; + } + } - if (render_feedback) - si_texture_disable_dcc(sctx, tex); + if (render_feedback) + si_texture_disable_dcc(sctx, tex); } -static void si_check_render_feedback_textures(struct si_context *sctx, - struct si_samplers *textures) +static void si_check_render_feedback_textures(struct si_context *sctx, struct si_samplers *textures) { - uint32_t mask = textures->enabled_mask; + uint32_t mask = textures->enabled_mask; - while (mask) { - const struct pipe_sampler_view *view; - struct si_texture *tex; + while (mask) { + const struct pipe_sampler_view *view; + struct si_texture *tex; - unsigned i = u_bit_scan(&mask); + unsigned i = u_bit_scan(&mask); - view = textures->views[i]; - if(view->texture->target == PIPE_BUFFER) - continue; + view = textures->views[i]; + if (view->texture->target == PIPE_BUFFER) + continue; - tex = (struct si_texture *)view->texture; + tex = (struct si_texture *)view->texture; - si_check_render_feedback_texture(sctx, tex, - view->u.tex.first_level, - view->u.tex.last_level, - view->u.tex.first_layer, - view->u.tex.last_layer); - } + si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level, + view->u.tex.first_layer, view->u.tex.last_layer); + } } -static void si_check_render_feedback_images(struct si_context *sctx, - struct si_images *images) +static void si_check_render_feedback_images(struct si_context *sctx, struct si_images *images) { - uint32_t mask = images->enabled_mask; + uint32_t mask = images->enabled_mask; - while (mask) { - const struct pipe_image_view *view; - struct si_texture *tex; + while (mask) { + const struct pipe_image_view *view; + struct si_texture *tex; - unsigned i = u_bit_scan(&mask); + unsigned i = u_bit_scan(&mask); - view = &images->views[i]; - if (view->resource->target == PIPE_BUFFER) - continue; + view = &images->views[i]; + if (view->resource->target == PIPE_BUFFER) + continue; - tex = (struct si_texture *)view->resource; + tex = (struct si_texture *)view->resource; - si_check_render_feedback_texture(sctx, tex, - view->u.tex.level, - view->u.tex.level, - view->u.tex.first_layer, - view->u.tex.last_layer); - } + si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level, + view->u.tex.first_layer, view->u.tex.last_layer); + } } static void si_check_render_feedback_resident_textures(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct pipe_sampler_view *view; - struct si_texture *tex; - - view = (*tex_handle)->view; - if (view->texture->target == PIPE_BUFFER) - continue; - - tex = (struct si_texture *)view->texture; - - si_check_render_feedback_texture(sctx, tex, - view->u.tex.first_level, - view->u.tex.last_level, - view->u.tex.first_layer, - view->u.tex.last_layer); - } + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + struct pipe_sampler_view *view; + struct si_texture *tex; + + view = (*tex_handle)->view; + if (view->texture->target == PIPE_BUFFER) + continue; + + tex = (struct si_texture *)view->texture; + + si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level, + view->u.tex.first_layer, view->u.tex.last_layer); + } } static void si_check_render_feedback_resident_images(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view; - struct si_texture *tex; - - view = &(*img_handle)->view; - if (view->resource->target == PIPE_BUFFER) - continue; - - tex = (struct si_texture *)view->resource; - - si_check_render_feedback_texture(sctx, tex, - view->u.tex.level, - view->u.tex.level, - view->u.tex.first_layer, - view->u.tex.last_layer); - } + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + struct pipe_image_view *view; + struct si_texture *tex; + + view = &(*img_handle)->view; + if (view->resource->target == PIPE_BUFFER) + continue; + + tex = (struct si_texture *)view->resource; + + si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level, + view->u.tex.first_layer, view->u.tex.last_layer); + } } static void si_check_render_feedback(struct si_context *sctx) { - if (!sctx->need_check_render_feedback) - return; + if (!sctx->need_check_render_feedback) + return; - /* There is no render feedback if color writes are disabled. - * (e.g. a pixel shader with image stores) - */ - if (!si_get_total_colormask(sctx)) - return; + /* There is no render feedback if color writes are disabled. + * (e.g. a pixel shader with image stores) + */ + if (!si_get_total_colormask(sctx)) + return; - for (int i = 0; i < SI_NUM_SHADERS; ++i) { - si_check_render_feedback_images(sctx, &sctx->images[i]); - si_check_render_feedback_textures(sctx, &sctx->samplers[i]); - } + for (int i = 0; i < SI_NUM_SHADERS; ++i) { + si_check_render_feedback_images(sctx, &sctx->images[i]); + si_check_render_feedback_textures(sctx, &sctx->samplers[i]); + } - si_check_render_feedback_resident_images(sctx); - si_check_render_feedback_resident_textures(sctx); + si_check_render_feedback_resident_images(sctx); + si_check_render_feedback_resident_textures(sctx); - sctx->need_check_render_feedback = false; + sctx->need_check_render_feedback = false; } static void si_decompress_resident_textures(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, tex_handle) { - struct pipe_sampler_view *view = (*tex_handle)->view; - struct si_texture *tex = (struct si_texture *)view->texture; - - si_decompress_color_texture(sctx, tex, view->u.tex.first_level, - view->u.tex.last_level, false); - } - - util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, tex_handle) { - struct pipe_sampler_view *view = (*tex_handle)->view; - struct si_sampler_view *sview = (struct si_sampler_view *)view; - struct si_texture *tex = (struct si_texture *)view->texture; - - si_decompress_depth(sctx, tex, - sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z, - view->u.tex.first_level, view->u.tex.last_level, - 0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level)); - } + util_dynarray_foreach (&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *, + tex_handle) { + struct pipe_sampler_view *view = (*tex_handle)->view; + struct si_texture *tex = (struct si_texture *)view->texture; + + si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level, + false); + } + + util_dynarray_foreach (&sctx->resident_tex_needs_depth_decompress, struct si_texture_handle *, + tex_handle) { + struct pipe_sampler_view *view = (*tex_handle)->view; + struct si_sampler_view *sview = (struct si_sampler_view *)view; + struct si_texture *tex = (struct si_texture *)view->texture; + + si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z, + view->u.tex.first_level, view->u.tex.last_level, 0, + util_max_layer(&tex->buffer.b.b, view->u.tex.first_level)); + } } static void si_decompress_resident_images(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_img_needs_color_decompress, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - struct si_texture *tex = (struct si_texture *)view->resource; - - si_decompress_color_texture(sctx, tex, view->u.tex.level, - view->u.tex.level, - view->access & PIPE_IMAGE_ACCESS_WRITE); - } + util_dynarray_foreach (&sctx->resident_img_needs_color_decompress, struct si_image_handle *, + img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + struct si_texture *tex = (struct si_texture *)view->resource; + + si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level, + view->access & PIPE_IMAGE_ACCESS_WRITE); + } } void si_decompress_textures(struct si_context *sctx, unsigned shader_mask) { - unsigned compressed_colortex_counter, mask; - - if (sctx->blitter->running) - return; - - /* Update the compressed_colortex_mask if necessary. */ - compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter); - if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) { - sctx->last_compressed_colortex_counter = compressed_colortex_counter; - si_update_needs_color_decompress_masks(sctx); - } - - /* Decompress color & depth textures if needed. */ - mask = sctx->shader_needs_decompress_mask & shader_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - - if (sctx->samplers[i].needs_depth_decompress_mask) { - si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]); - } - if (sctx->samplers[i].needs_color_decompress_mask) { - si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]); - } - if (sctx->images[i].needs_color_decompress_mask) { - si_decompress_image_color_textures(sctx, &sctx->images[i]); - } - } - - if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) { - if (sctx->uses_bindless_samplers) - si_decompress_resident_textures(sctx); - if (sctx->uses_bindless_images) - si_decompress_resident_images(sctx); - - if (sctx->ps_uses_fbfetch) { - struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; - si_decompress_color_texture(sctx, - (struct si_texture*)cb0->texture, - cb0->u.tex.first_layer, - cb0->u.tex.last_layer, false); - } - - si_check_render_feedback(sctx); - } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) { - if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers) - si_decompress_resident_textures(sctx); - if (sctx->cs_shader_state.program->sel.info.uses_bindless_images) - si_decompress_resident_images(sctx); - } + unsigned compressed_colortex_counter, mask; + + if (sctx->blitter->running) + return; + + /* Update the compressed_colortex_mask if necessary. */ + compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter); + if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) { + sctx->last_compressed_colortex_counter = compressed_colortex_counter; + si_update_needs_color_decompress_masks(sctx); + } + + /* Decompress color & depth textures if needed. */ + mask = sctx->shader_needs_decompress_mask & shader_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (sctx->samplers[i].needs_depth_decompress_mask) { + si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]); + } + if (sctx->samplers[i].needs_color_decompress_mask) { + si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]); + } + if (sctx->images[i].needs_color_decompress_mask) { + si_decompress_image_color_textures(sctx, &sctx->images[i]); + } + } + + if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) { + if (sctx->uses_bindless_samplers) + si_decompress_resident_textures(sctx); + if (sctx->uses_bindless_images) + si_decompress_resident_images(sctx); + + if (sctx->ps_uses_fbfetch) { + struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; + si_decompress_color_texture(sctx, (struct si_texture *)cb0->texture, + cb0->u.tex.first_layer, cb0->u.tex.last_layer, false); + } + + si_check_render_feedback(sctx); + } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) { + if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers) + si_decompress_resident_textures(sctx); + if (sctx->cs_shader_state.program->sel.info.uses_bindless_images) + si_decompress_resident_images(sctx); + } } /* Helper for decompressing a portion of a color or depth resource before * blitting if any decompression is needed. * The driver doesn't decompress resources automatically while u_blitter is * rendering. */ -void si_decompress_subresource(struct pipe_context *ctx, - struct pipe_resource *tex, - unsigned planes, unsigned level, - unsigned first_layer, unsigned last_layer) +void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes, + unsigned level, unsigned first_layer, unsigned last_layer) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture *stex = (struct si_texture*)tex; - - if (stex->db_compatible) { - planes &= PIPE_MASK_Z | PIPE_MASK_S; - - if (!stex->surface.has_stencil) - planes &= ~PIPE_MASK_S; - - /* If we've rendered into the framebuffer and it's a blitting - * source, make sure the decompression pass is invoked - * by dirtying the framebuffer. - */ - if (sctx->framebuffer.state.zsbuf && - sctx->framebuffer.state.zsbuf->u.tex.level == level && - sctx->framebuffer.state.zsbuf->texture == tex) - si_update_fb_dirtiness_after_rendering(sctx); - - si_decompress_depth(sctx, stex, planes, - level, level, - first_layer, last_layer); - } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) { - /* If we've rendered into the framebuffer and it's a blitting - * source, make sure the decompression pass is invoked - * by dirtying the framebuffer. - */ - for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (sctx->framebuffer.state.cbufs[i] && - sctx->framebuffer.state.cbufs[i]->u.tex.level == level && - sctx->framebuffer.state.cbufs[i]->texture == tex) { - si_update_fb_dirtiness_after_rendering(sctx); - break; - } - } - - si_blit_decompress_color(sctx, stex, level, level, - first_layer, last_layer, false, false); - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *stex = (struct si_texture *)tex; + + if (stex->db_compatible) { + planes &= PIPE_MASK_Z | PIPE_MASK_S; + + if (!stex->surface.has_stencil) + planes &= ~PIPE_MASK_S; + + /* If we've rendered into the framebuffer and it's a blitting + * source, make sure the decompression pass is invoked + * by dirtying the framebuffer. + */ + if (sctx->framebuffer.state.zsbuf && sctx->framebuffer.state.zsbuf->u.tex.level == level && + sctx->framebuffer.state.zsbuf->texture == tex) + si_update_fb_dirtiness_after_rendering(sctx); + + si_decompress_depth(sctx, stex, planes, level, level, first_layer, last_layer); + } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) { + /* If we've rendered into the framebuffer and it's a blitting + * source, make sure the decompression pass is invoked + * by dirtying the framebuffer. + */ + for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (sctx->framebuffer.state.cbufs[i] && + sctx->framebuffer.state.cbufs[i]->u.tex.level == level && + sctx->framebuffer.state.cbufs[i]->texture == tex) { + si_update_fb_dirtiness_after_rendering(sctx); + break; + } + } + + si_blit_decompress_color(sctx, stex, level, level, first_layer, last_layer, false, false); + } } struct texture_orig_info { - unsigned format; - unsigned width0; - unsigned height0; - unsigned npix_x; - unsigned npix_y; - unsigned npix0_x; - unsigned npix0_y; + unsigned format; + unsigned width0; + unsigned height0; + unsigned npix_x; + unsigned npix_y; + unsigned npix0_x; + unsigned npix0_y; }; -void si_resource_copy_region(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture *ssrc = (struct si_texture*)src; - struct si_texture *sdst = (struct si_texture*)dst; - struct pipe_surface *dst_view, dst_templ; - struct pipe_sampler_view src_templ, *src_view; - unsigned dst_width, dst_height, src_width0, src_height0; - unsigned dst_width0, dst_height0, src_force_level = 0; - struct pipe_box sbox, dstbox; - - /* Handle buffers first. */ - if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { - si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width); - return; - } - - if (!util_format_is_compressed(src->format) && - !util_format_is_compressed(dst->format) && - !util_format_is_depth_or_stencil(src->format) && - src->nr_samples <= 1 && - !sdst->surface.dcc_offset && - !(dst->target != src->target && - (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) { - si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box); - return; - } - - assert(u_max_sample(dst) == u_max_sample(src)); - - /* The driver doesn't decompress resources automatically while - * u_blitter is rendering. */ - si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, - src_box->z, src_box->z + src_box->depth - 1); - - dst_width = u_minify(dst->width0, dst_level); - dst_height = u_minify(dst->height0, dst_level); - dst_width0 = dst->width0; - dst_height0 = dst->height0; - src_width0 = src->width0; - src_height0 = src->height0; - - util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz); - util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level); - - if (util_format_is_compressed(src->format) || - util_format_is_compressed(dst->format)) { - unsigned blocksize = ssrc->surface.bpe; - - if (blocksize == 8) - src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */ - else - src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */ - dst_templ.format = src_templ.format; - - dst_width = util_format_get_nblocksx(dst->format, dst_width); - dst_height = util_format_get_nblocksy(dst->format, dst_height); - dst_width0 = util_format_get_nblocksx(dst->format, dst_width0); - dst_height0 = util_format_get_nblocksy(dst->format, dst_height0); - src_width0 = util_format_get_nblocksx(src->format, src_width0); - src_height0 = util_format_get_nblocksy(src->format, src_height0); - - dstx = util_format_get_nblocksx(dst->format, dstx); - dsty = util_format_get_nblocksy(dst->format, dsty); - - sbox.x = util_format_get_nblocksx(src->format, src_box->x); - sbox.y = util_format_get_nblocksy(src->format, src_box->y); - sbox.z = src_box->z; - sbox.width = util_format_get_nblocksx(src->format, src_box->width); - sbox.height = util_format_get_nblocksy(src->format, src_box->height); - sbox.depth = src_box->depth; - src_box = &sbox; - - src_force_level = src_level; - } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) { - if (util_format_is_subsampled_422(src->format)) { - src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; - dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; - - dst_width = util_format_get_nblocksx(dst->format, dst_width); - dst_width0 = util_format_get_nblocksx(dst->format, dst_width0); - src_width0 = util_format_get_nblocksx(src->format, src_width0); - - dstx = util_format_get_nblocksx(dst->format, dstx); - - sbox = *src_box; - sbox.x = util_format_get_nblocksx(src->format, src_box->x); - sbox.width = util_format_get_nblocksx(src->format, src_box->width); - src_box = &sbox; - } else { - unsigned blocksize = ssrc->surface.bpe; - - switch (blocksize) { - case 1: - dst_templ.format = PIPE_FORMAT_R8_UNORM; - src_templ.format = PIPE_FORMAT_R8_UNORM; - break; - case 2: - dst_templ.format = PIPE_FORMAT_R8G8_UNORM; - src_templ.format = PIPE_FORMAT_R8G8_UNORM; - break; - case 4: - dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; - src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; - break; - case 8: - dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; - src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; - break; - case 16: - dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; - src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; - break; - default: - fprintf(stderr, "Unhandled format %s with blocksize %u\n", - util_format_short_name(src->format), blocksize); - assert(0); - } - } - } - - /* SNORM8 blitting has precision issues on some chips. Use the SINT - * equivalent instead, which doesn't force DCC decompression. - * Note that some chips avoid this issue by using SDMA. - */ - if (util_format_is_snorm8(dst_templ.format)) { - dst_templ.format = src_templ.format = - util_format_snorm8_to_sint8(dst_templ.format); - } - - vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level, - dst_templ.format); - vi_disable_dcc_if_incompatible_format(sctx, src, src_level, - src_templ.format); - - /* Initialize the surface. */ - dst_view = si_create_surface_custom(ctx, dst, &dst_templ, - dst_width0, dst_height0, - dst_width, dst_height); - - /* Initialize the sampler view. */ - src_view = si_create_sampler_view_custom(ctx, src, &src_templ, - src_width0, src_height0, - src_force_level); - - u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), - abs(src_box->depth), &dstbox); - - /* Copy. */ - si_blitter_begin(sctx, SI_COPY); - util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, - src_view, src_box, src_width0, src_height0, - PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, - false); - si_blitter_end(sctx); - - pipe_surface_reference(&dst_view, NULL); - pipe_sampler_view_reference(&src_view, NULL); + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *ssrc = (struct si_texture *)src; + struct si_texture *sdst = (struct si_texture *)dst; + struct pipe_surface *dst_view, dst_templ; + struct pipe_sampler_view src_templ, *src_view; + unsigned dst_width, dst_height, src_width0, src_height0; + unsigned dst_width0, dst_height0, src_force_level = 0; + struct pipe_box sbox, dstbox; + + /* Handle buffers first. */ + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width); + return; + } + + if (!util_format_is_compressed(src->format) && !util_format_is_compressed(dst->format) && + !util_format_is_depth_or_stencil(src->format) && src->nr_samples <= 1 && + !sdst->surface.dcc_offset && + !(dst->target != src->target && + (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) { + si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box); + return; + } + + assert(u_max_sample(dst) == u_max_sample(src)); + + /* The driver doesn't decompress resources automatically while + * u_blitter is rendering. */ + si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z, + src_box->z + src_box->depth - 1); + + dst_width = u_minify(dst->width0, dst_level); + dst_height = u_minify(dst->height0, dst_level); + dst_width0 = dst->width0; + dst_height0 = dst->height0; + src_width0 = src->width0; + src_height0 = src->height0; + + util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz); + util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level); + + if (util_format_is_compressed(src->format) || util_format_is_compressed(dst->format)) { + unsigned blocksize = ssrc->surface.bpe; + + if (blocksize == 8) + src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */ + else + src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */ + dst_templ.format = src_templ.format; + + dst_width = util_format_get_nblocksx(dst->format, dst_width); + dst_height = util_format_get_nblocksy(dst->format, dst_height); + dst_width0 = util_format_get_nblocksx(dst->format, dst_width0); + dst_height0 = util_format_get_nblocksy(dst->format, dst_height0); + src_width0 = util_format_get_nblocksx(src->format, src_width0); + src_height0 = util_format_get_nblocksy(src->format, src_height0); + + dstx = util_format_get_nblocksx(dst->format, dstx); + dsty = util_format_get_nblocksy(dst->format, dsty); + + sbox.x = util_format_get_nblocksx(src->format, src_box->x); + sbox.y = util_format_get_nblocksy(src->format, src_box->y); + sbox.z = src_box->z; + sbox.width = util_format_get_nblocksx(src->format, src_box->width); + sbox.height = util_format_get_nblocksy(src->format, src_box->height); + sbox.depth = src_box->depth; + src_box = &sbox; + + src_force_level = src_level; + } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) { + if (util_format_is_subsampled_422(src->format)) { + src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; + dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT; + + dst_width = util_format_get_nblocksx(dst->format, dst_width); + dst_width0 = util_format_get_nblocksx(dst->format, dst_width0); + src_width0 = util_format_get_nblocksx(src->format, src_width0); + + dstx = util_format_get_nblocksx(dst->format, dstx); + + sbox = *src_box; + sbox.x = util_format_get_nblocksx(src->format, src_box->x); + sbox.width = util_format_get_nblocksx(src->format, src_box->width); + src_box = &sbox; + } else { + unsigned blocksize = ssrc->surface.bpe; + + switch (blocksize) { + case 1: + dst_templ.format = PIPE_FORMAT_R8_UNORM; + src_templ.format = PIPE_FORMAT_R8_UNORM; + break; + case 2: + dst_templ.format = PIPE_FORMAT_R8G8_UNORM; + src_templ.format = PIPE_FORMAT_R8G8_UNORM; + break; + case 4: + dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; + src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM; + break; + case 8: + dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; + src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; + break; + case 16: + dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; + src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; + break; + default: + fprintf(stderr, "Unhandled format %s with blocksize %u\n", + util_format_short_name(src->format), blocksize); + assert(0); + } + } + } + + /* SNORM8 blitting has precision issues on some chips. Use the SINT + * equivalent instead, which doesn't force DCC decompression. + * Note that some chips avoid this issue by using SDMA. + */ + if (util_format_is_snorm8(dst_templ.format)) { + dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format); + } + + vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level, dst_templ.format); + vi_disable_dcc_if_incompatible_format(sctx, src, src_level, src_templ.format); + + /* Initialize the surface. */ + dst_view = si_create_surface_custom(ctx, dst, &dst_templ, dst_width0, dst_height0, dst_width, + dst_height); + + /* Initialize the sampler view. */ + src_view = + si_create_sampler_view_custom(ctx, src, &src_templ, src_width0, src_height0, src_force_level); + + u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), abs(src_box->depth), + &dstbox); + + /* Copy. */ + si_blitter_begin(sctx, SI_COPY); + util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0, + src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false); + si_blitter_end(sctx); + + pipe_surface_reference(&dst_view, NULL); + pipe_sampler_view_reference(&src_view, NULL); } -static void si_do_CB_resolve(struct si_context *sctx, - const struct pipe_blit_info *info, - struct pipe_resource *dst, - unsigned dst_level, unsigned dst_z, - enum pipe_format format) +static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_info *info, + struct pipe_resource *dst, unsigned dst_level, unsigned dst_z, + enum pipe_format format) { - /* Required before and after CB_RESOLVE. */ - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - - si_blitter_begin(sctx, SI_COLOR_RESOLVE | - (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, - info->src.resource, info->src.box.z, - ~0, sctx->custom_blend_resolve, - format); - si_blitter_end(sctx); - - /* Flush caches for possible texturing. */ - si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */); + /* Required before and after CB_RESOLVE. */ + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + si_blitter_begin( + sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); + util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource, + info->src.box.z, ~0, sctx->custom_blend_resolve, format); + si_blitter_end(sctx); + + /* Flush caches for possible texturing. */ + si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */); } -static bool do_hardware_msaa_resolve(struct pipe_context *ctx, - const struct pipe_blit_info *info) +static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *src = (struct si_texture*)info->src.resource; - struct si_texture *dst = (struct si_texture*)info->dst.resource; - ASSERTED struct si_texture *stmp; - unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level); - unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level); - enum pipe_format format = info->src.format; - struct pipe_resource *tmp, templ; - struct pipe_blit_info blit; - - /* Check basic requirements for hw resolve. */ - if (!(info->src.resource->nr_samples > 1 && - info->dst.resource->nr_samples <= 1 && - !util_format_is_pure_integer(format) && - !util_format_is_depth_or_stencil(format) && - util_max_layer(info->src.resource, 0) == 0)) - return false; - - /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and - * the format is R16G16. Use R16A16, which does work. - */ - if (format == PIPE_FORMAT_R16G16_UNORM) - format = PIPE_FORMAT_R16A16_UNORM; - if (format == PIPE_FORMAT_R16G16_SNORM) - format = PIPE_FORMAT_R16A16_SNORM; - - /* Check the remaining requirements for hw resolve. */ - if (util_max_layer(info->dst.resource, info->dst.level) == 0 && - !info->scissor_enable && - (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA && - util_is_format_compatible(util_format_description(info->src.format), - util_format_description(info->dst.format)) && - dst_width == info->src.resource->width0 && - dst_height == info->src.resource->height0 && - info->dst.box.x == 0 && - info->dst.box.y == 0 && - info->dst.box.width == dst_width && - info->dst.box.height == dst_height && - info->dst.box.depth == 1 && - info->src.box.x == 0 && - info->src.box.y == 0 && - info->src.box.width == dst_width && - info->src.box.height == dst_height && - info->src.box.depth == 1 && - !dst->surface.is_linear && - (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */ - /* Check the last constraint. */ - if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) { - /* The next fast clear will switch to this mode to - * get direct hw resolve next time if the mode is - * different now. - * - * TODO-GFX10: This does not work in GFX10 because MSAA - * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes. - * In some cases we could change the swizzle of the - * destination texture instead, but the more general - * solution is to implement compute shader resolve. - */ - src->last_msaa_resolve_target_micro_mode = - dst->surface.micro_tile_mode; - goto resolve_to_temp; - } - - /* Resolving into a surface with DCC is unsupported. Since - * it's being overwritten anyway, clear it to uncompressed. - * This is still the fastest codepath even with this clear. - */ - if (vi_dcc_enabled(dst, info->dst.level)) { - if (!vi_dcc_clear_level(sctx, dst, info->dst.level, - DCC_UNCOMPRESSED)) - goto resolve_to_temp; - - dst->dirty_level_mask &= ~(1 << info->dst.level); - } - - /* Resolve directly from src to dst. */ - si_do_CB_resolve(sctx, info, info->dst.resource, - info->dst.level, info->dst.box.z, format); - return true; - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *src = (struct si_texture *)info->src.resource; + struct si_texture *dst = (struct si_texture *)info->dst.resource; + ASSERTED struct si_texture *stmp; + unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level); + unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level); + enum pipe_format format = info->src.format; + struct pipe_resource *tmp, templ; + struct pipe_blit_info blit; + + /* Check basic requirements for hw resolve. */ + if (!(info->src.resource->nr_samples > 1 && info->dst.resource->nr_samples <= 1 && + !util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format) && + util_max_layer(info->src.resource, 0) == 0)) + return false; + + /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and + * the format is R16G16. Use R16A16, which does work. + */ + if (format == PIPE_FORMAT_R16G16_UNORM) + format = PIPE_FORMAT_R16A16_UNORM; + if (format == PIPE_FORMAT_R16G16_SNORM) + format = PIPE_FORMAT_R16A16_SNORM; + + /* Check the remaining requirements for hw resolve. */ + if (util_max_layer(info->dst.resource, info->dst.level) == 0 && !info->scissor_enable && + (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA && + util_is_format_compatible(util_format_description(info->src.format), + util_format_description(info->dst.format)) && + dst_width == info->src.resource->width0 && dst_height == info->src.resource->height0 && + info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.width == dst_width && + info->dst.box.height == dst_height && info->dst.box.depth == 1 && info->src.box.x == 0 && + info->src.box.y == 0 && info->src.box.width == dst_width && + info->src.box.height == dst_height && info->src.box.depth == 1 && !dst->surface.is_linear && + (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */ + /* Check the last constraint. */ + if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) { + /* The next fast clear will switch to this mode to + * get direct hw resolve next time if the mode is + * different now. + * + * TODO-GFX10: This does not work in GFX10 because MSAA + * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes. + * In some cases we could change the swizzle of the + * destination texture instead, but the more general + * solution is to implement compute shader resolve. + */ + src->last_msaa_resolve_target_micro_mode = dst->surface.micro_tile_mode; + goto resolve_to_temp; + } + + /* Resolving into a surface with DCC is unsupported. Since + * it's being overwritten anyway, clear it to uncompressed. + * This is still the fastest codepath even with this clear. + */ + if (vi_dcc_enabled(dst, info->dst.level)) { + if (!vi_dcc_clear_level(sctx, dst, info->dst.level, DCC_UNCOMPRESSED)) + goto resolve_to_temp; + + dst->dirty_level_mask &= ~(1 << info->dst.level); + } + + /* Resolve directly from src to dst. */ + si_do_CB_resolve(sctx, info, info->dst.resource, info->dst.level, info->dst.box.z, format); + return true; + } resolve_to_temp: - /* Shader-based resolve is VERY SLOW. Instead, resolve into - * a temporary texture and blit. - */ - memset(&templ, 0, sizeof(templ)); - templ.target = PIPE_TEXTURE_2D; - templ.format = info->src.resource->format; - templ.width0 = info->src.resource->width0; - templ.height0 = info->src.resource->height0; - templ.depth0 = 1; - templ.array_size = 1; - templ.usage = PIPE_USAGE_DEFAULT; - templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | - SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE | - SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) | - SI_RESOURCE_FLAG_DISABLE_DCC; - - /* The src and dst microtile modes must be the same. */ - if (sctx->chip_class <= GFX8 && - src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) - templ.bind = PIPE_BIND_SCANOUT; - else - templ.bind = 0; - - tmp = ctx->screen->resource_create(ctx->screen, &templ); - if (!tmp) - return false; - stmp = (struct si_texture*)tmp; - - assert(!stmp->surface.is_linear); - assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode); - - /* resolve */ - si_do_CB_resolve(sctx, info, tmp, 0, 0, format); - - /* blit */ - blit = *info; - blit.src.resource = tmp; - blit.src.box.z = 0; - - si_blitter_begin(sctx, SI_BLIT | - (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_blit(sctx->blitter, &blit); - si_blitter_end(sctx); - - pipe_resource_reference(&tmp, NULL); - return true; + /* Shader-based resolve is VERY SLOW. Instead, resolve into + * a temporary texture and blit. + */ + memset(&templ, 0, sizeof(templ)); + templ.target = PIPE_TEXTURE_2D; + templ.format = info->src.resource->format; + templ.width0 = info->src.resource->width0; + templ.height0 = info->src.resource->height0; + templ.depth0 = 1; + templ.array_size = 1; + templ.usage = PIPE_USAGE_DEFAULT; + templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE | + SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) | + SI_RESOURCE_FLAG_DISABLE_DCC; + + /* The src and dst microtile modes must be the same. */ + if (sctx->chip_class <= GFX8 && src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) + templ.bind = PIPE_BIND_SCANOUT; + else + templ.bind = 0; + + tmp = ctx->screen->resource_create(ctx->screen, &templ); + if (!tmp) + return false; + stmp = (struct si_texture *)tmp; + + assert(!stmp->surface.is_linear); + assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode); + + /* resolve */ + si_do_CB_resolve(sctx, info, tmp, 0, 0, format); + + /* blit */ + blit = *info; + blit.src.resource = tmp; + blit.src.box.z = 0; + + si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); + util_blitter_blit(sctx->blitter, &blit); + si_blitter_end(sctx); + + pipe_resource_reference(&tmp, NULL); + return true; } -static void si_blit(struct pipe_context *ctx, - const struct pipe_blit_info *info) +static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *dst = (struct si_texture *)info->dst.resource; - - if (do_hardware_msaa_resolve(ctx, info)) { - return; - } - - /* Using SDMA for copying to a linear texture in GTT is much faster. - * This improves DRI PRIME performance. - * - * resource_copy_region can't do this yet, because dma_copy calls it - * on failure (recursion). - */ - if (dst->surface.is_linear && - util_can_blit_via_copy_region(info, false)) { - sctx->dma_copy(ctx, info->dst.resource, info->dst.level, - info->dst.box.x, info->dst.box.y, - info->dst.box.z, - info->src.resource, info->src.level, - &info->src.box); - return; - } - - assert(util_blitter_is_blit_supported(sctx->blitter, info)); - - /* The driver doesn't decompress resources automatically while - * u_blitter is rendering. */ - vi_disable_dcc_if_incompatible_format(sctx, info->src.resource, - info->src.level, - info->src.format); - vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource, - info->dst.level, - info->dst.format); - si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, - info->src.level, - info->src.box.z, - info->src.box.z + info->src.box.depth - 1); - - if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && - util_try_blit_via_copy_region(ctx, info)) - return; - - si_blitter_begin(sctx, SI_BLIT | - (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_blit(sctx->blitter, info); - si_blitter_end(sctx); + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *dst = (struct si_texture *)info->dst.resource; + + if (do_hardware_msaa_resolve(ctx, info)) { + return; + } + + /* Using SDMA for copying to a linear texture in GTT is much faster. + * This improves DRI PRIME performance. + * + * resource_copy_region can't do this yet, because dma_copy calls it + * on failure (recursion). + */ + if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) { + sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y, + info->dst.box.z, info->src.resource, info->src.level, &info->src.box); + return; + } + + assert(util_blitter_is_blit_supported(sctx->blitter, info)); + + /* The driver doesn't decompress resources automatically while + * u_blitter is rendering. */ + vi_disable_dcc_if_incompatible_format(sctx, info->src.resource, info->src.level, + info->src.format); + vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource, info->dst.level, + info->dst.format); + si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level, + info->src.box.z, info->src.box.z + info->src.box.depth - 1); + + if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info)) + return; + + si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); + util_blitter_blit(sctx->blitter, info); + si_blitter_end(sctx); } -static bool si_generate_mipmap(struct pipe_context *ctx, - struct pipe_resource *tex, - enum pipe_format format, - unsigned base_level, unsigned last_level, - unsigned first_layer, unsigned last_layer) +static bool si_generate_mipmap(struct pipe_context *ctx, struct pipe_resource *tex, + enum pipe_format format, unsigned base_level, unsigned last_level, + unsigned first_layer, unsigned last_layer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *stex = (struct si_texture *)tex; - - if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex)) - return false; - - /* The driver doesn't decompress resources automatically while - * u_blitter is rendering. */ - vi_disable_dcc_if_incompatible_format(sctx, tex, base_level, - format); - si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, - base_level, first_layer, last_layer); - - /* Clear dirty_level_mask for the levels that will be overwritten. */ - assert(base_level < last_level); - stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, - last_level - base_level); - - sctx->generate_mipmap_for_depth = stex->is_depth; - - si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND); - util_blitter_generate_mipmap(sctx->blitter, tex, format, - base_level, last_level, - first_layer, last_layer); - si_blitter_end(sctx); - - sctx->generate_mipmap_for_depth = false; - return true; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *stex = (struct si_texture *)tex; + + if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex)) + return false; + + /* The driver doesn't decompress resources automatically while + * u_blitter is rendering. */ + vi_disable_dcc_if_incompatible_format(sctx, tex, base_level, format); + si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, base_level, first_layer, last_layer); + + /* Clear dirty_level_mask for the levels that will be overwritten. */ + assert(base_level < last_level); + stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, last_level - base_level); + + sctx->generate_mipmap_for_depth = stex->is_depth; + + si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND); + util_blitter_generate_mipmap(sctx->blitter, tex, format, base_level, last_level, first_layer, + last_layer); + si_blitter_end(sctx); + + sctx->generate_mipmap_for_depth = false; + return true; } -static void si_flush_resource(struct pipe_context *ctx, - struct pipe_resource *res) +static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *res) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *tex = (struct si_texture*)res; - - assert(res->target != PIPE_BUFFER); - assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics); - - /* st/dri calls flush twice per frame (not a bug), this prevents double - * decompression. */ - if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty) - return; - - if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) { - si_blit_decompress_color(sctx, tex, 0, res->last_level, - 0, util_max_layer(res, 0), - tex->dcc_separate_buffer != NULL, false); - - if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) { - si_retile_dcc(sctx, tex); - tex->displayable_dcc_dirty = false; - } - } - - /* Always do the analysis even if DCC is disabled at the moment. */ - if (tex->dcc_gather_statistics) { - bool separate_dcc_dirty = tex->separate_dcc_dirty; - - /* If the color buffer hasn't been unbound and fast clear hasn't - * been used, separate_dcc_dirty is false, but there may have been - * new rendering. Check if the color buffer is bound and assume - * it's dirty. - * - * Note that DRI2 never unbinds window colorbuffers, which means - * the DCC pipeline statistics query would never be re-set and would - * keep adding new results until all free memory is exhausted if we - * didn't do this. - */ - if (!separate_dcc_dirty) { - for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (sctx->framebuffer.state.cbufs[i] && - sctx->framebuffer.state.cbufs[i]->texture == res) { - separate_dcc_dirty = true; - break; - } - } - } - - if (separate_dcc_dirty) { - tex->separate_dcc_dirty = false; - vi_separate_dcc_process_and_reset_stats(ctx, tex); - } - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *tex = (struct si_texture *)res; + + assert(res->target != PIPE_BUFFER); + assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics); + + /* st/dri calls flush twice per frame (not a bug), this prevents double + * decompression. */ + if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty) + return; + + if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) { + si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0), + tex->dcc_separate_buffer != NULL, false); + + if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) { + si_retile_dcc(sctx, tex); + tex->displayable_dcc_dirty = false; + } + } + + /* Always do the analysis even if DCC is disabled at the moment. */ + if (tex->dcc_gather_statistics) { + bool separate_dcc_dirty = tex->separate_dcc_dirty; + + /* If the color buffer hasn't been unbound and fast clear hasn't + * been used, separate_dcc_dirty is false, but there may have been + * new rendering. Check if the color buffer is bound and assume + * it's dirty. + * + * Note that DRI2 never unbinds window colorbuffers, which means + * the DCC pipeline statistics query would never be re-set and would + * keep adding new results until all free memory is exhausted if we + * didn't do this. + */ + if (!separate_dcc_dirty) { + for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (sctx->framebuffer.state.cbufs[i] && + sctx->framebuffer.state.cbufs[i]->texture == res) { + separate_dcc_dirty = true; + break; + } + } + } + + if (separate_dcc_dirty) { + tex->separate_dcc_dirty = false; + vi_separate_dcc_process_and_reset_stats(ctx, tex); + } + } } void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex) { - /* If graphics is disabled, we can't decompress DCC, but it shouldn't - * be compressed either. The caller should simply discard it. - */ - if (!tex->surface.dcc_offset || !sctx->has_graphics) - return; - - si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, - 0, util_max_layer(&tex->buffer.b.b, 0), - true, false); + /* If graphics is disabled, we can't decompress DCC, but it shouldn't + * be compressed either. The caller should simply discard it. + */ + if (!tex->surface.dcc_offset || !sctx->has_graphics) + return; + + si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, + util_max_layer(&tex->buffer.b.b, 0), true, false); } void si_init_blit_functions(struct si_context *sctx) { - sctx->b.resource_copy_region = si_resource_copy_region; + sctx->b.resource_copy_region = si_resource_copy_region; - if (sctx->has_graphics) { - sctx->b.blit = si_blit; - sctx->b.flush_resource = si_flush_resource; - sctx->b.generate_mipmap = si_generate_mipmap; - } + if (sctx->has_graphics) { + sctx->b.blit = si_blit; + sctx->b.flush_resource = si_flush_resource; + sctx->b.generate_mipmap = si_generate_mipmap; + } } diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 38d8e9456c2..eb71636d346 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -24,240 +24,227 @@ #include "radeonsi/si_pipe.h" #include "util/u_memory.h" -#include "util/u_upload_mgr.h" #include "util/u_transfer.h" +#include "util/u_upload_mgr.h" + #include #include -bool si_rings_is_buffer_referenced(struct si_context *sctx, - struct pb_buffer *buf, - enum radeon_bo_usage usage) +bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf, + enum radeon_bo_usage usage) { - if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) { - return true; - } - if (radeon_emitted(sctx->sdma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) { - return true; - } - return false; + if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) { + return true; + } + if (radeon_emitted(sctx->sdma_cs, 0) && + sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) { + return true; + } + return false; } -void *si_buffer_map_sync_with_rings(struct si_context *sctx, - struct si_resource *resource, - unsigned usage) +void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource, + unsigned usage) { - enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; - bool busy = false; - - assert(!(resource->flags & RADEON_FLAG_SPARSE)); - - if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { - return sctx->ws->buffer_map(resource->buf, NULL, usage); - } - - if (!(usage & PIPE_TRANSFER_WRITE)) { - /* have to wait for the last write */ - rusage = RADEON_USAGE_WRITE; - } - - if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) && - sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, - resource->buf, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - return NULL; - } else { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - busy = true; - } - } - if (radeon_emitted(sctx->sdma_cs, 0) && - sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, - resource->buf, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { - si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); - return NULL; - } else { - si_flush_dma_cs(sctx, 0, NULL); - busy = true; - } - } - - if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { - return NULL; - } else { - /* We will be wait for the GPU. Wait for any offloaded - * CS flush to complete to avoid busy-waiting in the winsys. */ - sctx->ws->cs_sync_flush(sctx->gfx_cs); - if (sctx->sdma_cs) - sctx->ws->cs_sync_flush(sctx->sdma_cs); - } - } - - /* Setting the CS to NULL will prevent doing checks we have done already. */ - return sctx->ws->buffer_map(resource->buf, NULL, usage); + enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; + bool busy = false; + + assert(!(resource->flags & RADEON_FLAG_SPARSE)); + + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { + return sctx->ws->buffer_map(resource->buf, NULL, usage); + } + + if (!(usage & PIPE_TRANSFER_WRITE)) { + /* have to wait for the last write */ + rusage = RADEON_USAGE_WRITE; + } + + if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) && + sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + return NULL; + } else { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + busy = true; + } + } + if (radeon_emitted(sctx->sdma_cs, 0) && + sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, resource->buf, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + return NULL; + } else { + si_flush_dma_cs(sctx, 0, NULL); + busy = true; + } + } + + if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + return NULL; + } else { + /* We will be wait for the GPU. Wait for any offloaded + * CS flush to complete to avoid busy-waiting in the winsys. */ + sctx->ws->cs_sync_flush(sctx->gfx_cs); + if (sctx->sdma_cs) + sctx->ws->cs_sync_flush(sctx->sdma_cs); + } + } + + /* Setting the CS to NULL will prevent doing checks we have done already. */ + return sctx->ws->buffer_map(resource->buf, NULL, usage); } -void si_init_resource_fields(struct si_screen *sscreen, - struct si_resource *res, - uint64_t size, unsigned alignment) +void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size, + unsigned alignment) { - struct si_texture *tex = (struct si_texture*)res; - - res->bo_size = size; - res->bo_alignment = alignment; - res->flags = 0; - res->texture_handle_allocated = false; - res->image_handle_allocated = false; - - switch (res->b.b.usage) { - case PIPE_USAGE_STREAM: - res->flags = RADEON_FLAG_GTT_WC; - /* fall through */ - case PIPE_USAGE_STAGING: - /* Transfers are likely to occur more often with these - * resources. */ - res->domains = RADEON_DOMAIN_GTT; - break; - case PIPE_USAGE_DYNAMIC: - /* Older kernels didn't always flush the HDP cache before - * CS execution - */ - if (!sscreen->info.kernel_flushes_hdp_before_ib) { - res->domains = RADEON_DOMAIN_GTT; - res->flags |= RADEON_FLAG_GTT_WC; - break; - } - /* fall through */ - case PIPE_USAGE_DEFAULT: - case PIPE_USAGE_IMMUTABLE: - default: - /* Not listing GTT here improves performance in some - * apps. */ - res->domains = RADEON_DOMAIN_VRAM; - res->flags |= RADEON_FLAG_GTT_WC; - break; - } - - if (res->b.b.target == PIPE_BUFFER && - res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) { - /* Use GTT for all persistent mappings with older - * kernels, because they didn't always flush the HDP - * cache before CS execution. - * - * Write-combined CPU mappings are fine, the kernel - * ensures all CPU writes finish before the GPU - * executes a command stream. - * - * radeon doesn't have good BO move throttling, so put all - * persistent buffers into GTT to prevent VRAM CPU page faults. - */ - if (!sscreen->info.kernel_flushes_hdp_before_ib || - !sscreen->info.is_amdgpu) - res->domains = RADEON_DOMAIN_GTT; - } - - /* Tiled textures are unmappable. Always put them in VRAM. */ - if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) || - res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) { - res->domains = RADEON_DOMAIN_VRAM; - res->flags |= RADEON_FLAG_NO_CPU_ACCESS | - RADEON_FLAG_GTT_WC; - } - - /* Displayable and shareable surfaces are not suballocated. */ - if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT)) - res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */ - else - res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; - - if (sscreen->debug_flags & DBG(NO_WC)) - res->flags &= ~RADEON_FLAG_GTT_WC; - - if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY) - res->flags |= RADEON_FLAG_READ_ONLY; - - if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT) - res->flags |= RADEON_FLAG_32BIT; - - /* Set expected VRAM and GART usage for the buffer. */ - res->vram_usage = 0; - res->gart_usage = 0; - res->max_forced_staging_uploads = 0; - res->b.max_forced_staging_uploads = 0; - - if (res->domains & RADEON_DOMAIN_VRAM) { - res->vram_usage = size; - - res->max_forced_staging_uploads = - res->b.max_forced_staging_uploads = - sscreen->info.has_dedicated_vram && - size >= sscreen->info.vram_vis_size / 4 ? 1 : 0; - } else if (res->domains & RADEON_DOMAIN_GTT) { - res->gart_usage = size; - } + struct si_texture *tex = (struct si_texture *)res; + + res->bo_size = size; + res->bo_alignment = alignment; + res->flags = 0; + res->texture_handle_allocated = false; + res->image_handle_allocated = false; + + switch (res->b.b.usage) { + case PIPE_USAGE_STREAM: + res->flags = RADEON_FLAG_GTT_WC; + /* fall through */ + case PIPE_USAGE_STAGING: + /* Transfers are likely to occur more often with these + * resources. */ + res->domains = RADEON_DOMAIN_GTT; + break; + case PIPE_USAGE_DYNAMIC: + /* Older kernels didn't always flush the HDP cache before + * CS execution + */ + if (!sscreen->info.kernel_flushes_hdp_before_ib) { + res->domains = RADEON_DOMAIN_GTT; + res->flags |= RADEON_FLAG_GTT_WC; + break; + } + /* fall through */ + case PIPE_USAGE_DEFAULT: + case PIPE_USAGE_IMMUTABLE: + default: + /* Not listing GTT here improves performance in some + * apps. */ + res->domains = RADEON_DOMAIN_VRAM; + res->flags |= RADEON_FLAG_GTT_WC; + break; + } + + if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) { + /* Use GTT for all persistent mappings with older + * kernels, because they didn't always flush the HDP + * cache before CS execution. + * + * Write-combined CPU mappings are fine, the kernel + * ensures all CPU writes finish before the GPU + * executes a command stream. + * + * radeon doesn't have good BO move throttling, so put all + * persistent buffers into GTT to prevent VRAM CPU page faults. + */ + if (!sscreen->info.kernel_flushes_hdp_before_ib || !sscreen->info.is_amdgpu) + res->domains = RADEON_DOMAIN_GTT; + } + + /* Tiled textures are unmappable. Always put them in VRAM. */ + if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) || + res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) { + res->domains = RADEON_DOMAIN_VRAM; + res->flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC; + } + + /* Displayable and shareable surfaces are not suballocated. */ + if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT)) + res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */ + else + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; + + if (sscreen->debug_flags & DBG(NO_WC)) + res->flags &= ~RADEON_FLAG_GTT_WC; + + if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY) + res->flags |= RADEON_FLAG_READ_ONLY; + + if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT) + res->flags |= RADEON_FLAG_32BIT; + + /* Set expected VRAM and GART usage for the buffer. */ + res->vram_usage = 0; + res->gart_usage = 0; + res->max_forced_staging_uploads = 0; + res->b.max_forced_staging_uploads = 0; + + if (res->domains & RADEON_DOMAIN_VRAM) { + res->vram_usage = size; + + res->max_forced_staging_uploads = res->b.max_forced_staging_uploads = + sscreen->info.has_dedicated_vram && size >= sscreen->info.vram_vis_size / 4 ? 1 : 0; + } else if (res->domains & RADEON_DOMAIN_GTT) { + res->gart_usage = size; + } } -bool si_alloc_resource(struct si_screen *sscreen, - struct si_resource *res) +bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res) { - struct pb_buffer *old_buf, *new_buf; - - /* Allocate a new resource. */ - new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size, - res->bo_alignment, - res->domains, res->flags); - if (!new_buf) { - return false; - } - - /* Replace the pointer such that if res->buf wasn't NULL, it won't be - * NULL. This should prevent crashes with multiple contexts using - * the same buffer where one of the contexts invalidates it while - * the others are using it. */ - old_buf = res->buf; - res->buf = new_buf; /* should be atomic */ - res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf); - - if (res->flags & RADEON_FLAG_32BIT) { - uint64_t start = res->gpu_address; - uint64_t last = start + res->bo_size - 1; - (void)start; - (void)last; - - assert((start >> 32) == sscreen->info.address32_hi); - assert((last >> 32) == sscreen->info.address32_hi); - } - - pb_reference(&old_buf, NULL); - - util_range_set_empty(&res->valid_buffer_range); - res->TC_L2_dirty = false; - - /* Print debug information. */ - if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) { - fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n", - res->gpu_address, res->gpu_address + res->buf->size, - res->buf->size); - } - - if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR) - si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0); - - return true; + struct pb_buffer *old_buf, *new_buf; + + /* Allocate a new resource. */ + new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size, res->bo_alignment, res->domains, + res->flags); + if (!new_buf) { + return false; + } + + /* Replace the pointer such that if res->buf wasn't NULL, it won't be + * NULL. This should prevent crashes with multiple contexts using + * the same buffer where one of the contexts invalidates it while + * the others are using it. */ + old_buf = res->buf; + res->buf = new_buf; /* should be atomic */ + res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf); + + if (res->flags & RADEON_FLAG_32BIT) { + uint64_t start = res->gpu_address; + uint64_t last = start + res->bo_size - 1; + (void)start; + (void)last; + + assert((start >> 32) == sscreen->info.address32_hi); + assert((last >> 32) == sscreen->info.address32_hi); + } + + pb_reference(&old_buf, NULL); + + util_range_set_empty(&res->valid_buffer_range); + res->TC_L2_dirty = false; + + /* Print debug information. */ + if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) { + fprintf(stderr, "VM start=0x%" PRIX64 " end=0x%" PRIX64 " | Buffer %" PRIu64 " bytes\n", + res->gpu_address, res->gpu_address + res->buf->size, res->buf->size); + } + + if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR) + si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0); + + return true; } -static void si_buffer_destroy(struct pipe_screen *screen, - struct pipe_resource *buf) +static void si_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf) { - struct si_resource *buffer = si_resource(buf); + struct si_resource *buffer = si_resource(buf); - threaded_resource_deinit(buf); - util_range_destroy(&buffer->valid_buffer_range); - pb_reference(&buffer->buf, NULL); - FREE(buffer); + threaded_resource_deinit(buf); + util_range_destroy(&buffer->valid_buffer_range); + pb_reference(&buffer->buf, NULL); + FREE(buffer); } /* Reallocate the buffer a update all resource bindings where the buffer is @@ -266,560 +253,511 @@ static void si_buffer_destroy(struct pipe_screen *screen, * This is used to avoid CPU-GPU synchronizations, because it makes the buffer * idle by discarding its contents. */ -static bool -si_invalidate_buffer(struct si_context *sctx, - struct si_resource *buf) +static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *buf) { - /* Shared buffers can't be reallocated. */ - if (buf->b.is_shared) - return false; - - /* Sparse buffers can't be reallocated. */ - if (buf->flags & RADEON_FLAG_SPARSE) - return false; - - /* In AMD_pinned_memory, the user pointer association only gets - * broken when the buffer is explicitly re-allocated. - */ - if (buf->b.is_user_ptr) - return false; - - /* Check if mapping this buffer would cause waiting for the GPU. */ - if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) { - /* Reallocate the buffer in the same pipe_resource. */ - si_alloc_resource(sctx->screen, buf); - si_rebind_buffer(sctx, &buf->b.b); - } else { - util_range_set_empty(&buf->valid_buffer_range); - } - - return true; + /* Shared buffers can't be reallocated. */ + if (buf->b.is_shared) + return false; + + /* Sparse buffers can't be reallocated. */ + if (buf->flags & RADEON_FLAG_SPARSE) + return false; + + /* In AMD_pinned_memory, the user pointer association only gets + * broken when the buffer is explicitly re-allocated. + */ + if (buf->b.is_user_ptr) + return false; + + /* Check if mapping this buffer would cause waiting for the GPU. */ + if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Reallocate the buffer in the same pipe_resource. */ + si_alloc_resource(sctx->screen, buf); + si_rebind_buffer(sctx, &buf->b.b); + } else { + util_range_set_empty(&buf->valid_buffer_range); + } + + return true; } /* Replace the storage of dst with src. */ -void si_replace_buffer_storage(struct pipe_context *ctx, - struct pipe_resource *dst, - struct pipe_resource *src) +void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, + struct pipe_resource *src) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_resource *sdst = si_resource(dst); - struct si_resource *ssrc = si_resource(src); - - pb_reference(&sdst->buf, ssrc->buf); - sdst->gpu_address = ssrc->gpu_address; - sdst->b.b.bind = ssrc->b.b.bind; - sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads; - sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads; - sdst->flags = ssrc->flags; - - assert(sdst->vram_usage == ssrc->vram_usage); - assert(sdst->gart_usage == ssrc->gart_usage); - assert(sdst->bo_size == ssrc->bo_size); - assert(sdst->bo_alignment == ssrc->bo_alignment); - assert(sdst->domains == ssrc->domains); - - si_rebind_buffer(sctx, dst); + struct si_context *sctx = (struct si_context *)ctx; + struct si_resource *sdst = si_resource(dst); + struct si_resource *ssrc = si_resource(src); + + pb_reference(&sdst->buf, ssrc->buf); + sdst->gpu_address = ssrc->gpu_address; + sdst->b.b.bind = ssrc->b.b.bind; + sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads; + sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads; + sdst->flags = ssrc->flags; + + assert(sdst->vram_usage == ssrc->vram_usage); + assert(sdst->gart_usage == ssrc->gart_usage); + assert(sdst->bo_size == ssrc->bo_size); + assert(sdst->bo_alignment == ssrc->bo_alignment); + assert(sdst->domains == ssrc->domains); + + si_rebind_buffer(sctx, dst); } -static void si_invalidate_resource(struct pipe_context *ctx, - struct pipe_resource *resource) +static void si_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_resource *buf = si_resource(resource); + struct si_context *sctx = (struct si_context *)ctx; + struct si_resource *buf = si_resource(resource); - /* We currently only do anyting here for buffers */ - if (resource->target == PIPE_BUFFER) - (void)si_invalidate_buffer(sctx, buf); + /* We currently only do anyting here for buffers */ + if (resource->target == PIPE_BUFFER) + (void)si_invalidate_buffer(sctx, buf); } -static void *si_buffer_get_transfer(struct pipe_context *ctx, - struct pipe_resource *resource, - unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **ptransfer, - void *data, struct si_resource *staging, - unsigned offset) +static void *si_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource, + unsigned usage, const struct pipe_box *box, + struct pipe_transfer **ptransfer, void *data, + struct si_resource *staging, unsigned offset) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_transfer *transfer; - - if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) - transfer = slab_alloc(&sctx->pool_transfers_unsync); - else - transfer = slab_alloc(&sctx->pool_transfers); - - transfer->b.b.resource = NULL; - pipe_resource_reference(&transfer->b.b.resource, resource); - transfer->b.b.level = 0; - transfer->b.b.usage = usage; - transfer->b.b.box = *box; - transfer->b.b.stride = 0; - transfer->b.b.layer_stride = 0; - transfer->b.staging = NULL; - transfer->offset = offset; - transfer->staging = staging; - *ptransfer = &transfer->b.b; - return data; + struct si_context *sctx = (struct si_context *)ctx; + struct si_transfer *transfer; + + if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) + transfer = slab_alloc(&sctx->pool_transfers_unsync); + else + transfer = slab_alloc(&sctx->pool_transfers); + + transfer->b.b.resource = NULL; + pipe_resource_reference(&transfer->b.b.resource, resource); + transfer->b.b.level = 0; + transfer->b.b.usage = usage; + transfer->b.b.box = *box; + transfer->b.b.stride = 0; + transfer->b.b.layer_stride = 0; + transfer->b.staging = NULL; + transfer->offset = offset; + transfer->staging = staging; + *ptransfer = &transfer->b.b; + return data; } -static void *si_buffer_transfer_map(struct pipe_context *ctx, - struct pipe_resource *resource, - unsigned level, - unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **ptransfer) +static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource, + unsigned level, unsigned usage, const struct pipe_box *box, + struct pipe_transfer **ptransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_resource *buf = si_resource(resource); - uint8_t *data; - - assert(box->x + box->width <= resource->width0); - - /* From GL_AMD_pinned_memory issues: - * - * 4) Is glMapBuffer on a shared buffer guaranteed to return the - * same system address which was specified at creation time? - * - * RESOLVED: NO. The GL implementation might return a different - * virtual mapping of that memory, although the same physical - * page will be used. - * - * So don't ever use staging buffers. - */ - if (buf->b.is_user_ptr) - usage |= PIPE_TRANSFER_PERSISTENT; - - /* See if the buffer range being mapped has never been initialized, - * in which case it can be mapped unsynchronized. */ - if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | - TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) && - usage & PIPE_TRANSFER_WRITE && - !buf->b.is_shared && - !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) { - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - } - - /* If discarding the entire range, discard the whole resource instead. */ - if (usage & PIPE_TRANSFER_DISCARD_RANGE && - box->x == 0 && box->width == resource->width0) { - usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; - } - - /* If a buffer in VRAM is too large and the range is discarded, don't - * map it directly. This makes sure that the buffer stays in VRAM. - */ - bool force_discard_range = false; - if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | - PIPE_TRANSFER_DISCARD_RANGE) && - !(usage & PIPE_TRANSFER_PERSISTENT) && - /* Try not to decrement the counter if it's not positive. Still racy, - * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */ - buf->max_forced_staging_uploads > 0 && - p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) { - usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | - PIPE_TRANSFER_UNSYNCHRONIZED); - usage |= PIPE_TRANSFER_DISCARD_RANGE; - force_discard_range = true; - } - - if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && - !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | - TC_TRANSFER_MAP_NO_INVALIDATE))) { - assert(usage & PIPE_TRANSFER_WRITE); - - if (si_invalidate_buffer(sctx, buf)) { - /* At this point, the buffer is always idle. */ - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - } else { - /* Fall back to a temporary buffer. */ - usage |= PIPE_TRANSFER_DISCARD_RANGE; - } - } - - if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT && - buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { - usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_PERSISTENT); - usage |= PIPE_TRANSFER_DISCARD_RANGE; - force_discard_range = true; - } - - if (usage & PIPE_TRANSFER_DISCARD_RANGE && - ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_PERSISTENT))) || - (buf->flags & RADEON_FLAG_SPARSE))) { - assert(usage & PIPE_TRANSFER_WRITE); - - /* Check if mapping this buffer would cause waiting for the GPU. - */ - if (buf->flags & RADEON_FLAG_SPARSE || - force_discard_range || - si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) { - /* Do a wait-free write-only transfer using a temporary buffer. */ - struct u_upload_mgr *uploader; - struct si_resource *staging = NULL; - unsigned offset; - - /* If we are not called from the driver thread, we have - * to use the uploader from u_threaded_context, which is - * local to the calling thread. - */ - if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) - uploader = sctx->tc->base.stream_uploader; - else - uploader = sctx->b.stream_uploader; - - u_upload_alloc(uploader, 0, - box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT), - sctx->screen->info.tcc_cache_line_size, - &offset, (struct pipe_resource**)&staging, - (void**)&data); - - if (staging) { - data += box->x % SI_MAP_BUFFER_ALIGNMENT; - return si_buffer_get_transfer(ctx, resource, usage, box, - ptransfer, data, staging, offset); - } else if (buf->flags & RADEON_FLAG_SPARSE) { - return NULL; - } - } else { - /* At this point, the buffer is always idle (we checked it above). */ - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - } - } - /* Use a staging buffer in cached GTT for reads. */ - else if (((usage & PIPE_TRANSFER_READ) && - !(usage & PIPE_TRANSFER_PERSISTENT) && - (buf->domains & RADEON_DOMAIN_VRAM || - buf->flags & RADEON_FLAG_GTT_WC)) || - (buf->flags & RADEON_FLAG_SPARSE)) { - struct si_resource *staging; - - assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)); - staging = si_resource(pipe_buffer_create( - ctx->screen, 0, PIPE_USAGE_STAGING, - box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT))); - if (staging) { - /* Copy the VRAM buffer to the staging buffer. */ - si_sdma_copy_buffer(sctx, &staging->b.b, resource, - box->x % SI_MAP_BUFFER_ALIGNMENT, - box->x, box->width); - - data = si_buffer_map_sync_with_rings(sctx, staging, - usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); - if (!data) { - si_resource_reference(&staging, NULL); - return NULL; - } - data += box->x % SI_MAP_BUFFER_ALIGNMENT; - - return si_buffer_get_transfer(ctx, resource, usage, box, - ptransfer, data, staging, 0); - } else if (buf->flags & RADEON_FLAG_SPARSE) { - return NULL; - } - } - - data = si_buffer_map_sync_with_rings(sctx, buf, usage); - if (!data) { - return NULL; - } - data += box->x; - - return si_buffer_get_transfer(ctx, resource, usage, box, - ptransfer, data, NULL, 0); + struct si_context *sctx = (struct si_context *)ctx; + struct si_resource *buf = si_resource(resource); + uint8_t *data; + + assert(box->x + box->width <= resource->width0); + + /* From GL_AMD_pinned_memory issues: + * + * 4) Is glMapBuffer on a shared buffer guaranteed to return the + * same system address which was specified at creation time? + * + * RESOLVED: NO. The GL implementation might return a different + * virtual mapping of that memory, although the same physical + * page will be used. + * + * So don't ever use staging buffers. + */ + if (buf->b.is_user_ptr) + usage |= PIPE_TRANSFER_PERSISTENT; + + /* See if the buffer range being mapped has never been initialized, + * in which case it can be mapped unsynchronized. */ + if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) && + usage & PIPE_TRANSFER_WRITE && !buf->b.is_shared && + !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) { + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } + + /* If discarding the entire range, discard the whole resource instead. */ + if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { + usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; + } + + /* If a buffer in VRAM is too large and the range is discarded, don't + * map it directly. This makes sure that the buffer stays in VRAM. + */ + bool force_discard_range = false; + if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_DISCARD_RANGE) && + !(usage & PIPE_TRANSFER_PERSISTENT) && + /* Try not to decrement the counter if it's not positive. Still racy, + * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */ + buf->max_forced_staging_uploads > 0 && + p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) { + usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_UNSYNCHRONIZED); + usage |= PIPE_TRANSFER_DISCARD_RANGE; + force_discard_range = true; + } + + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && + !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) { + assert(usage & PIPE_TRANSFER_WRITE); + + if (si_invalidate_buffer(sctx, buf)) { + /* At this point, the buffer is always idle. */ + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } else { + /* Fall back to a temporary buffer. */ + usage |= PIPE_TRANSFER_DISCARD_RANGE; + } + } + + if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT && + buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { + usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT); + usage |= PIPE_TRANSFER_DISCARD_RANGE; + force_discard_range = true; + } + + if (usage & PIPE_TRANSFER_DISCARD_RANGE && + ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT))) || + (buf->flags & RADEON_FLAG_SPARSE))) { + assert(usage & PIPE_TRANSFER_WRITE); + + /* Check if mapping this buffer would cause waiting for the GPU. + */ + if (buf->flags & RADEON_FLAG_SPARSE || force_discard_range || + si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Do a wait-free write-only transfer using a temporary buffer. */ + struct u_upload_mgr *uploader; + struct si_resource *staging = NULL; + unsigned offset; + + /* If we are not called from the driver thread, we have + * to use the uploader from u_threaded_context, which is + * local to the calling thread. + */ + if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) + uploader = sctx->tc->base.stream_uploader; + else + uploader = sctx->b.stream_uploader; + + u_upload_alloc(uploader, 0, box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT), + sctx->screen->info.tcc_cache_line_size, &offset, + (struct pipe_resource **)&staging, (void **)&data); + + if (staging) { + data += box->x % SI_MAP_BUFFER_ALIGNMENT; + return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, + offset); + } else if (buf->flags & RADEON_FLAG_SPARSE) { + return NULL; + } + } else { + /* At this point, the buffer is always idle (we checked it above). */ + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } + } + /* Use a staging buffer in cached GTT for reads. */ + else if (((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_PERSISTENT) && + (buf->domains & RADEON_DOMAIN_VRAM || buf->flags & RADEON_FLAG_GTT_WC)) || + (buf->flags & RADEON_FLAG_SPARSE)) { + struct si_resource *staging; + + assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)); + staging = si_resource(pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_STAGING, + box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT))); + if (staging) { + /* Copy the VRAM buffer to the staging buffer. */ + si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT, + box->x, box->width); + + data = si_buffer_map_sync_with_rings(sctx, staging, usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); + if (!data) { + si_resource_reference(&staging, NULL); + return NULL; + } + data += box->x % SI_MAP_BUFFER_ALIGNMENT; + + return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, 0); + } else if (buf->flags & RADEON_FLAG_SPARSE) { + return NULL; + } + } + + data = si_buffer_map_sync_with_rings(sctx, buf, usage); + if (!data) { + return NULL; + } + data += box->x; + + return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0); } -static void si_buffer_do_flush_region(struct pipe_context *ctx, - struct pipe_transfer *transfer, - const struct pipe_box *box) +static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer, + const struct pipe_box *box) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_transfer *stransfer = (struct si_transfer*)transfer; - struct si_resource *buf = si_resource(transfer->resource); - - if (stransfer->staging) { - unsigned src_offset = stransfer->offset + - transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + - (box->x - transfer->box.x); - - if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { - /* This should be true for all uploaders. */ - assert(transfer->box.x == 0); - - /* Find a previous upload and extend its range. The last - * upload is likely to be at the end of the list. - */ - for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) { - struct si_sdma_upload *up = &sctx->sdma_uploads[i]; - - if (up->dst != buf) - continue; - - assert(up->src == stransfer->staging); - assert(box->x > up->dst_offset); - up->size = box->x + box->width - up->dst_offset; - return; - } - - /* Enlarge the array if it's full. */ - if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) { - unsigned size; - - sctx->max_sdma_uploads += 4; - size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]); - sctx->sdma_uploads = realloc(sctx->sdma_uploads, size); - } - - /* Add a new upload. */ - struct si_sdma_upload *up = - &sctx->sdma_uploads[sctx->num_sdma_uploads++]; - up->dst = up->src = NULL; - si_resource_reference(&up->dst, buf); - si_resource_reference(&up->src, stransfer->staging); - up->dst_offset = box->x; - up->src_offset = src_offset; - up->size = box->width; - return; - } - - /* Copy the staging buffer into the original one. */ - si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, - box->x, src_offset, box->width); - } - - util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, - box->x + box->width); + struct si_context *sctx = (struct si_context *)ctx; + struct si_transfer *stransfer = (struct si_transfer *)transfer; + struct si_resource *buf = si_resource(transfer->resource); + + if (stransfer->staging) { + unsigned src_offset = + stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x); + + if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { + /* This should be true for all uploaders. */ + assert(transfer->box.x == 0); + + /* Find a previous upload and extend its range. The last + * upload is likely to be at the end of the list. + */ + for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) { + struct si_sdma_upload *up = &sctx->sdma_uploads[i]; + + if (up->dst != buf) + continue; + + assert(up->src == stransfer->staging); + assert(box->x > up->dst_offset); + up->size = box->x + box->width - up->dst_offset; + return; + } + + /* Enlarge the array if it's full. */ + if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) { + unsigned size; + + sctx->max_sdma_uploads += 4; + size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]); + sctx->sdma_uploads = realloc(sctx->sdma_uploads, size); + } + + /* Add a new upload. */ + struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++]; + up->dst = up->src = NULL; + si_resource_reference(&up->dst, buf); + si_resource_reference(&up->src, stransfer->staging); + up->dst_offset = box->x; + up->src_offset = src_offset; + up->size = box->width; + return; + } + + /* Copy the staging buffer into the original one. */ + si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset, + box->width); + } + + util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, box->x + box->width); } -static void si_buffer_flush_region(struct pipe_context *ctx, - struct pipe_transfer *transfer, - const struct pipe_box *rel_box) +static void si_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer, + const struct pipe_box *rel_box) { - unsigned required_usage = PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_FLUSH_EXPLICIT; + unsigned required_usage = PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT; - if ((transfer->usage & required_usage) == required_usage) { - struct pipe_box box; + if ((transfer->usage & required_usage) == required_usage) { + struct pipe_box box; - u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box); - si_buffer_do_flush_region(ctx, transfer, &box); - } + u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box); + si_buffer_do_flush_region(ctx, transfer, &box); + } } -static void si_buffer_transfer_unmap(struct pipe_context *ctx, - struct pipe_transfer *transfer) +static void si_buffer_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_transfer *stransfer = (struct si_transfer*)transfer; + struct si_context *sctx = (struct si_context *)ctx; + struct si_transfer *stransfer = (struct si_transfer *)transfer; - if (transfer->usage & PIPE_TRANSFER_WRITE && - !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) - si_buffer_do_flush_region(ctx, transfer, &transfer->box); + if (transfer->usage & PIPE_TRANSFER_WRITE && !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + si_buffer_do_flush_region(ctx, transfer, &transfer->box); - si_resource_reference(&stransfer->staging, NULL); - assert(stransfer->b.staging == NULL); /* for threaded context only */ - pipe_resource_reference(&transfer->resource, NULL); + si_resource_reference(&stransfer->staging, NULL); + assert(stransfer->b.staging == NULL); /* for threaded context only */ + pipe_resource_reference(&transfer->resource, NULL); - /* Don't use pool_transfers_unsync. We are always in the driver - * thread. */ - slab_free(&sctx->pool_transfers, transfer); + /* Don't use pool_transfers_unsync. We are always in the driver + * thread. */ + slab_free(&sctx->pool_transfers, transfer); } -static void si_buffer_subdata(struct pipe_context *ctx, - struct pipe_resource *buffer, - unsigned usage, unsigned offset, - unsigned size, const void *data) +static void si_buffer_subdata(struct pipe_context *ctx, struct pipe_resource *buffer, + unsigned usage, unsigned offset, unsigned size, const void *data) { - struct pipe_transfer *transfer = NULL; - struct pipe_box box; - uint8_t *map = NULL; + struct pipe_transfer *transfer = NULL; + struct pipe_box box; + uint8_t *map = NULL; - usage |= PIPE_TRANSFER_WRITE; + usage |= PIPE_TRANSFER_WRITE; - if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY)) - usage |= PIPE_TRANSFER_DISCARD_RANGE; + if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY)) + usage |= PIPE_TRANSFER_DISCARD_RANGE; - u_box_1d(offset, size, &box); - map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer); - if (!map) - return; + u_box_1d(offset, size, &box); + map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer); + if (!map) + return; - memcpy(map, data, size); - si_buffer_transfer_unmap(ctx, transfer); + memcpy(map, data, size); + si_buffer_transfer_unmap(ctx, transfer); } -static const struct u_resource_vtbl si_buffer_vtbl = -{ - NULL, /* get_handle */ - si_buffer_destroy, /* resource_destroy */ - si_buffer_transfer_map, /* transfer_map */ - si_buffer_flush_region, /* transfer_flush_region */ - si_buffer_transfer_unmap, /* transfer_unmap */ +static const struct u_resource_vtbl si_buffer_vtbl = { + NULL, /* get_handle */ + si_buffer_destroy, /* resource_destroy */ + si_buffer_transfer_map, /* transfer_map */ + si_buffer_flush_region, /* transfer_flush_region */ + si_buffer_transfer_unmap, /* transfer_unmap */ }; -static struct si_resource * -si_alloc_buffer_struct(struct pipe_screen *screen, - const struct pipe_resource *templ) +static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen, + const struct pipe_resource *templ) { - struct si_resource *buf; + struct si_resource *buf; - buf = MALLOC_STRUCT(si_resource); + buf = MALLOC_STRUCT(si_resource); - buf->b.b = *templ; - buf->b.b.next = NULL; - pipe_reference_init(&buf->b.b.reference, 1); - buf->b.b.screen = screen; + buf->b.b = *templ; + buf->b.b.next = NULL; + pipe_reference_init(&buf->b.b.reference, 1); + buf->b.b.screen = screen; - buf->b.vtbl = &si_buffer_vtbl; - threaded_resource_init(&buf->b.b); + buf->b.vtbl = &si_buffer_vtbl; + threaded_resource_init(&buf->b.b); - buf->buf = NULL; - buf->bind_history = 0; - buf->TC_L2_dirty = false; - util_range_init(&buf->valid_buffer_range); - return buf; + buf->buf = NULL; + buf->bind_history = 0; + buf->TC_L2_dirty = false; + util_range_init(&buf->valid_buffer_range); + return buf; } static struct pipe_resource *si_buffer_create(struct pipe_screen *screen, - const struct pipe_resource *templ, - unsigned alignment) + const struct pipe_resource *templ, unsigned alignment) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_resource *buf = si_alloc_buffer_struct(screen, templ); + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_resource *buf = si_alloc_buffer_struct(screen, templ); - if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) - buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE; + if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) + buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE; - si_init_resource_fields(sscreen, buf, templ->width0, alignment); + si_init_resource_fields(sscreen, buf, templ->width0, alignment); - if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) - buf->flags |= RADEON_FLAG_SPARSE; + if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) + buf->flags |= RADEON_FLAG_SPARSE; - if (!si_alloc_resource(sscreen, buf)) { - FREE(buf); - return NULL; - } - return &buf->b.b; + if (!si_alloc_resource(sscreen, buf)) { + FREE(buf); + return NULL; + } + return &buf->b.b; } -struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, - unsigned flags, unsigned usage, - unsigned size, unsigned alignment) +struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, + unsigned usage, unsigned size, unsigned alignment) { - struct pipe_resource buffer; - - memset(&buffer, 0, sizeof buffer); - buffer.target = PIPE_BUFFER; - buffer.format = PIPE_FORMAT_R8_UNORM; - buffer.bind = 0; - buffer.usage = usage; - buffer.flags = flags; - buffer.width0 = size; - buffer.height0 = 1; - buffer.depth0 = 1; - buffer.array_size = 1; - return si_buffer_create(screen, &buffer, alignment); + struct pipe_resource buffer; + + memset(&buffer, 0, sizeof buffer); + buffer.target = PIPE_BUFFER; + buffer.format = PIPE_FORMAT_R8_UNORM; + buffer.bind = 0; + buffer.usage = usage; + buffer.flags = flags; + buffer.width0 = size; + buffer.height0 = 1; + buffer.depth0 = 1; + buffer.array_size = 1; + return si_buffer_create(screen, &buffer, alignment); } -struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, - unsigned flags, unsigned usage, - unsigned size, unsigned alignment) +struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, + unsigned usage, unsigned size, unsigned alignment) { - return si_resource(pipe_aligned_buffer_create(screen, flags, usage, - size, alignment)); + return si_resource(pipe_aligned_buffer_create(screen, flags, usage, size, alignment)); } -static struct pipe_resource * -si_buffer_from_user_memory(struct pipe_screen *screen, - const struct pipe_resource *templ, - void *user_memory) +static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *screen, + const struct pipe_resource *templ, + void *user_memory) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct radeon_winsys *ws = sscreen->ws; - struct si_resource *buf = si_alloc_buffer_struct(screen, templ); - - buf->domains = RADEON_DOMAIN_GTT; - buf->flags = 0; - buf->b.is_user_ptr = true; - util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0); - util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0); - - /* Convert a user pointer to a buffer. */ - buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0); - if (!buf->buf) { - FREE(buf); - return NULL; - } - - buf->gpu_address = ws->buffer_get_virtual_address(buf->buf); - buf->vram_usage = 0; - buf->gart_usage = templ->width0; - - return &buf->b.b; + struct si_screen *sscreen = (struct si_screen *)screen; + struct radeon_winsys *ws = sscreen->ws; + struct si_resource *buf = si_alloc_buffer_struct(screen, templ); + + buf->domains = RADEON_DOMAIN_GTT; + buf->flags = 0; + buf->b.is_user_ptr = true; + util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0); + util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0); + + /* Convert a user pointer to a buffer. */ + buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0); + if (!buf->buf) { + FREE(buf); + return NULL; + } + + buf->gpu_address = ws->buffer_get_virtual_address(buf->buf); + buf->vram_usage = 0; + buf->gart_usage = templ->width0; + + return &buf->b.b; } static struct pipe_resource *si_resource_create(struct pipe_screen *screen, - const struct pipe_resource *templ) + const struct pipe_resource *templ) { - if (templ->target == PIPE_BUFFER) { - return si_buffer_create(screen, templ, 256); - } else { - return si_texture_create(screen, templ); - } + if (templ->target == PIPE_BUFFER) { + return si_buffer_create(screen, templ, 256); + } else { + return si_texture_create(screen, templ); + } } -static bool si_resource_commit(struct pipe_context *pctx, - struct pipe_resource *resource, - unsigned level, struct pipe_box *box, - bool commit) +static bool si_resource_commit(struct pipe_context *pctx, struct pipe_resource *resource, + unsigned level, struct pipe_box *box, bool commit) { - struct si_context *ctx = (struct si_context *)pctx; - struct si_resource *res = si_resource(resource); - - /* - * Since buffer commitment changes cannot be pipelined, we need to - * (a) flush any pending commands that refer to the buffer we're about - * to change, and - * (b) wait for threaded submit to finish, including those that were - * triggered by some other, earlier operation. - */ - if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && - ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, - res->buf, RADEON_USAGE_READWRITE)) { - si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - } - if (radeon_emitted(ctx->sdma_cs, 0) && - ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, - res->buf, RADEON_USAGE_READWRITE)) { - si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); - } - - if (ctx->sdma_cs) - ctx->ws->cs_sync_flush(ctx->sdma_cs); - ctx->ws->cs_sync_flush(ctx->gfx_cs); - - assert(resource->target == PIPE_BUFFER); - - return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); + struct si_context *ctx = (struct si_context *)pctx; + struct si_resource *res = si_resource(resource); + + /* + * Since buffer commitment changes cannot be pipelined, we need to + * (a) flush any pending commands that refer to the buffer we're about + * to change, and + * (b) wait for threaded submit to finish, including those that were + * triggered by some other, earlier operation. + */ + if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) { + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + } + if (radeon_emitted(ctx->sdma_cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) { + si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + } + + if (ctx->sdma_cs) + ctx->ws->cs_sync_flush(ctx->sdma_cs); + ctx->ws->cs_sync_flush(ctx->gfx_cs); + + assert(resource->target == PIPE_BUFFER); + + return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); } void si_init_screen_buffer_functions(struct si_screen *sscreen) { - sscreen->b.resource_create = si_resource_create; - sscreen->b.resource_destroy = u_resource_destroy_vtbl; - sscreen->b.resource_from_user_memory = si_buffer_from_user_memory; + sscreen->b.resource_create = si_resource_create; + sscreen->b.resource_destroy = u_resource_destroy_vtbl; + sscreen->b.resource_from_user_memory = si_buffer_from_user_memory; } void si_init_buffer_functions(struct si_context *sctx) { - sctx->b.invalidate_resource = si_invalidate_resource; - sctx->b.transfer_map = u_transfer_map_vtbl; - sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl; - sctx->b.transfer_unmap = u_transfer_unmap_vtbl; - sctx->b.texture_subdata = u_default_texture_subdata; - sctx->b.buffer_subdata = si_buffer_subdata; - sctx->b.resource_commit = si_resource_commit; + sctx->b.invalidate_resource = si_invalidate_resource; + sctx->b.transfer_map = u_transfer_map_vtbl; + sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl; + sctx->b.transfer_unmap = u_transfer_unmap_vtbl; + sctx->b.texture_subdata = u_default_texture_subdata; + sctx->b.buffer_subdata = si_buffer_subdata; + sctx->b.resource_commit = si_resource_commit; } diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 0b0b64ca13c..8a9b6ea5e34 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -34,131 +34,128 @@ static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) { - assert(reg < SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); - radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2); + assert(reg < SI_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); + radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2); } static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) { - radeon_set_config_reg_seq(cs, reg, 1); - radeon_emit(cs, value); + radeon_set_config_reg_seq(cs, reg, 1); + radeon_emit(cs, value); } static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) { - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); + assert(reg >= SI_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); + radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); } static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) { - radeon_set_context_reg_seq(cs, reg, 1); - radeon_emit(cs, value); + radeon_set_context_reg_seq(cs, reg, 1); + radeon_emit(cs, value); } -static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, - unsigned reg, unsigned idx, - unsigned value) +static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx, + unsigned value) { - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 3 <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28)); - radeon_emit(cs, value); + assert(reg >= SI_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 3 <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); + radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); } static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) { - assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); - radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); + assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); + radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); } static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) { - radeon_set_sh_reg_seq(cs, reg, 1); - radeon_emit(cs, value); + radeon_set_sh_reg_seq(cs, reg, 1); + radeon_emit(cs, value); } static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) { - assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0)); - radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); } static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) { - radeon_set_uconfig_reg_seq(cs, reg, 1); - radeon_emit(cs, value); + radeon_set_uconfig_reg_seq(cs, reg, 1); + radeon_emit(cs, value); } -static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, - struct si_screen *screen, - unsigned reg, unsigned idx, - unsigned value) +static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen, + unsigned reg, unsigned idx, unsigned value) { - assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); - assert(cs->current.cdw + 3 <= cs->current.max_dw); - assert(idx != 0); - unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX; - if (screen->info.chip_class < GFX9 || - (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26)) - opcode = PKT3_SET_UCONFIG_REG; - radeon_emit(cs, PKT3(opcode, 1, 0)); - radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28)); - radeon_emit(cs, value); + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->current.cdw + 3 <= cs->current.max_dw); + assert(idx != 0); + unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX; + if (screen->info.chip_class < GFX9 || + (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26)) + opcode = PKT3_SET_UCONFIG_REG; + radeon_emit(cs, PKT3(opcode, 1, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); } static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg, - unsigned value, unsigned mask) + unsigned value, unsigned mask) { - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 4 <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); - radeon_emit(cs, mask); - radeon_emit(cs, value); + assert(reg >= SI_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 4 <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); + radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); + radeon_emit(cs, mask); + radeon_emit(cs, value); } /* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */ static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value, - unsigned mask) + enum si_tracked_reg reg, unsigned value, + unsigned mask) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - assert((value & ~mask) == 0); - value &= mask; + assert((value & ~mask) == 0); + value &= mask; - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - radeon_set_context_reg_rmw(cs, offset, value, mask); + if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || + sctx->tracked_regs.reg_value[reg] != value) { + radeon_set_context_reg_rmw(cs, offset, value, mask); - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } + sctx->tracked_regs.reg_saved |= 0x1ull << reg; + sctx->tracked_regs.reg_value[reg] = value; + } } /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value) + enum si_tracked_reg reg, unsigned value) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - radeon_set_context_reg(cs, offset, value); + if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || + sctx->tracked_regs.reg_value[reg] != value) { + radeon_set_context_reg(cs, offset, value); - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } + sctx->tracked_regs.reg_saved |= 0x1ull << reg; + sctx->tracked_regs.reg_value[reg] = value; + } } /** @@ -168,98 +165,96 @@ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned * @param value2 is written to second register */ static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2) + enum si_tracked_reg reg, unsigned value1, + unsigned value2) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg+1] != value2) { - radeon_set_context_reg_seq(cs, offset, 2); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg+1] = value2; - sctx->tracked_regs.reg_saved |= 0x3ull << reg; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 || + sctx->tracked_regs.reg_value[reg] != value1 || + sctx->tracked_regs.reg_value[reg + 1] != value2) { + radeon_set_context_reg_seq(cs, offset, 2); + radeon_emit(cs, value1); + radeon_emit(cs, value2); + + sctx->tracked_regs.reg_value[reg] = value1; + sctx->tracked_regs.reg_value[reg + 1] = value2; + sctx->tracked_regs.reg_saved |= 0x3ull << reg; + } } /** * Set 3 consecutive registers if any registers value is different. */ static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2, unsigned value3) + enum si_tracked_reg reg, unsigned value1, + unsigned value2, unsigned value3) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg+1] != value2 || - sctx->tracked_regs.reg_value[reg+2] != value3) { - radeon_set_context_reg_seq(cs, offset, 3); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - radeon_emit(cs, value3); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg+1] = value2; - sctx->tracked_regs.reg_value[reg+2] = value3; - sctx->tracked_regs.reg_saved |= 0x7ull << reg; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 || + sctx->tracked_regs.reg_value[reg] != value1 || + sctx->tracked_regs.reg_value[reg + 1] != value2 || + sctx->tracked_regs.reg_value[reg + 2] != value3) { + radeon_set_context_reg_seq(cs, offset, 3); + radeon_emit(cs, value1); + radeon_emit(cs, value2); + radeon_emit(cs, value3); + + sctx->tracked_regs.reg_value[reg] = value1; + sctx->tracked_regs.reg_value[reg + 1] = value2; + sctx->tracked_regs.reg_value[reg + 2] = value3; + sctx->tracked_regs.reg_saved |= 0x7ull << reg; + } } /** * Set 4 consecutive registers if any registers value is different. */ static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2, unsigned value3, - unsigned value4) + enum si_tracked_reg reg, unsigned value1, + unsigned value2, unsigned value3, unsigned value4) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg+1] != value2 || - sctx->tracked_regs.reg_value[reg+2] != value3 || - sctx->tracked_regs.reg_value[reg+3] != value4) { - radeon_set_context_reg_seq(cs, offset, 4); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - radeon_emit(cs, value3); - radeon_emit(cs, value4); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg+1] = value2; - sctx->tracked_regs.reg_value[reg+2] = value3; - sctx->tracked_regs.reg_value[reg+3] = value4; - sctx->tracked_regs.reg_saved |= 0xfull << reg; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf || + sctx->tracked_regs.reg_value[reg] != value1 || + sctx->tracked_regs.reg_value[reg + 1] != value2 || + sctx->tracked_regs.reg_value[reg + 2] != value3 || + sctx->tracked_regs.reg_value[reg + 3] != value4) { + radeon_set_context_reg_seq(cs, offset, 4); + radeon_emit(cs, value1); + radeon_emit(cs, value2); + radeon_emit(cs, value3); + radeon_emit(cs, value4); + + sctx->tracked_regs.reg_value[reg] = value1; + sctx->tracked_regs.reg_value[reg + 1] = value2; + sctx->tracked_regs.reg_value[reg + 2] = value3; + sctx->tracked_regs.reg_value[reg + 3] = value4; + sctx->tracked_regs.reg_saved |= 0xfull << reg; + } } /** * Set consecutive registers if any registers value is different. */ static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset, - unsigned *value, unsigned *saved_val, - unsigned num) + unsigned *value, unsigned *saved_val, unsigned num) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - int i, j; - - for (i = 0; i < num; i++) { - if (saved_val[i] != value[i]) { - radeon_set_context_reg_seq(cs, offset, num); - for (j = 0; j < num; j++) - radeon_emit(cs, value[j]); - - memcpy(saved_val, value, sizeof(uint32_t) * num); - break; - } - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + int i, j; + + for (i = 0; i < num; i++) { + if (saved_val[i] != value[i]) { + radeon_set_context_reg_seq(cs, offset, num); + for (j = 0; j < num; j++) + radeon_emit(cs, value[j]); + + memcpy(saved_val, value, sizeof(uint32_t) * num); + break; + } + } } #endif diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 2af778b41ad..1e7aa443222 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -24,761 +24,710 @@ #include "si_pipe.h" #include "sid.h" - #include "util/format/u_format.h" #include "util/u_pack_color.h" #include "util/u_surface.h" -enum { - SI_CLEAR = SI_SAVE_FRAGMENT_STATE, - SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE, +enum +{ + SI_CLEAR = SI_SAVE_FRAGMENT_STATE, + SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE, }; -static void si_alloc_separate_cmask(struct si_screen *sscreen, - struct si_texture *tex) +static void si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex) { - /* CMASK for MSAA is allocated in advance or always disabled - * by "nofmask" option. - */ - if (tex->cmask_buffer || !tex->surface.cmask_size || - tex->buffer.b.b.nr_samples >= 2) - return; - - tex->cmask_buffer = - si_aligned_buffer_create(&sscreen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - tex->surface.cmask_size, - tex->surface.cmask_alignment); - if (tex->cmask_buffer == NULL) - return; - - tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8; - tex->cb_color_info |= S_028C70_FAST_CLEAR(1); - - p_atomic_inc(&sscreen->compressed_colortex_counter); + /* CMASK for MSAA is allocated in advance or always disabled + * by "nofmask" option. + */ + if (tex->cmask_buffer || !tex->surface.cmask_size || tex->buffer.b.b.nr_samples >= 2) + return; + + tex->cmask_buffer = + si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + tex->surface.cmask_size, tex->surface.cmask_alignment); + if (tex->cmask_buffer == NULL) + return; + + tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8; + tex->cb_color_info |= S_028C70_FAST_CLEAR(1); + + p_atomic_inc(&sscreen->compressed_colortex_counter); } -static bool si_set_clear_color(struct si_texture *tex, - enum pipe_format surface_format, - const union pipe_color_union *color) +static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format, + const union pipe_color_union *color) { - union util_color uc; - - memset(&uc, 0, sizeof(uc)); - - if (tex->surface.bpe == 16) { - /* DCC fast clear only: - * CLEAR_WORD0 = R = G = B - * CLEAR_WORD1 = A - */ - assert(color->ui[0] == color->ui[1] && - color->ui[0] == color->ui[2]); - uc.ui[0] = color->ui[0]; - uc.ui[1] = color->ui[3]; - } else { - util_pack_color_union(surface_format, &uc, color); - } - - if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0) - return false; - - memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)); - return true; + union util_color uc; + + memset(&uc, 0, sizeof(uc)); + + if (tex->surface.bpe == 16) { + /* DCC fast clear only: + * CLEAR_WORD0 = R = G = B + * CLEAR_WORD1 = A + */ + assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]); + uc.ui[0] = color->ui[0]; + uc.ui[1] = color->ui[3]; + } else { + util_pack_color_union(surface_format, &uc, color); + } + + if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0) + return false; + + memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)); + return true; } /** Linearize and convert luminace/intensity to red. */ enum pipe_format si_simplify_cb_format(enum pipe_format format) { - format = util_format_linear(format); - format = util_format_luminance_to_red(format); - return util_format_intensity_to_red(format); + format = util_format_linear(format); + format = util_format_luminance_to_red(format); + return util_format_intensity_to_red(format); } bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format) { - format = si_simplify_cb_format(format); - const struct util_format_description *desc = util_format_description(format); + format = si_simplify_cb_format(format); + const struct util_format_description *desc = util_format_description(format); - /* Formats with 3 channels can't have alpha. */ - if (desc->nr_channels == 3) - return true; /* same as xxxA; is any value OK here? */ + /* Formats with 3 channels can't have alpha. */ + if (desc->nr_channels == 3) + return true; /* same as xxxA; is any value OK here? */ - if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1) - return desc->swizzle[3] == PIPE_SWIZZLE_X; + if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1) + return desc->swizzle[3] == PIPE_SWIZZLE_X; - return si_translate_colorswap(format, false) <= 1; + return si_translate_colorswap(format, false) <= 1; } -static bool vi_get_fast_clear_parameters(struct si_screen *sscreen, - enum pipe_format base_format, - enum pipe_format surface_format, - const union pipe_color_union *color, - uint32_t* clear_value, - bool *eliminate_needed) +static bool vi_get_fast_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format, + enum pipe_format surface_format, + const union pipe_color_union *color, uint32_t *clear_value, + bool *eliminate_needed) { - /* If we want to clear without needing a fast clear eliminate step, we - * can set color and alpha independently to 0 or 1 (or 0/max for integer - * formats). - */ - bool values[4] = {}; /* whether to clear to 0 or 1 */ - bool color_value = false; /* clear color to 0 or 1 */ - bool alpha_value = false; /* clear alpha to 0 or 1 */ - int alpha_channel; /* index of the alpha component */ - bool has_color = false; - bool has_alpha = false; - - const struct util_format_description *desc = - util_format_description(si_simplify_cb_format(surface_format)); - - /* 128-bit fast clear with different R,G,B values is unsupported. */ - if (desc->block.bits == 128 && - (color->ui[0] != color->ui[1] || - color->ui[0] != color->ui[2])) - return false; - - *eliminate_needed = true; - *clear_value = DCC_CLEAR_COLOR_REG; - - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return true; /* need ELIMINATE_FAST_CLEAR */ - - bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format); - bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format); - - /* Formats with 3 channels can't have alpha. */ - if (desc->nr_channels == 3) - alpha_channel = -1; - else if (surf_alpha_is_on_msb) - alpha_channel = desc->nr_channels - 1; - else - alpha_channel = 0; - - for (int i = 0; i < 4; ++i) { - if (desc->swizzle[i] >= PIPE_SWIZZLE_0) - continue; - - if (desc->channel[i].pure_integer && - desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { - /* Use the maximum value for clamping the clear color. */ - int max = u_bit_consecutive(0, desc->channel[i].size - 1); - - values[i] = color->i[i] != 0; - if (color->i[i] != 0 && MIN2(color->i[i], max) != max) - return true; /* need ELIMINATE_FAST_CLEAR */ - } else if (desc->channel[i].pure_integer && - desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { - /* Use the maximum value for clamping the clear color. */ - unsigned max = u_bit_consecutive(0, desc->channel[i].size); - - values[i] = color->ui[i] != 0U; - if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max) - return true; /* need ELIMINATE_FAST_CLEAR */ - } else { - values[i] = color->f[i] != 0.0F; - if (color->f[i] != 0.0F && color->f[i] != 1.0F) - return true; /* need ELIMINATE_FAST_CLEAR */ - } - - if (desc->swizzle[i] == alpha_channel) { - alpha_value = values[i]; - has_alpha = true; - } else { - color_value = values[i]; - has_color = true; - } - } - - /* If alpha isn't present, make it the same as color, and vice versa. */ - if (!has_alpha) - alpha_value = color_value; - else if (!has_color) - color_value = alpha_value; - - if (color_value != alpha_value && - base_alpha_is_on_msb != surf_alpha_is_on_msb) - return true; /* require ELIMINATE_FAST_CLEAR */ - - /* Check if all color values are equal if they are present. */ - for (int i = 0; i < 4; ++i) { - if (desc->swizzle[i] <= PIPE_SWIZZLE_W && - desc->swizzle[i] != alpha_channel && - values[i] != color_value) - return true; /* require ELIMINATE_FAST_CLEAR */ - } - - /* This doesn't need ELIMINATE_FAST_CLEAR. - * On chips predating Raven2, the DCC clear codes and the CB clear - * color registers must match. - */ - *eliminate_needed = false; - - if (color_value) { - if (alpha_value) - *clear_value = DCC_CLEAR_COLOR_1111; - else - *clear_value = DCC_CLEAR_COLOR_1110; - } else { - if (alpha_value) - *clear_value = DCC_CLEAR_COLOR_0001; - else - *clear_value = DCC_CLEAR_COLOR_0000; - } - return true; + /* If we want to clear without needing a fast clear eliminate step, we + * can set color and alpha independently to 0 or 1 (or 0/max for integer + * formats). + */ + bool values[4] = {}; /* whether to clear to 0 or 1 */ + bool color_value = false; /* clear color to 0 or 1 */ + bool alpha_value = false; /* clear alpha to 0 or 1 */ + int alpha_channel; /* index of the alpha component */ + bool has_color = false; + bool has_alpha = false; + + const struct util_format_description *desc = + util_format_description(si_simplify_cb_format(surface_format)); + + /* 128-bit fast clear with different R,G,B values is unsupported. */ + if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2])) + return false; + + *eliminate_needed = true; + *clear_value = DCC_CLEAR_COLOR_REG; + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return true; /* need ELIMINATE_FAST_CLEAR */ + + bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format); + bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format); + + /* Formats with 3 channels can't have alpha. */ + if (desc->nr_channels == 3) + alpha_channel = -1; + else if (surf_alpha_is_on_msb) + alpha_channel = desc->nr_channels - 1; + else + alpha_channel = 0; + + for (int i = 0; i < 4; ++i) { + if (desc->swizzle[i] >= PIPE_SWIZZLE_0) + continue; + + if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { + /* Use the maximum value for clamping the clear color. */ + int max = u_bit_consecutive(0, desc->channel[i].size - 1); + + values[i] = color->i[i] != 0; + if (color->i[i] != 0 && MIN2(color->i[i], max) != max) + return true; /* need ELIMINATE_FAST_CLEAR */ + } else if (desc->channel[i].pure_integer && + desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { + /* Use the maximum value for clamping the clear color. */ + unsigned max = u_bit_consecutive(0, desc->channel[i].size); + + values[i] = color->ui[i] != 0U; + if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max) + return true; /* need ELIMINATE_FAST_CLEAR */ + } else { + values[i] = color->f[i] != 0.0F; + if (color->f[i] != 0.0F && color->f[i] != 1.0F) + return true; /* need ELIMINATE_FAST_CLEAR */ + } + + if (desc->swizzle[i] == alpha_channel) { + alpha_value = values[i]; + has_alpha = true; + } else { + color_value = values[i]; + has_color = true; + } + } + + /* If alpha isn't present, make it the same as color, and vice versa. */ + if (!has_alpha) + alpha_value = color_value; + else if (!has_color) + color_value = alpha_value; + + if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb) + return true; /* require ELIMINATE_FAST_CLEAR */ + + /* Check if all color values are equal if they are present. */ + for (int i = 0; i < 4; ++i) { + if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel && + values[i] != color_value) + return true; /* require ELIMINATE_FAST_CLEAR */ + } + + /* This doesn't need ELIMINATE_FAST_CLEAR. + * On chips predating Raven2, the DCC clear codes and the CB clear + * color registers must match. + */ + *eliminate_needed = false; + + if (color_value) { + if (alpha_value) + *clear_value = DCC_CLEAR_COLOR_1111; + else + *clear_value = DCC_CLEAR_COLOR_1110; + } else { + if (alpha_value) + *clear_value = DCC_CLEAR_COLOR_0001; + else + *clear_value = DCC_CLEAR_COLOR_0000; + } + return true; } -bool vi_dcc_clear_level(struct si_context *sctx, - struct si_texture *tex, - unsigned level, unsigned clear_value) +bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level, + unsigned clear_value) { - struct pipe_resource *dcc_buffer; - uint64_t dcc_offset, clear_size; - - assert(vi_dcc_enabled(tex, level)); - - if (tex->dcc_separate_buffer) { - dcc_buffer = &tex->dcc_separate_buffer->b.b; - dcc_offset = 0; - } else { - dcc_buffer = &tex->buffer.b.b; - dcc_offset = tex->surface.dcc_offset; - } - - if (sctx->chip_class >= GFX9) { - /* Mipmap level clears aren't implemented. */ - if (tex->buffer.b.b.last_level > 0) - return false; - - /* 4x and 8x MSAA needs a sophisticated compute shader for - * the clear. See AMDVLK. */ - if (tex->buffer.b.b.nr_storage_samples >= 4) - return false; - - clear_size = tex->surface.dcc_size; - } else { - unsigned num_layers = util_num_layers(&tex->buffer.b.b, level); - - /* If this is 0, fast clear isn't possible. (can occur with MSAA) */ - if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size) - return false; - - /* Layered 4x and 8x MSAA DCC fast clears need to clear - * dcc_fast_clear_size bytes for each layer. A compute shader - * would be more efficient than separate per-layer clear operations. - */ - if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1) - return false; - - dcc_offset += tex->surface.u.legacy.level[level].dcc_offset; - clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * - num_layers; - } - - si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - &clear_value, 4, SI_COHERENCY_CB_META, false); - return true; + struct pipe_resource *dcc_buffer; + uint64_t dcc_offset, clear_size; + + assert(vi_dcc_enabled(tex, level)); + + if (tex->dcc_separate_buffer) { + dcc_buffer = &tex->dcc_separate_buffer->b.b; + dcc_offset = 0; + } else { + dcc_buffer = &tex->buffer.b.b; + dcc_offset = tex->surface.dcc_offset; + } + + if (sctx->chip_class >= GFX9) { + /* Mipmap level clears aren't implemented. */ + if (tex->buffer.b.b.last_level > 0) + return false; + + /* 4x and 8x MSAA needs a sophisticated compute shader for + * the clear. See AMDVLK. */ + if (tex->buffer.b.b.nr_storage_samples >= 4) + return false; + + clear_size = tex->surface.dcc_size; + } else { + unsigned num_layers = util_num_layers(&tex->buffer.b.b, level); + + /* If this is 0, fast clear isn't possible. (can occur with MSAA) */ + if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size) + return false; + + /* Layered 4x and 8x MSAA DCC fast clears need to clear + * dcc_fast_clear_size bytes for each layer. A compute shader + * would be more efficient than separate per-layer clear operations. + */ + if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1) + return false; + + dcc_offset += tex->surface.u.legacy.level[level].dcc_offset; + clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers; + } + + si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, &clear_value, 4, SI_COHERENCY_CB_META, + false); + return true; } /* Set the same micro tile mode as the destination of the last MSAA resolve. * This allows hitting the MSAA resolve fast path, which requires that both * src and dst micro tile modes match. */ -static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, - struct si_texture *tex) +static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex) { - if (sscreen->info.chip_class >= GFX10 || - tex->buffer.b.is_shared || - tex->buffer.b.b.nr_samples <= 1 || - tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode) - return; - - assert(sscreen->info.chip_class >= GFX9 || - tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D); - assert(tex->buffer.b.b.last_level == 0); - - if (sscreen->info.chip_class >= GFX9) { - /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */ - assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4); - - /* If you do swizzle_mode % 4, you'll get: - * 0 = Depth - * 1 = Standard, - * 2 = Displayable - * 3 = Rotated - * - * Depth-sample order isn't allowed: - */ - assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0); - - switch (tex->last_msaa_resolve_target_micro_mode) { - case RADEON_MICRO_MODE_DISPLAY: - tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; - tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */ - break; - case RADEON_MICRO_MODE_THIN: - tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; - tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */ - break; - case RADEON_MICRO_MODE_ROTATED: - tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; - tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */ - break; - default: /* depth */ - assert(!"unexpected micro mode"); - return; - } - } else if (sscreen->info.chip_class >= GFX7) { - /* These magic numbers were copied from addrlib. It doesn't use - * any definitions for them either. They are all 2D_TILED_THIN1 - * modes with different bpp and micro tile mode. - */ - switch (tex->last_msaa_resolve_target_micro_mode) { - case RADEON_MICRO_MODE_DISPLAY: - tex->surface.u.legacy.tiling_index[0] = 10; - break; - case RADEON_MICRO_MODE_THIN: - tex->surface.u.legacy.tiling_index[0] = 14; - break; - case RADEON_MICRO_MODE_ROTATED: - tex->surface.u.legacy.tiling_index[0] = 28; - break; - default: /* depth, thick */ - assert(!"unexpected micro mode"); - return; - } - } else { /* GFX6 */ - switch (tex->last_msaa_resolve_target_micro_mode) { - case RADEON_MICRO_MODE_DISPLAY: - switch (tex->surface.bpe) { - case 1: - tex->surface.u.legacy.tiling_index[0] = 10; - break; - case 2: - tex->surface.u.legacy.tiling_index[0] = 11; - break; - default: /* 4, 8 */ - tex->surface.u.legacy.tiling_index[0] = 12; - break; - } - break; - case RADEON_MICRO_MODE_THIN: - switch (tex->surface.bpe) { - case 1: - tex->surface.u.legacy.tiling_index[0] = 14; - break; - case 2: - tex->surface.u.legacy.tiling_index[0] = 15; - break; - case 4: - tex->surface.u.legacy.tiling_index[0] = 16; - break; - default: /* 8, 16 */ - tex->surface.u.legacy.tiling_index[0] = 17; - break; - } - break; - default: /* depth, thick */ - assert(!"unexpected micro mode"); - return; - } - } - - tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode; - - p_atomic_inc(&sscreen->dirty_tex_counter); + if (sscreen->info.chip_class >= GFX10 || tex->buffer.b.is_shared || + tex->buffer.b.b.nr_samples <= 1 || + tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode) + return; + + assert(sscreen->info.chip_class >= GFX9 || + tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D); + assert(tex->buffer.b.b.last_level == 0); + + if (sscreen->info.chip_class >= GFX9) { + /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */ + assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4); + + /* If you do swizzle_mode % 4, you'll get: + * 0 = Depth + * 1 = Standard, + * 2 = Displayable + * 3 = Rotated + * + * Depth-sample order isn't allowed: + */ + assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0); + + switch (tex->last_msaa_resolve_target_micro_mode) { + case RADEON_MICRO_MODE_DISPLAY: + tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */ + break; + case RADEON_MICRO_MODE_THIN: + tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */ + break; + case RADEON_MICRO_MODE_ROTATED: + tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3; + tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */ + break; + default: /* depth */ + assert(!"unexpected micro mode"); + return; + } + } else if (sscreen->info.chip_class >= GFX7) { + /* These magic numbers were copied from addrlib. It doesn't use + * any definitions for them either. They are all 2D_TILED_THIN1 + * modes with different bpp and micro tile mode. + */ + switch (tex->last_msaa_resolve_target_micro_mode) { + case RADEON_MICRO_MODE_DISPLAY: + tex->surface.u.legacy.tiling_index[0] = 10; + break; + case RADEON_MICRO_MODE_THIN: + tex->surface.u.legacy.tiling_index[0] = 14; + break; + case RADEON_MICRO_MODE_ROTATED: + tex->surface.u.legacy.tiling_index[0] = 28; + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } else { /* GFX6 */ + switch (tex->last_msaa_resolve_target_micro_mode) { + case RADEON_MICRO_MODE_DISPLAY: + switch (tex->surface.bpe) { + case 1: + tex->surface.u.legacy.tiling_index[0] = 10; + break; + case 2: + tex->surface.u.legacy.tiling_index[0] = 11; + break; + default: /* 4, 8 */ + tex->surface.u.legacy.tiling_index[0] = 12; + break; + } + break; + case RADEON_MICRO_MODE_THIN: + switch (tex->surface.bpe) { + case 1: + tex->surface.u.legacy.tiling_index[0] = 14; + break; + case 2: + tex->surface.u.legacy.tiling_index[0] = 15; + break; + case 4: + tex->surface.u.legacy.tiling_index[0] = 16; + break; + default: /* 8, 16 */ + tex->surface.u.legacy.tiling_index[0] = 17; + break; + } + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } + + tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode; + + p_atomic_inc(&sscreen->dirty_tex_counter); } -static void si_do_fast_color_clear(struct si_context *sctx, - unsigned *buffers, - const union pipe_color_union *color) +static void si_do_fast_color_clear(struct si_context *sctx, unsigned *buffers, + const union pipe_color_union *color) { - struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; - int i; + struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; + int i; - /* This function is broken in BE, so just disable this path for now */ + /* This function is broken in BE, so just disable this path for now */ #if UTIL_ARCH_BIG_ENDIAN - return; + return; #endif - if (sctx->render_cond) - return; - - for (i = 0; i < fb->nr_cbufs; i++) { - struct si_texture *tex; - unsigned clear_bit = PIPE_CLEAR_COLOR0 << i; - - if (!fb->cbufs[i]) - continue; - - /* if this colorbuffer is not being cleared */ - if (!(*buffers & clear_bit)) - continue; - - unsigned level = fb->cbufs[i]->u.tex.level; - if (level > 0) - continue; - - tex = (struct si_texture *)fb->cbufs[i]->texture; - - /* TODO: GFX9: Implement DCC fast clear for level 0 of - * mipmapped textures. Mipmapped DCC has to clear a rectangular - * area of DCC for level 0 (because the whole miptree is - * organized in a 2D plane). - */ - if (sctx->chip_class >= GFX9 && - tex->buffer.b.b.last_level > 0) - continue; - - /* the clear is allowed if all layers are bound */ - if (fb->cbufs[i]->u.tex.first_layer != 0 || - fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) { - continue; - } - - /* only supported on tiled surfaces */ - if (tex->surface.is_linear) { - continue; - } - - /* shared textures can't use fast clear without an explicit flush, - * because there is no way to communicate the clear color among - * all clients - */ - if (tex->buffer.b.is_shared && - !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) - continue; - - if (sctx->chip_class <= GFX8 && - tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D && - !sctx->screen->info.htile_cmask_support_1d_tiling) - continue; - - /* Use a slow clear for small surfaces where the cost of - * the eliminate pass can be higher than the benefit of fast - * clear. The closed driver does this, but the numbers may differ. - * - * This helps on both dGPUs and APUs, even small APUs like Mullins. - */ - bool too_small = tex->buffer.b.b.nr_samples <= 1 && - tex->buffer.b.b.width0 * - tex->buffer.b.b.height0 <= 512 * 512; - bool eliminate_needed = false; - bool fmask_decompress_needed = false; - - /* Fast clear is the most appropriate place to enable DCC for - * displayable surfaces. - */ - if (sctx->family == CHIP_STONEY && !too_small) { - vi_separate_dcc_try_enable(sctx, tex); - - /* RB+ isn't supported with a CMASK clear only on Stoney, - * so all clears are considered to be hypothetically slow - * clears, which is weighed when determining whether to - * enable separate DCC. - */ - if (tex->dcc_gather_statistics) /* only for Stoney */ - tex->num_slow_clears++; - } - - /* Try to clear DCC first, otherwise try CMASK. */ - if (vi_dcc_enabled(tex, 0)) { - uint32_t reset_value; - - if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR)) - continue; - - if (!vi_get_fast_clear_parameters(sctx->screen, - tex->buffer.b.b.format, - fb->cbufs[i]->format, - color, &reset_value, - &eliminate_needed)) - continue; - - if (eliminate_needed && too_small) - continue; - - /* TODO: This DCC+CMASK clear doesn't work with MSAA. */ - if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer && - eliminate_needed) - continue; - - if (!vi_dcc_clear_level(sctx, tex, 0, reset_value)) - continue; - - tex->separate_dcc_dirty = true; - tex->displayable_dcc_dirty = true; - - /* DCC fast clear with MSAA should clear CMASK to 0xC. */ - if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { - uint32_t clear_value = 0xCCCCCCCC; - si_clear_buffer(sctx, &tex->cmask_buffer->b.b, - tex->surface.cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META, false); - fmask_decompress_needed = true; - } - } else { - if (too_small) - continue; - - /* 128-bit formats are unusupported */ - if (tex->surface.bpe > 8) { - continue; - } - - /* RB+ doesn't work with CMASK fast clear on Stoney. */ - if (sctx->family == CHIP_STONEY) - continue; - - /* ensure CMASK is enabled */ - si_alloc_separate_cmask(sctx->screen, tex); - if (!tex->cmask_buffer) - continue; - - /* Do the fast clear. */ - uint32_t clear_value = 0; - si_clear_buffer(sctx, &tex->cmask_buffer->b.b, - tex->surface.cmask_offset, tex->surface.cmask_size, - &clear_value, 4, SI_COHERENCY_CB_META, false); - eliminate_needed = true; - } - - if ((eliminate_needed || fmask_decompress_needed) && - !(tex->dirty_level_mask & (1 << level))) { - tex->dirty_level_mask |= 1 << level; - p_atomic_inc(&sctx->screen->compressed_colortex_counter); - } - - /* We can change the micro tile mode before a full clear. */ - si_set_optimal_micro_tile_mode(sctx->screen, tex); - - *buffers &= ~clear_bit; - - /* Chips with DCC constant encoding don't need to set the clear - * color registers for DCC clear values 0 and 1. - */ - if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed) - continue; - - if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) { - sctx->framebuffer.dirty_cbufs |= 1 << i; - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - } - } + if (sctx->render_cond) + return; + + for (i = 0; i < fb->nr_cbufs; i++) { + struct si_texture *tex; + unsigned clear_bit = PIPE_CLEAR_COLOR0 << i; + + if (!fb->cbufs[i]) + continue; + + /* if this colorbuffer is not being cleared */ + if (!(*buffers & clear_bit)) + continue; + + unsigned level = fb->cbufs[i]->u.tex.level; + if (level > 0) + continue; + + tex = (struct si_texture *)fb->cbufs[i]->texture; + + /* TODO: GFX9: Implement DCC fast clear for level 0 of + * mipmapped textures. Mipmapped DCC has to clear a rectangular + * area of DCC for level 0 (because the whole miptree is + * organized in a 2D plane). + */ + if (sctx->chip_class >= GFX9 && tex->buffer.b.b.last_level > 0) + continue; + + /* the clear is allowed if all layers are bound */ + if (fb->cbufs[i]->u.tex.first_layer != 0 || + fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) { + continue; + } + + /* only supported on tiled surfaces */ + if (tex->surface.is_linear) { + continue; + } + + /* shared textures can't use fast clear without an explicit flush, + * because there is no way to communicate the clear color among + * all clients + */ + if (tex->buffer.b.is_shared && + !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + continue; + + if (sctx->chip_class <= GFX8 && tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D && + !sctx->screen->info.htile_cmask_support_1d_tiling) + continue; + + /* Use a slow clear for small surfaces where the cost of + * the eliminate pass can be higher than the benefit of fast + * clear. The closed driver does this, but the numbers may differ. + * + * This helps on both dGPUs and APUs, even small APUs like Mullins. + */ + bool too_small = tex->buffer.b.b.nr_samples <= 1 && + tex->buffer.b.b.width0 * tex->buffer.b.b.height0 <= 512 * 512; + bool eliminate_needed = false; + bool fmask_decompress_needed = false; + + /* Fast clear is the most appropriate place to enable DCC for + * displayable surfaces. + */ + if (sctx->family == CHIP_STONEY && !too_small) { + vi_separate_dcc_try_enable(sctx, tex); + + /* RB+ isn't supported with a CMASK clear only on Stoney, + * so all clears are considered to be hypothetically slow + * clears, which is weighed when determining whether to + * enable separate DCC. + */ + if (tex->dcc_gather_statistics) /* only for Stoney */ + tex->num_slow_clears++; + } + + /* Try to clear DCC first, otherwise try CMASK. */ + if (vi_dcc_enabled(tex, 0)) { + uint32_t reset_value; + + if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR)) + continue; + + if (!vi_get_fast_clear_parameters(sctx->screen, tex->buffer.b.b.format, + fb->cbufs[i]->format, color, &reset_value, + &eliminate_needed)) + continue; + + if (eliminate_needed && too_small) + continue; + + /* TODO: This DCC+CMASK clear doesn't work with MSAA. */ + if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer && eliminate_needed) + continue; + + if (!vi_dcc_clear_level(sctx, tex, 0, reset_value)) + continue; + + tex->separate_dcc_dirty = true; + tex->displayable_dcc_dirty = true; + + /* DCC fast clear with MSAA should clear CMASK to 0xC. */ + if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { + uint32_t clear_value = 0xCCCCCCCC; + si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset, + tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false); + fmask_decompress_needed = true; + } + } else { + if (too_small) + continue; + + /* 128-bit formats are unusupported */ + if (tex->surface.bpe > 8) { + continue; + } + + /* RB+ doesn't work with CMASK fast clear on Stoney. */ + if (sctx->family == CHIP_STONEY) + continue; + + /* ensure CMASK is enabled */ + si_alloc_separate_cmask(sctx->screen, tex); + if (!tex->cmask_buffer) + continue; + + /* Do the fast clear. */ + uint32_t clear_value = 0; + si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset, + tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false); + eliminate_needed = true; + } + + if ((eliminate_needed || fmask_decompress_needed) && + !(tex->dirty_level_mask & (1 << level))) { + tex->dirty_level_mask |= 1 << level; + p_atomic_inc(&sctx->screen->compressed_colortex_counter); + } + + /* We can change the micro tile mode before a full clear. */ + si_set_optimal_micro_tile_mode(sctx->screen, tex); + + *buffers &= ~clear_bit; + + /* Chips with DCC constant encoding don't need to set the clear + * color registers for DCC clear values 0 and 1. + */ + if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed) + continue; + + if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) { + sctx->framebuffer.dirty_cbufs |= 1 << i; + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + } + } } static void si_clear(struct pipe_context *ctx, unsigned buffers, - const union pipe_color_union *color, - double depth, unsigned stencil) + const union pipe_color_union *color, double depth, unsigned stencil) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; - struct pipe_surface *zsbuf = fb->zsbuf; - struct si_texture *zstex = - zsbuf ? (struct si_texture*)zsbuf->texture : NULL; - bool needs_db_flush = false; - - if (buffers & PIPE_CLEAR_COLOR) { - si_do_fast_color_clear(sctx, &buffers, color); - if (!buffers) - return; /* all buffers have been fast cleared */ - - /* These buffers cannot use fast clear, make sure to disable expansion. */ - for (unsigned i = 0; i < fb->nr_cbufs; i++) { - struct si_texture *tex; - - /* If not clearing this buffer, skip. */ - if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i]) - continue; - - tex = (struct si_texture *)fb->cbufs[i]->texture; - if (tex->surface.fmask_size == 0) - tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level); - } - } - - if (zstex && - zsbuf->u.tex.first_layer == 0 && - zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) { - /* TC-compatible HTILE only supports depth clears to 0 or 1. */ - if (buffers & PIPE_CLEAR_DEPTH && - si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) && - (!zstex->tc_compatible_htile || - depth == 0 || depth == 1)) { - /* Need to disable EXPCLEAR temporarily if clearing - * to a new value. */ - if (!zstex->depth_cleared || zstex->depth_clear_value != depth) { - sctx->db_depth_disable_expclear = true; - } - - if (zstex->depth_clear_value != (float)depth) { - if ((zstex->depth_clear_value != 0) != (depth != 0)) { - /* ZRANGE_PRECISION register of a bound surface will change so we - * must flush the DB caches. */ - needs_db_flush = true; - } - /* Update DB_DEPTH_CLEAR. */ - zstex->depth_clear_value = depth; - sctx->framebuffer.dirty_zsbuf = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - } - sctx->db_depth_clear = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - - /* TC-compatible HTILE only supports stencil clears to 0. */ - if (buffers & PIPE_CLEAR_STENCIL && - si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) && - (!zstex->tc_compatible_htile || stencil == 0)) { - stencil &= 0xff; - - /* Need to disable EXPCLEAR temporarily if clearing - * to a new value. */ - if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) { - sctx->db_stencil_disable_expclear = true; - } - - if (zstex->stencil_clear_value != (uint8_t)stencil) { - /* Update DB_STENCIL_CLEAR. */ - zstex->stencil_clear_value = stencil; - sctx->framebuffer.dirty_zsbuf = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - } - sctx->db_stencil_clear = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - - if (needs_db_flush) - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; - } - - si_blitter_begin(sctx, SI_CLEAR); - util_blitter_clear(sctx->blitter, fb->width, fb->height, - util_framebuffer_get_num_layers(fb), - buffers, color, depth, stencil, - sctx->framebuffer.nr_samples > 1); - si_blitter_end(sctx); - - if (sctx->db_depth_clear) { - sctx->db_depth_clear = false; - sctx->db_depth_disable_expclear = false; - zstex->depth_cleared = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } - - if (sctx->db_stencil_clear) { - sctx->db_stencil_clear = false; - sctx->db_stencil_disable_expclear = false; - zstex->stencil_cleared = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; + struct pipe_surface *zsbuf = fb->zsbuf; + struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL; + bool needs_db_flush = false; + + if (buffers & PIPE_CLEAR_COLOR) { + si_do_fast_color_clear(sctx, &buffers, color); + if (!buffers) + return; /* all buffers have been fast cleared */ + + /* These buffers cannot use fast clear, make sure to disable expansion. */ + for (unsigned i = 0; i < fb->nr_cbufs; i++) { + struct si_texture *tex; + + /* If not clearing this buffer, skip. */ + if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i]) + continue; + + tex = (struct si_texture *)fb->cbufs[i]->texture; + if (tex->surface.fmask_size == 0) + tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level); + } + } + + if (zstex && zsbuf->u.tex.first_layer == 0 && + zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) { + /* TC-compatible HTILE only supports depth clears to 0 or 1. */ + if (buffers & PIPE_CLEAR_DEPTH && si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) && + (!zstex->tc_compatible_htile || depth == 0 || depth == 1)) { + /* Need to disable EXPCLEAR temporarily if clearing + * to a new value. */ + if (!zstex->depth_cleared || zstex->depth_clear_value != depth) { + sctx->db_depth_disable_expclear = true; + } + + if (zstex->depth_clear_value != (float)depth) { + if ((zstex->depth_clear_value != 0) != (depth != 0)) { + /* ZRANGE_PRECISION register of a bound surface will change so we + * must flush the DB caches. */ + needs_db_flush = true; + } + /* Update DB_DEPTH_CLEAR. */ + zstex->depth_clear_value = depth; + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + } + sctx->db_depth_clear = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + + /* TC-compatible HTILE only supports stencil clears to 0. */ + if (buffers & PIPE_CLEAR_STENCIL && + si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) && + (!zstex->tc_compatible_htile || stencil == 0)) { + stencil &= 0xff; + + /* Need to disable EXPCLEAR temporarily if clearing + * to a new value. */ + if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) { + sctx->db_stencil_disable_expclear = true; + } + + if (zstex->stencil_clear_value != (uint8_t)stencil) { + /* Update DB_STENCIL_CLEAR. */ + zstex->stencil_clear_value = stencil; + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + } + sctx->db_stencil_clear = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + + if (needs_db_flush) + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; + } + + si_blitter_begin(sctx, SI_CLEAR); + util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb), + buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1); + si_blitter_end(sctx); + + if (sctx->db_depth_clear) { + sctx->db_depth_clear = false; + sctx->db_depth_disable_expclear = false; + zstex->depth_cleared = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } + + if (sctx->db_stencil_clear) { + sctx->db_stencil_clear = false; + sctx->db_stencil_disable_expclear = false; + zstex->stencil_cleared = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } } -static void si_clear_render_target(struct pipe_context *ctx, - struct pipe_surface *dst, - const union pipe_color_union *color, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - bool render_condition_enabled) +static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst, + const union pipe_color_union *color, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, + bool render_condition_enabled) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture *sdst = (struct si_texture*)dst->texture; - - if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) { - si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, - height, render_condition_enabled); - return; - } - - si_blitter_begin(sctx, SI_CLEAR_SURFACE | - (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_clear_render_target(sctx->blitter, dst, color, - dstx, dsty, width, height); - si_blitter_end(sctx); + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *sdst = (struct si_texture *)dst->texture; + + if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) { + si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height, + render_condition_enabled); + return; + } + + si_blitter_begin(sctx, + SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND)); + util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height); + si_blitter_end(sctx); } -static void si_clear_depth_stencil(struct pipe_context *ctx, - struct pipe_surface *dst, - unsigned clear_flags, - double depth, - unsigned stencil, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - bool render_condition_enabled) +static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst, + unsigned clear_flags, double depth, unsigned stencil, + unsigned dstx, unsigned dsty, unsigned width, unsigned height, + bool render_condition_enabled) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - si_blitter_begin(sctx, SI_CLEAR_SURFACE | - (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, - dstx, dsty, width, height); - si_blitter_end(sctx); + si_blitter_begin(sctx, + SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND)); + util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty, + width, height); + si_blitter_end(sctx); } -static void si_clear_texture(struct pipe_context *pipe, - struct pipe_resource *tex, - unsigned level, - const struct pipe_box *box, - const void *data) +static void si_clear_texture(struct pipe_context *pipe, struct pipe_resource *tex, unsigned level, + const struct pipe_box *box, const void *data) { - struct pipe_screen *screen = pipe->screen; - struct si_texture *stex = (struct si_texture*)tex; - struct pipe_surface tmpl = {{0}}; - struct pipe_surface *sf; - - tmpl.format = tex->format; - tmpl.u.tex.first_layer = box->z; - tmpl.u.tex.last_layer = box->z + box->depth - 1; - tmpl.u.tex.level = level; - sf = pipe->create_surface(pipe, tex, &tmpl); - if (!sf) - return; - - if (stex->is_depth) { - unsigned clear; - float depth; - uint8_t stencil = 0; - - /* Depth is always present. */ - clear = PIPE_CLEAR_DEPTH; - util_format_unpack_z_float(tex->format, &depth, data, 1); - - if (stex->surface.has_stencil) { - clear |= PIPE_CLEAR_STENCIL; - util_format_unpack_s_8uint(tex->format, - &stencil, data, 1); - } - - si_clear_depth_stencil(pipe, sf, clear, depth, stencil, - box->x, box->y, - box->width, box->height, false); - } else { - union pipe_color_union color; - - util_format_unpack_rgba(tex->format, color.ui, data, 1); - - if (screen->is_format_supported(screen, tex->format, - tex->target, 0, 0, - PIPE_BIND_RENDER_TARGET)) { - si_clear_render_target(pipe, sf, &color, - box->x, box->y, - box->width, box->height, false); - } else { - /* Software fallback - just for R9G9B9E5_FLOAT */ - util_clear_render_target(pipe, sf, &color, - box->x, box->y, - box->width, box->height); - } - } - pipe_surface_reference(&sf, NULL); + struct pipe_screen *screen = pipe->screen; + struct si_texture *stex = (struct si_texture *)tex; + struct pipe_surface tmpl = {{0}}; + struct pipe_surface *sf; + + tmpl.format = tex->format; + tmpl.u.tex.first_layer = box->z; + tmpl.u.tex.last_layer = box->z + box->depth - 1; + tmpl.u.tex.level = level; + sf = pipe->create_surface(pipe, tex, &tmpl); + if (!sf) + return; + + if (stex->is_depth) { + unsigned clear; + float depth; + uint8_t stencil = 0; + + /* Depth is always present. */ + clear = PIPE_CLEAR_DEPTH; + util_format_unpack_z_float(tex->format, &depth, data, 1); + + if (stex->surface.has_stencil) { + clear |= PIPE_CLEAR_STENCIL; + util_format_unpack_s_8uint(tex->format, &stencil, data, 1); + } + + si_clear_depth_stencil(pipe, sf, clear, depth, stencil, box->x, box->y, box->width, + box->height, false); + } else { + union pipe_color_union color; + + util_format_unpack_rgba(tex->format, color.ui, data, 1); + + if (screen->is_format_supported(screen, tex->format, tex->target, 0, 0, + PIPE_BIND_RENDER_TARGET)) { + si_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height, false); + } else { + /* Software fallback - just for R9G9B9E5_FLOAT */ + util_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height); + } + } + pipe_surface_reference(&sf, NULL); } void si_init_clear_functions(struct si_context *sctx) { - sctx->b.clear_render_target = si_clear_render_target; - sctx->b.clear_texture = si_clear_texture; + sctx->b.clear_render_target = si_clear_render_target; + sctx->b.clear_texture = si_clear_texture; - if (sctx->has_graphics) { - sctx->b.clear = si_clear; - sctx->b.clear_depth_stencil = si_clear_depth_stencil; - } + if (sctx->has_graphics) { + sctx->b.clear = si_clear; + sctx->b.clear_depth_stencil = si_clear_depth_stencil; + } } diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 610c1333597..5dca5730a58 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -23,972 +23,892 @@ * */ -#include "nir/tgsi_to_nir.h" -#include "util/u_async_debug.h" -#include "util/u_memory.h" -#include "util/u_upload_mgr.h" +#include "si_compute.h" #include "ac_rtld.h" #include "amd_kernel_code_t.h" +#include "nir/tgsi_to_nir.h" #include "si_build_pm4.h" -#include "si_compute.h" +#include "util/u_async_debug.h" +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" -#define COMPUTE_DBG(sscreen, fmt, args...) \ - do { \ - if ((sscreen->debug_flags & DBG(COMPUTE))) fprintf(stderr, fmt, ##args); \ - } while (0); +#define COMPUTE_DBG(sscreen, fmt, args...) \ + do { \ + if ((sscreen->debug_flags & DBG(COMPUTE))) \ + fprintf(stderr, fmt, ##args); \ + } while (0); struct dispatch_packet { - uint16_t header; - uint16_t setup; - uint16_t workgroup_size_x; - uint16_t workgroup_size_y; - uint16_t workgroup_size_z; - uint16_t reserved0; - uint32_t grid_size_x; - uint32_t grid_size_y; - uint32_t grid_size_z; - uint32_t private_segment_size; - uint32_t group_segment_size; - uint64_t kernel_object; - uint64_t kernarg_address; - uint64_t reserved2; + uint16_t header; + uint16_t setup; + uint16_t workgroup_size_x; + uint16_t workgroup_size_y; + uint16_t workgroup_size_z; + uint16_t reserved0; + uint32_t grid_size_x; + uint32_t grid_size_y; + uint32_t grid_size_z; + uint32_t private_segment_size; + uint32_t group_segment_size; + uint64_t kernel_object; + uint64_t kernarg_address; + uint64_t reserved2; }; -static const amd_kernel_code_t *si_compute_get_code_object( - const struct si_compute *program, - uint64_t symbol_offset) +static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program, + uint64_t symbol_offset) { - const struct si_shader_selector *sel = &program->sel; + const struct si_shader_selector *sel = &program->sel; - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - return NULL; + if (program->ir_type != PIPE_SHADER_IR_NATIVE) + return NULL; - struct ac_rtld_binary rtld; - if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ - .info = &sel->screen->info, - .shader_type = MESA_SHADER_COMPUTE, - .wave_size = sel->screen->compute_wave_size, - .num_parts = 1, - .elf_ptrs = &program->shader.binary.elf_buffer, - .elf_sizes = &program->shader.binary.elf_size })) - return NULL; + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, + (struct ac_rtld_open_info){.info = &sel->screen->info, + .shader_type = MESA_SHADER_COMPUTE, + .wave_size = sel->screen->compute_wave_size, + .num_parts = 1, + .elf_ptrs = &program->shader.binary.elf_buffer, + .elf_sizes = &program->shader.binary.elf_size})) + return NULL; - const amd_kernel_code_t *result = NULL; - const char *text; - size_t size; - if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size)) - goto out; + const amd_kernel_code_t *result = NULL; + const char *text; + size_t size; + if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size)) + goto out; - if (symbol_offset + sizeof(amd_kernel_code_t) > size) - goto out; + if (symbol_offset + sizeof(amd_kernel_code_t) > size) + goto out; - result = (const amd_kernel_code_t*)(text + symbol_offset); + result = (const amd_kernel_code_t *)(text + symbol_offset); out: - ac_rtld_close(&rtld); - return result; + ac_rtld_close(&rtld); + return result; } static void code_object_to_config(const amd_kernel_code_t *code_object, - struct ac_shader_config *out_config) { - - uint32_t rsrc1 = code_object->compute_pgm_resource_registers; - uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; - out_config->num_sgprs = code_object->wavefront_sgpr_count; - out_config->num_vgprs = code_object->workitem_vgpr_count; - out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); - out_config->rsrc1 = rsrc1; - out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2)); - out_config->rsrc2 = rsrc2; - out_config->scratch_bytes_per_wave = - align(code_object->workitem_private_segment_byte_size * 64, 1024); + struct ac_shader_config *out_config) +{ + + uint32_t rsrc1 = code_object->compute_pgm_resource_registers; + uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; + out_config->num_sgprs = code_object->wavefront_sgpr_count; + out_config->num_vgprs = code_object->workitem_vgpr_count; + out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); + out_config->rsrc1 = rsrc1; + out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2)); + out_config->rsrc2 = rsrc2; + out_config->scratch_bytes_per_wave = + align(code_object->workitem_private_segment_byte_size * 64, 1024); } /* Asynchronous compute shader compilation. */ static void si_create_compute_state_async(void *job, int thread_index) { - struct si_compute *program = (struct si_compute *)job; - struct si_shader_selector *sel = &program->sel; - struct si_shader *shader = &program->shader; - struct ac_llvm_compiler *compiler; - struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; - struct si_screen *sscreen = sel->screen; - - assert(!debug->debug_message || debug->async); - assert(thread_index >= 0); - assert(thread_index < ARRAY_SIZE(sscreen->compiler)); - compiler = &sscreen->compiler[thread_index]; - - if (!compiler->passes) - si_init_compiler(sscreen, compiler); - - assert(program->ir_type == PIPE_SHADER_IR_NIR); - si_nir_scan_shader(sel->nir, &sel->info); - - /* Store the declared LDS size into si_shader_info for the shader - * cache to include it. - */ - sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size; - - si_get_active_slot_masks(&sel->info, - &sel->active_const_and_shader_buffers, - &sel->active_samplers_and_images); - - program->shader.is_monolithic = true; - program->reads_variable_block_size = - sel->info.uses_block_size && - sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; - program->num_cs_user_data_dwords = - sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; - - unsigned char ir_sha1_cache_key[20]; - si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key); - - /* Try to load the shader from the shader cache. */ - simple_mtx_lock(&sscreen->shader_cache_mutex); - - if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { - simple_mtx_unlock(&sscreen->shader_cache_mutex); - - si_shader_dump_stats_for_shader_db(sscreen, shader, debug); - si_shader_dump(sscreen, shader, debug, stderr, true); - - if (!si_shader_binary_upload(sscreen, shader, 0)) - program->shader.compilation_failed = true; - } else { - simple_mtx_unlock(&sscreen->shader_cache_mutex); - - if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) { - program->shader.compilation_failed = true; - return; - } - - bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0; - unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + - (sel->info.uses_grid_size ? 3 : 0) + - (program->reads_variable_block_size ? 3 : 0) + - program->num_cs_user_data_dwords; - - shader->config.rsrc1 = - S_00B848_VGPRS((shader->config.num_vgprs - 1) / - (sscreen->compute_wave_size == 32 ? 8 : 4)) | - S_00B848_DX10_CLAMP(1) | - S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | - S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) | - S_00B848_FLOAT_MODE(shader->config.float_mode); - - if (sscreen->info.chip_class < GFX10) { - shader->config.rsrc1 |= - S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8); - } - - shader->config.rsrc2 = - S_00B84C_USER_SGPR(user_sgprs) | - S_00B84C_SCRATCH_EN(scratch_enabled) | - S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) | - S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) | - S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) | - S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) | - S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] ? 2 : - sel->info.uses_thread_id[1] ? 1 : 0) | - S_00B84C_LDS_SIZE(shader->config.lds_size); - - simple_mtx_lock(&sscreen->shader_cache_mutex); - si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, - shader, true); - simple_mtx_unlock(&sscreen->shader_cache_mutex); - } - - ralloc_free(sel->nir); - sel->nir = NULL; + struct si_compute *program = (struct si_compute *)job; + struct si_shader_selector *sel = &program->sel; + struct si_shader *shader = &program->shader; + struct ac_llvm_compiler *compiler; + struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; + struct si_screen *sscreen = sel->screen; + + assert(!debug->debug_message || debug->async); + assert(thread_index >= 0); + assert(thread_index < ARRAY_SIZE(sscreen->compiler)); + compiler = &sscreen->compiler[thread_index]; + + if (!compiler->passes) + si_init_compiler(sscreen, compiler); + + assert(program->ir_type == PIPE_SHADER_IR_NIR); + si_nir_scan_shader(sel->nir, &sel->info); + + /* Store the declared LDS size into si_shader_info for the shader + * cache to include it. + */ + sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size; + + si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers, + &sel->active_samplers_and_images); + + program->shader.is_monolithic = true; + program->reads_variable_block_size = + sel->info.uses_block_size && sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; + program->num_cs_user_data_dwords = + sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; + + unsigned char ir_sha1_cache_key[20]; + si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key); + + /* Try to load the shader from the shader cache. */ + simple_mtx_lock(&sscreen->shader_cache_mutex); + + if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { + simple_mtx_unlock(&sscreen->shader_cache_mutex); + + si_shader_dump_stats_for_shader_db(sscreen, shader, debug); + si_shader_dump(sscreen, shader, debug, stderr, true); + + if (!si_shader_binary_upload(sscreen, shader, 0)) + program->shader.compilation_failed = true; + } else { + simple_mtx_unlock(&sscreen->shader_cache_mutex); + + if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) { + program->shader.compilation_failed = true; + return; + } + + bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0; + unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) + + (program->reads_variable_block_size ? 3 : 0) + + program->num_cs_user_data_dwords; + + shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) / + (sscreen->compute_wave_size == 32 ? 8 : 4)) | + S_00B848_DX10_CLAMP(1) | + S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | + S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) | + S_00B848_FLOAT_MODE(shader->config.float_mode); + + if (sscreen->info.chip_class < GFX10) { + shader->config.rsrc1 |= S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8); + } + + shader->config.rsrc2 = S_00B84C_USER_SGPR(user_sgprs) | S_00B84C_SCRATCH_EN(scratch_enabled) | + S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) | + S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) | + S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) | + S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) | + S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] + ? 2 + : sel->info.uses_thread_id[1] ? 1 : 0) | + S_00B84C_LDS_SIZE(shader->config.lds_size); + + simple_mtx_lock(&sscreen->shader_cache_mutex); + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true); + simple_mtx_unlock(&sscreen->shader_cache_mutex); + } + + ralloc_free(sel->nir); + sel->nir = NULL; } -static void *si_create_compute_state( - struct pipe_context *ctx, - const struct pipe_compute_state *cso) +static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_screen *sscreen = (struct si_screen *)ctx->screen; - struct si_compute *program = CALLOC_STRUCT(si_compute); - struct si_shader_selector *sel = &program->sel; - - pipe_reference_init(&sel->base.reference, 1); - sel->type = PIPE_SHADER_COMPUTE; - sel->screen = sscreen; - program->shader.selector = &program->sel; - program->ir_type = cso->ir_type; - program->local_size = cso->req_local_mem; - program->private_size = cso->req_private_mem; - program->input_size = cso->req_input_mem; - - if (cso->ir_type != PIPE_SHADER_IR_NATIVE) { - if (cso->ir_type == PIPE_SHADER_IR_TGSI) { - program->ir_type = PIPE_SHADER_IR_NIR; - sel->nir = tgsi_to_nir(cso->prog, ctx->screen); - } else { - assert(cso->ir_type == PIPE_SHADER_IR_NIR); - sel->nir = (struct nir_shader *) cso->prog; - } - - sel->compiler_ctx_state.debug = sctx->debug; - sel->compiler_ctx_state.is_debug_context = sctx->is_debug; - p_atomic_inc(&sscreen->num_shaders_created); - - si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE, - &sel->ready, - &sel->compiler_ctx_state, - program, si_create_compute_state_async); - } else { - const struct pipe_binary_program_header *header; - header = cso->prog; - - program->shader.binary.elf_size = header->num_bytes; - program->shader.binary.elf_buffer = malloc(header->num_bytes); - if (!program->shader.binary.elf_buffer) { - FREE(program); - return NULL; - } - memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes); - - const amd_kernel_code_t *code_object = - si_compute_get_code_object(program, 0); - code_object_to_config(code_object, &program->shader.config); - - si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true); - if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) { - fprintf(stderr, "LLVM failed to upload shader\n"); - free((void *)program->shader.binary.elf_buffer); - FREE(program); - return NULL; - } - } - - return program; + struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + struct si_compute *program = CALLOC_STRUCT(si_compute); + struct si_shader_selector *sel = &program->sel; + + pipe_reference_init(&sel->base.reference, 1); + sel->type = PIPE_SHADER_COMPUTE; + sel->screen = sscreen; + program->shader.selector = &program->sel; + program->ir_type = cso->ir_type; + program->local_size = cso->req_local_mem; + program->private_size = cso->req_private_mem; + program->input_size = cso->req_input_mem; + + if (cso->ir_type != PIPE_SHADER_IR_NATIVE) { + if (cso->ir_type == PIPE_SHADER_IR_TGSI) { + program->ir_type = PIPE_SHADER_IR_NIR; + sel->nir = tgsi_to_nir(cso->prog, ctx->screen); + } else { + assert(cso->ir_type == PIPE_SHADER_IR_NIR); + sel->nir = (struct nir_shader *)cso->prog; + } + + sel->compiler_ctx_state.debug = sctx->debug; + sel->compiler_ctx_state.is_debug_context = sctx->is_debug; + p_atomic_inc(&sscreen->num_shaders_created); + + si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state, + program, si_create_compute_state_async); + } else { + const struct pipe_binary_program_header *header; + header = cso->prog; + + program->shader.binary.elf_size = header->num_bytes; + program->shader.binary.elf_buffer = malloc(header->num_bytes); + if (!program->shader.binary.elf_buffer) { + FREE(program); + return NULL; + } + memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes); + + const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0); + code_object_to_config(code_object, &program->shader.config); + + si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true); + if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) { + fprintf(stderr, "LLVM failed to upload shader\n"); + free((void *)program->shader.binary.elf_buffer); + FREE(program); + return NULL; + } + } + + return program; } static void si_bind_compute_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_compute *program = (struct si_compute*)state; - struct si_shader_selector *sel = &program->sel; - - sctx->cs_shader_state.program = program; - if (!program) - return; - - /* Wait because we need active slot usage masks. */ - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - util_queue_fence_wait(&sel->ready); - - si_set_active_descriptors(sctx, - SI_DESCS_FIRST_COMPUTE + - SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, - sel->active_const_and_shader_buffers); - si_set_active_descriptors(sctx, - SI_DESCS_FIRST_COMPUTE + - SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, - sel->active_samplers_and_images); + struct si_context *sctx = (struct si_context *)ctx; + struct si_compute *program = (struct si_compute *)state; + struct si_shader_selector *sel = &program->sel; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type != PIPE_SHADER_IR_NATIVE) + util_queue_fence_wait(&sel->ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + sel->active_samplers_and_images); } -static void si_set_global_binding( - struct pipe_context *ctx, unsigned first, unsigned n, - struct pipe_resource **resources, - uint32_t **handles) +static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n, + struct pipe_resource **resources, uint32_t **handles) { - unsigned i; - struct si_context *sctx = (struct si_context*)ctx; - struct si_compute *program = sctx->cs_shader_state.program; - - if (first + n > program->max_global_buffers) { - unsigned old_max = program->max_global_buffers; - program->max_global_buffers = first + n; - program->global_buffers = - realloc(program->global_buffers, - program->max_global_buffers * - sizeof(program->global_buffers[0])); - if (!program->global_buffers) { - fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n"); - return; - } - - memset(&program->global_buffers[old_max], 0, - (program->max_global_buffers - old_max) * - sizeof(program->global_buffers[0])); - } - - if (!resources) { - for (i = 0; i < n; i++) { - pipe_resource_reference(&program->global_buffers[first + i], NULL); - } - return; - } - - for (i = 0; i < n; i++) { - uint64_t va; - uint32_t offset; - pipe_resource_reference(&program->global_buffers[first + i], resources[i]); - va = si_resource(resources[i])->gpu_address; - offset = util_le32_to_cpu(*handles[i]); - va += offset; - va = util_cpu_to_le64(va); - memcpy(handles[i], &va, sizeof(va)); - } + unsigned i; + struct si_context *sctx = (struct si_context *)ctx; + struct si_compute *program = sctx->cs_shader_state.program; + + if (first + n > program->max_global_buffers) { + unsigned old_max = program->max_global_buffers; + program->max_global_buffers = first + n; + program->global_buffers = realloc( + program->global_buffers, program->max_global_buffers * sizeof(program->global_buffers[0])); + if (!program->global_buffers) { + fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n"); + return; + } + + memset(&program->global_buffers[old_max], 0, + (program->max_global_buffers - old_max) * sizeof(program->global_buffers[0])); + } + + if (!resources) { + for (i = 0; i < n; i++) { + pipe_resource_reference(&program->global_buffers[first + i], NULL); + } + return; + } + + for (i = 0; i < n; i++) { + uint64_t va; + uint32_t offset; + pipe_resource_reference(&program->global_buffers[first + i], resources[i]); + va = si_resource(resources[i])->gpu_address; + offset = util_le32_to_cpu(*handles[i]); + va += offset; + va = util_cpu_to_le64(va); + memcpy(handles[i], &va, sizeof(va)); + } } void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs) { - uint64_t bc_va; - - radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); - /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, - * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); - - if (sctx->chip_class >= GFX7) { - /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ - radeon_set_sh_reg_seq(cs, - R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | - S_00B858_SH1_CU_EN(0xffff)); - radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | - S_00B858_SH1_CU_EN(0xffff)); - } - - if (sctx->chip_class >= GFX10) - radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0); - - /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID - * and is now per pipe, so it should be handled in the - * kernel if we want to use something other than the default value, - * which is now 0x22f. - */ - if (sctx->chip_class <= GFX6) { - /* XXX: This should be: - * (number of compute units) * 4 * (waves per simd) - 1 */ - - radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, - 0x190 /* Default value */); - } - - /* Set the pointer to border colors. */ - bc_va = sctx->border_color_buffer->gpu_address; - - if (sctx->chip_class >= GFX7) { - radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2); - radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ - radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ - } else { - if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) { - radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, - bc_va >> 8); - } - } + uint64_t bc_va; + + radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); + /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, + * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ + radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + + if (sctx->chip_class >= GFX7) { + /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ + radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); + radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff)); + } + + if (sctx->chip_class >= GFX10) + radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0); + + /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID + * and is now per pipe, so it should be handled in the + * kernel if we want to use something other than the default value, + * which is now 0x22f. + */ + if (sctx->chip_class <= GFX6) { + /* XXX: This should be: + * (number of compute units) * 4 * (waves per simd) - 1 */ + + radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); + } + + /* Set the pointer to border colors. */ + bc_va = sctx->border_color_buffer->gpu_address; + + if (sctx->chip_class >= GFX7) { + radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2); + radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ + radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ + } else { + if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) { + radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8); + } + } } -static bool si_setup_compute_scratch_buffer(struct si_context *sctx, - struct si_shader *shader, +static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader, struct ac_shader_config *config) { - uint64_t scratch_bo_size, scratch_needed; - scratch_bo_size = 0; - scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves; - if (sctx->compute_scratch_buffer) - scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; + uint64_t scratch_bo_size, scratch_needed; + scratch_bo_size = 0; + scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves; + if (sctx->compute_scratch_buffer) + scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; - if (scratch_bo_size < scratch_needed) { - si_resource_reference(&sctx->compute_scratch_buffer, NULL); + if (scratch_bo_size < scratch_needed) { + si_resource_reference(&sctx->compute_scratch_buffer, NULL); - sctx->compute_scratch_buffer = - si_aligned_buffer_create(&sctx->screen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - scratch_needed, - sctx->screen->info.pte_fragment_size); + sctx->compute_scratch_buffer = + si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + scratch_needed, sctx->screen->info.pte_fragment_size); - if (!sctx->compute_scratch_buffer) - return false; - } + if (!sctx->compute_scratch_buffer) + return false; + } - if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { - uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; + if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { + uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; - if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) - return false; + if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) + return false; - si_resource_reference(&shader->scratch_bo, - sctx->compute_scratch_buffer); - } + si_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer); + } - return true; + return true; } -static bool si_switch_compute_shader(struct si_context *sctx, - struct si_compute *program, - struct si_shader *shader, - const amd_kernel_code_t *code_object, - unsigned offset) +static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program, + struct si_shader *shader, const amd_kernel_code_t *code_object, + unsigned offset) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct ac_shader_config inline_config = {0}; - struct ac_shader_config *config; - uint64_t shader_va; - - if (sctx->cs_shader_state.emitted_program == program && - sctx->cs_shader_state.offset == offset) - return true; - - if (program->ir_type != PIPE_SHADER_IR_NATIVE) { - config = &shader->config; - } else { - unsigned lds_blocks; - - config = &inline_config; - code_object_to_config(code_object, config); - - lds_blocks = config->lds_size; - /* XXX: We are over allocating LDS. For GFX6, the shader reports - * LDS in blocks of 256 bytes, so if there are 4 bytes lds - * allocated in the shader and 4 bytes allocated by the state - * tracker, then we will set LDS_SIZE to 512 bytes rather than 256. - */ - if (sctx->chip_class <= GFX6) { - lds_blocks += align(program->local_size, 256) >> 8; - } else { - lds_blocks += align(program->local_size, 512) >> 9; - } - - /* TODO: use si_multiwave_lds_size_workaround */ - assert(lds_blocks <= 0xFF); - - config->rsrc2 &= C_00B84C_LDS_SIZE; - config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); - } - - if (!si_setup_compute_scratch_buffer(sctx, shader, config)) - return false; - - if (shader->scratch_bo) { - COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " - "Total Scratch: %u bytes\n", sctx->scratch_waves, - config->scratch_bytes_per_wave, - config->scratch_bytes_per_wave * - sctx->scratch_waves); - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - shader->scratch_bo, RADEON_USAGE_READWRITE, - RADEON_PRIO_SCRATCH_BUFFER); - } - - /* Prefetch the compute shader to TC L2. - * - * We should also prefetch graphics shaders if a compute dispatch was - * the last command, and the compute shader if a draw call was the last - * command. However, that would add more complexity and we're likely - * to get a shader state change in that case anyway. - */ - if (sctx->chip_class >= GFX7) { - cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, - 0, program->shader.bo->b.b.width0); - } - - shader_va = shader->bo->gpu_address + offset; - if (program->ir_type == PIPE_SHADER_IR_NATIVE) { - /* Shader code is placed after the amd_kernel_code_t - * struct. */ - shader_va += sizeof(amd_kernel_code_t); - } - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo, - RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); - - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cs, config->rsrc1); - radeon_emit(cs, config->rsrc2); - - COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x " - "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2); - - sctx->max_seen_compute_scratch_bytes_per_wave = - MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, - config->scratch_bytes_per_wave); - - radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) - | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); - - sctx->cs_shader_state.emitted_program = program; - sctx->cs_shader_state.offset = offset; - sctx->cs_shader_state.uses_scratch = - config->scratch_bytes_per_wave != 0; - - return true; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct ac_shader_config inline_config = {0}; + struct ac_shader_config *config; + uint64_t shader_va; + + if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset) + return true; + + if (program->ir_type != PIPE_SHADER_IR_NATIVE) { + config = &shader->config; + } else { + unsigned lds_blocks; + + config = &inline_config; + code_object_to_config(code_object, config); + + lds_blocks = config->lds_size; + /* XXX: We are over allocating LDS. For GFX6, the shader reports + * LDS in blocks of 256 bytes, so if there are 4 bytes lds + * allocated in the shader and 4 bytes allocated by the state + * tracker, then we will set LDS_SIZE to 512 bytes rather than 256. + */ + if (sctx->chip_class <= GFX6) { + lds_blocks += align(program->local_size, 256) >> 8; + } else { + lds_blocks += align(program->local_size, 512) >> 9; + } + + /* TODO: use si_multiwave_lds_size_workaround */ + assert(lds_blocks <= 0xFF); + + config->rsrc2 &= C_00B84C_LDS_SIZE; + config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); + } + + if (!si_setup_compute_scratch_buffer(sctx, shader, config)) + return false; + + if (shader->scratch_bo) { + COMPUTE_DBG(sctx->screen, + "Waves: %u; Scratch per wave: %u bytes; " + "Total Scratch: %u bytes\n", + sctx->scratch_waves, config->scratch_bytes_per_wave, + config->scratch_bytes_per_wave * sctx->scratch_waves); + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE, + RADEON_PRIO_SCRATCH_BUFFER); + } + + /* Prefetch the compute shader to TC L2. + * + * We should also prefetch graphics shaders if a compute dispatch was + * the last command, and the compute shader if a draw call was the last + * command. However, that would add more complexity and we're likely + * to get a shader state change in that case anyway. + */ + if (sctx->chip_class >= GFX7) { + cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0); + } + + shader_va = shader->bo->gpu_address + offset; + if (program->ir_type == PIPE_SHADER_IR_NATIVE) { + /* Shader code is placed after the amd_kernel_code_t + * struct. */ + shader_va += sizeof(amd_kernel_code_t); + } + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_BINARY); + + radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(cs, shader_va >> 8); + radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); + + radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit(cs, config->rsrc1); + radeon_emit(cs, config->rsrc2); + + COMPUTE_DBG(sctx->screen, + "COMPUTE_PGM_RSRC1: 0x%08x " + "COMPUTE_PGM_RSRC2: 0x%08x\n", + config->rsrc1, config->rsrc2); + + sctx->max_seen_compute_scratch_bytes_per_wave = + MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave); + + radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(sctx->scratch_waves) | + S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); + + sctx->cs_shader_state.emitted_program = program; + sctx->cs_shader_state.offset = offset; + sctx->cs_shader_state.uses_scratch = config->scratch_bytes_per_wave != 0; + + return true; } static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, - const amd_kernel_code_t *code_object, - unsigned user_sgpr) + const amd_kernel_code_t *code_object, unsigned user_sgpr) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; - - unsigned max_private_element_size = AMD_HSA_BITS_GET( - code_object->code_properties, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE); - - uint32_t scratch_dword0 = scratch_va & 0xffffffff; - uint32_t scratch_dword1 = - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | - S_008F04_SWIZZLE_ENABLE(1); - - /* Disable address clamping */ - uint32_t scratch_dword2 = 0xffffffff; - uint32_t scratch_dword3 = - S_008F0C_INDEX_STRIDE(3) | - S_008F0C_ADD_TID_ENABLE(1); - - if (sctx->chip_class >= GFX9) { - assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */ - } else { - scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size); - - if (sctx->chip_class < GFX8) { - /* BUF_DATA_FORMAT is ignored, but it cannot be - * BUF_DATA_FORMAT_INVALID. */ - scratch_dword3 |= - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8); - } - } - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + - (user_sgpr * 4), 4); - radeon_emit(cs, scratch_dword0); - radeon_emit(cs, scratch_dword1); - radeon_emit(cs, scratch_dword2); - radeon_emit(cs, scratch_dword3); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; + + unsigned max_private_element_size = + AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE); + + uint32_t scratch_dword0 = scratch_va & 0xffffffff; + uint32_t scratch_dword1 = + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); + + /* Disable address clamping */ + uint32_t scratch_dword2 = 0xffffffff; + uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(3) | S_008F0C_ADD_TID_ENABLE(1); + + if (sctx->chip_class >= GFX9) { + assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */ + } else { + scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size); + + if (sctx->chip_class < GFX8) { + /* BUF_DATA_FORMAT is ignored, but it cannot be + * BUF_DATA_FORMAT_INVALID. */ + scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8); + } + } + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); + radeon_emit(cs, scratch_dword0); + radeon_emit(cs, scratch_dword1); + radeon_emit(cs, scratch_dword2); + radeon_emit(cs, scratch_dword3); } -static void si_setup_user_sgprs_co_v2(struct si_context *sctx, - const amd_kernel_code_t *code_object, - const struct pipe_grid_info *info, - uint64_t kernel_args_va) +static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object, + const struct pipe_grid_info *info, uint64_t kernel_args_va) { - struct si_compute *program = sctx->cs_shader_state.program; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - static const enum amd_code_property_mask_t workgroup_count_masks [] = { - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z - }; - - unsigned i, user_sgpr = 0; - if (AMD_HSA_BITS_GET(code_object->code_properties, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { - if (code_object->workitem_private_segment_byte_size > 0) { - setup_scratch_rsrc_user_sgprs(sctx, code_object, - user_sgpr); - } - user_sgpr += 4; - } - - if (AMD_HSA_BITS_GET(code_object->code_properties, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { - struct dispatch_packet dispatch; - unsigned dispatch_offset; - struct si_resource *dispatch_buf = NULL; - uint64_t dispatch_va; - - /* Upload dispatch ptr */ - memset(&dispatch, 0, sizeof(dispatch)); - - dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]); - dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]); - dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]); - - dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]); - dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]); - dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]); - - dispatch.private_segment_size = util_cpu_to_le32(program->private_size); - dispatch.group_segment_size = util_cpu_to_le32(program->local_size); - - dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va); - - u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), - 256, &dispatch, &dispatch_offset, - (struct pipe_resource**)&dispatch_buf); - - if (!dispatch_buf) { - fprintf(stderr, "Error: Failed to allocate dispatch " - "packet."); - } - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); - - dispatch_va = dispatch_buf->gpu_address + dispatch_offset; - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + - (user_sgpr * 4), 2); - radeon_emit(cs, dispatch_va); - radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | - S_008F04_STRIDE(0)); - - si_resource_reference(&dispatch_buf, NULL); - user_sgpr += 2; - } - - if (AMD_HSA_BITS_GET(code_object->code_properties, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + - (user_sgpr * 4), 2); - radeon_emit(cs, kernel_args_va); - radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) | - S_008F04_STRIDE(0)); - user_sgpr += 2; - } - - for (i = 0; i < 3 && user_sgpr < 16; i++) { - if (code_object->code_properties & workgroup_count_masks[i]) { - radeon_set_sh_reg_seq(cs, - R_00B900_COMPUTE_USER_DATA_0 + - (user_sgpr * 4), 1); - radeon_emit(cs, info->grid[i]); - user_sgpr += 1; - } - } + struct si_compute *program = sctx->cs_shader_state.program; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + static const enum amd_code_property_mask_t workgroup_count_masks[] = { + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z}; + + unsigned i, user_sgpr = 0; + if (AMD_HSA_BITS_GET(code_object->code_properties, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + if (code_object->workitem_private_segment_byte_size > 0) { + setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr); + } + user_sgpr += 4; + } + + if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { + struct dispatch_packet dispatch; + unsigned dispatch_offset; + struct si_resource *dispatch_buf = NULL; + uint64_t dispatch_va; + + /* Upload dispatch ptr */ + memset(&dispatch, 0, sizeof(dispatch)); + + dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]); + dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]); + dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]); + + dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]); + dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]); + dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]); + + dispatch.private_segment_size = util_cpu_to_le32(program->private_size); + dispatch.group_segment_size = util_cpu_to_le32(program->local_size); + + dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va); + + u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset, + (struct pipe_resource **)&dispatch_buf); + + if (!dispatch_buf) { + fprintf(stderr, "Error: Failed to allocate dispatch " + "packet."); + } + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf, RADEON_USAGE_READ, + RADEON_PRIO_CONST_BUFFER); + + dispatch_va = dispatch_buf->gpu_address + dispatch_offset; + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); + radeon_emit(cs, dispatch_va); + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0)); + + si_resource_reference(&dispatch_buf, NULL); + user_sgpr += 2; + } + + if (AMD_HSA_BITS_GET(code_object->code_properties, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); + radeon_emit(cs, kernel_args_va); + radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0)); + user_sgpr += 2; + } + + for (i = 0; i < 3 && user_sgpr < 16; i++) { + if (code_object->code_properties & workgroup_count_masks[i]) { + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1); + radeon_emit(cs, info->grid[i]); + user_sgpr += 1; + } + } } -static bool si_upload_compute_input(struct si_context *sctx, - const amd_kernel_code_t *code_object, - const struct pipe_grid_info *info) +static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object, + const struct pipe_grid_info *info) { - struct si_compute *program = sctx->cs_shader_state.program; - struct si_resource *input_buffer = NULL; - uint32_t kernel_args_offset = 0; - uint32_t *kernel_args; - void *kernel_args_ptr; - uint64_t kernel_args_va; + struct si_compute *program = sctx->cs_shader_state.program; + struct si_resource *input_buffer = NULL; + uint32_t kernel_args_offset = 0; + uint32_t *kernel_args; + void *kernel_args_ptr; + uint64_t kernel_args_va; - u_upload_alloc(sctx->b.const_uploader, 0, program->input_size, - sctx->screen->info.tcc_cache_line_size, - &kernel_args_offset, - (struct pipe_resource**)&input_buffer, &kernel_args_ptr); + u_upload_alloc(sctx->b.const_uploader, 0, program->input_size, + sctx->screen->info.tcc_cache_line_size, &kernel_args_offset, + (struct pipe_resource **)&input_buffer, &kernel_args_ptr); - if (unlikely(!kernel_args_ptr)) - return false; + if (unlikely(!kernel_args_ptr)) + return false; - kernel_args = (uint32_t*)kernel_args_ptr; - kernel_args_va = input_buffer->gpu_address + kernel_args_offset; + kernel_args = (uint32_t *)kernel_args_ptr; + kernel_args_va = input_buffer->gpu_address + kernel_args_offset; - memcpy(kernel_args, info->input, program->input_size); + memcpy(kernel_args, info->input, program->input_size); - for (unsigned i = 0; i < program->input_size / 4; i++) { - COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, - kernel_args[i]); - } + for (unsigned i = 0; i < program->input_size / 4; i++) { + COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]); + } - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer, RADEON_USAGE_READ, + RADEON_PRIO_CONST_BUFFER); - si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va); - si_resource_reference(&input_buffer, NULL); - return true; + si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va); + si_resource_reference(&input_buffer, NULL); + return true; } -static void si_setup_nir_user_data(struct si_context *sctx, - const struct pipe_grid_info *info) +static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info) { - struct si_compute *program = sctx->cs_shader_state.program; - struct si_shader_selector *sel = &program->sel; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + - 4 * SI_NUM_RESOURCE_SGPRS; - unsigned block_size_reg = grid_size_reg + - /* 12 bytes = 3 dwords. */ - 12 * sel->info.uses_grid_size; - unsigned cs_user_data_reg = block_size_reg + - 12 * program->reads_variable_block_size; - - if (info->indirect) { - if (sel->info.uses_grid_size) { - for (unsigned i = 0; i < 3; ++i) { - si_cp_copy_data(sctx, sctx->gfx_cs, - COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i, - COPY_DATA_SRC_MEM, si_resource(info->indirect), - info->indirect_offset + 4 * i); - } - } - } else { - if (sel->info.uses_grid_size) { - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); - radeon_emit(cs, info->grid[0]); - radeon_emit(cs, info->grid[1]); - radeon_emit(cs, info->grid[2]); - } - if (program->reads_variable_block_size) { - radeon_set_sh_reg_seq(cs, block_size_reg, 3); - radeon_emit(cs, info->block[0]); - radeon_emit(cs, info->block[1]); - radeon_emit(cs, info->block[2]); - } - } - - if (program->num_cs_user_data_dwords) { - radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords); - radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords); - } + struct si_compute *program = sctx->cs_shader_state.program; + struct si_shader_selector *sel = &program->sel; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 4 * SI_NUM_RESOURCE_SGPRS; + unsigned block_size_reg = grid_size_reg + + /* 12 bytes = 3 dwords. */ + 12 * sel->info.uses_grid_size; + unsigned cs_user_data_reg = block_size_reg + 12 * program->reads_variable_block_size; + + if (info->indirect) { + if (sel->info.uses_grid_size) { + for (unsigned i = 0; i < 3; ++i) { + si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i, + COPY_DATA_SRC_MEM, si_resource(info->indirect), + info->indirect_offset + 4 * i); + } + } + } else { + if (sel->info.uses_grid_size) { + radeon_set_sh_reg_seq(cs, grid_size_reg, 3); + radeon_emit(cs, info->grid[0]); + radeon_emit(cs, info->grid[1]); + radeon_emit(cs, info->grid[2]); + } + if (program->reads_variable_block_size) { + radeon_set_sh_reg_seq(cs, block_size_reg, 3); + radeon_emit(cs, info->block[0]); + radeon_emit(cs, info->block[1]); + radeon_emit(cs, info->block[2]); + } + } + + if (program->num_cs_user_data_dwords) { + radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords); + radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords); + } } -static void si_emit_dispatch_packets(struct si_context *sctx, - const struct pipe_grid_info *info) +static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) { - struct si_screen *sscreen = sctx->screen; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off; - unsigned threads_per_threadgroup = - info->block[0] * info->block[1] * info->block[2]; - unsigned waves_per_threadgroup = - DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size); - unsigned threadgroups_per_cu = 1; - - if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) - threadgroups_per_cu = 2; - - radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - ac_get_compute_resource_limits(&sscreen->info, - waves_per_threadgroup, - sctx->cs_max_waves_per_sh, - threadgroups_per_cu)); - - unsigned dispatch_initiator = - S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_FORCE_START_AT_000(1) | - /* If the KMD allows it (there is a KMD hw register for it), - * allow launching waves out-of-order. (same as Vulkan) */ - S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) | - S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32); - - const uint *last_block = info->last_block; - bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; - - radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); - - if (partial_block_en) { - unsigned partial[3]; - - /* If no partial_block, these should be an entire block size, not 0. */ - partial[0] = last_block[0] ? last_block[0] : info->block[0]; - partial[1] = last_block[1] ? last_block[1] : info->block[1]; - partial[2] = last_block[2] ? last_block[2] : info->block[2]; - - radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | - S_00B81C_NUM_THREAD_PARTIAL(partial[0])); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | - S_00B820_NUM_THREAD_PARTIAL(partial[1])); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | - S_00B824_NUM_THREAD_PARTIAL(partial[2])); - - dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); - } else { - radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); - } - - if (info->indirect) { - uint64_t base_va = si_resource(info->indirect)->gpu_address; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(info->indirect), - RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); - - radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, 1); - radeon_emit(cs, base_va); - radeon_emit(cs, base_va >> 32); - - radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, info->indirect_offset); - radeon_emit(cs, dispatch_initiator); - } else { - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, info->grid[0]); - radeon_emit(cs, info->grid[1]); - radeon_emit(cs, info->grid[2]); - radeon_emit(cs, dispatch_initiator); - } + struct si_screen *sscreen = sctx->screen; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off; + unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2]; + unsigned waves_per_threadgroup = + DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size); + unsigned threadgroups_per_cu = 1; + + if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) + threadgroups_per_cu = 2; + + radeon_set_sh_reg( + cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup, + sctx->cs_max_waves_per_sh, threadgroups_per_cu)); + + unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) | + /* If the KMD allows it (there is a KMD hw register for it), + * allow launching waves out-of-order. (same as Vulkan) */ + S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) | + S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32); + + const uint *last_block = info->last_block; + bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; + + radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); + + if (partial_block_en) { + unsigned partial[3]; + + /* If no partial_block, these should be an entire block size, not 0. */ + partial[0] = last_block[0] ? last_block[0] : info->block[0]; + partial[1] = last_block[1] ? last_block[1] : info->block[1]; + partial[2] = last_block[2] ? last_block[2] : info->block[2]; + + radeon_emit( + cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0])); + radeon_emit( + cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1])); + radeon_emit( + cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2])); + + dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1); + } else { + radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0])); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1])); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2])); + } + + if (info->indirect) { + uint64_t base_va = si_resource(info->indirect)->gpu_address; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ, + RADEON_PRIO_DRAW_INDIRECT); + + radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, 1); + radeon_emit(cs, base_va); + radeon_emit(cs, base_va >> 32); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, info->indirect_offset); + radeon_emit(cs, dispatch_initiator); + } else { + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, info->grid[0]); + radeon_emit(cs, info->grid[1]); + radeon_emit(cs, info->grid[2]); + radeon_emit(cs, dispatch_initiator); + } } - -static void si_launch_grid( - struct pipe_context *ctx, const struct pipe_grid_info *info) +static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_compute *program = sctx->cs_shader_state.program; - const amd_kernel_code_t *code_object = - si_compute_get_code_object(program, info->pc); - int i; - /* HW bug workaround when CS threadgroups > 256 threads and async - * compute isn't used, i.e. only one compute job can run at a time. - * If async compute is possible, the threadgroup size must be limited - * to 256 threads on all queues to avoid the bug. - * Only GFX6 and certain GFX7 chips are affected. - */ - bool cs_regalloc_hang = - (sctx->chip_class == GFX6 || - sctx->family == CHIP_BONAIRE || - sctx->family == CHIP_KABINI) && - info->block[0] * info->block[1] * info->block[2] > 256; - - if (cs_regalloc_hang) - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - - if (program->ir_type != PIPE_SHADER_IR_NATIVE && - program->shader.compilation_failed) - return; - - if (sctx->has_graphics) { - if (sctx->last_num_draw_calls != sctx->num_draw_calls) { - si_update_fb_dirtiness_after_rendering(sctx); - sctx->last_num_draw_calls = sctx->num_draw_calls; - } - - si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE); - } - - /* Add buffer sizes for memory checking in need_cs_space. */ - si_context_add_resource_size(sctx, &program->shader.bo->b.b); - /* TODO: add the scratch buffer */ - - if (info->indirect) { - si_context_add_resource_size(sctx, info->indirect); - - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->chip_class <= GFX8 && - si_resource(info->indirect)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WB_L2; - si_resource(info->indirect)->TC_L2_dirty = false; - } - } - - si_need_gfx_cs_space(sctx); - - if (sctx->bo_list_add_all_compute_resources) - si_compute_resources_add_all_to_bo_list(sctx); - - if (!sctx->cs_shader_state.initialized) { - si_emit_initial_compute_regs(sctx, sctx->gfx_cs); - - sctx->cs_shader_state.emitted_program = NULL; - sctx->cs_shader_state.initialized = true; - } - - if (sctx->flags) - sctx->emit_cache_flush(sctx); - - if (!si_switch_compute_shader(sctx, program, &program->shader, - code_object, info->pc)) - return; - - si_upload_compute_shader_descriptors(sctx); - si_emit_compute_shader_pointers(sctx); - - if (sctx->has_graphics && - si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { - sctx->atoms.s.render_cond.emit(sctx); - si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false); - } - - if (program->ir_type == PIPE_SHADER_IR_NATIVE && - unlikely(!si_upload_compute_input(sctx, code_object, info))) - return; - - /* Global buffers */ - for (i = 0; i < program->max_global_buffers; i++) { - struct si_resource *buffer = - si_resource(program->global_buffers[i]); - if (!buffer) { - continue; - } - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, - RADEON_USAGE_READWRITE, - RADEON_PRIO_COMPUTE_GLOBAL); - } - - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - si_setup_nir_user_data(sctx, info); - - si_emit_dispatch_packets(sctx, info); - - if (unlikely(sctx->current_saved_cs)) { - si_trace_emit(sctx); - si_log_compute_state(sctx, sctx->log); - } - - sctx->compute_is_busy = true; - sctx->num_compute_calls++; - if (sctx->cs_shader_state.uses_scratch) - sctx->num_spill_compute_calls++; - - if (cs_regalloc_hang) - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + struct si_context *sctx = (struct si_context *)ctx; + struct si_compute *program = sctx->cs_shader_state.program; + const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc); + int i; + /* HW bug workaround when CS threadgroups > 256 threads and async + * compute isn't used, i.e. only one compute job can run at a time. + * If async compute is possible, the threadgroup size must be limited + * to 256 threads on all queues to avoid the bug. + * Only GFX6 and certain GFX7 chips are affected. + */ + bool cs_regalloc_hang = + (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) && + info->block[0] * info->block[1] * info->block[2] > 256; + + if (cs_regalloc_hang) + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed) + return; + + if (sctx->has_graphics) { + if (sctx->last_num_draw_calls != sctx->num_draw_calls) { + si_update_fb_dirtiness_after_rendering(sctx); + sctx->last_num_draw_calls = sctx->num_draw_calls; + } + + si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE); + } + + /* Add buffer sizes for memory checking in need_cs_space. */ + si_context_add_resource_size(sctx, &program->shader.bo->b.b); + /* TODO: add the scratch buffer */ + + if (info->indirect) { + si_context_add_resource_size(sctx, info->indirect); + + /* Indirect buffers use TC L2 on GFX9, but not older hw. */ + if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { + sctx->flags |= SI_CONTEXT_WB_L2; + si_resource(info->indirect)->TC_L2_dirty = false; + } + } + + si_need_gfx_cs_space(sctx); + + if (sctx->bo_list_add_all_compute_resources) + si_compute_resources_add_all_to_bo_list(sctx); + + if (!sctx->cs_shader_state.initialized) { + si_emit_initial_compute_regs(sctx, sctx->gfx_cs); + + sctx->cs_shader_state.emitted_program = NULL; + sctx->cs_shader_state.initialized = true; + } + + if (sctx->flags) + sctx->emit_cache_flush(sctx); + + if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc)) + return; + + si_upload_compute_shader_descriptors(sctx); + si_emit_compute_shader_pointers(sctx); + + if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { + sctx->atoms.s.render_cond.emit(sctx); + si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false); + } + + if (program->ir_type == PIPE_SHADER_IR_NATIVE && + unlikely(!si_upload_compute_input(sctx, code_object, info))) + return; + + /* Global buffers */ + for (i = 0; i < program->max_global_buffers; i++) { + struct si_resource *buffer = si_resource(program->global_buffers[i]); + if (!buffer) { + continue; + } + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_COMPUTE_GLOBAL); + } + + if (program->ir_type != PIPE_SHADER_IR_NATIVE) + si_setup_nir_user_data(sctx, info); + + si_emit_dispatch_packets(sctx, info); + + if (unlikely(sctx->current_saved_cs)) { + si_trace_emit(sctx); + si_log_compute_state(sctx, sctx->log); + } + + sctx->compute_is_busy = true; + sctx->num_compute_calls++; + if (sctx->cs_shader_state.uses_scratch) + sctx->num_spill_compute_calls++; + + if (cs_regalloc_hang) + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; } void si_destroy_compute(struct si_compute *program) { - struct si_shader_selector *sel = &program->sel; + struct si_shader_selector *sel = &program->sel; - if (program->ir_type != PIPE_SHADER_IR_NATIVE) { - util_queue_drop_job(&sel->screen->shader_compiler_queue, - &sel->ready); - util_queue_fence_destroy(&sel->ready); - } + if (program->ir_type != PIPE_SHADER_IR_NATIVE) { + util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready); + util_queue_fence_destroy(&sel->ready); + } - for (unsigned i = 0; i < program->max_global_buffers; i++) - pipe_resource_reference(&program->global_buffers[i], NULL); - FREE(program->global_buffers); + for (unsigned i = 0; i < program->max_global_buffers; i++) + pipe_resource_reference(&program->global_buffers[i], NULL); + FREE(program->global_buffers); - si_shader_destroy(&program->shader); - ralloc_free(program->sel.nir); - FREE(program); + si_shader_destroy(&program->shader); + ralloc_free(program->sel.nir); + FREE(program); } -static void si_delete_compute_state(struct pipe_context *ctx, void* state){ - struct si_compute *program = (struct si_compute *)state; - struct si_context *sctx = (struct si_context*)ctx; +static void si_delete_compute_state(struct pipe_context *ctx, void *state) +{ + struct si_compute *program = (struct si_compute *)state; + struct si_context *sctx = (struct si_context *)ctx; - if (!state) - return; + if (!state) + return; - if (program == sctx->cs_shader_state.program) - sctx->cs_shader_state.program = NULL; + if (program == sctx->cs_shader_state.program) + sctx->cs_shader_state.program = NULL; - if (program == sctx->cs_shader_state.emitted_program) - sctx->cs_shader_state.emitted_program = NULL; + if (program == sctx->cs_shader_state.emitted_program) + sctx->cs_shader_state.emitted_program = NULL; - si_compute_reference(&program, NULL); + si_compute_reference(&program, NULL); } -static void si_set_compute_resources(struct pipe_context * ctx_, - unsigned start, unsigned count, - struct pipe_surface ** surfaces) { } +static void si_set_compute_resources(struct pipe_context *ctx_, unsigned start, unsigned count, + struct pipe_surface **surfaces) +{ +} void si_init_compute_functions(struct si_context *sctx) { - sctx->b.create_compute_state = si_create_compute_state; - sctx->b.delete_compute_state = si_delete_compute_state; - sctx->b.bind_compute_state = si_bind_compute_state; - sctx->b.set_compute_resources = si_set_compute_resources; - sctx->b.set_global_binding = si_set_global_binding; - sctx->b.launch_grid = si_launch_grid; + sctx->b.create_compute_state = si_create_compute_state; + sctx->b.delete_compute_state = si_delete_compute_state; + sctx->b.bind_compute_state = si_bind_compute_state; + sctx->b.set_compute_resources = si_set_compute_resources; + sctx->b.set_global_binding = si_set_global_binding; + sctx->b.launch_grid = si_launch_grid; } diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h index 14c3c8cb789..7cf06271853 100644 --- a/src/gallium/drivers/radeonsi/si_compute.h +++ b/src/gallium/drivers/radeonsi/si_compute.h @@ -25,35 +25,33 @@ #ifndef SI_COMPUTE_H #define SI_COMPUTE_H -#include "util/u_inlines.h" - #include "si_shader.h" +#include "util/u_inlines.h" struct si_compute { - struct si_shader_selector sel; - struct si_shader shader; + struct si_shader_selector sel; + struct si_shader shader; - unsigned ir_type; - unsigned local_size; - unsigned private_size; - unsigned input_size; + unsigned ir_type; + unsigned local_size; + unsigned private_size; + unsigned input_size; - int max_global_buffers; - struct pipe_resource **global_buffers; + int max_global_buffers; + struct pipe_resource **global_buffers; - bool reads_variable_block_size; - unsigned num_cs_user_data_dwords; + bool reads_variable_block_size; + unsigned num_cs_user_data_dwords; }; void si_destroy_compute(struct si_compute *program); -static inline void -si_compute_reference(struct si_compute **dst, struct si_compute *src) +static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src) { - if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference)) - si_destroy_compute(*dst); + if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference)) + si_destroy_compute(*dst); - *dst = src; + *dst = src; } #endif /* SI_COMPUTE_H */ diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index de020bfaf8c..6e3b07cb7c8 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -30,758 +30,705 @@ /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst * and L2_STREAM for src. */ -static enum si_cache_policy get_cache_policy(struct si_context *sctx, - enum si_coherency coher, - uint64_t size) +static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher, + uint64_t size) { - if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || - coher == SI_COHERENCY_CP)) || - (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER)) - return size <= 256 * 1024 ? L2_LRU : L2_STREAM; + if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) || + (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER)) + return size <= 256 * 1024 ? L2_LRU : L2_STREAM; - return L2_BYPASS; + return L2_BYPASS; } unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, - enum si_cache_policy cache_policy) + enum si_cache_policy cache_policy) { - switch (coher) { - default: - case SI_COHERENCY_NONE: - case SI_COHERENCY_CP: - return 0; - case SI_COHERENCY_SHADER: - return SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE | - (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0); - case SI_COHERENCY_CB_META: - return SI_CONTEXT_FLUSH_AND_INV_CB; - } + switch (coher) { + default: + case SI_COHERENCY_NONE: + case SI_COHERENCY_CP: + return 0; + case SI_COHERENCY_SHADER: + return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0); + case SI_COHERENCY_CB_META: + return SI_CONTEXT_FLUSH_AND_INV_CB; + } } -static void si_launch_grid_internal(struct si_context *sctx, - struct pipe_grid_info *info) +static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info) { - /* Set settings for driver-internal compute dispatches. */ - sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; - sctx->render_cond_force_off = true; - /* Skip decompression to prevent infinite recursion. */ - sctx->blitter->running = true; - - /* Dispatch compute. */ - sctx->b.launch_grid(&sctx->b, info); - - /* Restore default settings. */ - sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; - sctx->render_cond_force_off = false; - sctx->blitter->running = false; + /* Set settings for driver-internal compute dispatches. */ + sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + sctx->render_cond_force_off = true; + /* Skip decompression to prevent infinite recursion. */ + sctx->blitter->running = true; + + /* Dispatch compute. */ + sctx->b.launch_grid(&sctx->b, info); + + /* Restore default settings. */ + sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + sctx->render_cond_force_off = false; + sctx->blitter->running = false; } -static void si_compute_clear_12bytes_buffer(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_offset, - unsigned size, - const uint32_t *clear_value, - enum si_coherency coher) +static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_offset, unsigned size, + const uint32_t *clear_value, enum si_coherency coher) { - struct pipe_context *ctx = &sctx->b; + struct pipe_context *ctx = &sctx->b; - assert(dst_offset % 4 == 0); - assert(size % 4 == 0); - unsigned size_12 = DIV_ROUND_UP(size, 12); + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + unsigned size_12 = DIV_ROUND_UP(size, 12); - unsigned data[4] = {0}; - memcpy(data, clear_value, 12); + unsigned data[4] = {0}; + memcpy(data, clear_value, 12); - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - struct pipe_shader_buffer saved_sb = {0}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); + struct pipe_shader_buffer saved_sb = {0}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); - unsigned saved_writable_mask = 0; - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(0))) - saved_writable_mask = 1; + unsigned saved_writable_mask = 0; + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(0))) + saved_writable_mask = 1; - struct pipe_constant_buffer saved_cb = {}; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - void *saved_cs = sctx->cs_shader_state.program; + void *saved_cs = sctx->cs_shader_state.program; - struct pipe_constant_buffer cb = {}; - cb.buffer_size = sizeof(data); - cb.user_buffer = data; - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); - struct pipe_shader_buffer sb = {0}; - sb.buffer = dst; - sb.buffer_offset = dst_offset; - sb.buffer_size = size; + struct pipe_shader_buffer sb = {0}; + sb.buffer = dst; + sb.buffer_offset = dst_offset; + sb.buffer_size = size; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); - struct pipe_grid_info info = {0}; + struct pipe_grid_info info = {0}; - if (!sctx->cs_clear_12bytes_buffer) - sctx->cs_clear_12bytes_buffer = - si_clear_12bytes_buffer_shader(ctx); - ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer); - info.block[0] = 64; - info.last_block[0] = size_12 % 64; - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(size_12, 64); - info.grid[1] = 1; - info.grid[2] = 1; + if (!sctx->cs_clear_12bytes_buffer) + sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer); + info.block[0] = 64; + info.last_block[0] = size_12 % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(size_12, 64); + info.grid[1] = 1; + info.grid[2] = 1; - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info); - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - pipe_resource_reference(&saved_sb.buffer, NULL); - pipe_resource_reference(&saved_cb.buffer, NULL); + pipe_resource_reference(&saved_sb.buffer, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } -static void si_compute_do_clear_or_copy(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_offset, - struct pipe_resource *src, - unsigned src_offset, - unsigned size, - const uint32_t *clear_value, - unsigned clear_value_size, - enum si_coherency coher) +static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_offset, struct pipe_resource *src, + unsigned src_offset, unsigned size, + const uint32_t *clear_value, unsigned clear_value_size, + enum si_coherency coher) { - struct pipe_context *ctx = &sctx->b; - - assert(src_offset % 4 == 0); - assert(dst_offset % 4 == 0); - assert(size % 4 == 0); - - assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); - assert(!src || src_offset + size <= src->width0); - - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - - /* Save states. */ - void *saved_cs = sctx->cs_shader_state.program; - struct pipe_shader_buffer saved_sb[2] = {}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); - - unsigned saved_writable_mask = 0; - for (unsigned i = 0; i < (src ? 2 : 1); i++) { - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(i))) - saved_writable_mask |= 1 << i; - } - - /* The memory accesses are coalesced, meaning that the 1st instruction writes - * the 1st contiguous block of data for the whole wave, the 2nd instruction - * writes the 2nd contiguous block of data, etc. - */ - unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD : - SI_COMPUTE_CLEAR_DW_PER_THREAD; - unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); - unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; - unsigned wave_size = sctx->screen->compute_wave_size; - unsigned dwords_per_wave = dwords_per_thread * wave_size; - - unsigned num_dwords = size / 4; - unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); - - struct pipe_grid_info info = {}; - info.block[0] = MIN2(wave_size, num_instructions); - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); - info.grid[1] = 1; - info.grid[2] = 1; - - struct pipe_shader_buffer sb[2] = {}; - sb[0].buffer = dst; - sb[0].buffer_offset = dst_offset; - sb[0].buffer_size = size; - - bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; - - if (src) { - sb[1].buffer = src; - sb[1].buffer_offset = src_offset; - sb[1].buffer_size = size; - - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1); - - if (!sctx->cs_copy_buffer) { - sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b, - SI_COMPUTE_COPY_DW_PER_THREAD, - shader_dst_stream_policy, true); - } - ctx->bind_compute_state(ctx, sctx->cs_copy_buffer); - } else { - assert(clear_value_size >= 4 && - clear_value_size <= 16 && - util_is_power_of_two_or_zero(clear_value_size)); - - for (unsigned i = 0; i < 4; i++) - sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; - - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1); - - if (!sctx->cs_clear_buffer) { - sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b, - SI_COMPUTE_CLEAR_DW_PER_THREAD, - shader_dst_stream_policy, false); - } - ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); - } - - si_launch_grid_internal(sctx, &info); - - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0); - - if (cache_policy != L2_BYPASS) - si_resource(dst)->TC_L2_dirty = true; - - /* Restore states. */ - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, - saved_writable_mask); - for (int i = 0; i < 2; i++) - pipe_resource_reference(&saved_sb[i].buffer, NULL); + struct pipe_context *ctx = &sctx->b; + + assert(src_offset % 4 == 0); + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + + assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); + assert(!src || src_offset + size <= src->width0); + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_shader_buffer saved_sb[2] = {}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); + + unsigned saved_writable_mask = 0; + for (unsigned i = 0; i < (src ? 2 : 1); i++) { + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(i))) + saved_writable_mask |= 1 << i; + } + + /* The memory accesses are coalesced, meaning that the 1st instruction writes + * the 1st contiguous block of data for the whole wave, the 2nd instruction + * writes the 2nd contiguous block of data, etc. + */ + unsigned dwords_per_thread = + src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD; + unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); + unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; + unsigned wave_size = sctx->screen->compute_wave_size; + unsigned dwords_per_wave = dwords_per_thread * wave_size; + + unsigned num_dwords = size / 4; + unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + + struct pipe_grid_info info = {}; + info.block[0] = MIN2(wave_size, num_instructions); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[1] = 1; + info.grid[2] = 1; + + struct pipe_shader_buffer sb[2] = {}; + sb[0].buffer = dst; + sb[0].buffer_offset = dst_offset; + sb[0].buffer_size = size; + + bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; + + if (src) { + sb[1].buffer = src; + sb[1].buffer_offset = src_offset; + sb[1].buffer_size = size; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1); + + if (!sctx->cs_copy_buffer) { + sctx->cs_copy_buffer = si_create_dma_compute_shader( + &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); + } + ctx->bind_compute_state(ctx, sctx->cs_copy_buffer); + } else { + assert(clear_value_size >= 4 && clear_value_size <= 16 && + util_is_power_of_two_or_zero(clear_value_size)); + + for (unsigned i = 0; i < 4; i++) + sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1); + + if (!sctx->cs_clear_buffer) { + sctx->cs_clear_buffer = si_create_dma_compute_shader( + &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); + } + ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); + } + + si_launch_grid_internal(sctx, &info); + + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0); + + if (cache_policy != L2_BYPASS) + si_resource(dst)->TC_L2_dirty = true; + + /* Restore states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask); + for (int i = 0; i < 2; i++) + pipe_resource_reference(&saved_sb[i].buffer, NULL); } -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher, - bool force_cpdma) +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, + uint64_t size, uint32_t *clear_value, uint32_t clear_value_size, + enum si_coherency coher, bool force_cpdma) { - if (!size) - return; - - ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4); - - assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */ - assert(offset % clear_alignment == 0); - assert(size % clear_alignment == 0); - assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */ - - /* Reduce a large clear value size if possible. */ - if (clear_value_size > 4) { - bool clear_dword_duplicated = true; - - /* See if we can lower large fills to dword fills. */ - for (unsigned i = 1; i < clear_value_size / 4; i++) { - if (clear_value[0] != clear_value[i]) { - clear_dword_duplicated = false; - break; - } - } - if (clear_dword_duplicated) - clear_value_size = 4; - } - - /* Expand a small clear value size. */ - uint32_t tmp_clear_value; - if (clear_value_size <= 2) { - if (clear_value_size == 1) { - tmp_clear_value = *(uint8_t*)clear_value; - tmp_clear_value |= (tmp_clear_value << 8) | - (tmp_clear_value << 16) | - (tmp_clear_value << 24); - } else { - tmp_clear_value = *(uint16_t*)clear_value; - tmp_clear_value |= tmp_clear_value << 16; - } - clear_value = &tmp_clear_value; - clear_value_size = 4; - } - - if (clear_value_size == 12) { - si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher); - return; - } - - uint64_t aligned_size = size & ~3ull; - if (aligned_size >= 4) { - /* Before GFX9, CP DMA was very slow when clearing GTT, so never - * use CP DMA clears on those chips, because we can't be certain - * about buffer placements. - */ - if (clear_value_size > 4 || - (!force_cpdma && - clear_value_size == 4 && - offset % 4 == 0 && - (size > 32*1024 || sctx->chip_class <= GFX9))) { - si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, - aligned_size, clear_value, - clear_value_size, coher); - } else { - assert(clear_value_size == 4); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, - aligned_size, *clear_value, 0, coher, - get_cache_policy(sctx, coher, size)); - } - - offset += aligned_size; - size -= aligned_size; - } - - /* Handle non-dword alignment. */ - if (size) { - assert(dst); - assert(dst->target == PIPE_BUFFER); - assert(size < 4); - - pipe_buffer_write(&sctx->b, dst, offset, size, clear_value); - } + if (!size) + return; + + ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4); + + assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */ + assert(offset % clear_alignment == 0); + assert(size % clear_alignment == 0); + assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */ + + /* Reduce a large clear value size if possible. */ + if (clear_value_size > 4) { + bool clear_dword_duplicated = true; + + /* See if we can lower large fills to dword fills. */ + for (unsigned i = 1; i < clear_value_size / 4; i++) { + if (clear_value[0] != clear_value[i]) { + clear_dword_duplicated = false; + break; + } + } + if (clear_dword_duplicated) + clear_value_size = 4; + } + + /* Expand a small clear value size. */ + uint32_t tmp_clear_value; + if (clear_value_size <= 2) { + if (clear_value_size == 1) { + tmp_clear_value = *(uint8_t *)clear_value; + tmp_clear_value |= + (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24); + } else { + tmp_clear_value = *(uint16_t *)clear_value; + tmp_clear_value |= tmp_clear_value << 16; + } + clear_value = &tmp_clear_value; + clear_value_size = 4; + } + + if (clear_value_size == 12) { + si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher); + return; + } + + uint64_t aligned_size = size & ~3ull; + if (aligned_size >= 4) { + /* Before GFX9, CP DMA was very slow when clearing GTT, so never + * use CP DMA clears on those chips, because we can't be certain + * about buffer placements. + */ + if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 && + (size > 32 * 1024 || sctx->chip_class <= GFX9))) { + si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value, + clear_value_size, coher); + } else { + assert(clear_value_size == 4); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0, + coher, get_cache_policy(sctx, coher, size)); + } + + offset += aligned_size; + size -= aligned_size; + } + + /* Handle non-dword alignment. */ + if (size) { + assert(dst); + assert(dst->target == PIPE_BUFFER); + assert(size < 4); + + pipe_buffer_write(&sctx->b, dst, offset, size, clear_value); + } } -static void si_pipe_clear_buffer(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned offset, unsigned size, - const void *clear_value, - int clear_value_size) +static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, + unsigned offset, unsigned size, const void *clear_value, + int clear_value_size) { - si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value, - clear_value_size, SI_COHERENCY_SHADER, false); + si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value, + clear_value_size, SI_COHERENCY_SHADER, false); } -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size) +void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size) { - if (!size) - return; - - enum si_coherency coher = SI_COHERENCY_SHADER; - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); - - /* Only use compute for VRAM copies on dGPUs. */ - if (sctx->screen->info.has_dedicated_vram && - si_resource(dst)->domains & RADEON_DOMAIN_VRAM && - si_resource(src)->domains & RADEON_DOMAIN_VRAM && - size > 32 * 1024 && - dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { - si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, - size, NULL, 0, coher); - } else { - si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, - 0, coher, cache_policy); - } + if (!size) + return; + + enum si_coherency coher = SI_COHERENCY_SHADER; + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + + /* Only use compute for VRAM copies on dGPUs. */ + if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM && + si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 && + src_offset % 4 == 0 && size % 4 == 0) { + si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher); + } else { + si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy); + } } -void si_compute_copy_image(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_level, - struct pipe_resource *src, - unsigned src_level, - unsigned dstx, unsigned dsty, unsigned dstz, - const struct pipe_box *src_box) +void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, + struct pipe_resource *src, unsigned src_level, unsigned dstx, + unsigned dsty, unsigned dstz, const struct pipe_box *src_box) { - struct pipe_context *ctx = &sctx->b; - unsigned width = src_box->width; - unsigned height = src_box->height; - unsigned depth = src_box->depth; - enum pipe_format src_format = util_format_linear(src->format); - enum pipe_format dst_format = util_format_linear(dst->format); - - assert(util_format_is_subsampled_422(src_format) == - util_format_is_subsampled_422(dst_format)); - - if (util_format_is_subsampled_422(src_format)) { - src_format = dst_format = PIPE_FORMAT_R32_UINT; - /* Interpreting 422 subsampled format (16 bpp) as 32 bpp - * should force us to divide src_box->x, dstx and width by 2. - * But given that ac_surface allocates this format as 32 bpp - * and that surf_size is then modified to pack the values - * we must keep the original values to get the correct results. - */ - } - unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, - dstx, dsty, dstz, 0}; - - if (width == 0 || height == 0) - return; - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - - /* The driver doesn't decompress resources automatically here. */ - si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, - dstz, dstz + src_box->depth - 1); - si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, - src_box->z, src_box->z + src_box->depth - 1); - - /* src and dst have the same number of samples. */ - si_make_CB_shader_coherent(sctx, src->nr_samples, true, - /* Only src can have DCC.*/ - ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned); - - struct pipe_constant_buffer saved_cb = {}; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - - struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; - struct pipe_image_view saved_image[2] = {0}; - util_copy_image_view(&saved_image[0], &images->views[0]); - util_copy_image_view(&saved_image[1], &images->views[1]); - - void *saved_cs = sctx->cs_shader_state.program; - - struct pipe_constant_buffer cb = {}; - cb.buffer_size = sizeof(data); - cb.user_buffer = data; - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); - - struct pipe_image_view image[2] = {0}; - image[0].resource = src; - image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; - image[0].format = src_format; - image[0].u.tex.level = src_level; - image[0].u.tex.first_layer = 0; - image[0].u.tex.last_layer = - src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1 - : (unsigned)(src->array_size - 1); - image[1].resource = dst; - image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; - image[1].format = dst_format; - image[1].u.tex.level = dst_level; - image[1].u.tex.first_layer = 0; - image[1].u.tex.last_layer = - dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1 - : (unsigned)(dst->array_size - 1); - - if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT) - image[0].format = image[1].format = PIPE_FORMAT_R32_UINT; - - /* SNORM8 blitting has precision issues on some chips. Use the SINT - * equivalent instead, which doesn't force DCC decompression. - * Note that some chips avoid this issue by using SDMA. - */ - if (util_format_is_snorm8(dst->format)) { - image[0].format = image[1].format = - util_format_snorm8_to_sint8(dst->format); - } - - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); - - struct pipe_grid_info info = {0}; - - if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { - if (!sctx->cs_copy_image_1d_array) - sctx->cs_copy_image_1d_array = - si_create_copy_image_compute_shader_1d_array(ctx); - ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); - info.block[0] = 64; - info.last_block[0] = width % 64; - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(width, 64); - info.grid[1] = depth; - info.grid[2] = 1; - } else { - if (!sctx->cs_copy_image) - sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx); - ctx->bind_compute_state(ctx, sctx->cs_copy_image); - info.block[0] = 8; - info.last_block[0] = width % 8; - info.block[1] = 8; - info.last_block[1] = height % 8; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(width, 8); - info.grid[1] = DIV_ROUND_UP(height, 8); - info.grid[2] = depth; - } - - si_launch_grid_internal(sctx, &info); - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - for (int i = 0; i < 2; i++) - pipe_resource_reference(&saved_image[i].resource, NULL); - pipe_resource_reference(&saved_cb.buffer, NULL); + struct pipe_context *ctx = &sctx->b; + unsigned width = src_box->width; + unsigned height = src_box->height; + unsigned depth = src_box->depth; + enum pipe_format src_format = util_format_linear(src->format); + enum pipe_format dst_format = util_format_linear(dst->format); + + assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format)); + + if (util_format_is_subsampled_422(src_format)) { + src_format = dst_format = PIPE_FORMAT_R32_UINT; + /* Interpreting 422 subsampled format (16 bpp) as 32 bpp + * should force us to divide src_box->x, dstx and width by 2. + * But given that ac_surface allocates this format as 32 bpp + * and that surf_size is then modified to pack the values + * we must keep the original values to get the correct results. + */ + } + unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; + + if (width == 0 || height == 0) + return; + + sctx->flags |= + SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + + /* The driver doesn't decompress resources automatically here. */ + si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz, + dstz + src_box->depth - 1); + si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z, + src_box->z + src_box->depth - 1); + + /* src and dst have the same number of samples. */ + si_make_CB_shader_coherent(sctx, src->nr_samples, true, + /* Only src can have DCC.*/ + ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned); + + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; + struct pipe_image_view saved_image[2] = {0}; + util_copy_image_view(&saved_image[0], &images->views[0]); + util_copy_image_view(&saved_image[1], &images->views[1]); + + void *saved_cs = sctx->cs_shader_state.program; + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + + struct pipe_image_view image[2] = {0}; + image[0].resource = src; + image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; + image[0].format = src_format; + image[0].u.tex.level = src_level; + image[0].u.tex.first_layer = 0; + image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1 + : (unsigned)(src->array_size - 1); + image[1].resource = dst; + image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; + image[1].format = dst_format; + image[1].u.tex.level = dst_level; + image[1].u.tex.first_layer = 0; + image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1 + : (unsigned)(dst->array_size - 1); + + if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT) + image[0].format = image[1].format = PIPE_FORMAT_R32_UINT; + + /* SNORM8 blitting has precision issues on some chips. Use the SINT + * equivalent instead, which doesn't force DCC decompression. + * Note that some chips avoid this issue by using SDMA. + */ + if (util_format_is_snorm8(dst->format)) { + image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format); + } + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); + + struct pipe_grid_info info = {0}; + + if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { + if (!sctx->cs_copy_image_1d_array) + sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx); + ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); + info.block[0] = 64; + info.last_block[0] = width % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 64); + info.grid[1] = depth; + info.grid[2] = 1; + } else { + if (!sctx->cs_copy_image) + sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_copy_image); + info.block[0] = 8; + info.last_block[0] = width % 8; + info.block[1] = 8; + info.last_block[1] = height % 8; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 8); + info.grid[1] = DIV_ROUND_UP(height, 8); + info.grid[2] = depth; + } + + si_launch_grid_internal(sctx, &info); + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + for (int i = 0; i < 2; i++) + pipe_resource_reference(&saved_image[i].resource, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) { - struct pipe_context *ctx = &sctx->b; - - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU); - sctx->emit_cache_flush(sctx); - - /* Save states. */ - void *saved_cs = sctx->cs_shader_state.program; - struct pipe_image_view saved_img[3] = {}; - - for (unsigned i = 0; i < 3; i++) { - util_copy_image_view(&saved_img[i], - &sctx->images[PIPE_SHADER_COMPUTE].views[i]); - } - - /* Set images. */ - bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; - unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; - struct pipe_image_view img[3]; - - assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX); - assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX); - assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); - - for (unsigned i = 0; i < 3; i++) { - img[i].resource = &tex->buffer.b.b; - img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ; - img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER; - } - - img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : - PIPE_FORMAT_R32G32B32A32_UINT; - img[0].u.buf.offset = tex->surface.dcc_retile_map_offset; - img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4); - - img[1].format = PIPE_FORMAT_R8_UINT; - img[1].u.buf.offset = tex->surface.dcc_offset; - img[1].u.buf.size = tex->surface.dcc_size; - - img[2].format = PIPE_FORMAT_R8_UINT; - img[2].u.buf.offset = tex->surface.display_dcc_offset; - img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size; - - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img); - - /* Bind the compute shader. */ - if (!sctx->cs_dcc_retile) - sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx); - ctx->bind_compute_state(ctx, sctx->cs_dcc_retile); - - /* Dispatch compute. */ - /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */ - unsigned num_threads = num_elements / 4; - - struct pipe_grid_info info = {}; - info.block[0] = 64; - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */ - info.grid[1] = 1; - info.grid[2] = 1; - info.last_block[0] = num_threads % 64; - - si_launch_grid_internal(sctx, &info); - - /* Don't flush caches or wait. The driver will wait at the end of this IB, - * and L2 will be flushed by the kernel fence. - */ - - /* Restore states. */ - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); - - for (unsigned i = 0; i < 3; i++) { - pipe_resource_reference(&saved_img[i].resource, NULL); - } + struct pipe_context *ctx = &sctx->b; + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU); + sctx->emit_cache_flush(sctx); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_image_view saved_img[3] = {}; + + for (unsigned i = 0; i < 3; i++) { + util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]); + } + + /* Set images. */ + bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; + unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; + struct pipe_image_view img[3]; + + assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX); + assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX); + assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); + + for (unsigned i = 0; i < 3; i++) { + img[i].resource = &tex->buffer.b.b; + img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ; + img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER; + } + + img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT; + img[0].u.buf.offset = tex->surface.dcc_retile_map_offset; + img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4); + + img[1].format = PIPE_FORMAT_R8_UINT; + img[1].u.buf.offset = tex->surface.dcc_offset; + img[1].u.buf.size = tex->surface.dcc_size; + + img[2].format = PIPE_FORMAT_R8_UINT; + img[2].u.buf.offset = tex->surface.display_dcc_offset; + img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size; + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img); + + /* Bind the compute shader. */ + if (!sctx->cs_dcc_retile) + sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx); + ctx->bind_compute_state(ctx, sctx->cs_dcc_retile); + + /* Dispatch compute. */ + /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */ + unsigned num_threads = num_elements / 4; + + struct pipe_grid_info info = {}; + info.block[0] = 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */ + info.grid[1] = 1; + info.grid[2] = 1; + info.last_block[0] = num_threads % 64; + + si_launch_grid_internal(sctx, &info); + + /* Don't flush caches or wait. The driver will wait at the end of this IB, + * and L2 will be flushed by the kernel fence. + */ + + /* Restore states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); + + for (unsigned i = 0; i < 3; i++) { + pipe_resource_reference(&saved_img[i].resource, NULL); + } } /* Expand FMASK to make it identity, so that image stores can ignore it. */ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex) { - struct si_context *sctx = (struct si_context *)ctx; - bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY; - unsigned log_fragments = util_logbase2(tex->nr_storage_samples); - unsigned log_samples = util_logbase2(tex->nr_samples); - assert(tex->nr_samples >= 2); - - /* EQAA FMASK expansion is unimplemented. */ - if (tex->nr_samples != tex->nr_storage_samples) - return; - - /* Flush caches and sync engines. */ - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - si_make_CB_shader_coherent(sctx, tex->nr_samples, true, - true /* DCC is not possible with image stores */); - - /* Save states. */ - void *saved_cs = sctx->cs_shader_state.program; - struct pipe_image_view saved_image = {0}; - util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]); - - /* Bind the image. */ - struct pipe_image_view image = {0}; - image.resource = tex; - /* Don't set WRITE so as not to trigger FMASK expansion, causing - * an infinite loop. */ - image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ; - image.format = util_format_linear(tex->format); - if (is_array) - image.u.tex.last_layer = tex->array_size - 1; - - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); - - /* Bind the shader. */ - void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array]; - if (!*shader) - *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array); - ctx->bind_compute_state(ctx, *shader); - - /* Dispatch compute. */ - struct pipe_grid_info info = {0}; - info.block[0] = 8; - info.last_block[0] = tex->width0 % 8; - info.block[1] = 8; - info.last_block[1] = tex->height0 % 8; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(tex->width0, 8); - info.grid[1] = DIV_ROUND_UP(tex->height0, 8); - info.grid[2] = is_array ? tex->array_size : 1; - - si_launch_grid_internal(sctx, &info); - - /* Flush caches and sync engines. */ - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - - /* Restore previous states. */ - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); - pipe_resource_reference(&saved_image.resource, NULL); - - /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */ + struct si_context *sctx = (struct si_context *)ctx; + bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY; + unsigned log_fragments = util_logbase2(tex->nr_storage_samples); + unsigned log_samples = util_logbase2(tex->nr_samples); + assert(tex->nr_samples >= 2); + + /* EQAA FMASK expansion is unimplemented. */ + if (tex->nr_samples != tex->nr_storage_samples) + return; + + /* Flush caches and sync engines. */ + sctx->flags |= + SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + si_make_CB_shader_coherent(sctx, tex->nr_samples, true, + true /* DCC is not possible with image stores */); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_image_view saved_image = {0}; + util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]); + + /* Bind the image. */ + struct pipe_image_view image = {0}; + image.resource = tex; + /* Don't set WRITE so as not to trigger FMASK expansion, causing + * an infinite loop. */ + image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ; + image.format = util_format_linear(tex->format); + if (is_array) + image.u.tex.last_layer = tex->array_size - 1; + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); + + /* Bind the shader. */ + void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array]; + if (!*shader) + *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array); + ctx->bind_compute_state(ctx, *shader); + + /* Dispatch compute. */ + struct pipe_grid_info info = {0}; + info.block[0] = 8; + info.last_block[0] = tex->width0 % 8; + info.block[1] = 8; + info.last_block[1] = tex->height0 % 8; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(tex->width0, 8); + info.grid[1] = DIV_ROUND_UP(tex->height0, 8); + info.grid[2] = is_array ? tex->array_size : 1; + + si_launch_grid_internal(sctx, &info); + + /* Flush caches and sync engines. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + + /* Restore previous states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); + pipe_resource_reference(&saved_image.resource, NULL); + + /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */ #define INVALID 0 /* never used */ - static const uint64_t fmask_expand_values[][4] = { - /* samples */ - /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */ - {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */ - {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */ - {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */ - {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */ - }; - - /* Clear FMASK to identity. */ - struct si_texture *stex = (struct si_texture*)tex; - si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size, - (uint32_t*)&fmask_expand_values[log_fragments][log_samples - 1], - 4, SI_COHERENCY_SHADER, false); + static const uint64_t fmask_expand_values[][4] = { + /* samples */ + /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */ + {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */ + {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */ + {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */ + {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */ + }; + + /* Clear FMASK to identity. */ + struct si_texture *stex = (struct si_texture *)tex; + si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size, + (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4, + SI_COHERENCY_SHADER, false); } void si_init_compute_blit_functions(struct si_context *sctx) { - sctx->b.clear_buffer = si_pipe_clear_buffer; + sctx->b.clear_buffer = si_pipe_clear_buffer; } /* Clear a region of a color surface to a constant value. */ -void si_compute_clear_render_target(struct pipe_context *ctx, - struct pipe_surface *dstsurf, - const union pipe_color_union *color, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - bool render_condition_enabled) +void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, + const union pipe_color_union *color, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, + bool render_condition_enabled) { - struct si_context *sctx = (struct si_context *)ctx; - unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1; - unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0}; - - if (width == 0 || height == 0) - return; - - /* The driver doesn't decompress resources automatically here. */ - si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, - dstsurf->u.tex.level, dstsurf->u.tex.first_layer, - dstsurf->u.tex.last_layer); - - if (util_format_is_srgb(dstsurf->format)) { - union pipe_color_union color_srgb; - for (int i = 0; i < 3; i++) - color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]); - color_srgb.f[3] = color->f[3]; - memcpy(data + 4, color_srgb.ui, sizeof(color->ui)); - } else { - memcpy(data + 4, color->ui, sizeof(color->ui)); - } - - sctx->render_cond_force_off = !render_condition_enabled; - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true, - true /* DCC is not possible with image stores */); - - struct pipe_constant_buffer saved_cb = {}; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - - struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; - struct pipe_image_view saved_image = {0}; - util_copy_image_view(&saved_image, &images->views[0]); - - void *saved_cs = sctx->cs_shader_state.program; - - struct pipe_constant_buffer cb = {}; - cb.buffer_size = sizeof(data); - cb.user_buffer = data; - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); - - struct pipe_image_view image = {0}; - image.resource = dstsurf->texture; - image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE; - image.format = util_format_linear(dstsurf->format); - image.u.tex.level = dstsurf->u.tex.level; - image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */ - image.u.tex.last_layer = dstsurf->u.tex.last_layer; - - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); - - struct pipe_grid_info info = {0}; - - if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) { - if (!sctx->cs_clear_render_target) - sctx->cs_clear_render_target = si_clear_render_target_shader(ctx); - ctx->bind_compute_state(ctx, sctx->cs_clear_render_target); - info.block[0] = 8; - info.last_block[0] = width % 8; - info.block[1] = 8; - info.last_block[1] = height % 8; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(width, 8); - info.grid[1] = DIV_ROUND_UP(height, 8); - info.grid[2] = num_layers; - } else { - if (!sctx->cs_clear_render_target_1d_array) - sctx->cs_clear_render_target_1d_array = - si_clear_render_target_shader_1d_array(ctx); - ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array); - info.block[0] = 64; - info.last_block[0] = width % 64; - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(width, 64); - info.grid[1] = num_layers; - info.grid[2] = 1; - } - - si_launch_grid_internal(sctx, &info); - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - ctx->bind_compute_state(ctx, saved_cs); - ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); - ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); - pipe_resource_reference(&saved_image.resource, NULL); - pipe_resource_reference(&saved_cb.buffer, NULL); + struct si_context *sctx = (struct si_context *)ctx; + unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1; + unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0}; + + if (width == 0 || height == 0) + return; + + /* The driver doesn't decompress resources automatically here. */ + si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level, + dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer); + + if (util_format_is_srgb(dstsurf->format)) { + union pipe_color_union color_srgb; + for (int i = 0; i < 3; i++) + color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]); + color_srgb.f[3] = color->f[3]; + memcpy(data + 4, color_srgb.ui, sizeof(color->ui)); + } else { + memcpy(data + 4, color->ui, sizeof(color->ui)); + } + + sctx->render_cond_force_off = !render_condition_enabled; + + sctx->flags |= + SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true, + true /* DCC is not possible with image stores */); + + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; + struct pipe_image_view saved_image = {0}; + util_copy_image_view(&saved_image, &images->views[0]); + + void *saved_cs = sctx->cs_shader_state.program; + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + + struct pipe_image_view image = {0}; + image.resource = dstsurf->texture; + image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE; + image.format = util_format_linear(dstsurf->format); + image.u.tex.level = dstsurf->u.tex.level; + image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */ + image.u.tex.last_layer = dstsurf->u.tex.last_layer; + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image); + + struct pipe_grid_info info = {0}; + + if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) { + if (!sctx->cs_clear_render_target) + sctx->cs_clear_render_target = si_clear_render_target_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_clear_render_target); + info.block[0] = 8; + info.last_block[0] = width % 8; + info.block[1] = 8; + info.last_block[1] = height % 8; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 8); + info.grid[1] = DIV_ROUND_UP(height, 8); + info.grid[2] = num_layers; + } else { + if (!sctx->cs_clear_render_target_1d_array) + sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx); + ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array); + info.block[0] = 64; + info.last_block[0] = width % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 64); + info.grid[1] = num_layers; + info.grid[2] = 1; + } + + si_launch_grid_internal(sctx, &info); + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + pipe_resource_reference(&saved_image.resource, NULL); + pipe_resource_reference(&saved_cb.buffer, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index 7f985ad3c62..389233835eb 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -23,16 +23,15 @@ * */ +#include "ac_llvm_cull.h" +#include "si_build_pm4.h" #include "si_pipe.h" #include "si_shader_internal.h" #include "sid.h" -#include "si_build_pm4.h" -#include "ac_llvm_cull.h" - +#include "util/fast_idiv_by_const.h" #include "util/u_prim.h" #include "util/u_suballoc.h" #include "util/u_upload_mgr.h" -#include "util/fast_idiv_by_const.h" /* Based on: * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf @@ -154,1453 +153,1354 @@ /* At least 256 is needed for the fastest wave launch rate from compute queues * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */ -#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ -#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ -#define MAX_WAVES_PER_SH 0 /* no limit */ -#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ +#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ +#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ +#define MAX_WAVES_PER_SH 0 /* no limit */ +#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */ -#define CULL_Z 0 +#define CULL_Z 0 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */ -#define VERTEX_COUNTER_GDS_MODE 2 -#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ +#define VERTEX_COUNTER_GDS_MODE 2 +#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */ /* Grouping compute dispatches for small draw calls: How many primitives from multiple * draw calls to process by compute before signaling the gfx IB. This reduces the number * of EOP events + REWIND packets, because they decrease performance. */ -#define PRIMS_PER_BATCH (512 * 1024) +#define PRIMS_PER_BATCH (512 * 1024) /* Draw call splitting at the packet level. This allows signaling the gfx IB * for big draw calls sooner, but doesn't allow context flushes between packets. * Primitive restart is supported. Only implemented for ordered append. */ -#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH +#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH /* If there is not enough ring buffer space for the current IB, split draw calls into * this number of primitives, so that we can flush the context and get free ring space. */ -#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH +#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH /* Derived values. */ -#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) -#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \ - SPLIT_PRIMS_PACKET_LEVEL_VALUE : \ - UINT_MAX & ~(THREADGROUP_SIZE - 1)) +#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) +#define SPLIT_PRIMS_PACKET_LEVEL \ + (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE \ + : UINT_MAX & ~(THREADGROUP_SIZE - 1)) -#define REWIND_SIGNAL_BIT 0x80000000 +#define REWIND_SIGNAL_BIT 0x80000000 /* For emulating the rewind packet on CI. */ -#define FORCE_REWIND_EMULATION 0 +#define FORCE_REWIND_EMULATION 0 -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, - bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib) +void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, + unsigned *prim_discard_vertex_count_threshold, + unsigned *index_ring_size_per_ib) { - *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ - - if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */ - !sscreen->info.has_gds_ordered_append || - sscreen->debug_flags & DBG(NO_PD) || - is_aux_context) - return; - - /* TODO: enable this after the GDS kernel memory management is fixed */ - bool enable_on_pro_graphics_by_default = false; - - if (sscreen->debug_flags & DBG(ALWAYS_PD) || - sscreen->debug_flags & DBG(PD) || - (enable_on_pro_graphics_by_default && - sscreen->info.is_pro_graphics && - (sscreen->info.family == CHIP_BONAIRE || - sscreen->info.family == CHIP_HAWAII || - sscreen->info.family == CHIP_TONGA || - sscreen->info.family == CHIP_FIJI || - sscreen->info.family == CHIP_POLARIS10 || - sscreen->info.family == CHIP_POLARIS11 || - sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_VEGA20))) { - *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ - - if (sscreen->debug_flags & DBG(ALWAYS_PD)) - *prim_discard_vertex_count_threshold = 0; /* always enable */ - - const uint32_t MB = 1024 * 1024; - const uint64_t GB = 1024 * 1024 * 1024; - - /* The total size is double this per context. - * Greater numbers allow bigger gfx IBs. - */ - if (sscreen->info.vram_size <= 2 * GB) - *index_ring_size_per_ib = 64 * MB; - else if (sscreen->info.vram_size <= 4 * GB) - *index_ring_size_per_ib = 128 * MB; - else - *index_ring_size_per_ib = 256 * MB; - } + *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ + + if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */ + !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context) + return; + + /* TODO: enable this after the GDS kernel memory management is fixed */ + bool enable_on_pro_graphics_by_default = false; + + if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) || + (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics && + (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII || + sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI || + sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 || + sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) { + *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ + + if (sscreen->debug_flags & DBG(ALWAYS_PD)) + *prim_discard_vertex_count_threshold = 0; /* always enable */ + + const uint32_t MB = 1024 * 1024; + const uint64_t GB = 1024 * 1024 * 1024; + + /* The total size is double this per context. + * Greater numbers allow bigger gfx IBs. + */ + if (sscreen->info.vram_size <= 2 * GB) + *index_ring_size_per_ib = 64 * MB; + else if (sscreen->info.vram_size <= 4 * GB) + *index_ring_size_per_ib = 128 * MB; + else + *index_ring_size_per_ib = 256 * MB; + } } /* Opcode can be "add" or "swap". */ -static LLVMValueRef -si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, - LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index, - bool release, bool done) +static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode, + LLVMValueRef m0, LLVMValueRef value, + unsigned ordered_count_index, bool release, bool done) { - if (ctx->screen->info.chip_class >= GFX10) - ordered_count_index |= 1 << 24; /* number of dwords == 1 */ - - LLVMValueRef args[] = { - LLVMBuildIntToPtr(ctx->ac.builder, m0, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""), - value, - LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ - ctx->ac.i32_0, /* scope */ - ctx->ac.i1false, /* volatile */ - LLVMConstInt(ctx->ac.i32, ordered_count_index, 0), - LLVMConstInt(ctx->ac.i1, release, 0), - LLVMConstInt(ctx->ac.i1, done, 0), - }; - - char intrinsic[64]; - snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); - return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0); + if (ctx->screen->info.chip_class >= GFX10) + ordered_count_index |= 1 << 24; /* number of dwords == 1 */ + + LLVMValueRef args[] = { + LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""), + value, + LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */ + ctx->ac.i32_0, /* scope */ + ctx->ac.i1false, /* volatile */ + LLVMConstInt(ctx->ac.i32, ordered_count_index, 0), + LLVMConstInt(ctx->ac.i1, release, 0), + LLVMConstInt(ctx->ac.i1, done, 0), + }; + + char intrinsic[64]; + snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode); + return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0); } static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) { - uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; - ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, ""); - ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), ""); - return LLVMBuildIntToPtr(ctx->ac.builder, ptr, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), ""); + uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; + ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, ""); + ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), ""); + return LLVMBuildIntToPtr(ctx->ac.builder, ptr, + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), ""); } struct si_thread0_section { - struct si_shader_context *ctx; - LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ - LLVMValueRef saved_exec; + struct si_shader_context *ctx; + LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ + LLVMValueRef saved_exec; }; /* Enter a section that only executes on thread 0. */ static void si_enter_thread0_section(struct si_shader_context *ctx, - struct si_thread0_section *section, - LLVMValueRef thread_id) + struct si_thread0_section *section, LLVMValueRef thread_id) { - section->ctx = ctx; - section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0"); - - /* This IF has 4 instructions: - * v_and_b32_e32 v, 63, v ; get the thread ID - * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 - * s_and_saveexec_b64 s, vcc - * s_cbranch_execz BB0_4 - * - * It could just be s_and_saveexec_b64 s, 1. - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, - ctx->ac.i32_0, ""), 12601); + section->ctx = ctx; + section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0"); + + /* This IF has 4 instructions: + * v_and_b32_e32 v, 63, v ; get the thread ID + * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 + * s_and_saveexec_b64 s, vcc + * s_cbranch_execz BB0_4 + * + * It could just be s_and_saveexec_b64 s, 1. + */ + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""), + 12601); } /* Exit a section that only executes on thread 0 and broadcast the result * to all threads. */ -static void si_exit_thread0_section(struct si_thread0_section *section, - LLVMValueRef *result) +static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result) { - struct si_shader_context *ctx = section->ctx; + struct si_shader_context *ctx = section->ctx; - LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); + LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); - ac_build_endif(&ctx->ac, 12601); + ac_build_endif(&ctx->ac, 12601); - /* Broadcast the result from thread 0 to all threads. */ - *result = ac_build_readlane(&ctx->ac, - LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); + /* Broadcast the result from thread 0 to all threads. */ + *result = + ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); } void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) { - struct si_shader_key *key = &ctx->shader->key; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef vs = ctx->main_fn; - - /* Always inline the VS function. */ - ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(vs, LLVMPrivateLinkage); - - enum ac_arg_type const_desc_type; - if (ctx->shader->selector->info.const_buffers_declared == 1 && - ctx->shader->selector->info.shader_buffers_declared == 0) - const_desc_type = AC_ARG_CONST_FLOAT_PTR; - else - const_desc_type = AC_ARG_CONST_DESC_PTR; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - struct ac_arg param_index_buffers_and_constants, param_vertex_counter; - struct ac_arg param_vb_desc, param_const_desc; - struct ac_arg param_base_vertex, param_start_instance; - struct ac_arg param_block_id, param_local_id, param_ordered_wave_id; - struct ac_arg param_restart_index, param_smallprim_precision; - struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms; - struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr; - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - ¶m_index_buffers_and_constants); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - ¶m_vb_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, - ¶m_const_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, - ¶m_sampler_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision); - - /* Block ID and thread ID inputs. */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id); - if (VERTEX_COUNTER_GDS_MODE == 2) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id); - - /* Create the compute shader function. */ - unsigned old_type = ctx->type; - ctx->type = PIPE_SHADER_COMPUTE; - si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE); - ctx->type = old_type; - - if (VERTEX_COUNTER_GDS_MODE == 2) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "amdgpu-gds-size", 256); - } else if (VERTEX_COUNTER_GDS_MODE == 1) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", - GDS_SIZE_UNORDERED); - } - - /* Assemble parameters for VS. */ - LLVMValueRef vs_params[16]; - unsigned num_vs_params = 0; - unsigned param_vertex_id, param_instance_id; - - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc); - vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32, - S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance); - vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc); - - vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ - vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */ - - assert(num_vs_params <= ARRAY_SIZE(vs_params)); - assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); - - /* Load descriptors. (load 8 dwords at once) */ - LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; - - LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants); - tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, - ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); - tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0); - - for (unsigned i = 0; i < 8; i++) - desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); - - input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); - output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); - - /* Compute PrimID and InstanceID. */ - LLVMValueRef global_thread_id = - ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id), - LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0), - ac_get_arg(&ctx->ac, param_local_id)); - LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ - LLVMValueRef instance_id = ctx->ac.i32_0; - - if (key->opt.cs_instancing) { - LLVMValueRef num_prims_udiv_terms = - ac_get_arg(&ctx->ac, param_num_prims_udiv_terms); - LLVMValueRef num_prims_udiv_multiplier = - ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier); - /* Unpack num_prims_udiv_terms. */ - LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->ac.i32, 0x1f, 0), ""); - LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms, - LLVMConstInt(ctx->ac.i32, 5, 0), ""); - /* Divide the total prim_id by the number of prims per instance. */ - instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, - num_prims_udiv_multiplier, - post_shift); - /* Compute the remainder. */ - prim_id = LLVMBuildSub(builder, prim_id, - LLVMBuildMul(builder, instance_id, - prims_per_instance, ""), ""); - } - - /* Generate indices (like a non-indexed draw call). */ - LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)}; - unsigned vertices_per_prim = 3; - - switch (key->opt.cs_prim_type) { - case PIPE_PRIM_TRIANGLES: - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_imad(&ctx->ac, prim_id, - LLVMConstInt(ctx->ac.i32, 3, 0), - LLVMConstInt(ctx->ac.i32, i, 0)); - } - break; - case PIPE_PRIM_TRIANGLE_STRIP: - for (unsigned i = 0; i < 3; i++) { - index[i] = LLVMBuildAdd(builder, prim_id, - LLVMConstInt(ctx->ac.i32, i, 0), ""); - } - break; - case PIPE_PRIM_TRIANGLE_FAN: - /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper - * and rasterizer as a normal triangle, so we need to put the provoking - * vertex into the correct index variable and preserve orientation at the same time. - * gl_VertexID is preserved, because it's equal to the index. - */ - if (key->opt.cs_provoking_vertex_first) { - index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - index[2] = ctx->ac.i32_0; - } else { - index[0] = ctx->ac.i32_0; - index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); - index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - } - break; - default: - unreachable("unexpected primitive type"); - } - - /* Fetch indices. */ - if (key->opt.cs_indexed) { - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, - index[i], ctx->ac.i32_0, 1, - 0, true); - index[i] = ac_to_integer(&ctx->ac, index[i]); - } - } - - LLVMValueRef ordered_wave_id = NULL; - - /* Extract the ordered wave ID. */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id); - ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id, - LLVMConstInt(ctx->ac.i32, 6, 0), ""); - ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id, - LLVMConstInt(ctx->ac.i32, 0xfff, 0), ""); - } - LLVMValueRef thread_id = - LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id), - LLVMConstInt(ctx->ac.i32, 63, 0), ""); - - /* Every other triangle in a strip has a reversed vertex order, so we - * need to swap vertices of odd primitives to get the correct primitive - * orientation when converting triangle strips to triangles. Primitive - * restart complicates it, because a strip can start anywhere. - */ - LLVMValueRef prim_restart_accepted = ctx->ac.i1true; - LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter); - - if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { - /* Without primitive restart, odd primitives have reversed orientation. - * Only primitive restart can flip it with respect to the first vertex - * of the draw call. - */ - LLVMValueRef first_is_odd = ctx->ac.i1false; - - /* Handle primitive restart. */ - if (key->opt.cs_primitive_restart) { - /* Get the GDS primitive restart continue flag and clear - * the flag in vertex_counter. This flag is used when the draw - * call was split and we need to load the primitive orientation - * flag from GDS for the first wave too. - */ - LLVMValueRef gds_prim_restart_continue = - LLVMBuildLShr(builder, vertex_counter, - LLVMConstInt(ctx->ac.i32, 31, 0), ""); - gds_prim_restart_continue = - LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, ""); - vertex_counter = LLVMBuildAnd(builder, vertex_counter, - LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), ""); - - LLVMValueRef index0_is_reset; - - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], - ac_get_arg(&ctx->ac, param_restart_index), - ""); - if (i == 0) - index0_is_reset = LLVMBuildNot(builder, not_reset, ""); - prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, - not_reset, ""); - } - - /* If the previous waves flip the primitive orientation - * of the current triangle strip, it will be stored in GDS. - * - * Sometimes the correct orientation is not needed, in which case - * we don't need to execute this. - */ - if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { - /* If there are reset indices in this wave, get the thread index - * where the most recent strip starts relative to each thread. - */ - LLVMValueRef preceding_threads_mask = - LLVMBuildSub(builder, - LLVMBuildShl(builder, ctx->ac.i64_1, - LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""), - ctx->ac.i64_1, ""); - - LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); - LLVMValueRef preceding_reset_threadmask = - LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); - LLVMValueRef strip_start = - ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); - strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, ""); - - /* This flips the orientatino based on reset indices within this wave only. */ - first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, ""); - - LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; - LLVMValueRef is_first_wave, current_wave_resets_index; - - /* Get the thread index where the last strip starts in this wave. - * - * If the last strip doesn't start in this wave, the thread index - * will be 0. - * - * If the last strip starts in the next wave, the thread index will - * be 64. - */ - last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); - last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, ""); - - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - - /* This must be done in the thread 0 section, because - * we expect PrimID to be 0 for the whole first wave - * in this expression. - * - * NOTE: This will need to be different if we wanna support - * instancing with primitive restart. - */ - is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, ""); - is_first_wave = LLVMBuildAnd(builder, is_first_wave, - LLVMBuildNot(builder, - gds_prim_restart_continue, ""), ""); - current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE, - last_strip_start, ctx->ac.i32_0, ""); - - ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state"); - - /* Save the last strip start primitive index in GDS and read - * the value that previous waves stored. - * - * if (is_first_wave || current_wave_resets_strip) - * // Read the value that previous waves stored and store a new one. - * first_is_odd = ds.ordered.swap(last_strip_start); - * else - * // Just read the value that previous waves stored. - * first_is_odd = ds.ordered.add(0); - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildOr(builder, is_first_wave, - current_wave_resets_index, ""), 12602); - { - /* The GDS address is always 0 with ordered append. */ - tmp = si_build_ds_ordered_op(ctx, "swap", - ordered_wave_id, last_strip_start, - 1, true, false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_else(&ctx->ac, 12603); - { - /* Just read the value from GDS. */ - tmp = si_build_ds_ordered_op(ctx, "add", - ordered_wave_id, ctx->ac.i32_0, - 1, true, false); - LLVMBuildStore(builder, tmp, ret); - } - ac_build_endif(&ctx->ac, 12602); - - prev_wave_state = LLVMBuildLoad(builder, ret, ""); - /* Ignore the return value if this is the first wave. */ - prev_wave_state = LLVMBuildSelect(builder, is_first_wave, - ctx->ac.i32_0, prev_wave_state, ""); - si_exit_thread0_section(§ion, &prev_wave_state); - prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, ""); - - /* If the strip start appears to be on thread 0 for the current primitive - * (meaning the reset index is not present in this wave and might have - * appeared in previous waves), use the value from GDS to determine - * primitive orientation. - * - * If the strip start is in this wave for the current primitive, use - * the value from the current wave to determine primitive orientation. - */ - LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ, - strip_start, ctx->ac.i32_0, ""); - first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, - first_is_odd, ""); - } - } - /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ - LLVMValueRef prim_is_odd = - LLVMBuildXor(builder, first_is_odd, - LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), ""); - - /* Convert triangle strip indices to triangle indices. */ - ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd, - LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0), - index); - } - - /* Execute the vertex shader for each vertex to get vertex positions. */ - LLVMValueRef pos[3][4]; - for (unsigned i = 0; i < vertices_per_prim; i++) { - vs_params[param_vertex_id] = index[i]; - vs_params[param_instance_id] = instance_id; - - LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); - for (unsigned chan = 0; chan < 4; chan++) - pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); - } - - /* Divide XYZ by W. */ - for (unsigned i = 0; i < vertices_per_prim; i++) { - for (unsigned chan = 0; chan < 3; chan++) - pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); - } - - /* Load the viewport state. */ - LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, - LLVMConstInt(ctx->ac.i32, 2, 0)); - vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); - LLVMValueRef vp_scale[2], vp_translate[2]; - vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); - vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); - vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); - vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - - /* Do culling. */ - struct ac_cull_options options = {}; - options.cull_front = key->opt.cs_cull_front; - options.cull_back = key->opt.cs_cull_back; - options.cull_view_xy = true; - options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; - options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; - options.cull_small_prims = true; - options.cull_zero_area = true; - options.cull_w = true; - options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; - - LLVMValueRef accepted = - ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, - vp_scale, vp_translate, - ac_get_arg(&ctx->ac, param_smallprim_precision), - &options); - - ac_build_optimization_barrier(&ctx->ac, &accepted); - LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); - - /* Count the number of active threads by doing bitcount(accepted). */ - LLVMValueRef num_prims_accepted = - ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, - &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); - num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); - - LLVMValueRef start; - - /* Execute atomic_add on the vertex count. */ - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, thread_id); - { - if (VERTEX_COUNTER_GDS_MODE == 0) { - LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, - vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 1) { - LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted, - LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, - vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } else if (VERTEX_COUNTER_GDS_MODE == 2) { - LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - - /* If the draw call was split into multiple subdraws, each using - * a separate draw packet, we need to start counting from 0 for - * the first compute wave of the subdraw. - * - * vertex_counter contains the primitive ID of the first thread - * in the first wave. - * - * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: - */ - LLVMValueRef is_first_wave = - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - vertex_counter, ""); - - /* Store the primitive count for ordered append, not vertex count. - * The idea is to avoid GDS initialization via CP DMA. The shader - * effectively stores the first count using "swap". - * - * if (first_wave) { - * ds.ordered.swap(num_prims_accepted); // store the first primitive count - * previous = 0; - * } else { - * previous = ds.ordered.add(num_prims_accepted) // add the primitive count - * } - */ - ac_build_ifcc(&ctx->ac, is_first_wave, 12604); - { - /* The GDS address is always 0 with ordered append. */ - si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, - num_prims_accepted, 0, true, true); - LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store); - } - ac_build_else(&ctx->ac, 12605); - { - LLVMBuildStore(builder, - si_build_ds_ordered_op(ctx, "add", ordered_wave_id, - num_prims_accepted, 0, - true, true), - tmp_store); - } - ac_build_endif(&ctx->ac, 12604); - - start = LLVMBuildLoad(builder, tmp_store, ""); - } - } - si_exit_thread0_section(§ion, &start); - - /* Write the final vertex count to memory. An EOS/EOP event could do this, - * but those events are super slow and should be avoided if performance - * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE - * event like this. - */ - if (VERTEX_COUNTER_GDS_MODE == 2) { - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, - ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""), - 12606); - LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); - count = LLVMBuildMul(builder, count, - LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - - /* GFX8 needs to disable caching, so that the CP can see the stored value. - * MTYPE=3 bypasses TC L2. - */ - if (ctx->screen->info.chip_class <= GFX8) { - LLVMValueRef desc[] = { - ac_get_arg(&ctx->ac, param_vertex_count_addr), - LLVMConstInt(ctx->ac.i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), - LLVMConstInt(ctx->ac.i32, 4, 0), - LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | - S_008F0C_MTYPE(3 /* uncached */), 0), - }; - LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); - ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, - ctx->ac.i32_0, 0, ac_glc | ac_slc); - } else { - LLVMBuildStore(builder, count, - si_expand_32bit_pointer(ctx, - ac_get_arg(&ctx->ac, - param_vertex_count_addr))); - } - ac_build_endif(&ctx->ac, 12606); - } else { - /* For unordered modes that increment a vertex count instead of - * primitive count, convert it into the primitive index. - */ - start = LLVMBuildUDiv(builder, start, - LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - } - - /* Now we need to store the indices of accepted primitives into - * the output index buffer. - */ - ac_build_ifcc(&ctx->ac, accepted, 16607); - { - /* Get the number of bits set before the index of this thread. */ - LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); - - /* We have lowered instancing. Pack the instance ID into vertex ID. */ - if (key->opt.cs_instancing) { - instance_id = LLVMBuildShl(builder, instance_id, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - for (unsigned i = 0; i < vertices_per_prim; i++) - index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); - } - - if (VERTEX_COUNTER_GDS_MODE == 2) { - /* vertex_counter contains the first primitive ID - * for this dispatch. If the draw call was split into - * multiple subdraws, the first primitive ID is > 0 - * for subsequent subdraws. Each subdraw uses a different - * portion of the output index buffer. Offset the store - * vindex by the first primitive ID to get the correct - * store address for the subdraw. - */ - start = LLVMBuildAdd(builder, start, vertex_counter, ""); - } - - /* Write indices for accepted primitives. */ - LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); - LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); - - if (!ac_has_vec3_support(ctx->ac.chip_class, true)) - vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); - - ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, - vindex, ctx->ac.i32_0, 3, - ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); - } - ac_build_endif(&ctx->ac, 16607); - - LLVMBuildRetVoid(builder); + struct si_shader_key *key = &ctx->shader->key; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef vs = ctx->main_fn; + + /* Always inline the VS function. */ + ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(vs, LLVMPrivateLinkage); + + enum ac_arg_type const_desc_type; + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_desc_type = AC_ARG_CONST_FLOAT_PTR; + else + const_desc_type = AC_ARG_CONST_DESC_PTR; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + struct ac_arg param_index_buffers_and_constants, param_vertex_counter; + struct ac_arg param_vb_desc, param_const_desc; + struct ac_arg param_base_vertex, param_start_instance; + struct ac_arg param_block_id, param_local_id, param_ordered_wave_id; + struct ac_arg param_restart_index, param_smallprim_precision; + struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms; + struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr; + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + ¶m_index_buffers_and_constants); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision); + + /* Block ID and thread ID inputs. */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id); + if (VERTEX_COUNTER_GDS_MODE == 2) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id); + + /* Create the compute shader function. */ + unsigned old_type = ctx->type; + ctx->type = PIPE_SHADER_COMPUTE; + si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE); + ctx->type = old_type; + + if (VERTEX_COUNTER_GDS_MODE == 2) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256); + } else if (VERTEX_COUNTER_GDS_MODE == 1) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED); + } + + /* Assemble parameters for VS. */ + LLVMValueRef vs_params[16]; + unsigned num_vs_params = 0; + unsigned param_vertex_id, param_instance_id; + + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */ + vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc); + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc); + vs_params[num_vs_params++] = + LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex); + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance); + vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */ + vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc); + + vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ + vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */ + vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */ + vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */ + + assert(num_vs_params <= ARRAY_SIZE(vs_params)); + assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); + + /* Load descriptors. (load 8 dwords at once) */ + LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; + + LLVMValueRef index_buffers_and_constants = + ac_get_arg(&ctx->ac, param_index_buffers_and_constants); + tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, + ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); + tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0); + + for (unsigned i = 0; i < 8; i++) + desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); + + input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); + output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); + + /* Compute PrimID and InstanceID. */ + LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id), + LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0), + ac_get_arg(&ctx->ac, param_local_id)); + LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */ + LLVMValueRef instance_id = ctx->ac.i32_0; + + if (key->opt.cs_instancing) { + LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms); + LLVMValueRef num_prims_udiv_multiplier = + ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier); + /* Unpack num_prims_udiv_terms. */ + LLVMValueRef post_shift = + LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), ""); + LLVMValueRef prims_per_instance = + LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), ""); + /* Divide the total prim_id by the number of prims per instance. */ + instance_id = + ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift); + /* Compute the remainder. */ + prim_id = LLVMBuildSub(builder, prim_id, + LLVMBuildMul(builder, instance_id, prims_per_instance, ""), ""); + } + + /* Generate indices (like a non-indexed draw call). */ + LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)}; + unsigned vertices_per_prim = 3; + + switch (key->opt.cs_prim_type) { + case PIPE_PRIM_TRIANGLES: + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0), + LLVMConstInt(ctx->ac.i32, i, 0)); + } + break; + case PIPE_PRIM_TRIANGLE_STRIP: + for (unsigned i = 0; i < 3; i++) { + index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), ""); + } + break; + case PIPE_PRIM_TRIANGLE_FAN: + /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper + * and rasterizer as a normal triangle, so we need to put the provoking + * vertex into the correct index variable and preserve orientation at the same time. + * gl_VertexID is preserved, because it's equal to the index. + */ + if (key->opt.cs_provoking_vertex_first) { + index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + index[2] = ctx->ac.i32_0; + } else { + index[0] = ctx->ac.i32_0; + index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), ""); + index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + } + break; + default: + unreachable("unexpected primitive type"); + } + + /* Fetch indices. */ + if (key->opt.cs_indexed) { + for (unsigned i = 0; i < 3; i++) { + index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0, + 1, 0, true); + index[i] = ac_to_integer(&ctx->ac, index[i]); + } + } + + LLVMValueRef ordered_wave_id = NULL; + + /* Extract the ordered wave ID. */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id); + ordered_wave_id = + LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), ""); + ordered_wave_id = + LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), ""); + } + LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id), + LLVMConstInt(ctx->ac.i32, 63, 0), ""); + + /* Every other triangle in a strip has a reversed vertex order, so we + * need to swap vertices of odd primitives to get the correct primitive + * orientation when converting triangle strips to triangles. Primitive + * restart complicates it, because a strip can start anywhere. + */ + LLVMValueRef prim_restart_accepted = ctx->ac.i1true; + LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter); + + if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { + /* Without primitive restart, odd primitives have reversed orientation. + * Only primitive restart can flip it with respect to the first vertex + * of the draw call. + */ + LLVMValueRef first_is_odd = ctx->ac.i1false; + + /* Handle primitive restart. */ + if (key->opt.cs_primitive_restart) { + /* Get the GDS primitive restart continue flag and clear + * the flag in vertex_counter. This flag is used when the draw + * call was split and we need to load the primitive orientation + * flag from GDS for the first wave too. + */ + LLVMValueRef gds_prim_restart_continue = + LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), ""); + gds_prim_restart_continue = + LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, ""); + vertex_counter = + LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), ""); + + LLVMValueRef index0_is_reset; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i], + ac_get_arg(&ctx->ac, param_restart_index), ""); + if (i == 0) + index0_is_reset = LLVMBuildNot(builder, not_reset, ""); + prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, ""); + } + + /* If the previous waves flip the primitive orientation + * of the current triangle strip, it will be stored in GDS. + * + * Sometimes the correct orientation is not needed, in which case + * we don't need to execute this. + */ + if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) { + /* If there are reset indices in this wave, get the thread index + * where the most recent strip starts relative to each thread. + */ + LLVMValueRef preceding_threads_mask = + LLVMBuildSub(builder, + LLVMBuildShl(builder, ctx->ac.i64_1, + LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""), + ctx->ac.i64_1, ""); + + LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset); + LLVMValueRef preceding_reset_threadmask = + LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, ""); + LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL); + strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, ""); + + /* This flips the orientatino based on reset indices within this wave only. */ + first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, ""); + + LLVMValueRef last_strip_start, prev_wave_state, ret, tmp; + LLVMValueRef is_first_wave, current_wave_resets_index; + + /* Get the thread index where the last strip starts in this wave. + * + * If the last strip doesn't start in this wave, the thread index + * will be 0. + * + * If the last strip starts in the next wave, the thread index will + * be 64. + */ + last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL); + last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, ""); + + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + + /* This must be done in the thread 0 section, because + * we expect PrimID to be 0 for the whole first wave + * in this expression. + * + * NOTE: This will need to be different if we wanna support + * instancing with primitive restart. + */ + is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, ""); + is_first_wave = LLVMBuildAnd(builder, is_first_wave, + LLVMBuildNot(builder, gds_prim_restart_continue, ""), ""); + current_wave_resets_index = + LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, ""); + + ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state"); + + /* Save the last strip start primitive index in GDS and read + * the value that previous waves stored. + * + * if (is_first_wave || current_wave_resets_strip) + * // Read the value that previous waves stored and store a new one. + * first_is_odd = ds.ordered.swap(last_strip_start); + * else + * // Just read the value that previous waves stored. + * first_is_odd = ds.ordered.add(0); + */ + ac_build_ifcc( + &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602); + { + /* The GDS address is always 0 with ordered append. */ + tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true, + false); + LLVMBuildStore(builder, tmp, ret); + } + ac_build_else(&ctx->ac, 12603); + { + /* Just read the value from GDS. */ + tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true, + false); + LLVMBuildStore(builder, tmp, ret); + } + ac_build_endif(&ctx->ac, 12602); + + prev_wave_state = LLVMBuildLoad(builder, ret, ""); + /* Ignore the return value if this is the first wave. */ + prev_wave_state = + LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, ""); + si_exit_thread0_section(§ion, &prev_wave_state); + prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, ""); + + /* If the strip start appears to be on thread 0 for the current primitive + * (meaning the reset index is not present in this wave and might have + * appeared in previous waves), use the value from GDS to determine + * primitive orientation. + * + * If the strip start is in this wave for the current primitive, use + * the value from the current wave to determine primitive orientation. + */ + LLVMValueRef strip_start_is0 = + LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, ""); + first_is_odd = + LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, ""); + } + } + /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */ + LLVMValueRef prim_is_odd = LLVMBuildXor( + builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), ""); + + /* Convert triangle strip indices to triangle indices. */ + ac_build_triangle_strip_indices_to_triangle( + &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0), + index); + } + + /* Execute the vertex shader for each vertex to get vertex positions. */ + LLVMValueRef pos[3][4]; + for (unsigned i = 0; i < vertices_per_prim; i++) { + vs_params[param_vertex_id] = index[i]; + vs_params[param_instance_id] = instance_id; + + LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); + for (unsigned chan = 0; chan < 4; chan++) + pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); + } + + /* Divide XYZ by W. */ + for (unsigned i = 0; i < vertices_per_prim; i++) { + for (unsigned chan = 0; chan < 3; chan++) + pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); + } + + /* Load the viewport state. */ + LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, + LLVMConstInt(ctx->ac.i32, 2, 0)); + vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); + LLVMValueRef vp_scale[2], vp_translate[2]; + vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); + vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); + vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); + vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); + + /* Do culling. */ + struct ac_cull_options options = {}; + options.cull_front = key->opt.cs_cull_front; + options.cull_back = key->opt.cs_cull_back; + options.cull_view_xy = true; + options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z; + options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z; + options.cull_small_prims = true; + options.cull_zero_area = true; + options.cull_w = true; + options.use_halfz_clip_space = key->opt.cs_halfz_clip_space; + + LLVMValueRef accepted = + ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate, + ac_get_arg(&ctx->ac, param_smallprim_precision), &options); + + ac_build_optimization_barrier(&ctx->ac, &accepted); + LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); + + /* Count the number of active threads by doing bitcount(accepted). */ + LLVMValueRef num_prims_accepted = ac_build_intrinsic( + &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE); + num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); + + LLVMValueRef start; + + /* Execute atomic_add on the vertex count. */ + struct si_thread0_section section; + si_enter_thread0_section(ctx, §ion, thread_id); + { + if (VERTEX_COUNTER_GDS_MODE == 0) { + LLVMValueRef num_indices = LLVMBuildMul( + builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); + vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 1) { + LLVMValueRef num_indices = LLVMBuildMul( + builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); + vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter, + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""); + start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, + LLVMAtomicOrderingMonotonic, false); + } else if (VERTEX_COUNTER_GDS_MODE == 2) { + LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + /* If the draw call was split into multiple subdraws, each using + * a separate draw packet, we need to start counting from 0 for + * the first compute wave of the subdraw. + * + * vertex_counter contains the primitive ID of the first thread + * in the first wave. + * + * This is only correct with VERTEX_COUNTER_GDS_MODE == 2: + */ + LLVMValueRef is_first_wave = + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, ""); + + /* Store the primitive count for ordered append, not vertex count. + * The idea is to avoid GDS initialization via CP DMA. The shader + * effectively stores the first count using "swap". + * + * if (first_wave) { + * ds.ordered.swap(num_prims_accepted); // store the first primitive count + * previous = 0; + * } else { + * previous = ds.ordered.add(num_prims_accepted) // add the primitive count + * } + */ + ac_build_ifcc(&ctx->ac, is_first_wave, 12604); + { + /* The GDS address is always 0 with ordered append. */ + si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true); + LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store); + } + ac_build_else(&ctx->ac, 12605); + { + LLVMBuildStore(builder, + si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted, + 0, true, true), + tmp_store); + } + ac_build_endif(&ctx->ac, 12604); + + start = LLVMBuildLoad(builder, tmp_store, ""); + } + } + si_exit_thread0_section(§ion, &start); + + /* Write the final vertex count to memory. An EOS/EOP event could do this, + * but those events are super slow and should be avoided if performance + * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE + * event like this. + */ + if (VERTEX_COUNTER_GDS_MODE == 2) { + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, + ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""), + 12606); + LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, ""); + count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); + + /* GFX8 needs to disable caching, so that the CP can see the stored value. + * MTYPE=3 bypasses TC L2. + */ + if (ctx->screen->info.chip_class <= GFX8) { + LLVMValueRef desc[] = { + ac_get_arg(&ctx->ac, param_vertex_count_addr), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0), + LLVMConstInt(ctx->ac.i32, 4, 0), + LLVMConstInt( + ctx->ac.i32, + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */), + 0), + }; + LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4); + ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0, + ac_glc | ac_slc); + } else { + LLVMBuildStore( + builder, count, + si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr))); + } + ac_build_endif(&ctx->ac, 12606); + } else { + /* For unordered modes that increment a vertex count instead of + * primitive count, convert it into the primitive index. + */ + start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); + } + + /* Now we need to store the indices of accepted primitives into + * the output index buffer. + */ + ac_build_ifcc(&ctx->ac, accepted, 16607); + { + /* Get the number of bits set before the index of this thread. */ + LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); + + /* We have lowered instancing. Pack the instance ID into vertex ID. */ + if (key->opt.cs_instancing) { + instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + + for (unsigned i = 0; i < vertices_per_prim; i++) + index[i] = LLVMBuildOr(builder, index[i], instance_id, ""); + } + + if (VERTEX_COUNTER_GDS_MODE == 2) { + /* vertex_counter contains the first primitive ID + * for this dispatch. If the draw call was split into + * multiple subdraws, the first primitive ID is > 0 + * for subsequent subdraws. Each subdraw uses a different + * portion of the output index buffer. Offset the store + * vindex by the first primitive ID to get the correct + * store address for the subdraw. + */ + start = LLVMBuildAdd(builder, start, vertex_counter, ""); + } + + /* Write indices for accepted primitives. */ + LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); + LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); + + if (!ac_has_vec3_support(ctx->ac.chip_class, true)) + vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); + + ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, 3, + ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); + } + ac_build_endif(&ctx->ac, 16607); + + LLVMBuildRetVoid(builder); } /* Return false if the shader isn't ready. */ static bool si_shader_select_prim_discard_cs(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart) + const struct pipe_draw_info *info, + bool primitive_restart) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader_key key; - - /* Primitive restart needs ordered counters. */ - assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); - assert(!primitive_restart || info->instance_count == 1); - - memset(&key, 0, sizeof(key)); - si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog); - assert(!key.part.vs.prolog.instance_divisor_is_fetched); - - key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; - key.opt.vs_as_prim_discard_cs = 1; - key.opt.cs_prim_type = info->mode; - key.opt.cs_indexed = info->index_size != 0; - key.opt.cs_instancing = info->instance_count > 1; - key.opt.cs_primitive_restart = primitive_restart; - key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; - - /* Primitive restart with triangle strips needs to preserve primitive - * orientation for cases where front and back primitive orientation matters. - */ - if (primitive_restart) { - struct si_shader_selector *ps = sctx->ps_shader.cso; - - key.opt.cs_need_correct_orientation = - rs->cull_front != rs->cull_back || - ps->info.uses_frontface || - (rs->two_side && ps->info.colors_read); - } - - if (rs->rasterizer_discard) { - /* Just for performance testing and analysis of trivial bottlenecks. - * This should result in a very short compute shader. */ - key.opt.cs_cull_front = 1; - key.opt.cs_cull_back = 1; - } else { - key.opt.cs_cull_front = - sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front; - key.opt.cs_cull_back = - sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back; - } - - if (!rs->depth_clamp_any && CULL_Z) { - key.opt.cs_cull_z = 1; - key.opt.cs_halfz_clip_space = rs->clip_halfz; - } - - sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; - sctx->cs_prim_discard_state.current = NULL; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - struct si_compiler_ctx_state compiler_state; - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, - &compiler_state, &key, -1, true) == 0 && - /* Disallow compute shaders using the scratch buffer. */ - sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader_key key; + + /* Primitive restart needs ordered counters. */ + assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2); + assert(!primitive_restart || info->instance_count == 1); + + memset(&key, 0, sizeof(key)); + si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog); + assert(!key.part.vs.prolog.instance_divisor_is_fetched); + + key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0; + key.opt.vs_as_prim_discard_cs = 1; + key.opt.cs_prim_type = info->mode; + key.opt.cs_indexed = info->index_size != 0; + key.opt.cs_instancing = info->instance_count > 1; + key.opt.cs_primitive_restart = primitive_restart; + key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; + + /* Primitive restart with triangle strips needs to preserve primitive + * orientation for cases where front and back primitive orientation matters. + */ + if (primitive_restart) { + struct si_shader_selector *ps = sctx->ps_shader.cso; + + key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back || + ps->info.uses_frontface || + (rs->two_side && ps->info.colors_read); + } + + if (rs->rasterizer_discard) { + /* Just for performance testing and analysis of trivial bottlenecks. + * This should result in a very short compute shader. */ + key.opt.cs_cull_front = 1; + key.opt.cs_cull_back = 1; + } else { + key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front; + key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back; + } + + if (!rs->depth_clamp_any && CULL_Z) { + key.opt.cs_cull_z = 1; + key.opt.cs_halfz_clip_space = rs->clip_halfz; + } + + sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso; + sctx->cs_prim_discard_state.current = NULL; + + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + + struct si_compiler_ctx_state compiler_state; + compiler_state.compiler = &sctx->compiler; + compiler_state.debug = sctx->debug; + compiler_state.is_debug_context = sctx->is_debug; + + return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state, + &key, -1, true) == 0 && + /* Disallow compute shaders using the scratch buffer. */ + sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; } static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) { - if (sctx->index_ring) - return true; - - if (!sctx->prim_discard_compute_cs) { - struct radeon_winsys *ws = sctx->ws; - unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : - VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; - unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; - - if (gds_size) { - sctx->gds = ws->buffer_create(ws, gds_size, 4, - RADEON_DOMAIN_GDS, 0); - if (!sctx->gds) - return false; - - ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, - RADEON_USAGE_READWRITE, 0, 0); - } - if (num_oa_counters) { - assert(gds_size); - sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, - 1, RADEON_DOMAIN_OA, 0); - if (!sctx->gds_oa) - return false; - - ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, - RADEON_USAGE_READWRITE, 0, 0); - } - - sctx->prim_discard_compute_cs = - ws->cs_add_parallel_compute_ib(sctx->gfx_cs, - num_oa_counters > 0); - if (!sctx->prim_discard_compute_cs) - return false; - } - - if (!sctx->index_ring) { - sctx->index_ring = - si_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - sctx->index_ring_size_per_ib * 2, - sctx->screen->info.pte_fragment_size); - if (!sctx->index_ring) - return false; - } - return true; + if (sctx->index_ring) + return true; + + if (!sctx->prim_discard_compute_cs) { + struct radeon_winsys *ws = sctx->ws; + unsigned gds_size = + VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0; + unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0; + + if (gds_size) { + sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, 0); + if (!sctx->gds) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0); + } + if (num_oa_counters) { + assert(gds_size); + sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, 0); + if (!sctx->gds_oa) + return false; + + ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0); + } + + sctx->prim_discard_compute_cs = + ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0); + if (!sctx->prim_discard_compute_cs) + return false; + } + + if (!sctx->index_ring) { + sctx->index_ring = si_aligned_buffer_create( + sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size); + if (!sctx->index_ring) + return false; + } + return true; } static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size) { - return sctx->index_ring_offset + - align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= - sctx->index_ring_size_per_ib; + return sctx->index_ring_offset + + align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= + sctx->index_ring_size_per_ib; } enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart) +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, + bool primitive_restart) { - /* If the compute shader compilation isn't finished, this returns false. */ - if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) - return SI_PRIM_DISCARD_DISABLED; - - if (!si_initialize_prim_discard_cmdbuf(sctx)) - return SI_PRIM_DISCARD_DISABLED; - - struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; - unsigned prim = info->mode; - unsigned count = info->count; - unsigned instance_count = info->instance_count; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); - unsigned num_prims = num_prims_per_instance * instance_count; - unsigned out_indexbuf_size = num_prims * 12; - bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); - const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; - - /* Split draws at the draw call level if the ring is full. This makes - * better use of the ring space. - */ - if (ring_full && - num_prims > split_prims_draw_level && - instance_count == 1 && /* TODO: support splitting instanced draws */ - (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | - (1 << PIPE_PRIM_TRIANGLE_STRIP))) { - /* Split draws. */ - struct pipe_draw_info split_draw = *info; - split_draw.primitive_restart = primitive_restart; - - unsigned base_start = split_draw.start; - - if (prim == PIPE_PRIM_TRIANGLES) { - unsigned vert_count_per_subdraw = split_prims_draw_level * 3; - assert(vert_count_per_subdraw < count); - - for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw); - - sctx->b.draw_vbo(&sctx->b, &split_draw); - } - } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { - /* No primitive pair can be split, because strips reverse orientation - * for odd primitives. */ - STATIC_ASSERT(split_prims_draw_level % 2 == 0); - - unsigned vert_count_per_subdraw = split_prims_draw_level; - - for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { - split_draw.start = base_start + start; - split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); - - sctx->b.draw_vbo(&sctx->b, &split_draw); - - if (start == 0 && - primitive_restart && - sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) - sctx->preserve_prim_restart_gds_at_flush = true; - } - sctx->preserve_prim_restart_gds_at_flush = false; - } else { - assert(0); - } - - return SI_PRIM_DISCARD_DRAW_SPLIT; - } - - /* Just quit if the draw call doesn't fit into the ring and can't be split. */ - if (out_indexbuf_size > sctx->index_ring_size_per_ib) { - if (SI_PRIM_DISCARD_DEBUG) - puts("PD failed: draw call too big, can't be split"); - return SI_PRIM_DISCARD_DISABLED; - } - - unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL); - unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + - 24 * (num_subdraws - 1) + /* subdraws */ - 20; /* leave some space at the end */ - unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx); - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) - need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ - else - need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ - - if (ring_full || - (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || - !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { - /* If the current IB is empty but the size is too small, add a NOP - * packet to force a flush and get a bigger IB. - */ - if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && - gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - } - - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - } - - /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); - assert(compute_has_space); - assert(si_check_ring_space(sctx, out_indexbuf_size)); - return SI_PRIM_DISCARD_ENABLED; + /* If the compute shader compilation isn't finished, this returns false. */ + if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart)) + return SI_PRIM_DISCARD_DISABLED; + + if (!si_initialize_prim_discard_cmdbuf(sctx)) + return SI_PRIM_DISCARD_DISABLED; + + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + unsigned prim = info->mode; + unsigned count = info->count; + unsigned instance_count = info->instance_count; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count); + unsigned num_prims = num_prims_per_instance * instance_count; + unsigned out_indexbuf_size = num_prims * 12; + bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); + const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL; + + /* Split draws at the draw call level if the ring is full. This makes + * better use of the ring space. + */ + if (ring_full && num_prims > split_prims_draw_level && + instance_count == 1 && /* TODO: support splitting instanced draws */ + (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) { + /* Split draws. */ + struct pipe_draw_info split_draw = *info; + split_draw.primitive_restart = primitive_restart; + + unsigned base_start = split_draw.start; + + if (prim == PIPE_PRIM_TRIANGLES) { + unsigned vert_count_per_subdraw = split_prims_draw_level * 3; + assert(vert_count_per_subdraw < count); + + for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + } + } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { + /* No primitive pair can be split, because strips reverse orientation + * for odd primitives. */ + STATIC_ASSERT(split_prims_draw_level % 2 == 0); + + unsigned vert_count_per_subdraw = split_prims_draw_level; + + for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { + split_draw.start = base_start + start; + split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2); + + sctx->b.draw_vbo(&sctx->b, &split_draw); + + if (start == 0 && primitive_restart && + sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation) + sctx->preserve_prim_restart_gds_at_flush = true; + } + sctx->preserve_prim_restart_gds_at_flush = false; + } else { + assert(0); + } + + return SI_PRIM_DISCARD_DRAW_SPLIT; + } + + /* Just quit if the draw call doesn't fit into the ring and can't be split. */ + if (out_indexbuf_size > sctx->index_ring_size_per_ib) { + if (SI_PRIM_DISCARD_DEBUG) + puts("PD failed: draw call too big, can't be split"); + return SI_PRIM_DISCARD_DISABLED; + } + + unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL); + unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ + + 24 * (num_subdraws - 1) + /* subdraws */ + 20; /* leave some space at the end */ + unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx); + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) + need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */ + else + need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ + + if (ring_full || + (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) || + !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { + /* If the current IB is empty but the size is too small, add a NOP + * packet to force a flush and get a bigger IB. + */ + if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && + gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + } + + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + } + + /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); + assert(compute_has_space); + assert(si_check_ring_space(sctx, out_indexbuf_size)); + return SI_PRIM_DISCARD_ENABLED; } void si_compute_signal_gfx(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - unsigned writeback_L2_flags = 0; - - /* The writeback L2 flags vary with each chip generation. */ - /* CI needs to flush vertex indices to memory. */ - if (sctx->chip_class <= GFX7) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; - else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; - - if (!sctx->compute_num_prims_in_batch) - return; - - assert(sctx->compute_rewind_va); - - /* After the queued dispatches are done and vertex counts are written to - * the gfx IB, signal the gfx IB to continue. CP doesn't wait for - * the dispatches to finish, it only adds the CS_DONE event into the event - * queue. - */ - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : - EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - NULL, - sctx->compute_rewind_va | - ((uint64_t)sctx->screen->info.address32_hi << 32), - REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ - SI_NOT_QUERY); - - sctx->compute_rewind_va = 0; - sctx->compute_num_prims_in_batch = 0; + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned writeback_L2_flags = 0; + + /* The writeback L2 flags vary with each chip generation. */ + /* CI needs to flush vertex indices to memory. */ + if (sctx->chip_class <= GFX7) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA; + else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0) + writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; + + if (!sctx->compute_num_prims_in_batch) + return; + + assert(sctx->compute_rewind_va); + + /* After the queued dispatches are done and vertex counts are written to + * the gfx IB, signal the gfx IB to continue. CP doesn't wait for + * the dispatches to finish, it only adds the CS_DONE event into the event + * queue. + */ + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, NULL, + sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32), + REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ + SI_NOT_QUERY); + + sctx->compute_rewind_va = 0; + sctx->compute_num_prims_in_batch = 0; } /* Dispatch a primitive discard compute shader. */ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned index_size, - unsigned base_vertex, - uint64_t input_indexbuf_va, - unsigned input_indexbuf_num_elements) + const struct pipe_draw_info *info, unsigned index_size, + unsigned base_vertex, uint64_t input_indexbuf_va, + unsigned input_indexbuf_num_elements) { - struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; - struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; - unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); - if (!num_prims_per_instance) - return; - - unsigned num_prims = num_prims_per_instance * info->instance_count; - unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format; - - switch (info->mode) { - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - vertices_per_prim = 3; - output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; - gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT; - break; - default: - unreachable("unsupported primitive type"); - return; - } - - unsigned out_indexbuf_offset; - uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; - bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; - - /* Initialize the compute IB if it's empty. */ - if (!sctx->prim_discard_compute_ib_initialized) { - /* 1) State initialization. */ - sctx->compute_gds_offset = 0; - sctx->compute_ib_last_shader = NULL; - - if (sctx->last_ib_barrier_fence) { - assert(!sctx->last_ib_barrier_buf); - sctx->ws->cs_add_fence_dependency(gfx_cs, - sctx->last_ib_barrier_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); - } - - /* 2) IB initialization. */ - - /* This needs to be done at the beginning of IBs due to possible - * TTM buffer moves in the kernel. - */ - if (sctx->chip_class >= GFX10) { - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - radeon_emit(cs, /* GCR_CNTL */ - S_586_GLI_INV(V_586_GLI_ALL) | - S_586_GLK_INV(1) | S_586_GLV_INV(1) | - S_586_GL1_INV(1) | - S_586_GL2_INV(1) | S_586_GL2_WB(1) | - S_586_GLM_INV(1) | S_586_GLM_WB(1) | - S_586_SEQ(V_586_SEQ_FORWARD)); - } else { - si_emit_surface_sync(sctx, cs, - S_0085F0_TC_ACTION_ENA(1) | - S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | - S_0085F0_SH_ICACHE_ACTION_ENA(1) | - S_0085F0_SH_KCACHE_ACTION_ENA(1)); - } - - /* Restore the GDS prim restart counter if needed. */ - if (sctx->preserve_prim_restart_gds_at_flush) { - si_cp_copy_data(sctx, cs, - COPY_DATA_GDS, NULL, 4, - COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4); - } - - si_emit_initial_compute_regs(sctx, cs); - - radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) | - S_00B860_WAVESIZE(0)); /* no scratch */ - - /* Only 1D grids are launched. */ - radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | - S_00B820_NUM_THREAD_PARTIAL(1)); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | - S_00B824_NUM_THREAD_PARTIAL(1)); - - radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - - /* Disable ordered alloc for OA resources. */ - for (unsigned i = 0; i < 2; i++) { - radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3); - radeon_emit(cs, S_031074_INDEX(i)); - radeon_emit(cs, 0); - radeon_emit(cs, S_03107C_ENABLE(0)); - } - - if (sctx->last_ib_barrier_buf) { - assert(!sctx->last_ib_barrier_fence); - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, - RADEON_USAGE_READ, RADEON_PRIO_FENCE); - si_cp_wait_mem(sctx, cs, - sctx->last_ib_barrier_buf->gpu_address + - sctx->last_ib_barrier_buf_offset, 1, 1, - WAIT_REG_MEM_EQUAL); - } - - sctx->prim_discard_compute_ib_initialized = true; - } - - /* Allocate the output index buffer. */ - output_indexbuf_size = align(output_indexbuf_size, - sctx->screen->info.tcc_cache_line_size); - assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); - out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; - sctx->index_ring_offset += output_indexbuf_size; - - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, - RADEON_PRIO_SHADER_RW_BUFFER); - uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; - - /* Prepare index buffer descriptors. */ - struct si_resource *indexbuf_desc = NULL; - unsigned indexbuf_desc_offset; - unsigned desc_size = 12 * 4; - uint32_t *desc; - - u_upload_alloc(sctx->b.const_uploader, 0, desc_size, - si_optimal_tcc_alignment(sctx, desc_size), - &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc, - (void**)&desc); - radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - - /* Input index buffer. */ - desc[0] = input_indexbuf_va; - desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | - S_008F04_STRIDE(index_size); - desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); - - if (sctx->chip_class >= GFX10) { - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT : - index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT : - V_008F0C_IMG_FORMAT_32_UINT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : - index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 : - V_008F0C_BUF_DATA_FORMAT_32); - } - - /* Output index buffer. */ - desc[4] = out_indexbuf_va; - desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | - S_008F04_STRIDE(vertices_per_prim * 4); - desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); - - if (sctx->chip_class >= GFX10) { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_FORMAT(gfx10_output_indexbuf_format) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(output_indexbuf_format); - } - - /* Viewport state. */ - struct si_small_prim_cull_info cull_info; - si_get_small_prim_cull_info(sctx, &cull_info); - - desc[8] = fui(cull_info.scale[0]); - desc[9] = fui(cull_info.scale[1]); - desc[10] = fui(cull_info.translate[0]); - desc[11] = fui(cull_info.translate[1]); - - /* Better subpixel precision increases the efficiency of small - * primitive culling. */ - unsigned num_samples = sctx->framebuffer.nr_samples; - unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; - float small_prim_cull_precision; - - if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - small_prim_cull_precision = num_samples / 4096.0; - else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - small_prim_cull_precision = num_samples / 1024.0; - else - small_prim_cull_precision = num_samples / 256.0; - - /* Set user data SGPRs. */ - /* This can't be greater than 14 if we want the fastest launch rate. */ - unsigned user_sgprs = 13; - - uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; - unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); - unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); - uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; - uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; - uint64_t vb_desc_va = sctx->vb_descriptors_buffer ? - sctx->vb_descriptors_buffer->gpu_address + - sctx->vb_descriptors_offset : 0; - unsigned gds_offset, gds_size; - struct si_fast_udiv_info32 num_prims_udiv = {}; - - if (info->instance_count > 1) - num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); - - /* Limitations on how these two are packed in the user SGPR. */ - assert(num_prims_udiv.post_shift < 32); - assert(num_prims_per_instance < 1 << 27); - - si_resource_reference(&indexbuf_desc, NULL); - - bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart; - - if (VERTEX_COUNTER_GDS_MODE == 1) { - gds_offset = sctx->compute_gds_offset; - gds_size = primitive_restart ? 8 : 4; - sctx->compute_gds_offset += gds_size; - - /* Reset the counters in GDS for the first dispatch using WRITE_DATA. - * The remainder of the GDS will be cleared after the dispatch packet - * in parallel with compute shaders. - */ - if (first_dispatch) { - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0)); - radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); - radeon_emit(cs, gds_offset); - radeon_emit(cs, 0); - radeon_emit(cs, 0); /* value to write */ - if (gds_size == 8) - radeon_emit(cs, 0); - } - } - - /* Set shader registers. */ - struct si_shader *shader = sctx->cs_prim_discard_state.current; - - if (shader != sctx->compute_ib_last_shader) { - radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_BINARY); - uint64_t shader_va = shader->bo->gpu_address; - - assert(shader->config.scratch_bytes_per_wave == 0); - assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); - - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); - - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B848_SGPRS(sctx->chip_class <= GFX9 ? - (shader->config.num_sgprs - 1) / 8 : 0) | - S_00B848_FLOAT_MODE(shader->config.float_mode) | - S_00B848_DX10_CLAMP(1) | - S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) | - S_00B848_WGP_MODE(sctx->chip_class >= GFX10)); - radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | - S_00B84C_USER_SGPR(user_sgprs) | - S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | - S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | - S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | - S_00B84C_LDS_SIZE(shader->config.lds_size)); - - radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - ac_get_compute_resource_limits(&sctx->screen->info, - WAVES_PER_TG, - MAX_WAVES_PER_SH, - THREADGROUPS_PER_CU)); - sctx->compute_ib_last_shader = shader; - } - - STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); - - /* Big draw calls are split into smaller dispatches and draw packets. */ - for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { - unsigned num_subdraw_prims; - - if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) - num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; - else - num_subdraw_prims = num_prims - start_prim; - - /* Small dispatches are executed back to back until a specific primitive - * count is reached. Then, a CS_DONE is inserted to signal the gfx IB - * to start drawing the batch. This batching adds latency to the gfx IB, - * but CS_DONE and REWIND are too slow. - */ - if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) - si_compute_signal_gfx(sctx); - - if (sctx->compute_num_prims_in_batch == 0) { - assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); - sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; - - if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - - si_cp_wait_mem(sctx, gfx_cs, - sctx->compute_rewind_va | - (uint64_t)sctx->screen->info.address32_hi << 32, - REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, - WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); - - /* Use INDIRECT_BUFFER to chain to a different buffer - * to discard the CP prefetch cache. - */ - sctx->ws->cs_check_space(gfx_cs, 0, true); - } else { - radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); - radeon_emit(gfx_cs, 0); - } - } - - sctx->compute_num_prims_in_batch += num_subdraw_prims; - - uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; - uint64_t index_va = out_indexbuf_va + start_prim * 12; - - /* Emit the draw packet into the gfx IB. */ - radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); - radeon_emit(gfx_cs, num_prims * vertices_per_prim); - radeon_emit(gfx_cs, index_va); - radeon_emit(gfx_cs, index_va >> 32); - radeon_emit(gfx_cs, 0); - radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); - - /* Continue with the compute IB. */ - if (start_prim == 0) { - uint32_t gds_prim_restart_continue_bit = 0; - - if (sctx->preserve_prim_restart_gds_at_flush) { - assert(primitive_restart && - info->mode == PIPE_PRIM_TRIANGLE_STRIP); - assert(start_prim < 1 << 31); - gds_prim_restart_continue_bit = 1 << 31; - } - - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); - radeon_emit(cs, index_buffers_va); - radeon_emit(cs, - VERTEX_COUNTER_GDS_MODE == 0 ? count_va : - VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset : - start_prim | - gds_prim_restart_continue_bit); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - radeon_emit(cs, vb_desc_va); - radeon_emit(cs, vs_const_desc_va); - radeon_emit(cs, vs_sampler_desc_va); - radeon_emit(cs, base_vertex); - radeon_emit(cs, info->start_instance); - radeon_emit(cs, num_prims_udiv.multiplier); - radeon_emit(cs, num_prims_udiv.post_shift | - (num_prims_per_instance << 5)); - radeon_emit(cs, info->restart_index); - /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ - radeon_emit(cs, fui(small_prim_cull_precision)); - } else { - assert(VERTEX_COUNTER_GDS_MODE == 2); - /* Only update the SGPRs that changed. */ - radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); - radeon_emit(cs, start_prim); - radeon_emit(cs, start_prim + num_subdraw_prims - 1); - radeon_emit(cs, count_va); - } - - /* Set grid dimensions. */ - unsigned start_block = start_prim / THREADGROUP_SIZE; - unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; - unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; - - radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); - radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, - S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | - S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); - - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | - PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); - radeon_emit(cs, 1); - radeon_emit(cs, 1); - radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | - S_00B800_PARTIAL_TG_EN(!!partial_block_size) | - S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | - S_00B800_ORDER_MODE(0 /* launch in order */)); - - /* This is only for unordered append. Ordered append writes this from - * the shader. - * - * Note that EOP and EOS events are super slow, so emulating the event - * in a shader is an important optimization. - */ - if (VERTEX_COUNTER_GDS_MODE == 1) { - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - EOP_INT_SEL_NONE, - EOP_DATA_SEL_GDS, - NULL, - count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), - EOP_DATA_GDS(gds_offset / 4, 1), - SI_NOT_QUERY); - - /* Now that compute shaders are running, clear the remainder of GDS. */ - if (first_dispatch) { - unsigned offset = gds_offset + gds_size; - si_cp_dma_clear_buffer(sctx, cs, NULL, offset, - GDS_SIZE_UNORDERED - offset, - 0, - SI_CPDMA_SKIP_CHECK_CS_SPACE | - SI_CPDMA_SKIP_GFX_SYNC | - SI_CPDMA_SKIP_SYNC_BEFORE, - SI_COHERENCY_NONE, L2_BYPASS); - } - } - first_dispatch = false; - - assert(cs->current.cdw <= cs->current.max_dw); - assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); - } + struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs; + unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count); + if (!num_prims_per_instance) + return; + + unsigned num_prims = num_prims_per_instance * info->instance_count; + unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format; + + switch (info->mode) { + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_TRIANGLE_FAN: + vertices_per_prim = 3; + output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; + gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT; + break; + default: + unreachable("unsupported primitive type"); + return; + } + + unsigned out_indexbuf_offset; + uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4; + bool first_dispatch = !sctx->prim_discard_compute_ib_initialized; + + /* Initialize the compute IB if it's empty. */ + if (!sctx->prim_discard_compute_ib_initialized) { + /* 1) State initialization. */ + sctx->compute_gds_offset = 0; + sctx->compute_ib_last_shader = NULL; + + if (sctx->last_ib_barrier_fence) { + assert(!sctx->last_ib_barrier_buf); + sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence, + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); + } + + /* 2) IB initialization. */ + + /* This needs to be done at the beginning of IBs due to possible + * TTM buffer moves in the kernel. + */ + if (sctx->chip_class >= GFX10) { + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + radeon_emit(cs, /* GCR_CNTL */ + S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) | + S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | + S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD)); + } else { + si_emit_surface_sync(sctx, cs, + S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | + S_0085F0_SH_ICACHE_ACTION_ENA(1) | + S_0085F0_SH_KCACHE_ACTION_ENA(1)); + } + + /* Restore the GDS prim restart counter if needed. */ + if (sctx->preserve_prim_restart_gds_at_flush) { + si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM, + sctx->wait_mem_scratch, 4); + } + + si_emit_initial_compute_regs(sctx, cs); + + radeon_set_sh_reg( + cs, R_00B860_COMPUTE_TMPRING_SIZE, + S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */ + + /* Only 1D grids are launched. */ + radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); + radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1)); + radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1)); + + radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + + /* Disable ordered alloc for OA resources. */ + for (unsigned i = 0; i < 2; i++) { + radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3); + radeon_emit(cs, S_031074_INDEX(i)); + radeon_emit(cs, 0); + radeon_emit(cs, S_03107C_ENABLE(0)); + } + + if (sctx->last_ib_barrier_buf) { + assert(!sctx->last_ib_barrier_fence); + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ, + RADEON_PRIO_FENCE); + si_cp_wait_mem(sctx, cs, + sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset, + 1, 1, WAIT_REG_MEM_EQUAL); + } + + sctx->prim_discard_compute_ib_initialized = true; + } + + /* Allocate the output index buffer. */ + output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size); + assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); + out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; + sctx->index_ring_offset += output_indexbuf_size; + + radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RW_BUFFER); + uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; + + /* Prepare index buffer descriptors. */ + struct si_resource *indexbuf_desc = NULL; + unsigned indexbuf_desc_offset; + unsigned desc_size = 12 * 4; + uint32_t *desc; + + u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size), + &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc); + radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + + /* Input index buffer. */ + desc[0] = input_indexbuf_va; + desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size); + desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1); + + if (sctx->chip_class >= GFX10) { + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT + : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT + : V_008F0C_IMG_FORMAT_32_UINT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 + : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 + : V_008F0C_BUF_DATA_FORMAT_32); + } + + /* Output index buffer. */ + desc[4] = out_indexbuf_va; + desc[5] = + S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4); + desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); + + if (sctx->chip_class >= GFX10) { + desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | + S_008F0C_FORMAT(gfx10_output_indexbuf_format) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(output_indexbuf_format); + } + + /* Viewport state. */ + struct si_small_prim_cull_info cull_info; + si_get_small_prim_cull_info(sctx, &cull_info); + + desc[8] = fui(cull_info.scale[0]); + desc[9] = fui(cull_info.scale[1]); + desc[10] = fui(cull_info.translate[0]); + desc[11] = fui(cull_info.translate[1]); + + /* Better subpixel precision increases the efficiency of small + * primitive culling. */ + unsigned num_samples = sctx->framebuffer.nr_samples; + unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; + float small_prim_cull_precision; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + small_prim_cull_precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + small_prim_cull_precision = num_samples / 1024.0; + else + small_prim_cull_precision = num_samples / 256.0; + + /* Set user data SGPRs. */ + /* This can't be greater than 14 if we want the fastest launch rate. */ + unsigned user_sgprs = 13; + + uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; + unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); + unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); + uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; + uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; + uint64_t vb_desc_va = sctx->vb_descriptors_buffer + ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset + : 0; + unsigned gds_offset, gds_size; + struct si_fast_udiv_info32 num_prims_udiv = {}; + + if (info->instance_count > 1) + num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31); + + /* Limitations on how these two are packed in the user SGPR. */ + assert(num_prims_udiv.post_shift < 32); + assert(num_prims_per_instance < 1 << 27); + + si_resource_reference(&indexbuf_desc, NULL); + + bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart; + + if (VERTEX_COUNTER_GDS_MODE == 1) { + gds_offset = sctx->compute_gds_offset; + gds_size = primitive_restart ? 8 : 4; + sctx->compute_gds_offset += gds_size; + + /* Reset the counters in GDS for the first dispatch using WRITE_DATA. + * The remainder of the GDS will be cleared after the dispatch packet + * in parallel with compute shaders. + */ + if (first_dispatch) { + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); + radeon_emit(cs, gds_offset); + radeon_emit(cs, 0); + radeon_emit(cs, 0); /* value to write */ + if (gds_size == 8) + radeon_emit(cs, 0); + } + } + + /* Set shader registers. */ + struct si_shader *shader = sctx->cs_prim_discard_state.current; + + if (shader != sctx->compute_ib_last_shader) { + radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_BINARY); + uint64_t shader_va = shader->bo->gpu_address; + + assert(shader->config.scratch_bytes_per_wave == 0); + assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); + + radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); + radeon_emit(cs, shader_va >> 8); + radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); + + radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); + radeon_emit( + cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) | + S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) | + S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) | + S_00B848_WGP_MODE(sctx->chip_class >= GFX10)); + radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) | + S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | + S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) | + S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | + S_00B84C_LDS_SIZE(shader->config.lds_size)); + + radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, + ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG, + MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); + sctx->compute_ib_last_shader = shader; + } + + STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0); + + /* Big draw calls are split into smaller dispatches and draw packets. */ + for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) { + unsigned num_subdraw_prims; + + if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims) + num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL; + else + num_subdraw_prims = num_prims - start_prim; + + /* Small dispatches are executed back to back until a specific primitive + * count is reached. Then, a CS_DONE is inserted to signal the gfx IB + * to start drawing the batch. This batching adds latency to the gfx IB, + * but CS_DONE and REWIND are too slow. + */ + if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) + si_compute_signal_gfx(sctx); + + if (sctx->compute_num_prims_in_batch == 0) { + assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); + sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; + + if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { + radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(gfx_cs, 0); + + si_cp_wait_mem( + sctx, gfx_cs, + sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32, + REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP); + + /* Use INDIRECT_BUFFER to chain to a different buffer + * to discard the CP prefetch cache. + */ + sctx->ws->cs_check_space(gfx_cs, 0, true); + } else { + radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); + radeon_emit(gfx_cs, 0); + } + } + + sctx->compute_num_prims_in_batch += num_subdraw_prims; + + uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; + uint64_t index_va = out_indexbuf_va + start_prim * 12; + + /* Emit the draw packet into the gfx IB. */ + radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); + radeon_emit(gfx_cs, num_prims * vertices_per_prim); + radeon_emit(gfx_cs, index_va); + radeon_emit(gfx_cs, index_va >> 32); + radeon_emit(gfx_cs, 0); + radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); + + /* Continue with the compute IB. */ + if (start_prim == 0) { + uint32_t gds_prim_restart_continue_bit = 0; + + if (sctx->preserve_prim_restart_gds_at_flush) { + assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP); + assert(start_prim < 1 << 31); + gds_prim_restart_continue_bit = 1 << 31; + } + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); + radeon_emit(cs, index_buffers_va); + radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0 + ? count_va + : VERTEX_COUNTER_GDS_MODE == 1 + ? gds_offset + : start_prim | gds_prim_restart_continue_bit); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + radeon_emit(cs, vb_desc_va); + radeon_emit(cs, vs_const_desc_va); + radeon_emit(cs, vs_sampler_desc_va); + radeon_emit(cs, base_vertex); + radeon_emit(cs, info->start_instance); + radeon_emit(cs, num_prims_udiv.multiplier); + radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5)); + radeon_emit(cs, info->restart_index); + /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ + radeon_emit(cs, fui(small_prim_cull_precision)); + } else { + assert(VERTEX_COUNTER_GDS_MODE == 2); + /* Only update the SGPRs that changed. */ + radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3); + radeon_emit(cs, start_prim); + radeon_emit(cs, start_prim + num_subdraw_prims - 1); + radeon_emit(cs, count_va); + } + + /* Set grid dimensions. */ + unsigned start_block = start_prim / THREADGROUP_SIZE; + unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; + unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; + + radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); + radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, + S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | + S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); + + radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1)); + radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); + radeon_emit(cs, 1); + radeon_emit(cs, 1); + radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) | + S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | + S_00B800_ORDER_MODE(0 /* launch in order */)); + + /* This is only for unordered append. Ordered append writes this from + * the shader. + * + * Note that EOP and EOS events are super slow, so emulating the event + * in a shader is an important optimization. + */ + if (VERTEX_COUNTER_GDS_MODE == 1) { + si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0, + sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, + EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL, + count_va | ((uint64_t)sctx->screen->info.address32_hi << 32), + EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY); + + /* Now that compute shaders are running, clear the remainder of GDS. */ + if (first_dispatch) { + unsigned offset = gds_offset + gds_size; + si_cp_dma_clear_buffer( + sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0, + SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE, + SI_COHERENCY_NONE, L2_BYPASS); + } + } + first_dispatch = false; + + assert(cs->current.cdw <= cs->current.max_dw); + assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); + } } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 2ef41e44ded..391c4f8d50b 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -27,232 +27,221 @@ /* Set this if you want the ME to wait until CP DMA is done. * It should be set on the last CP DMA packet. */ -#define CP_DMA_SYNC (1 << 0) +#define CP_DMA_SYNC (1 << 0) /* Set this if the source data was used as a destination in a previous CP DMA * packet. It's for preventing a read-after-write (RAW) hazard between two * CP DMA packets. */ -#define CP_DMA_RAW_WAIT (1 << 1) -#define CP_DMA_DST_IS_GDS (1 << 2) -#define CP_DMA_CLEAR (1 << 3) -#define CP_DMA_PFP_SYNC_ME (1 << 4) -#define CP_DMA_SRC_IS_GDS (1 << 5) +#define CP_DMA_RAW_WAIT (1 << 1) +#define CP_DMA_DST_IS_GDS (1 << 2) +#define CP_DMA_CLEAR (1 << 3) +#define CP_DMA_PFP_SYNC_ME (1 << 4) +#define CP_DMA_SRC_IS_GDS (1 << 5) /* The max number of bytes that can be copied per packet. */ static inline unsigned cp_dma_max_byte_count(struct si_context *sctx) { - unsigned max = sctx->chip_class >= GFX9 ? - S_414_BYTE_COUNT_GFX9(~0u) : - S_414_BYTE_COUNT_GFX6(~0u); + unsigned max = + sctx->chip_class >= GFX9 ? S_414_BYTE_COUNT_GFX9(~0u) : S_414_BYTE_COUNT_GFX6(~0u); - /* make it aligned for optimal performance */ - return max & ~(SI_CPDMA_ALIGNMENT - 1); + /* make it aligned for optimal performance */ + return max & ~(SI_CPDMA_ALIGNMENT - 1); } - /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit * clear value. */ -static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, - uint64_t dst_va, uint64_t src_va, unsigned size, - unsigned flags, enum si_cache_policy cache_policy) +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va, + uint64_t src_va, unsigned size, unsigned flags, + enum si_cache_policy cache_policy) { - uint32_t header = 0, command = 0; - - assert(size <= cp_dma_max_byte_count(sctx)); - assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS); - - if (sctx->chip_class >= GFX9) - command |= S_414_BYTE_COUNT_GFX9(size); - else - command |= S_414_BYTE_COUNT_GFX6(size); - - /* Sync flags. */ - if (flags & CP_DMA_SYNC) - header |= S_411_CP_SYNC(1); - else { - if (sctx->chip_class >= GFX9) - command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); - else - command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); - } - - if (flags & CP_DMA_RAW_WAIT) - command |= S_414_RAW_WAIT(1); - - /* Src and dst flags. */ - if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && - src_va == dst_va) { - header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ - } else if (flags & CP_DMA_DST_IS_GDS) { - header |= S_411_DST_SEL(V_411_GDS); - /* GDS increments the address, not CP. */ - command |= S_414_DAS(V_414_REGISTER) | - S_414_DAIC(V_414_NO_INCREMENT); - } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { - header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | - S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM); - } - - if (flags & CP_DMA_CLEAR) { - header |= S_411_SRC_SEL(V_411_DATA); - } else if (flags & CP_DMA_SRC_IS_GDS) { - header |= S_411_SRC_SEL(V_411_GDS); - /* Both of these are required for GDS. It does increment the address. */ - command |= S_414_SAS(V_414_REGISTER) | - S_414_SAIC(V_414_NO_INCREMENT); - } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { - header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | - S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM); - } - - if (sctx->chip_class >= GFX7) { - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, header); - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, command); - } else { - header |= S_411_SRC_ADDR_HI(src_va >> 32); - - radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); - radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ - radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ - radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ - radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, command); - } - - /* CP DMA is executed in ME, but index buffers are read by PFP. - * This ensures that ME (CP DMA) is idle before PFP starts fetching - * indices. If we wanted to execute CP DMA in PFP, this packet - * should precede it. - */ - if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) { - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - } + uint32_t header = 0, command = 0; + + assert(size <= cp_dma_max_byte_count(sctx)); + assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS); + + if (sctx->chip_class >= GFX9) + command |= S_414_BYTE_COUNT_GFX9(size); + else + command |= S_414_BYTE_COUNT_GFX6(size); + + /* Sync flags. */ + if (flags & CP_DMA_SYNC) + header |= S_411_CP_SYNC(1); + else { + if (sctx->chip_class >= GFX9) + command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); + else + command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); + } + + if (flags & CP_DMA_RAW_WAIT) + command |= S_414_RAW_WAIT(1); + + /* Src and dst flags. */ + if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) { + header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ + } else if (flags & CP_DMA_DST_IS_GDS) { + header |= S_411_DST_SEL(V_411_GDS); + /* GDS increments the address, not CP. */ + command |= S_414_DAS(V_414_REGISTER) | S_414_DAIC(V_414_NO_INCREMENT); + } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { + header |= + S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM); + } + + if (flags & CP_DMA_CLEAR) { + header |= S_411_SRC_SEL(V_411_DATA); + } else if (flags & CP_DMA_SRC_IS_GDS) { + header |= S_411_SRC_SEL(V_411_GDS); + /* Both of these are required for GDS. It does increment the address. */ + command |= S_414_SAS(V_414_REGISTER) | S_414_SAIC(V_414_NO_INCREMENT); + } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) { + header |= + S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM); + } + + if (sctx->chip_class >= GFX7) { + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, header); + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(cs, command); + } else { + header |= S_411_SRC_ADDR_HI(src_va >> 32); + + radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); + radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */ + radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ + radeon_emit(cs, command); + } + + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } } void si_cp_dma_wait_for_idle(struct si_context *sctx) { - /* Issue a dummy DMA that copies zero bytes. - * - * The DMA engine will see that there's no work to do and skip this - * DMA request, however, the CP will see the sync flag and still wait - * for all DMAs to complete. - */ - si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); + /* Issue a dummy DMA that copies zero bytes. + * + * The DMA engine will see that there's no work to do and skip this + * DMA request, however, the CP will see the sync flag and still wait + * for all DMAs to complete. + */ + si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); } static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, - struct pipe_resource *src, unsigned byte_count, - uint64_t remaining_size, unsigned user_flags, - enum si_coherency coher, bool *is_first, - unsigned *packet_flags) + struct pipe_resource *src, unsigned byte_count, + uint64_t remaining_size, unsigned user_flags, enum si_coherency coher, + bool *is_first, unsigned *packet_flags) { - /* Fast exit for a CPDMA prefetch. */ - if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) { - *is_first = false; - return; - } - - if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { - /* Count memory usage in so that need_cs_space can take it into account. */ - if (dst) - si_context_add_resource_size(sctx, dst); - if (src) - si_context_add_resource_size(sctx, src); - } - - if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE)) - si_need_gfx_cs_space(sctx); - - /* This must be done after need_cs_space. */ - if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { - if (dst) - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(dst), - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); - if (src) - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(src), - RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - } - - /* Flush the caches for the first copy only. - * Also wait for the previous CP DMA operations. - */ - if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags) - sctx->emit_cache_flush(sctx); - - if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first && - !(*packet_flags & CP_DMA_CLEAR)) - *packet_flags |= CP_DMA_RAW_WAIT; - - *is_first = false; - - /* Do the synchronization after the last dma, so that all data - * is written to memory. - */ - if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && - byte_count == remaining_size) { - *packet_flags |= CP_DMA_SYNC; - - if (coher == SI_COHERENCY_SHADER) - *packet_flags |= CP_DMA_PFP_SYNC_ME; - } + /* Fast exit for a CPDMA prefetch. */ + if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) { + *is_first = false; + return; + } + + if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { + /* Count memory usage in so that need_cs_space can take it into account. */ + if (dst) + si_context_add_resource_size(sctx, dst); + if (src) + si_context_add_resource_size(sctx, src); + } + + if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE)) + si_need_gfx_cs_space(sctx); + + /* This must be done after need_cs_space. */ + if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) { + if (dst) + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(dst), RADEON_USAGE_WRITE, + RADEON_PRIO_CP_DMA); + if (src) + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ, + RADEON_PRIO_CP_DMA); + } + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. + */ + if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags) + sctx->emit_cache_flush(sctx); + + if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first && !(*packet_flags & CP_DMA_CLEAR)) + *packet_flags |= CP_DMA_RAW_WAIT; + + *is_first = false; + + /* Do the synchronization after the last dma, so that all data + * is written to memory. + */ + if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) { + *packet_flags |= CP_DMA_SYNC; + + if (coher == SI_COHERENCY_SHADER) + *packet_flags |= CP_DMA_PFP_SYNC_ME; + } } void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, - struct pipe_resource *dst, uint64_t offset, - uint64_t size, unsigned value, unsigned user_flags, - enum si_coherency coher, enum si_cache_policy cache_policy) + struct pipe_resource *dst, uint64_t offset, uint64_t size, + unsigned value, unsigned user_flags, enum si_coherency coher, + enum si_cache_policy cache_policy) { - struct si_resource *sdst = si_resource(dst); - uint64_t va = (sdst ? sdst->gpu_address : 0) + offset; - bool is_first = true; - - assert(size && size % 4 == 0); - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - if (sdst) - util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); - - /* Flush the caches. */ - if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, cache_policy); - } - - while (size) { - unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); - unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); - - si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, - coher, &is_first, &dma_flags); - - /* Emit the clear packet. */ - si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy); - - size -= byte_count; - va += byte_count; - } - - if (sdst && cache_policy != L2_BYPASS) - sdst->TC_L2_dirty = true; - - /* If it's not a framebuffer fast clear... */ - if (coher == SI_COHERENCY_SHADER) { - sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } + struct si_resource *sdst = si_resource(dst); + uint64_t va = (sdst ? sdst->gpu_address : 0) + offset; + bool is_first = true; + + assert(size && size % 4 == 0); + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + if (sdst) + util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); + + /* Flush the caches. */ + if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, cache_policy); + } + + while (size) { + unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); + unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); + + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first, + &dma_flags); + + /* Emit the clear packet. */ + si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy); + + size -= byte_count; + va += byte_count; + } + + if (sdst && cache_policy != L2_BYPASS) + sdst->TC_L2_dirty = true; + + /* If it's not a framebuffer fast clear... */ + if (coher == SI_COHERENCY_SHADER) { + sctx->num_cp_dma_calls++; + si_prim_discard_signal_next_compute_ib_start(sctx); + } } /** @@ -261,41 +250,34 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, * * \param size Remaining size to the CP DMA alignment. */ -static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, - unsigned user_flags, enum si_coherency coher, - enum si_cache_policy cache_policy, - bool *is_first) +static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags, + enum si_coherency coher, enum si_cache_policy cache_policy, + bool *is_first) { - uint64_t va; - unsigned dma_flags = 0; - unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2; - - assert(size < SI_CPDMA_ALIGNMENT); - - /* Use the scratch buffer as the dummy buffer. The 3D engine should be - * idle at this point. - */ - if (!sctx->scratch_buffer || - sctx->scratch_buffer->b.b.width0 < scratch_size) { - si_resource_reference(&sctx->scratch_buffer, NULL); - sctx->scratch_buffer = - si_aligned_buffer_create(&sctx->screen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - scratch_size, 256); - if (!sctx->scratch_buffer) - return; - - si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); - } - - si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, - &sctx->scratch_buffer->b.b, size, size, user_flags, - coher, is_first, &dma_flags); - - va = sctx->scratch_buffer->gpu_address; - si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, - cache_policy); + uint64_t va; + unsigned dma_flags = 0; + unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2; + + assert(size < SI_CPDMA_ALIGNMENT); + + /* Use the scratch buffer as the dummy buffer. The 3D engine should be + * idle at this point. + */ + if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) { + si_resource_reference(&sctx->scratch_buffer, NULL); + sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, scratch_size, 256); + if (!sctx->scratch_buffer) + return; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); + } + + si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size, + user_flags, coher, is_first, &dma_flags); + + va = sctx->scratch_buffer->gpu_address; + si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy); } /** @@ -304,141 +286,131 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, * * \param user_flags bitmask of SI_CPDMA_* */ -void si_cp_dma_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size, - unsigned user_flags, enum si_coherency coher, - enum si_cache_policy cache_policy) +void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, + unsigned size, unsigned user_flags, enum si_coherency coher, + enum si_cache_policy cache_policy) { - uint64_t main_dst_offset, main_src_offset; - unsigned skipped_size = 0; - unsigned realign_size = 0; - unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | - (src ? 0 : CP_DMA_SRC_IS_GDS); - bool is_first = true; - - assert(size); - - if (dst) { - /* Skip this for the L2 prefetch. */ - if (dst != src || dst_offset != src_offset) { - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, - dst_offset + size); - } - - dst_offset += si_resource(dst)->gpu_address; - } - if (src) - src_offset += si_resource(src)->gpu_address; - - /* The workarounds aren't needed on Fiji and beyond. */ - if (sctx->family <= CHIP_CARRIZO || - sctx->family == CHIP_STONEY) { - /* If the size is not aligned, we must add a dummy copy at the end - * just to align the internal counter. Otherwise, the DMA engine - * would slow down by an order of magnitude for following copies. - */ - if (size % SI_CPDMA_ALIGNMENT) - realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); - - /* If the copy begins unaligned, we must start copying from the next - * aligned block and the skipped part should be copied after everything - * else has been copied. Only the src alignment matters, not dst. - * - * GDS doesn't need the source address to be aligned. - */ - if (src && src_offset % SI_CPDMA_ALIGNMENT) { - skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT); - /* The main part will be skipped if the size is too small. */ - skipped_size = MIN2(skipped_size, size); - size -= skipped_size; - } - } - - /* Flush the caches. */ - if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, cache_policy); - } - - /* This is the main part doing the copying. Src is always aligned. */ - main_dst_offset = dst_offset + skipped_size; - main_src_offset = src_offset + skipped_size; - - while (size) { - unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); - unsigned dma_flags = gds_flags; - - si_cp_dma_prepare(sctx, dst, src, byte_count, - size + skipped_size + realign_size, - user_flags, coher, &is_first, &dma_flags); - - si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, - byte_count, dma_flags, cache_policy); - - size -= byte_count; - main_src_offset += byte_count; - main_dst_offset += byte_count; - } - - /* Copy the part we skipped because src wasn't aligned. */ - if (skipped_size) { - unsigned dma_flags = gds_flags; - - si_cp_dma_prepare(sctx, dst, src, skipped_size, - skipped_size + realign_size, user_flags, - coher, &is_first, &dma_flags); - - si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, - dma_flags, cache_policy); - } - - /* Finally, realign the engine if the size wasn't aligned. */ - if (realign_size) { - si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, - cache_policy, &is_first); - } - - if (dst && cache_policy != L2_BYPASS) - si_resource(dst)->TC_L2_dirty = true; - - /* If it's not a prefetch or GDS copy... */ - if (dst && src && (dst != src || dst_offset != src_offset)) { - sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } + uint64_t main_dst_offset, main_src_offset; + unsigned skipped_size = 0; + unsigned realign_size = 0; + unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS); + bool is_first = true; + + assert(size); + + if (dst) { + /* Skip this for the L2 prefetch. */ + if (dst != src || dst_offset != src_offset) { + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); + } + + dst_offset += si_resource(dst)->gpu_address; + } + if (src) + src_offset += si_resource(src)->gpu_address; + + /* The workarounds aren't needed on Fiji and beyond. */ + if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) { + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % SI_CPDMA_ALIGNMENT) + realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + * + * GDS doesn't need the source address to be aligned. + */ + if (src && src_offset % SI_CPDMA_ALIGNMENT) { + skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; + } + } + + /* Flush the caches. */ + if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, cache_policy); + } + + /* This is the main part doing the copying. Src is always aligned. */ + main_dst_offset = dst_offset + skipped_size; + main_src_offset = src_offset + skipped_size; + + while (size) { + unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); + unsigned dma_flags = gds_flags; + + si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags, + coher, &is_first, &dma_flags); + + si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags, + cache_policy); + + size -= byte_count; + main_src_offset += byte_count; + main_dst_offset += byte_count; + } + + /* Copy the part we skipped because src wasn't aligned. */ + if (skipped_size) { + unsigned dma_flags = gds_flags; + + si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags, + coher, &is_first, &dma_flags); + + si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags, + cache_policy); + } + + /* Finally, realign the engine if the size wasn't aligned. */ + if (realign_size) { + si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first); + } + + if (dst && cache_policy != L2_BYPASS) + si_resource(dst)->TC_L2_dirty = true; + + /* If it's not a prefetch or GDS copy... */ + if (dst && src && (dst != src || dst_offset != src_offset)) { + sctx->num_cp_dma_calls++; + si_prim_discard_signal_next_compute_ib_start(sctx); + } } -void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, - uint64_t offset, unsigned size) +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, + unsigned size) { - assert(sctx->chip_class >= GFX7); + assert(sctx->chip_class >= GFX7); - si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, - SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU); + si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, + SI_COHERENCY_SHADER, L2_LRU); } -static void cik_prefetch_shader_async(struct si_context *sctx, - struct si_pm4_state *state) +static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) { - struct pipe_resource *bo = &state->bo[0]->b.b; - assert(state->nbo == 1); + struct pipe_resource *bo = &state->bo[0]->b.b; + assert(state->nbo == 1); - cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); + cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); } static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { - if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size) - return; + if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size) + return; - cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, - sctx->vb_descriptors_offset, - sctx->vertex_elements->vb_desc_list_alloc_size); + cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, + sctx->vertex_elements->vb_desc_list_alloc_size); } /** @@ -449,191 +421,185 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx) */ void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only) { - unsigned mask = sctx->prefetch_L2_mask; - assert(mask); - - /* Prefetch shaders and VBO descriptors to TC L2. */ - if (sctx->chip_class >= GFX9) { - /* Choose the right spot for the VBO prefetch. */ - if (sctx->queued.named.hs) { - if (mask & SI_PREFETCH_HS) - cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - - if (mask & SI_PREFETCH_GS) - cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - } else if (sctx->queued.named.gs) { - if (mask & SI_PREFETCH_GS) - cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - } else { - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - } - } else { - /* GFX6-GFX8 */ - /* Choose the right spot for the VBO prefetch. */ - if (sctx->tes_shader.cso) { - if (mask & SI_PREFETCH_LS) - cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - - if (mask & SI_PREFETCH_HS) - cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (mask & SI_PREFETCH_ES) - cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (mask & SI_PREFETCH_GS) - cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - } else if (sctx->gs_shader.cso) { - if (mask & SI_PREFETCH_ES) - cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - - if (mask & SI_PREFETCH_GS) - cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - } else { - if (mask & SI_PREFETCH_VS) - cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (mask & SI_PREFETCH_VBO_DESCRIPTORS) - cik_prefetch_VBO_descriptors(sctx); - if (vertex_stage_only) { - sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | - SI_PREFETCH_VBO_DESCRIPTORS); - return; - } - } - } - - if (mask & SI_PREFETCH_PS) - cik_prefetch_shader_async(sctx, sctx->queued.named.ps); - - sctx->prefetch_L2_mask = 0; + unsigned mask = sctx->prefetch_L2_mask; + assert(mask); + + /* Prefetch shaders and VBO descriptors to TC L2. */ + if (sctx->chip_class >= GFX9) { + /* Choose the right spot for the VBO prefetch. */ + if (sctx->queued.named.hs) { + if (mask & SI_PREFETCH_HS) + cik_prefetch_shader_async(sctx, sctx->queued.named.hs); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) + cik_prefetch_shader_async(sctx, sctx->queued.named.gs); + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + } else if (sctx->queued.named.gs) { + if (mask & SI_PREFETCH_GS) + cik_prefetch_shader_async(sctx, sctx->queued.named.gs); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + } else { + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + } + } else { + /* GFX6-GFX8 */ + /* Choose the right spot for the VBO prefetch. */ + if (sctx->tes_shader.cso) { + if (mask & SI_PREFETCH_LS) + cik_prefetch_shader_async(sctx, sctx->queued.named.ls); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_HS) + cik_prefetch_shader_async(sctx, sctx->queued.named.hs); + if (mask & SI_PREFETCH_ES) + cik_prefetch_shader_async(sctx, sctx->queued.named.es); + if (mask & SI_PREFETCH_GS) + cik_prefetch_shader_async(sctx, sctx->queued.named.gs); + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + } else if (sctx->gs_shader.cso) { + if (mask & SI_PREFETCH_ES) + cik_prefetch_shader_async(sctx, sctx->queued.named.es); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) + cik_prefetch_shader_async(sctx, sctx->queued.named.gs); + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + } else { + if (mask & SI_PREFETCH_VS) + cik_prefetch_shader_async(sctx, sctx->queued.named.vs); + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) + cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + } + } + + if (mask & SI_PREFETCH_PS) + cik_prefetch_shader_async(sctx, sctx->queued.named.ps); + + sctx->prefetch_L2_mask = 0; } void si_test_gds(struct si_context *sctx) { - struct pipe_context *ctx = &sctx->b; - struct pipe_resource *src, *dst; - unsigned r[4] = {}; - unsigned offset = debug_get_num_option("OFFSET", 16); - - src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); - dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER, L2_BYPASS); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER, L2_BYPASS); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER, L2_BYPASS); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER, L2_BYPASS); - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER, L2_BYPASS); - - si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); - si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); - - pipe_buffer_read(ctx, dst, 0, sizeof(r), r); - printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], - r[0] == 0xabcdef01 && r[1] == 0x23456789 && - r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" : "fail"); - - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS); - si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); - - pipe_buffer_read(ctx, dst, 0, sizeof(r), r); - printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], - r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && - r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : "fail"); - - pipe_resource_reference(&src, NULL); - pipe_resource_reference(&dst, NULL); - exit(0); + struct pipe_context *ctx = &sctx->b; + struct pipe_resource *src, *dst; + unsigned r[4] = {}; + unsigned offset = debug_get_num_option("OFFSET", 16); + + src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); + dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER, + L2_BYPASS); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER, + L2_BYPASS); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER, + L2_BYPASS); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER, + L2_BYPASS); + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER, + L2_BYPASS); + + si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); + si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); + + pipe_buffer_read(ctx, dst, 0, sizeof(r), r); + printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], + r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98 + ? "pass" + : "fail"); + + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE, + L2_BYPASS); + si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS); + + pipe_buffer_read(ctx, dst, 0, sizeof(r), r); + printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], + r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 + ? "pass" + : "fail"); + + pipe_resource_reference(&src, NULL); + pipe_resource_reference(&dst, NULL); + exit(0); } -void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, - unsigned offset, unsigned size, unsigned dst_sel, - unsigned engine, const void *data) +void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset, + unsigned size, unsigned dst_sel, unsigned engine, const void *data) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - assert(offset % 4 == 0); - assert(size % 4 == 0); + assert(offset % 4 == 0); + assert(size % 4 == 0); - if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM) - dst_sel = V_370_MEM_GRBM; + if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM) + dst_sel = V_370_MEM_GRBM; - radeon_add_to_buffer_list(sctx, cs, buf, - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); - uint64_t va = buf->gpu_address + offset; + radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + uint64_t va = buf->gpu_address + offset; - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size/4, 0)); - radeon_emit(cs, S_370_DST_SEL(dst_sel) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(engine)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit_array(cs, (const uint32_t*)data, size/4); + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0)); + radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit_array(cs, (const uint32_t *)data, size / 4); } -void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, - unsigned dst_sel, struct si_resource *dst, unsigned dst_offset, - unsigned src_sel, struct si_resource *src, unsigned src_offset) +void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel, + struct si_resource *dst, unsigned dst_offset, unsigned src_sel, + struct si_resource *src, unsigned src_offset) { - /* cs can point to the compute IB, which has the buffer list in gfx_cs. */ - if (dst) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst, - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); - } - if (src) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src, - RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - } - - uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset; - uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | - COPY_DATA_DST_SEL(dst_sel) | - COPY_DATA_WR_CONFIRM); - radeon_emit(cs, src_va); - radeon_emit(cs, src_va >> 32); - radeon_emit(cs, dst_va); - radeon_emit(cs, dst_va >> 32); + /* cs can point to the compute IB, which has the buffer list in gfx_cs. */ + if (dst) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + } + if (src) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); + } + + uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset; + uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, src_va); + radeon_emit(cs, src_va >> 32); + radeon_emit(cs, dst_va); + radeon_emit(cs, dst_va >> 32); } diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index cbd92c02c73..acd86730d0b 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -22,21 +22,20 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_pipe.h" +#include "ac_debug.h" +#include "ac_rtld.h" +#include "driver_ddebug/dd_util.h" #include "si_compute.h" +#include "si_pipe.h" #include "sid.h" #include "sid_tables.h" #include "tgsi/tgsi_from_mesa.h" -#include "driver_ddebug/dd_util.h" #include "util/u_dump.h" #include "util/u_log.h" #include "util/u_memory.h" #include "util/u_string.h" -#include "ac_debug.h" -#include "ac_rtld.h" -static void si_dump_bo_list(struct si_context *sctx, - const struct radeon_saved_cs *saved, FILE *f); +static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f); DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL) @@ -44,155 +43,148 @@ DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL) * Store a linearized copy of all chunks of \p cs together with the buffer * list in \p saved. */ -void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, - struct radeon_saved_cs *saved, bool get_buffer_list) +void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, + bool get_buffer_list) { - uint32_t *buf; - unsigned i; - - /* Save the IB chunks. */ - saved->num_dw = cs->prev_dw + cs->current.cdw; - saved->ib = MALLOC(4 * saved->num_dw); - if (!saved->ib) - goto oom; - - buf = saved->ib; - for (i = 0; i < cs->num_prev; ++i) { - memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4); - buf += cs->prev[i].cdw; - } - memcpy(buf, cs->current.buf, cs->current.cdw * 4); - - if (!get_buffer_list) - return; - - /* Save the buffer list. */ - saved->bo_count = ws->cs_get_buffer_list(cs, NULL); - saved->bo_list = CALLOC(saved->bo_count, - sizeof(saved->bo_list[0])); - if (!saved->bo_list) { - FREE(saved->ib); - goto oom; - } - ws->cs_get_buffer_list(cs, saved->bo_list); - - return; + uint32_t *buf; + unsigned i; + + /* Save the IB chunks. */ + saved->num_dw = cs->prev_dw + cs->current.cdw; + saved->ib = MALLOC(4 * saved->num_dw); + if (!saved->ib) + goto oom; + + buf = saved->ib; + for (i = 0; i < cs->num_prev; ++i) { + memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4); + buf += cs->prev[i].cdw; + } + memcpy(buf, cs->current.buf, cs->current.cdw * 4); + + if (!get_buffer_list) + return; + + /* Save the buffer list. */ + saved->bo_count = ws->cs_get_buffer_list(cs, NULL); + saved->bo_list = CALLOC(saved->bo_count, sizeof(saved->bo_list[0])); + if (!saved->bo_list) { + FREE(saved->ib); + goto oom; + } + ws->cs_get_buffer_list(cs, saved->bo_list); + + return; oom: - fprintf(stderr, "%s: out of memory\n", __func__); - memset(saved, 0, sizeof(*saved)); + fprintf(stderr, "%s: out of memory\n", __func__); + memset(saved, 0, sizeof(*saved)); } void si_clear_saved_cs(struct radeon_saved_cs *saved) { - FREE(saved->ib); - FREE(saved->bo_list); + FREE(saved->ib); + FREE(saved->bo_list); - memset(saved, 0, sizeof(*saved)); + memset(saved, 0, sizeof(*saved)); } void si_destroy_saved_cs(struct si_saved_cs *scs) { - si_clear_saved_cs(&scs->gfx); - si_resource_reference(&scs->trace_buf, NULL); - free(scs); + si_clear_saved_cs(&scs->gfx); + si_resource_reference(&scs->trace_buf, NULL); + free(scs); } -static void si_dump_shader(struct si_screen *sscreen, - struct si_shader *shader, FILE *f) +static void si_dump_shader(struct si_screen *sscreen, struct si_shader *shader, FILE *f) { - if (shader->shader_log) - fwrite(shader->shader_log, shader->shader_log_size, 1, f); - else - si_shader_dump(sscreen, shader, NULL, f, false); + if (shader->shader_log) + fwrite(shader->shader_log, shader->shader_log_size, 1, f); + else + si_shader_dump(sscreen, shader, NULL, f, false); - if (shader->bo && sscreen->options.dump_shader_binary) { - unsigned size = shader->bo->b.b.width0; - fprintf(f, "BO: VA=%"PRIx64" Size=%u\n", shader->bo->gpu_address, size); + if (shader->bo && sscreen->options.dump_shader_binary) { + unsigned size = shader->bo->b.b.width0; + fprintf(f, "BO: VA=%" PRIx64 " Size=%u\n", shader->bo->gpu_address, size); - const char *mapped = sscreen->ws->buffer_map(shader->bo->buf, NULL, - PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_READ | - RADEON_TRANSFER_TEMPORARY); + const char *mapped = sscreen->ws->buffer_map( + shader->bo->buf, NULL, + PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY); - for (unsigned i = 0; i < size; i += 4) { - fprintf(f, " %4x: %08x\n", i, *(uint32_t*)(mapped + i)); - } + for (unsigned i = 0; i < size; i += 4) { + fprintf(f, " %4x: %08x\n", i, *(uint32_t *)(mapped + i)); + } - sscreen->ws->buffer_unmap(shader->bo->buf); + sscreen->ws->buffer_unmap(shader->bo->buf); - fprintf(f, "\n"); - } + fprintf(f, "\n"); + } } struct si_log_chunk_shader { - /* The shader destroy code assumes a current context for unlinking of - * PM4 packets etc. - * - * While we should be able to destroy shaders without a context, doing - * so would happen only very rarely and be therefore likely to fail - * just when you're trying to debug something. Let's just remember the - * current context in the chunk. - */ - struct si_context *ctx; - struct si_shader *shader; - - /* For keep-alive reference counts */ - struct si_shader_selector *sel; - struct si_compute *program; + /* The shader destroy code assumes a current context for unlinking of + * PM4 packets etc. + * + * While we should be able to destroy shaders without a context, doing + * so would happen only very rarely and be therefore likely to fail + * just when you're trying to debug something. Let's just remember the + * current context in the chunk. + */ + struct si_context *ctx; + struct si_shader *shader; + + /* For keep-alive reference counts */ + struct si_shader_selector *sel; + struct si_compute *program; }; -static void -si_log_chunk_shader_destroy(void *data) +static void si_log_chunk_shader_destroy(void *data) { - struct si_log_chunk_shader *chunk = data; - si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL); - si_compute_reference(&chunk->program, NULL); - FREE(chunk); + struct si_log_chunk_shader *chunk = data; + si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL); + si_compute_reference(&chunk->program, NULL); + FREE(chunk); } -static void -si_log_chunk_shader_print(void *data, FILE *f) +static void si_log_chunk_shader_print(void *data, FILE *f) { - struct si_log_chunk_shader *chunk = data; - struct si_screen *sscreen = chunk->ctx->screen; - si_dump_shader(sscreen, chunk->shader, f); + struct si_log_chunk_shader *chunk = data; + struct si_screen *sscreen = chunk->ctx->screen; + si_dump_shader(sscreen, chunk->shader, f); } static struct u_log_chunk_type si_log_chunk_type_shader = { - .destroy = si_log_chunk_shader_destroy, - .print = si_log_chunk_shader_print, + .destroy = si_log_chunk_shader_destroy, + .print = si_log_chunk_shader_print, }; -static void si_dump_gfx_shader(struct si_context *ctx, - const struct si_shader_ctx_state *state, - struct u_log_context *log) +static void si_dump_gfx_shader(struct si_context *ctx, const struct si_shader_ctx_state *state, + struct u_log_context *log) { - struct si_shader *current = state->current; + struct si_shader *current = state->current; - if (!state->cso || !current) - return; + if (!state->cso || !current) + return; - struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); - chunk->ctx = ctx; - chunk->shader = current; - si_shader_selector_reference(ctx, &chunk->sel, current->selector); - u_log_chunk(log, &si_log_chunk_type_shader, chunk); + struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); + chunk->ctx = ctx; + chunk->shader = current; + si_shader_selector_reference(ctx, &chunk->sel, current->selector); + u_log_chunk(log, &si_log_chunk_type_shader, chunk); } -static void si_dump_compute_shader(struct si_context *ctx, - struct u_log_context *log) +static void si_dump_compute_shader(struct si_context *ctx, struct u_log_context *log) { - const struct si_cs_shader_state *state = &ctx->cs_shader_state; + const struct si_cs_shader_state *state = &ctx->cs_shader_state; - if (!state->program) - return; + if (!state->program) + return; - struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); - chunk->ctx = ctx; - chunk->shader = &state->program->shader; - si_compute_reference(&chunk->program, state->program); - u_log_chunk(log, &si_log_chunk_type_shader, chunk); + struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); + chunk->ctx = ctx; + chunk->shader = &state->program->shader; + si_compute_reference(&chunk->program, state->program); + u_log_chunk(log, &si_log_chunk_type_shader, chunk); } /** @@ -203,724 +195,664 @@ static void si_dump_compute_shader(struct si_context *ctx, */ bool si_replace_shader(unsigned num, struct si_shader_binary *binary) { - const char *p = debug_get_option_replace_shaders(); - const char *semicolon; - char *copy = NULL; - FILE *f; - long filesize, nread; - bool replaced = false; - - if (!p) - return false; - - while (*p) { - unsigned long i; - char *endp; - i = strtoul(p, &endp, 0); - - p = endp; - if (*p != ':') { - fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n"); - exit(1); - } - ++p; - - if (i == num) - break; - - p = strchr(p, ';'); - if (!p) - return false; - ++p; - } - if (!*p) - return false; - - semicolon = strchr(p, ';'); - if (semicolon) { - p = copy = strndup(p, semicolon - p); - if (!copy) { - fprintf(stderr, "out of memory\n"); - return false; - } - } - - fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p); - - f = fopen(p, "r"); - if (!f) { - perror("radeonsi: failed to open file"); - goto out_free; - } - - if (fseek(f, 0, SEEK_END) != 0) - goto file_error; - - filesize = ftell(f); - if (filesize < 0) - goto file_error; - - if (fseek(f, 0, SEEK_SET) != 0) - goto file_error; - - binary->elf_buffer = MALLOC(filesize); - if (!binary->elf_buffer) { - fprintf(stderr, "out of memory\n"); - goto out_close; - } - - nread = fread((void*)binary->elf_buffer, 1, filesize, f); - if (nread != filesize) { - FREE((void*)binary->elf_buffer); - binary->elf_buffer = NULL; - goto file_error; - } - - binary->elf_size = nread; - replaced = true; + const char *p = debug_get_option_replace_shaders(); + const char *semicolon; + char *copy = NULL; + FILE *f; + long filesize, nread; + bool replaced = false; + + if (!p) + return false; + + while (*p) { + unsigned long i; + char *endp; + i = strtoul(p, &endp, 0); + + p = endp; + if (*p != ':') { + fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n"); + exit(1); + } + ++p; + + if (i == num) + break; + + p = strchr(p, ';'); + if (!p) + return false; + ++p; + } + if (!*p) + return false; + + semicolon = strchr(p, ';'); + if (semicolon) { + p = copy = strndup(p, semicolon - p); + if (!copy) { + fprintf(stderr, "out of memory\n"); + return false; + } + } + + fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p); + + f = fopen(p, "r"); + if (!f) { + perror("radeonsi: failed to open file"); + goto out_free; + } + + if (fseek(f, 0, SEEK_END) != 0) + goto file_error; + + filesize = ftell(f); + if (filesize < 0) + goto file_error; + + if (fseek(f, 0, SEEK_SET) != 0) + goto file_error; + + binary->elf_buffer = MALLOC(filesize); + if (!binary->elf_buffer) { + fprintf(stderr, "out of memory\n"); + goto out_close; + } + + nread = fread((void *)binary->elf_buffer, 1, filesize, f); + if (nread != filesize) { + FREE((void *)binary->elf_buffer); + binary->elf_buffer = NULL; + goto file_error; + } + + binary->elf_size = nread; + replaced = true; out_close: - fclose(f); + fclose(f); out_free: - free(copy); - return replaced; + free(copy); + return replaced; file_error: - perror("radeonsi: reading shader"); - goto out_close; + perror("radeonsi: reading shader"); + goto out_close; } /* Parsed IBs are difficult to read without colors. Use "less -R file" to * read them, or use "aha -b -f file" to convert them to html. */ -#define COLOR_RESET "\033[0m" -#define COLOR_RED "\033[31m" -#define COLOR_GREEN "\033[1;32m" -#define COLOR_YELLOW "\033[1;33m" -#define COLOR_CYAN "\033[1;36m" - -static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, - unsigned offset) +#define COLOR_RESET "\033[0m" +#define COLOR_RED "\033[31m" +#define COLOR_GREEN "\033[1;32m" +#define COLOR_YELLOW "\033[1;33m" +#define COLOR_CYAN "\033[1;36m" + +static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, unsigned offset) { - struct radeon_winsys *ws = sctx->ws; - uint32_t value; + struct radeon_winsys *ws = sctx->ws; + uint32_t value; - if (ws->read_registers(ws, offset, 1, &value)) - ac_dump_reg(f, sctx->chip_class, offset, value, ~0); + if (ws->read_registers(ws, offset, 1, &value)) + ac_dump_reg(f, sctx->chip_class, offset, value, ~0); } static void si_dump_debug_registers(struct si_context *sctx, FILE *f) { - if (!sctx->screen->info.has_read_registers_query) - return; - - fprintf(f, "Memory-mapped registers:\n"); - si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS); - - /* No other registers can be read on DRM < 3.1.0. */ - if (!sctx->screen->info.is_amdgpu || - sctx->screen->info.drm_minor < 1) { - fprintf(f, "\n"); - return; - } - - si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2); - si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0); - si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1); - si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2); - si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3); - si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG); - si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG); - if (sctx->chip_class <= GFX8) { - si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS); - si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2); - si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3); - } - si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT); - si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1); - si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2); - si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3); - si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS); - si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT); - si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1); - si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS); - si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT); - si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1); - fprintf(f, "\n"); + if (!sctx->screen->info.has_read_registers_query) + return; + + fprintf(f, "Memory-mapped registers:\n"); + si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS); + + /* No other registers can be read on DRM < 3.1.0. */ + if (!sctx->screen->info.is_amdgpu || sctx->screen->info.drm_minor < 1) { + fprintf(f, "\n"); + return; + } + + si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2); + si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0); + si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1); + si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2); + si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3); + si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG); + si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG); + if (sctx->chip_class <= GFX8) { + si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS); + si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2); + si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3); + } + si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT); + si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1); + si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2); + si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3); + si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS); + si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT); + si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1); + si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS); + si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT); + si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1); + fprintf(f, "\n"); } struct si_log_chunk_cs { - struct si_context *ctx; - struct si_saved_cs *cs; - bool dump_bo_list; - unsigned gfx_begin, gfx_end; - unsigned compute_begin, compute_end; + struct si_context *ctx; + struct si_saved_cs *cs; + bool dump_bo_list; + unsigned gfx_begin, gfx_end; + unsigned compute_begin, compute_end; }; static void si_log_chunk_type_cs_destroy(void *data) { - struct si_log_chunk_cs *chunk = data; - si_saved_cs_reference(&chunk->cs, NULL); - free(chunk); + struct si_log_chunk_cs *chunk = data; + si_saved_cs_reference(&chunk->cs, NULL); + free(chunk); } -static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, - unsigned begin, unsigned end, - int *last_trace_id, unsigned trace_id_count, - const char *name, enum chip_class chip_class) +static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begin, unsigned end, + int *last_trace_id, unsigned trace_id_count, const char *name, + enum chip_class chip_class) { - unsigned orig_end = end; + unsigned orig_end = end; - assert(begin <= end); + assert(begin <= end); - fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", - name, begin); + fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", name, begin); - for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) { - struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx]; + for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) { + struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx]; - if (begin < chunk->cdw) { - ac_parse_ib_chunk(f, chunk->buf + begin, - MIN2(end, chunk->cdw) - begin, - last_trace_id, trace_id_count, - chip_class, NULL, NULL); - } + if (begin < chunk->cdw) { + ac_parse_ib_chunk(f, chunk->buf + begin, MIN2(end, chunk->cdw) - begin, last_trace_id, + trace_id_count, chip_class, NULL, NULL); + } - if (end <= chunk->cdw) - return; + if (end <= chunk->cdw) + return; - if (begin < chunk->cdw) - fprintf(f, "\n---------- Next %s Chunk ----------\n\n", - name); + if (begin < chunk->cdw) + fprintf(f, "\n---------- Next %s Chunk ----------\n\n", name); - begin -= MIN2(begin, chunk->cdw); - end -= chunk->cdw; - } + begin -= MIN2(begin, chunk->cdw); + end -= chunk->cdw; + } - assert(end <= cs->current.cdw); + assert(end <= cs->current.cdw); - ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, - trace_id_count, chip_class, NULL, NULL); + ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, trace_id_count, + chip_class, NULL, NULL); - fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", - name, orig_end); + fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end); } static void si_log_chunk_type_cs_print(void *data, FILE *f) { - struct si_log_chunk_cs *chunk = data; - struct si_context *ctx = chunk->ctx; - struct si_saved_cs *scs = chunk->cs; - int last_trace_id = -1; - int last_compute_trace_id = -1; - - /* We are expecting that the ddebug pipe has already - * waited for the context, so this buffer should be idle. - * If the GPU is hung, there is no point in waiting for it. - */ - uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf, - NULL, - PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_READ); - if (map) { - last_trace_id = map[0]; - last_compute_trace_id = map[1]; - } - - if (chunk->gfx_end != chunk->gfx_begin) { - if (chunk->gfx_begin == 0) { - if (ctx->init_config) - ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw, - NULL, 0, "IB2: Init config", ctx->chip_class, - NULL, NULL); - - if (ctx->init_config_gs_rings) - ac_parse_ib(f, ctx->init_config_gs_rings->pm4, - ctx->init_config_gs_rings->ndw, - NULL, 0, "IB2: Init GS rings", ctx->chip_class, - NULL, NULL); - } - - if (scs->flushed) { - ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, - chunk->gfx_end - chunk->gfx_begin, - &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, - NULL, NULL); - } else { - si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin, - chunk->gfx_end, &last_trace_id, map ? 1 : 0, - "IB", ctx->chip_class); - } - } - - if (chunk->compute_end != chunk->compute_begin) { - assert(ctx->prim_discard_compute_cs); - - if (scs->flushed) { - ac_parse_ib(f, scs->compute.ib + chunk->compute_begin, - chunk->compute_end - chunk->compute_begin, - &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class, - NULL, NULL); - } else { - si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin, - chunk->compute_end, &last_compute_trace_id, - map ? 1 : 0, "Compute IB", ctx->chip_class); - } - } - - if (chunk->dump_bo_list) { - fprintf(f, "Flushing. Time: "); - util_dump_ns(f, scs->time_flush); - fprintf(f, "\n\n"); - si_dump_bo_list(ctx, &scs->gfx, f); - } + struct si_log_chunk_cs *chunk = data; + struct si_context *ctx = chunk->ctx; + struct si_saved_cs *scs = chunk->cs; + int last_trace_id = -1; + int last_compute_trace_id = -1; + + /* We are expecting that the ddebug pipe has already + * waited for the context, so this buffer should be idle. + * If the GPU is hung, there is no point in waiting for it. + */ + uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf, NULL, + PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ); + if (map) { + last_trace_id = map[0]; + last_compute_trace_id = map[1]; + } + + if (chunk->gfx_end != chunk->gfx_begin) { + if (chunk->gfx_begin == 0) { + if (ctx->init_config) + ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw, NULL, 0, + "IB2: Init config", ctx->chip_class, NULL, NULL); + + if (ctx->init_config_gs_rings) + ac_parse_ib(f, ctx->init_config_gs_rings->pm4, ctx->init_config_gs_rings->ndw, NULL, 0, + "IB2: Init GS rings", ctx->chip_class, NULL, NULL); + } + + if (scs->flushed) { + ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin, + &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, NULL, NULL); + } else { + si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin, chunk->gfx_end, &last_trace_id, + map ? 1 : 0, "IB", ctx->chip_class); + } + } + + if (chunk->compute_end != chunk->compute_begin) { + assert(ctx->prim_discard_compute_cs); + + if (scs->flushed) { + ac_parse_ib(f, scs->compute.ib + chunk->compute_begin, + chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0, + "Compute IB", ctx->chip_class, NULL, NULL); + } else { + si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin, + chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB", + ctx->chip_class); + } + } + + if (chunk->dump_bo_list) { + fprintf(f, "Flushing. Time: "); + util_dump_ns(f, scs->time_flush); + fprintf(f, "\n\n"); + si_dump_bo_list(ctx, &scs->gfx, f); + } } static const struct u_log_chunk_type si_log_chunk_type_cs = { - .destroy = si_log_chunk_type_cs_destroy, - .print = si_log_chunk_type_cs_print, + .destroy = si_log_chunk_type_cs_destroy, + .print = si_log_chunk_type_cs_print, }; -static void si_log_cs(struct si_context *ctx, struct u_log_context *log, - bool dump_bo_list) +static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool dump_bo_list) { - assert(ctx->current_saved_cs); + assert(ctx->current_saved_cs); - struct si_saved_cs *scs = ctx->current_saved_cs; - unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw; - unsigned compute_cur = 0; + struct si_saved_cs *scs = ctx->current_saved_cs; + unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw; + unsigned compute_cur = 0; - if (ctx->prim_discard_compute_cs) - compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw; + if (ctx->prim_discard_compute_cs) + compute_cur = + ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw; - if (!dump_bo_list && - gfx_cur == scs->gfx_last_dw && - compute_cur == scs->compute_last_dw) - return; + if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw) + return; - struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); + struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); - chunk->ctx = ctx; - si_saved_cs_reference(&chunk->cs, scs); - chunk->dump_bo_list = dump_bo_list; + chunk->ctx = ctx; + si_saved_cs_reference(&chunk->cs, scs); + chunk->dump_bo_list = dump_bo_list; - chunk->gfx_begin = scs->gfx_last_dw; - chunk->gfx_end = gfx_cur; - scs->gfx_last_dw = gfx_cur; + chunk->gfx_begin = scs->gfx_last_dw; + chunk->gfx_end = gfx_cur; + scs->gfx_last_dw = gfx_cur; - chunk->compute_begin = scs->compute_last_dw; - chunk->compute_end = compute_cur; - scs->compute_last_dw = compute_cur; + chunk->compute_begin = scs->compute_last_dw; + chunk->compute_end = compute_cur; + scs->compute_last_dw = compute_cur; - u_log_chunk(log, &si_log_chunk_type_cs, chunk); + u_log_chunk(log, &si_log_chunk_type_cs, chunk); } void si_auto_log_cs(void *data, struct u_log_context *log) { - struct si_context *ctx = (struct si_context *)data; - si_log_cs(ctx, log, false); + struct si_context *ctx = (struct si_context *)data; + si_log_cs(ctx, log, false); } void si_log_hw_flush(struct si_context *sctx) { - if (!sctx->log) - return; - - si_log_cs(sctx, sctx->log, true); - - if (&sctx->b == sctx->screen->aux_context) { - /* The aux context isn't captured by the ddebug wrapper, - * so we dump it on a flush-by-flush basis here. - */ - FILE *f = dd_get_debug_file(false); - if (!f) { - fprintf(stderr, "radeonsi: error opening aux context dump file.\n"); - } else { - dd_write_header(f, &sctx->screen->b, 0); - - fprintf(f, "Aux context dump:\n\n"); - u_log_new_page_print(sctx->log, f); - - fclose(f); - } - } + if (!sctx->log) + return; + + si_log_cs(sctx, sctx->log, true); + + if (&sctx->b == sctx->screen->aux_context) { + /* The aux context isn't captured by the ddebug wrapper, + * so we dump it on a flush-by-flush basis here. + */ + FILE *f = dd_get_debug_file(false); + if (!f) { + fprintf(stderr, "radeonsi: error opening aux context dump file.\n"); + } else { + dd_write_header(f, &sctx->screen->b, 0); + + fprintf(f, "Aux context dump:\n\n"); + u_log_new_page_print(sctx->log, f); + + fclose(f); + } + } } static const char *priority_to_string(enum radeon_bo_priority priority) { #define ITEM(x) [RADEON_PRIO_##x] = #x - static const char *table[64] = { - ITEM(FENCE), - ITEM(TRACE), - ITEM(SO_FILLED_SIZE), - ITEM(QUERY), - ITEM(IB1), - ITEM(IB2), - ITEM(DRAW_INDIRECT), - ITEM(INDEX_BUFFER), - ITEM(CP_DMA), - ITEM(CONST_BUFFER), - ITEM(DESCRIPTORS), - ITEM(BORDER_COLORS), - ITEM(SAMPLER_BUFFER), - ITEM(VERTEX_BUFFER), - ITEM(SHADER_RW_BUFFER), - ITEM(COMPUTE_GLOBAL), - ITEM(SAMPLER_TEXTURE), - ITEM(SHADER_RW_IMAGE), - ITEM(SAMPLER_TEXTURE_MSAA), - ITEM(COLOR_BUFFER), - ITEM(DEPTH_BUFFER), - ITEM(COLOR_BUFFER_MSAA), - ITEM(DEPTH_BUFFER_MSAA), - ITEM(SEPARATE_META), - ITEM(SHADER_BINARY), - ITEM(SHADER_RINGS), - ITEM(SCRATCH_BUFFER), - }; + static const char *table[64] = { + ITEM(FENCE), + ITEM(TRACE), + ITEM(SO_FILLED_SIZE), + ITEM(QUERY), + ITEM(IB1), + ITEM(IB2), + ITEM(DRAW_INDIRECT), + ITEM(INDEX_BUFFER), + ITEM(CP_DMA), + ITEM(CONST_BUFFER), + ITEM(DESCRIPTORS), + ITEM(BORDER_COLORS), + ITEM(SAMPLER_BUFFER), + ITEM(VERTEX_BUFFER), + ITEM(SHADER_RW_BUFFER), + ITEM(COMPUTE_GLOBAL), + ITEM(SAMPLER_TEXTURE), + ITEM(SHADER_RW_IMAGE), + ITEM(SAMPLER_TEXTURE_MSAA), + ITEM(COLOR_BUFFER), + ITEM(DEPTH_BUFFER), + ITEM(COLOR_BUFFER_MSAA), + ITEM(DEPTH_BUFFER_MSAA), + ITEM(SEPARATE_META), + ITEM(SHADER_BINARY), + ITEM(SHADER_RINGS), + ITEM(SCRATCH_BUFFER), + }; #undef ITEM - assert(priority < ARRAY_SIZE(table)); - return table[priority]; + assert(priority < ARRAY_SIZE(table)); + return table[priority]; } static int bo_list_compare_va(const struct radeon_bo_list_item *a, - const struct radeon_bo_list_item *b) + const struct radeon_bo_list_item *b) { - return a->vm_address < b->vm_address ? -1 : - a->vm_address > b->vm_address ? 1 : 0; + return a->vm_address < b->vm_address ? -1 : a->vm_address > b->vm_address ? 1 : 0; } -static void si_dump_bo_list(struct si_context *sctx, - const struct radeon_saved_cs *saved, FILE *f) +static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f) { - unsigned i,j; - - if (!saved->bo_list) - return; - - /* Sort the list according to VM adddresses first. */ - qsort(saved->bo_list, saved->bo_count, - sizeof(saved->bo_list[0]), (void*)bo_list_compare_va); - - fprintf(f, "Buffer list (in units of pages = 4kB):\n" - COLOR_YELLOW " Size VM start page " - "VM end page Usage" COLOR_RESET "\n"); - - for (i = 0; i < saved->bo_count; i++) { - /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */ - const unsigned page_size = sctx->screen->info.gart_page_size; - uint64_t va = saved->bo_list[i].vm_address; - uint64_t size = saved->bo_list[i].bo_size; - bool hit = false; - - /* If there's unused virtual memory between 2 buffers, print it. */ - if (i) { - uint64_t previous_va_end = saved->bo_list[i-1].vm_address + - saved->bo_list[i-1].bo_size; - - if (va > previous_va_end) { - fprintf(f, " %10"PRIu64" -- hole --\n", - (va - previous_va_end) / page_size); - } - } - - /* Print the buffer. */ - fprintf(f, " %10"PRIu64" 0x%013"PRIX64" 0x%013"PRIX64" ", - size / page_size, va / page_size, (va + size) / page_size); - - /* Print the usage. */ - for (j = 0; j < 32; j++) { - if (!(saved->bo_list[i].priority_usage & (1u << j))) - continue; - - fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j)); - hit = true; - } - fprintf(f, "\n"); - } - fprintf(f, "\nNote: The holes represent memory not used by the IB.\n" - " Other buffers can still be allocated there.\n\n"); + unsigned i, j; + + if (!saved->bo_list) + return; + + /* Sort the list according to VM adddresses first. */ + qsort(saved->bo_list, saved->bo_count, sizeof(saved->bo_list[0]), (void *)bo_list_compare_va); + + fprintf(f, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW + " Size VM start page " + "VM end page Usage" COLOR_RESET "\n"); + + for (i = 0; i < saved->bo_count; i++) { + /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */ + const unsigned page_size = sctx->screen->info.gart_page_size; + uint64_t va = saved->bo_list[i].vm_address; + uint64_t size = saved->bo_list[i].bo_size; + bool hit = false; + + /* If there's unused virtual memory between 2 buffers, print it. */ + if (i) { + uint64_t previous_va_end = + saved->bo_list[i - 1].vm_address + saved->bo_list[i - 1].bo_size; + + if (va > previous_va_end) { + fprintf(f, " %10" PRIu64 " -- hole --\n", (va - previous_va_end) / page_size); + } + } + + /* Print the buffer. */ + fprintf(f, " %10" PRIu64 " 0x%013" PRIX64 " 0x%013" PRIX64 " ", + size / page_size, va / page_size, (va + size) / page_size); + + /* Print the usage. */ + for (j = 0; j < 32; j++) { + if (!(saved->bo_list[i].priority_usage & (1u << j))) + continue; + + fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j)); + hit = true; + } + fprintf(f, "\n"); + } + fprintf(f, "\nNote: The holes represent memory not used by the IB.\n" + " Other buffers can still be allocated there.\n\n"); } static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log) { - struct pipe_framebuffer_state *state = &sctx->framebuffer.state; - struct si_texture *tex; - int i; - - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - tex = (struct si_texture*)state->cbufs[i]->texture; - u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i); - si_print_texture_info(sctx->screen, tex, log); - u_log_printf(log, "\n"); - } - - if (state->zsbuf) { - tex = (struct si_texture*)state->zsbuf->texture; - u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n"); - si_print_texture_info(sctx->screen, tex, log); - u_log_printf(log, "\n"); - } + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + struct si_texture *tex; + int i; + + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; + + tex = (struct si_texture *)state->cbufs[i]->texture; + u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i); + si_print_texture_info(sctx->screen, tex, log); + u_log_printf(log, "\n"); + } + + if (state->zsbuf) { + tex = (struct si_texture *)state->zsbuf->texture; + u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n"); + si_print_texture_info(sctx->screen, tex, log); + u_log_printf(log, "\n"); + } } typedef unsigned (*slot_remap_func)(unsigned); struct si_log_chunk_desc_list { - /** Pointer to memory map of buffer where the list is uploader */ - uint32_t *gpu_list; - /** Reference of buffer where the list is uploaded, so that gpu_list - * is kept live. */ - struct si_resource *buf; - - const char *shader_name; - const char *elem_name; - slot_remap_func slot_remap; - enum chip_class chip_class; - unsigned element_dw_size; - unsigned num_elements; - - uint32_t list[0]; + /** Pointer to memory map of buffer where the list is uploader */ + uint32_t *gpu_list; + /** Reference of buffer where the list is uploaded, so that gpu_list + * is kept live. */ + struct si_resource *buf; + + const char *shader_name; + const char *elem_name; + slot_remap_func slot_remap; + enum chip_class chip_class; + unsigned element_dw_size; + unsigned num_elements; + + uint32_t list[0]; }; -static void -si_log_chunk_desc_list_destroy(void *data) +static void si_log_chunk_desc_list_destroy(void *data) { - struct si_log_chunk_desc_list *chunk = data; - si_resource_reference(&chunk->buf, NULL); - FREE(chunk); + struct si_log_chunk_desc_list *chunk = data; + si_resource_reference(&chunk->buf, NULL); + FREE(chunk); } -static void -si_log_chunk_desc_list_print(void *data, FILE *f) +static void si_log_chunk_desc_list_print(void *data, FILE *f) { - struct si_log_chunk_desc_list *chunk = data; - unsigned sq_img_rsrc_word0 = chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 - : R_008F10_SQ_IMG_RSRC_WORD0; - - for (unsigned i = 0; i < chunk->num_elements; i++) { - unsigned cpu_dw_offset = i * chunk->element_dw_size; - unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size; - const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list"; - uint32_t *cpu_list = chunk->list + cpu_dw_offset; - uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list; - - fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", - chunk->shader_name, chunk->elem_name, i, list_note); - - switch (chunk->element_dw_size) { - case 4: - for (unsigned j = 0; j < 4; j++) - ac_dump_reg(f, chunk->chip_class, - R_008F00_SQ_BUF_RSRC_WORD0 + j*4, - gpu_list[j], 0xffffffff); - break; - case 8: - for (unsigned j = 0; j < 8; j++) - ac_dump_reg(f, chunk->chip_class, - sq_img_rsrc_word0 + j*4, - gpu_list[j], 0xffffffff); - - fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); - for (unsigned j = 0; j < 4; j++) - ac_dump_reg(f, chunk->chip_class, - R_008F00_SQ_BUF_RSRC_WORD0 + j*4, - gpu_list[4+j], 0xffffffff); - break; - case 16: - for (unsigned j = 0; j < 8; j++) - ac_dump_reg(f, chunk->chip_class, - sq_img_rsrc_word0 + j*4, - gpu_list[j], 0xffffffff); - - fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); - for (unsigned j = 0; j < 4; j++) - ac_dump_reg(f, chunk->chip_class, - R_008F00_SQ_BUF_RSRC_WORD0 + j*4, - gpu_list[4+j], 0xffffffff); - - fprintf(f, COLOR_CYAN " FMASK:" COLOR_RESET "\n"); - for (unsigned j = 0; j < 8; j++) - ac_dump_reg(f, chunk->chip_class, - sq_img_rsrc_word0 + j*4, - gpu_list[8+j], 0xffffffff); - - fprintf(f, COLOR_CYAN " Sampler state:" COLOR_RESET "\n"); - for (unsigned j = 0; j < 4; j++) - ac_dump_reg(f, chunk->chip_class, - R_008F30_SQ_IMG_SAMP_WORD0 + j*4, - gpu_list[12+j], 0xffffffff); - break; - } - - if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) { - fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" - COLOR_RESET "\n"); - } - - fprintf(f, "\n"); - } - + struct si_log_chunk_desc_list *chunk = data; + unsigned sq_img_rsrc_word0 = + chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 : R_008F10_SQ_IMG_RSRC_WORD0; + + for (unsigned i = 0; i < chunk->num_elements; i++) { + unsigned cpu_dw_offset = i * chunk->element_dw_size; + unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size; + const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list"; + uint32_t *cpu_list = chunk->list + cpu_dw_offset; + uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list; + + fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", chunk->shader_name, + chunk->elem_name, i, list_note); + + switch (chunk->element_dw_size) { + case 4: + for (unsigned j = 0; j < 4; j++) + ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[j], + 0xffffffff); + break; + case 8: + for (unsigned j = 0; j < 8; j++) + ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff); + + fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); + for (unsigned j = 0; j < 4; j++) + ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j], + 0xffffffff); + break; + case 16: + for (unsigned j = 0; j < 8; j++) + ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff); + + fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); + for (unsigned j = 0; j < 4; j++) + ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j], + 0xffffffff); + + fprintf(f, COLOR_CYAN " FMASK:" COLOR_RESET "\n"); + for (unsigned j = 0; j < 8; j++) + ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[8 + j], + 0xffffffff); + + fprintf(f, COLOR_CYAN " Sampler state:" COLOR_RESET "\n"); + for (unsigned j = 0; j < 4; j++) + ac_dump_reg(f, chunk->chip_class, R_008F30_SQ_IMG_SAMP_WORD0 + j * 4, gpu_list[12 + j], + 0xffffffff); + break; + } + + if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) { + fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET "\n"); + } + + fprintf(f, "\n"); + } } static const struct u_log_chunk_type si_log_chunk_type_descriptor_list = { - .destroy = si_log_chunk_desc_list_destroy, - .print = si_log_chunk_desc_list_print, + .destroy = si_log_chunk_desc_list_destroy, + .print = si_log_chunk_desc_list_print, }; -static void si_dump_descriptor_list(struct si_screen *screen, - struct si_descriptors *desc, - const char *shader_name, - const char *elem_name, - unsigned element_dw_size, - unsigned num_elements, - slot_remap_func slot_remap, - struct u_log_context *log) +static void si_dump_descriptor_list(struct si_screen *screen, struct si_descriptors *desc, + const char *shader_name, const char *elem_name, + unsigned element_dw_size, unsigned num_elements, + slot_remap_func slot_remap, struct u_log_context *log) { - if (!desc->list) - return; - - /* In some cases, the caller doesn't know how many elements are really - * uploaded. Reduce num_elements to fit in the range of active slots. */ - unsigned active_range_dw_begin = - desc->first_active_slot * desc->element_dw_size; - unsigned active_range_dw_end = - active_range_dw_begin + desc->num_active_slots * desc->element_dw_size; - - while (num_elements > 0) { - int i = slot_remap(num_elements - 1); - unsigned dw_begin = i * element_dw_size; - unsigned dw_end = dw_begin + element_dw_size; - - if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end) - break; - - num_elements--; - } - - struct si_log_chunk_desc_list *chunk = - CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, - 4 * element_dw_size * num_elements); - chunk->shader_name = shader_name; - chunk->elem_name = elem_name; - chunk->element_dw_size = element_dw_size; - chunk->num_elements = num_elements; - chunk->slot_remap = slot_remap; - chunk->chip_class = screen->info.chip_class; - - si_resource_reference(&chunk->buf, desc->buffer); - chunk->gpu_list = desc->gpu_list; - - for (unsigned i = 0; i < num_elements; ++i) { - memcpy(&chunk->list[i * element_dw_size], - &desc->list[slot_remap(i) * element_dw_size], - 4 * element_dw_size); - } - - u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk); + if (!desc->list) + return; + + /* In some cases, the caller doesn't know how many elements are really + * uploaded. Reduce num_elements to fit in the range of active slots. */ + unsigned active_range_dw_begin = desc->first_active_slot * desc->element_dw_size; + unsigned active_range_dw_end = + active_range_dw_begin + desc->num_active_slots * desc->element_dw_size; + + while (num_elements > 0) { + int i = slot_remap(num_elements - 1); + unsigned dw_begin = i * element_dw_size; + unsigned dw_end = dw_begin + element_dw_size; + + if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end) + break; + + num_elements--; + } + + struct si_log_chunk_desc_list *chunk = + CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, 4 * element_dw_size * num_elements); + chunk->shader_name = shader_name; + chunk->elem_name = elem_name; + chunk->element_dw_size = element_dw_size; + chunk->num_elements = num_elements; + chunk->slot_remap = slot_remap; + chunk->chip_class = screen->info.chip_class; + + si_resource_reference(&chunk->buf, desc->buffer); + chunk->gpu_list = desc->gpu_list; + + for (unsigned i = 0; i < num_elements; ++i) { + memcpy(&chunk->list[i * element_dw_size], &desc->list[slot_remap(i) * element_dw_size], + 4 * element_dw_size); + } + + u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk); } static unsigned si_identity(unsigned slot) { - return slot; + return slot; } -static void si_dump_descriptors(struct si_context *sctx, - enum pipe_shader_type processor, - const struct si_shader_info *info, - struct u_log_context *log) +static void si_dump_descriptors(struct si_context *sctx, enum pipe_shader_type processor, + const struct si_shader_info *info, struct u_log_context *log) { - struct si_descriptors *descs = - &sctx->descriptors[SI_DESCS_FIRST_SHADER + - processor * SI_NUM_SHADER_DESCS]; - static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"}; - const char *name = shader_name[processor]; - unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers; - unsigned enabled_images; - - if (info) { - enabled_constbuf = info->const_buffers_declared; - enabled_shaderbuf = info->shader_buffers_declared; - enabled_samplers = info->samplers_declared; - enabled_images = info->images_declared; - } else { - enabled_constbuf = sctx->const_and_shader_buffers[processor].enabled_mask >> - SI_NUM_SHADER_BUFFERS; - enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask & - u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS); - enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >> - (32 - SI_NUM_SHADER_BUFFERS); - enabled_samplers = sctx->samplers[processor].enabled_mask; - enabled_images = sctx->images[processor].enabled_mask; - } - - if (processor == PIPE_SHADER_VERTEX && - sctx->vb_descriptors_buffer && - sctx->vb_descriptors_gpu_list && - sctx->vertex_elements) { - assert(info); /* only CS may not have an info struct */ - struct si_descriptors desc = {}; - - desc.buffer = sctx->vb_descriptors_buffer; - desc.list = sctx->vb_descriptors_gpu_list; - desc.gpu_list = sctx->vb_descriptors_gpu_list; - desc.element_dw_size = 4; - desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16; - - si_dump_descriptor_list(sctx->screen, &desc, name, - " - Vertex buffer", 4, info->num_inputs, - si_identity, log); - } - - si_dump_descriptor_list(sctx->screen, - &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], - name, " - Constant buffer", 4, - util_last_bit(enabled_constbuf), - si_get_constbuf_slot, log); - si_dump_descriptor_list(sctx->screen, - &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], - name, " - Shader buffer", 4, - util_last_bit(enabled_shaderbuf), - si_get_shaderbuf_slot, log); - si_dump_descriptor_list(sctx->screen, - &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], - name, " - Sampler", 16, - util_last_bit(enabled_samplers), - si_get_sampler_slot, log); - si_dump_descriptor_list(sctx->screen, - &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], - name, " - Image", 8, - util_last_bit(enabled_images), - si_get_image_slot, log); + struct si_descriptors *descs = + &sctx->descriptors[SI_DESCS_FIRST_SHADER + processor * SI_NUM_SHADER_DESCS]; + static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"}; + const char *name = shader_name[processor]; + unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers; + unsigned enabled_images; + + if (info) { + enabled_constbuf = info->const_buffers_declared; + enabled_shaderbuf = info->shader_buffers_declared; + enabled_samplers = info->samplers_declared; + enabled_images = info->images_declared; + } else { + enabled_constbuf = + sctx->const_and_shader_buffers[processor].enabled_mask >> SI_NUM_SHADER_BUFFERS; + enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask & + u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS); + enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >> (32 - SI_NUM_SHADER_BUFFERS); + enabled_samplers = sctx->samplers[processor].enabled_mask; + enabled_images = sctx->images[processor].enabled_mask; + } + + if (processor == PIPE_SHADER_VERTEX && sctx->vb_descriptors_buffer && + sctx->vb_descriptors_gpu_list && sctx->vertex_elements) { + assert(info); /* only CS may not have an info struct */ + struct si_descriptors desc = {}; + + desc.buffer = sctx->vb_descriptors_buffer; + desc.list = sctx->vb_descriptors_gpu_list; + desc.gpu_list = sctx->vb_descriptors_gpu_list; + desc.element_dw_size = 4; + desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16; + + si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs, + si_identity, log); + } + + si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name, + " - Constant buffer", 4, util_last_bit(enabled_constbuf), + si_get_constbuf_slot, log); + si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name, + " - Shader buffer", 4, util_last_bit(enabled_shaderbuf), + si_get_shaderbuf_slot, log); + si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name, + " - Sampler", 16, util_last_bit(enabled_samplers), si_get_sampler_slot, + log); + si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name, + " - Image", 8, util_last_bit(enabled_images), si_get_image_slot, log); } static void si_dump_gfx_descriptors(struct si_context *sctx, - const struct si_shader_ctx_state *state, - struct u_log_context *log) + const struct si_shader_ctx_state *state, + struct u_log_context *log) { - if (!state->cso || !state->current) - return; + if (!state->cso || !state->current) + return; - si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log); + si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log); } -static void si_dump_compute_descriptors(struct si_context *sctx, - struct u_log_context *log) +static void si_dump_compute_descriptors(struct si_context *sctx, struct u_log_context *log) { - if (!sctx->cs_shader_state.program) - return; + if (!sctx->cs_shader_state.program) + return; - si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log); + si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log); } struct si_shader_inst { - const char *text; /* start of disassembly for this instruction */ - unsigned textlen; - unsigned size; /* instruction size = 4 or 8 */ - uint64_t addr; /* instruction address */ + const char *text; /* start of disassembly for this instruction */ + unsigned textlen; + unsigned size; /* instruction size = 4 or 8 */ + uint64_t addr; /* instruction address */ }; /** @@ -933,344 +865,323 @@ struct si_shader_inst { * The caller must keep \p rtld_binary alive as long as \p instructions are * used and then close it afterwards. */ -static void si_add_split_disasm(struct si_screen *screen, - struct ac_rtld_binary *rtld_binary, - struct si_shader_binary *binary, - uint64_t *addr, - unsigned *num, - struct si_shader_inst *instructions, - enum pipe_shader_type shader_type, - unsigned wave_size) +static void si_add_split_disasm(struct si_screen *screen, struct ac_rtld_binary *rtld_binary, + struct si_shader_binary *binary, uint64_t *addr, unsigned *num, + struct si_shader_inst *instructions, + enum pipe_shader_type shader_type, unsigned wave_size) { - if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){ - .info = &screen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size })) - return; - - const char *disasm; - size_t nbytes; - if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", - &disasm, &nbytes)) - return; - - const char *end = disasm + nbytes; - while (disasm < end) { - const char *semicolon = memchr(disasm, ';', end - disasm); - if (!semicolon) - break; - - struct si_shader_inst *inst = &instructions[(*num)++]; - const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1); - if (!inst_end) - inst_end = end; - - inst->text = disasm; - inst->textlen = inst_end - disasm; - - inst->addr = *addr; - /* More than 16 chars after ";" means the instruction is 8 bytes long. */ - inst->size = inst_end - semicolon > 16 ? 8 : 4; - *addr += inst->size; - - if (inst_end == end) - break; - disasm = inst_end + 1; - } + if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){ + .info = &screen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size})) + return; + + const char *disasm; + size_t nbytes; + if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) + return; + + const char *end = disasm + nbytes; + while (disasm < end) { + const char *semicolon = memchr(disasm, ';', end - disasm); + if (!semicolon) + break; + + struct si_shader_inst *inst = &instructions[(*num)++]; + const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1); + if (!inst_end) + inst_end = end; + + inst->text = disasm; + inst->textlen = inst_end - disasm; + + inst->addr = *addr; + /* More than 16 chars after ";" means the instruction is 8 bytes long. */ + inst->size = inst_end - semicolon > 16 ? 8 : 4; + *addr += inst->size; + + if (inst_end == end) + break; + disasm = inst_end + 1; + } } /* If the shader is being executed, print its asm instructions, and annotate * those that are being executed right now with information about waves that * execute them. This is most useful during a GPU hang. */ -static void si_print_annotated_shader(struct si_shader *shader, - struct ac_wave_info *waves, - unsigned num_waves, - FILE *f) +static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_info *waves, + unsigned num_waves, FILE *f) { - if (!shader) - return; - - struct si_screen *screen = shader->selector->screen; - enum pipe_shader_type shader_type = shader->selector->type; - uint64_t start_addr = shader->bo->gpu_address; - uint64_t end_addr = start_addr + shader->bo->b.b.width0; - unsigned i; - - /* See if any wave executes the shader. */ - for (i = 0; i < num_waves; i++) { - if (start_addr <= waves[i].pc && waves[i].pc <= end_addr) - break; - } - if (i == num_waves) - return; /* the shader is not being executed */ - - /* Remember the first found wave. The waves are sorted according to PC. */ - waves = &waves[i]; - num_waves -= i; - - /* Get the list of instructions. - * Buffer size / 4 is the upper bound of the instruction count. - */ - unsigned num_inst = 0; - uint64_t inst_addr = start_addr; - unsigned wave_size = si_get_shader_wave_size(shader); - struct ac_rtld_binary rtld_binaries[5] = {}; - struct si_shader_inst *instructions = - calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); - - if (shader->prolog) { - si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, - &inst_addr, &num_inst, instructions, shader_type, wave_size); - } - if (shader->previous_stage) { - si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, - &inst_addr, &num_inst, instructions, shader_type, wave_size); - } - if (shader->prolog2) { - si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, - &inst_addr, &num_inst, instructions, shader_type, wave_size); - } - si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, - &inst_addr, &num_inst, instructions, shader_type, wave_size); - if (shader->epilog) { - si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, - &inst_addr, &num_inst, instructions, shader_type, wave_size); - } - - fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", - si_get_shader_name(shader)); - - /* Print instructions with annotations. */ - for (i = 0; i < num_inst; i++) { - struct si_shader_inst *inst = &instructions[i]; - - fprintf(f, "%.*s [PC=0x%"PRIx64", size=%u]\n", - inst->textlen, inst->text, inst->addr, inst->size); - - /* Print which waves execute the instruction right now. */ - while (num_waves && inst->addr == waves->pc) { - fprintf(f, - " " COLOR_GREEN "^ SE%u SH%u CU%u " - "SIMD%u WAVE%u EXEC=%016"PRIx64 " ", - waves->se, waves->sh, waves->cu, waves->simd, - waves->wave, waves->exec); - - if (inst->size == 4) { - fprintf(f, "INST32=%08X" COLOR_RESET "\n", - waves->inst_dw0); - } else { - fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", - waves->inst_dw0, waves->inst_dw1); - } - - waves->matched = true; - waves = &waves[1]; - num_waves--; - } - } - - fprintf(f, "\n\n"); - free(instructions); - for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i) - ac_rtld_close(&rtld_binaries[i]); + if (!shader) + return; + + struct si_screen *screen = shader->selector->screen; + enum pipe_shader_type shader_type = shader->selector->type; + uint64_t start_addr = shader->bo->gpu_address; + uint64_t end_addr = start_addr + shader->bo->b.b.width0; + unsigned i; + + /* See if any wave executes the shader. */ + for (i = 0; i < num_waves; i++) { + if (start_addr <= waves[i].pc && waves[i].pc <= end_addr) + break; + } + if (i == num_waves) + return; /* the shader is not being executed */ + + /* Remember the first found wave. The waves are sorted according to PC. */ + waves = &waves[i]; + num_waves -= i; + + /* Get the list of instructions. + * Buffer size / 4 is the upper bound of the instruction count. + */ + unsigned num_inst = 0; + uint64_t inst_addr = start_addr; + unsigned wave_size = si_get_shader_wave_size(shader); + struct ac_rtld_binary rtld_binaries[5] = {}; + struct si_shader_inst *instructions = + calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); + + if (shader->prolog) { + si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst, + instructions, shader_type, wave_size); + } + if (shader->previous_stage) { + si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr, + &num_inst, instructions, shader_type, wave_size); + } + if (shader->prolog2) { + si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr, + &num_inst, instructions, shader_type, wave_size); + } + si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst, + instructions, shader_type, wave_size); + if (shader->epilog) { + si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst, + instructions, shader_type, wave_size); + } + + fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", + si_get_shader_name(shader)); + + /* Print instructions with annotations. */ + for (i = 0; i < num_inst; i++) { + struct si_shader_inst *inst = &instructions[i]; + + fprintf(f, "%.*s [PC=0x%" PRIx64 ", size=%u]\n", inst->textlen, inst->text, inst->addr, + inst->size); + + /* Print which waves execute the instruction right now. */ + while (num_waves && inst->addr == waves->pc) { + fprintf(f, + " " COLOR_GREEN "^ SE%u SH%u CU%u " + "SIMD%u WAVE%u EXEC=%016" PRIx64 " ", + waves->se, waves->sh, waves->cu, waves->simd, waves->wave, waves->exec); + + if (inst->size == 4) { + fprintf(f, "INST32=%08X" COLOR_RESET "\n", waves->inst_dw0); + } else { + fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", waves->inst_dw0, waves->inst_dw1); + } + + waves->matched = true; + waves = &waves[1]; + num_waves--; + } + } + + fprintf(f, "\n\n"); + free(instructions); + for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i) + ac_rtld_close(&rtld_binaries[i]); } static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f) { - struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]; - unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves); - - fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET - "\n\n", num_waves); - - si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f); - si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f); - si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f); - si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f); - si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f); - - /* Print waves executing shaders that are not currently bound. */ - unsigned i; - bool found = false; - for (i = 0; i < num_waves; i++) { - if (waves[i].matched) - continue; - - if (!found) { - fprintf(f, COLOR_CYAN - "Waves not executing currently-bound shaders:" - COLOR_RESET "\n"); - found = true; - } - fprintf(f, " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016"PRIx64 - " INST=%08X %08X PC=%"PRIx64"\n", - waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, - waves[i].wave, waves[i].exec, waves[i].inst_dw0, - waves[i].inst_dw1, waves[i].pc); - } - if (found) - fprintf(f, "\n\n"); + struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]; + unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves); + + fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves); + + si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f); + si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f); + + /* Print waves executing shaders that are not currently bound. */ + unsigned i; + bool found = false; + for (i = 0; i < num_waves; i++) { + if (waves[i].matched) + continue; + + if (!found) { + fprintf(f, COLOR_CYAN "Waves not executing currently-bound shaders:" COLOR_RESET "\n"); + found = true; + } + fprintf(f, + " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016" PRIx64 " INST=%08X %08X PC=%" PRIx64 + "\n", + waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, waves[i].wave, waves[i].exec, + waves[i].inst_dw0, waves[i].inst_dw1, waves[i].pc); + } + if (found) + fprintf(f, "\n\n"); } static void si_dump_command(const char *title, const char *command, FILE *f) { - char line[2000]; + char line[2000]; - FILE *p = popen(command, "r"); - if (!p) - return; + FILE *p = popen(command, "r"); + if (!p) + return; - fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title); - while (fgets(line, sizeof(line), p)) - fputs(line, f); - fprintf(f, "\n\n"); - pclose(p); + fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title); + while (fgets(line, sizeof(line), p)) + fputs(line, f); + fprintf(f, "\n\n"); + pclose(p); } -static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, - unsigned flags) +static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags) { - struct si_context *sctx = (struct si_context*)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->log) - u_log_flush(sctx->log); + if (sctx->log) + u_log_flush(sctx->log); - if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) { - si_dump_debug_registers(sctx, f); + if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) { + si_dump_debug_registers(sctx, f); - si_dump_annotated_shaders(sctx, f); - si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f); - si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f); - } + si_dump_annotated_shaders(sctx, f); + si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f); + si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f); + } } void si_log_draw_state(struct si_context *sctx, struct u_log_context *log) { - struct si_shader_ctx_state *tcs_shader; - - if (!log) - return; - - tcs_shader = &sctx->tcs_shader; - if (sctx->tes_shader.cso && !sctx->tcs_shader.cso) - tcs_shader = &sctx->fixed_func_tcs_shader; - - si_dump_framebuffer(sctx, log); - - si_dump_gfx_shader(sctx, &sctx->vs_shader, log); - si_dump_gfx_shader(sctx, tcs_shader, log); - si_dump_gfx_shader(sctx, &sctx->tes_shader, log); - si_dump_gfx_shader(sctx, &sctx->gs_shader, log); - si_dump_gfx_shader(sctx, &sctx->ps_shader, log); - - si_dump_descriptor_list(sctx->screen, - &sctx->descriptors[SI_DESCS_RW_BUFFERS], - "", "RW buffers", 4, - sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots, - si_identity, log); - si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log); - si_dump_gfx_descriptors(sctx, tcs_shader, log); - si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log); - si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log); - si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log); + struct si_shader_ctx_state *tcs_shader; + + if (!log) + return; + + tcs_shader = &sctx->tcs_shader; + if (sctx->tes_shader.cso && !sctx->tcs_shader.cso) + tcs_shader = &sctx->fixed_func_tcs_shader; + + si_dump_framebuffer(sctx, log); + + si_dump_gfx_shader(sctx, &sctx->vs_shader, log); + si_dump_gfx_shader(sctx, tcs_shader, log); + si_dump_gfx_shader(sctx, &sctx->tes_shader, log); + si_dump_gfx_shader(sctx, &sctx->gs_shader, log); + si_dump_gfx_shader(sctx, &sctx->ps_shader, log); + + si_dump_descriptor_list(sctx->screen, &sctx->descriptors[SI_DESCS_RW_BUFFERS], "", "RW buffers", + 4, sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots, si_identity, + log); + si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log); + si_dump_gfx_descriptors(sctx, tcs_shader, log); + si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log); + si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log); + si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log); } void si_log_compute_state(struct si_context *sctx, struct u_log_context *log) { - if (!log) - return; + if (!log) + return; - si_dump_compute_shader(sctx, log); - si_dump_compute_descriptors(sctx, log); + si_dump_compute_shader(sctx, log); + si_dump_compute_descriptors(sctx, log); } -static void si_dump_dma(struct si_context *sctx, - struct radeon_saved_cs *saved, FILE *f) +static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f) { - static const char ib_name[] = "sDMA IB"; - unsigned i; + static const char ib_name[] = "sDMA IB"; + unsigned i; - si_dump_bo_list(sctx, saved, f); + si_dump_bo_list(sctx, saved, f); - fprintf(f, "------------------ %s begin ------------------\n", ib_name); + fprintf(f, "------------------ %s begin ------------------\n", ib_name); - for (i = 0; i < saved->num_dw; ++i) { - fprintf(f, " %08x\n", saved->ib[i]); - } + for (i = 0; i < saved->num_dw; ++i) { + fprintf(f, " %08x\n", saved->ib[i]); + } - fprintf(f, "------------------- %s end -------------------\n", ib_name); - fprintf(f, "\n"); + fprintf(f, "------------------- %s end -------------------\n", ib_name); + fprintf(f, "\n"); - fprintf(f, "SDMA Dump Done.\n"); + fprintf(f, "SDMA Dump Done.\n"); } -void si_check_vm_faults(struct si_context *sctx, - struct radeon_saved_cs *saved, enum ring_type ring) +void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring) { - struct pipe_screen *screen = sctx->b.screen; - FILE *f; - uint64_t addr; - char cmd_line[4096]; - - if (!ac_vm_fault_occured(sctx->chip_class, - &sctx->dmesg_timestamp, &addr)) - return; - - f = dd_get_debug_file(false); - if (!f) - return; - - fprintf(f, "VM fault report.\n\n"); - if (os_get_command_line(cmd_line, sizeof(cmd_line))) - fprintf(f, "Command: %s\n", cmd_line); - fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen)); - fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen)); - fprintf(f, "Device name: %s\n\n", screen->get_name(screen)); - fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr); - - if (sctx->apitrace_call_number) - fprintf(f, "Last apitrace call: %u\n\n", - sctx->apitrace_call_number); - - switch (ring) { - case RING_GFX: { - struct u_log_context log; - u_log_context_init(&log); - - si_log_draw_state(sctx, &log); - si_log_compute_state(sctx, &log); - si_log_cs(sctx, &log, true); - - u_log_new_page_print(&log, f); - u_log_context_destroy(&log); - break; - } - case RING_DMA: - si_dump_dma(sctx, saved, f); - break; - - default: - break; - } - - fclose(f); - - fprintf(stderr, "Detected a VM fault, exiting...\n"); - exit(0); + struct pipe_screen *screen = sctx->b.screen; + FILE *f; + uint64_t addr; + char cmd_line[4096]; + + if (!ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, &addr)) + return; + + f = dd_get_debug_file(false); + if (!f) + return; + + fprintf(f, "VM fault report.\n\n"); + if (os_get_command_line(cmd_line, sizeof(cmd_line))) + fprintf(f, "Command: %s\n", cmd_line); + fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen)); + fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen)); + fprintf(f, "Device name: %s\n\n", screen->get_name(screen)); + fprintf(f, "Failing VM page: 0x%08" PRIx64 "\n\n", addr); + + if (sctx->apitrace_call_number) + fprintf(f, "Last apitrace call: %u\n\n", sctx->apitrace_call_number); + + switch (ring) { + case RING_GFX: { + struct u_log_context log; + u_log_context_init(&log); + + si_log_draw_state(sctx, &log); + si_log_compute_state(sctx, &log); + si_log_cs(sctx, &log, true); + + u_log_new_page_print(&log, f); + u_log_context_destroy(&log); + break; + } + case RING_DMA: + si_dump_dma(sctx, saved, f); + break; + + default: + break; + } + + fclose(f); + + fprintf(stderr, "Detected a VM fault, exiting...\n"); + exit(0); } void si_init_debug_functions(struct si_context *sctx) { - sctx->b.dump_debug_state = si_dump_debug_state; - - /* Set the initial dmesg timestamp for this context, so that - * only new messages will be checked for VM faults. - */ - if (sctx->screen->debug_flags & DBG(CHECK_VM)) - ac_vm_fault_occured(sctx->chip_class, - &sctx->dmesg_timestamp, NULL); + sctx->b.dump_debug_state = si_dump_debug_state; + + /* Set the initial dmesg timestamp for this context, so that + * only new messages will be checked for VM faults. + */ + if (sctx->screen->debug_flags & DBG(CHECK_VM)) + ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h index b0e8db8646a..83c7425e094 100644 --- a/src/gallium/drivers/radeonsi/si_debug_options.h +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -1,9 +1,11 @@ OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context") OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)") OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps") -OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)") +OPT_BOOL(debug_disassembly, false, + "Report shader disassembly as part of driver debug messages (for shader db)") OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)") -OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)") +OPT_BOOL(vs_fetch_always_opencode, false, + "Always open code vertex fetches (less efficient, purely for testing)") OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips") #undef OPT_BOOL diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index fa2174bac5d..bf3ede49b39 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -55,14 +55,12 @@ #include "si_pipe.h" #include "sid.h" - +#include "util/format/u_format.h" #include "util/hash_table.h" #include "util/u_idalloc.h" -#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" - /* NULL image and buffer descriptor for textures (alpha = 1) and images * (alpha = 0). * @@ -75,221 +73,197 @@ * This is the only reason why the buffer descriptor must be in words [4:7]. */ static uint32_t null_texture_descriptor[8] = { - 0, - 0, - 0, - S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | - S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) - /* the rest must contain zeros, which is also used by the buffer - * descriptor */ + 0, 0, 0, S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ }; static uint32_t null_image_descriptor[8] = { - 0, - 0, - 0, - S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) - /* the rest must contain zeros, which is also used by the buffer - * descriptor */ + 0, 0, 0, S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ }; static uint64_t si_desc_extract_buffer_address(const uint32_t *desc) { - uint64_t va = desc[0] | - ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); + uint64_t va = desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); - /* Sign-extend the 48-bit address. */ - va <<= 16; - va = (int64_t)va >> 16; - return va; + /* Sign-extend the 48-bit address. */ + va <<= 16; + va = (int64_t)va >> 16; + return va; } -static void si_init_descriptor_list(uint32_t *desc_list, - unsigned element_dw_size, - unsigned num_elements, - const uint32_t *null_descriptor) +static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size, + unsigned num_elements, const uint32_t *null_descriptor) { - int i; + int i; - /* Initialize the array to NULL descriptors if the element size is 8. */ - if (null_descriptor) { - assert(element_dw_size % 8 == 0); - for (i = 0; i < num_elements * element_dw_size / 8; i++) - memcpy(desc_list + i * 8, null_descriptor, 8 * 4); - } + /* Initialize the array to NULL descriptors if the element size is 8. */ + if (null_descriptor) { + assert(element_dw_size % 8 == 0); + for (i = 0; i < num_elements * element_dw_size / 8; i++) + memcpy(desc_list + i * 8, null_descriptor, 8 * 4); + } } -static void si_init_descriptors(struct si_descriptors *desc, - short shader_userdata_rel_index, - unsigned element_dw_size, - unsigned num_elements) +static void si_init_descriptors(struct si_descriptors *desc, short shader_userdata_rel_index, + unsigned element_dw_size, unsigned num_elements) { - desc->list = CALLOC(num_elements, element_dw_size * 4); - desc->element_dw_size = element_dw_size; - desc->num_elements = num_elements; - desc->shader_userdata_offset = shader_userdata_rel_index * 4; - desc->slot_index_to_bind_directly = -1; + desc->list = CALLOC(num_elements, element_dw_size * 4); + desc->element_dw_size = element_dw_size; + desc->num_elements = num_elements; + desc->shader_userdata_offset = shader_userdata_rel_index * 4; + desc->slot_index_to_bind_directly = -1; } static void si_release_descriptors(struct si_descriptors *desc) { - si_resource_reference(&desc->buffer, NULL); - FREE(desc->list); + si_resource_reference(&desc->buffer, NULL); + FREE(desc->list); } -static bool si_upload_descriptors(struct si_context *sctx, - struct si_descriptors *desc) +static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc) { - unsigned slot_size = desc->element_dw_size * 4; - unsigned first_slot_offset = desc->first_active_slot * slot_size; - unsigned upload_size = desc->num_active_slots * slot_size; + unsigned slot_size = desc->element_dw_size * 4; + unsigned first_slot_offset = desc->first_active_slot * slot_size; + unsigned upload_size = desc->num_active_slots * slot_size; - /* Skip the upload if no shader is using the descriptors. dirty_mask - * will stay dirty and the descriptors will be uploaded when there is - * a shader using them. - */ - if (!upload_size) - return true; + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ + if (!upload_size) + return true; - /* If there is just one active descriptor, bind it directly. */ - if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && - desc->num_active_slots == 1) { - uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * - desc->element_dw_size]; + /* If there is just one active descriptor, bind it directly. */ + if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && + desc->num_active_slots == 1) { + uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * desc->element_dw_size]; - /* The buffer is already in the buffer list. */ - si_resource_reference(&desc->buffer, NULL); - desc->gpu_list = NULL; - desc->gpu_address = si_desc_extract_buffer_address(descriptor); - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - return true; - } + /* The buffer is already in the buffer list. */ + si_resource_reference(&desc->buffer, NULL); + desc->gpu_list = NULL; + desc->gpu_address = si_desc_extract_buffer_address(descriptor); + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + return true; + } - uint32_t *ptr; - unsigned buffer_offset; - u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size, - si_optimal_tcc_alignment(sctx, upload_size), - &buffer_offset, (struct pipe_resource**)&desc->buffer, - (void**)&ptr); - if (!desc->buffer) { - desc->gpu_address = 0; - return false; /* skip the draw call */ - } + uint32_t *ptr; + unsigned buffer_offset; + u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), &buffer_offset, + (struct pipe_resource **)&desc->buffer, (void **)&ptr); + if (!desc->buffer) { + desc->gpu_address = 0; + return false; /* skip the draw call */ + } - util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, - upload_size); - desc->gpu_list = ptr - first_slot_offset / 4; + util_memcpy_cpu_to_le32(ptr, (char *)desc->list + first_slot_offset, upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); - /* The shader pointer should point to slot 0. */ - buffer_offset -= first_slot_offset; - desc->gpu_address = desc->buffer->gpu_address + buffer_offset; + /* The shader pointer should point to slot 0. */ + buffer_offset -= first_slot_offset; + desc->gpu_address = desc->buffer->gpu_address + buffer_offset; - assert(desc->buffer->flags & RADEON_FLAG_32BIT); - assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi); - assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi); + assert(desc->buffer->flags & RADEON_FLAG_32BIT); + assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi); + assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi); - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - return true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + return true; } -static void -si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc) +static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc) { - if (!desc->buffer) - return; + if (!desc->buffer) + return; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); } /* SAMPLER VIEWS */ -static inline enum radeon_bo_priority -si_get_sampler_view_priority(struct si_resource *res) +static inline enum radeon_bo_priority si_get_sampler_view_priority(struct si_resource *res) { - if (res->b.b.target == PIPE_BUFFER) - return RADEON_PRIO_SAMPLER_BUFFER; + if (res->b.b.target == PIPE_BUFFER) + return RADEON_PRIO_SAMPLER_BUFFER; - if (res->b.b.nr_samples > 1) - return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; + if (res->b.b.nr_samples > 1) + return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; - return RADEON_PRIO_SAMPLER_TEXTURE; + return RADEON_PRIO_SAMPLER_TEXTURE; } -static struct si_descriptors * -si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader) +static struct si_descriptors *si_sampler_and_image_descriptors(struct si_context *sctx, + unsigned shader) { - return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)]; + return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)]; } static void si_release_sampler_views(struct si_samplers *samplers) { - int i; + int i; - for (i = 0; i < ARRAY_SIZE(samplers->views); i++) { - pipe_sampler_view_reference(&samplers->views[i], NULL); - } + for (i = 0; i < ARRAY_SIZE(samplers->views); i++) { + pipe_sampler_view_reference(&samplers->views[i], NULL); + } } -static void si_sampler_view_add_buffer(struct si_context *sctx, - struct pipe_resource *resource, - enum radeon_bo_usage usage, - bool is_stencil_sampler, - bool check_mem) +static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_resource *resource, + enum radeon_bo_usage usage, bool is_stencil_sampler, + bool check_mem) { - struct si_texture *tex = (struct si_texture*)resource; - enum radeon_bo_priority priority; + struct si_texture *tex = (struct si_texture *)resource; + enum radeon_bo_priority priority; - if (!resource) - return; + if (!resource) + return; - /* Use the flushed depth texture if direct sampling is unsupported. */ - if (resource->target != PIPE_BUFFER && - tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler)) - tex = tex->flushed_depth_texture; + /* Use the flushed depth texture if direct sampling is unsupported. */ + if (resource->target != PIPE_BUFFER && tex->is_depth && + !si_can_sample_zs(tex, is_stencil_sampler)) + tex = tex->flushed_depth_texture; - priority = si_get_sampler_view_priority(&tex->buffer); - radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, - check_mem); + priority = si_get_sampler_view_priority(&tex->buffer); + radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem); - if (resource->target == PIPE_BUFFER) - return; + if (resource->target == PIPE_BUFFER) + return; - /* Add separate DCC. */ - if (tex->dcc_separate_buffer) { - radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, - usage, RADEON_PRIO_SEPARATE_META, check_mem); - } + /* Add separate DCC. */ + if (tex->dcc_separate_buffer) { + radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage, + RADEON_PRIO_SEPARATE_META, check_mem); + } } -static void si_sampler_views_begin_new_cs(struct si_context *sctx, - struct si_samplers *samplers) +static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers) { - unsigned mask = samplers->enabled_mask; + unsigned mask = samplers->enabled_mask; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); - struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } + si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } } /* Set buffer descriptor fields that can be changed by reallocations. */ -static void si_set_buf_desc_address(struct si_resource *buf, - uint64_t offset, uint32_t *state) +static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, uint32_t *state) { - uint64_t va = buf->gpu_address + offset; + uint64_t va = buf->gpu_address + offset; - state[0] = va; - state[1] &= C_008F04_BASE_ADDRESS_HI; - state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32); + state[0] = va; + state[1] &= C_008F04_BASE_ADDRESS_HI; + state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32); } /* Set texture descriptor fields that can be changed by reallocations. @@ -302,1316 +276,1195 @@ static void si_set_buf_desc_address(struct si_resource *buf, * \param is_stencil select between separate Z & Stencil * \param state descriptor to update */ -void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, - struct si_texture *tex, - const struct legacy_surf_level *base_level_info, - unsigned base_level, unsigned first_level, - unsigned block_width, bool is_stencil, - uint32_t *state) -{ - uint64_t va, meta_va = 0; - - if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) { - tex = tex->flushed_depth_texture; - is_stencil = false; - } - - va = tex->buffer.gpu_address; - - if (sscreen->info.chip_class >= GFX9) { - /* Only stencil_offset needs to be added here. */ - if (is_stencil) - va += tex->surface.u.gfx9.stencil_offset; - else - va += tex->surface.u.gfx9.surf_offset; - } else { - va += base_level_info->offset; - } - - state[0] = va >> 8; - state[1] &= C_008F14_BASE_ADDRESS_HI; - state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); - - /* Only macrotiled modes can set tile swizzle. - * GFX9 doesn't use (legacy) base_level_info. - */ - if (sscreen->info.chip_class >= GFX9 || - base_level_info->mode == RADEON_SURF_MODE_2D) - state[0] |= tex->surface.tile_swizzle; - - if (sscreen->info.chip_class >= GFX8) { - state[6] &= C_008F28_COMPRESSION_EN; - - if (vi_dcc_enabled(tex, first_level)) { - meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->surface.dcc_offset; - - if (sscreen->info.chip_class == GFX8) { - meta_va += base_level_info->dcc_offset; - assert(base_level_info->mode == RADEON_SURF_MODE_2D); - } - - unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8; - dcc_tile_swizzle &= tex->surface.dcc_alignment - 1; - meta_va |= dcc_tile_swizzle; - } else if (vi_tc_compat_htile_enabled(tex, first_level, - is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) { - meta_va = tex->buffer.gpu_address + tex->surface.htile_offset; - } - - if (meta_va) - state[6] |= S_008F28_COMPRESSION_EN(1); - } - - if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9) - state[7] = meta_va >> 8; - - if (sscreen->info.chip_class >= GFX10) { - state[3] &= C_00A00C_SW_MODE; - - if (is_stencil) { - state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - } else { - state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); - } - - state[6] &= C_00A018_META_DATA_ADDRESS_LO & - C_00A018_META_PIPE_ALIGNED; - - if (meta_va) { - struct gfx9_surf_meta_flags meta; - - if (tex->surface.dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.htile; - - state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | - S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8); - } - - state[7] = meta_va >> 16; - } else if (sscreen->info.chip_class == GFX9) { - state[3] &= C_008F1C_SW_MODE; - state[4] &= C_008F20_PITCH; - - if (is_stencil) { - state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch); - } else { - state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); - state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch); - } - - state[5] &= C_008F24_META_DATA_ADDRESS & - C_008F24_META_PIPE_ALIGNED & - C_008F24_META_RB_ALIGNED; - if (meta_va) { - struct gfx9_surf_meta_flags meta; - - if (tex->surface.dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.htile; - - state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | - S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) | - S_008F24_META_RB_ALIGNED(meta.rb_aligned); - } - } else { - /* GFX6-GFX8 */ - unsigned pitch = base_level_info->nblk_x * block_width; - unsigned index = si_tile_mode_index(tex, base_level, is_stencil); - - state[3] &= C_008F1C_TILING_INDEX; - state[3] |= S_008F1C_TILING_INDEX(index); - state[4] &= C_008F20_PITCH; - state[4] |= S_008F20_PITCH(pitch - 1); - } +void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex, + const struct legacy_surf_level *base_level_info, + unsigned base_level, unsigned first_level, unsigned block_width, + bool is_stencil, uint32_t *state) +{ + uint64_t va, meta_va = 0; + + if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) { + tex = tex->flushed_depth_texture; + is_stencil = false; + } + + va = tex->buffer.gpu_address; + + if (sscreen->info.chip_class >= GFX9) { + /* Only stencil_offset needs to be added here. */ + if (is_stencil) + va += tex->surface.u.gfx9.stencil_offset; + else + va += tex->surface.u.gfx9.surf_offset; + } else { + va += base_level_info->offset; + } + + state[0] = va >> 8; + state[1] &= C_008F14_BASE_ADDRESS_HI; + state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); + + /* Only macrotiled modes can set tile swizzle. + * GFX9 doesn't use (legacy) base_level_info. + */ + if (sscreen->info.chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D) + state[0] |= tex->surface.tile_swizzle; + + if (sscreen->info.chip_class >= GFX8) { + state[6] &= C_008F28_COMPRESSION_EN; + + if (vi_dcc_enabled(tex, first_level)) { + meta_va = + (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset; + + if (sscreen->info.chip_class == GFX8) { + meta_va += base_level_info->dcc_offset; + assert(base_level_info->mode == RADEON_SURF_MODE_2D); + } + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8; + dcc_tile_swizzle &= tex->surface.dcc_alignment - 1; + meta_va |= dcc_tile_swizzle; + } else if (vi_tc_compat_htile_enabled(tex, first_level, + is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) { + meta_va = tex->buffer.gpu_address + tex->surface.htile_offset; + } + + if (meta_va) + state[6] |= S_008F28_COMPRESSION_EN(1); + } + + if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9) + state[7] = meta_va >> 8; + + if (sscreen->info.chip_class >= GFX10) { + state[3] &= C_00A00C_SW_MODE; + + if (is_stencil) { + state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + } else { + state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); + } + + state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED; + + if (meta_va) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.htile; + + state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) | + S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8); + } + + state[7] = meta_va >> 16; + } else if (sscreen->info.chip_class == GFX9) { + state[3] &= C_008F1C_SW_MODE; + state[4] &= C_008F20_PITCH; + + if (is_stencil) { + state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch); + } else { + state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode); + state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch); + } + + state[5] &= + C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED; + if (meta_va) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.htile; + + state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) | + S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) | + S_008F24_META_RB_ALIGNED(meta.rb_aligned); + } + } else { + /* GFX6-GFX8 */ + unsigned pitch = base_level_info->nblk_x * block_width; + unsigned index = si_tile_mode_index(tex, base_level, is_stencil); + + state[3] &= C_008F1C_TILING_INDEX; + state[3] |= S_008F1C_TILING_INDEX(index); + state[4] &= C_008F20_PITCH; + state[4] |= S_008F20_PITCH(pitch - 1); + } } static void si_set_sampler_state_desc(struct si_sampler_state *sstate, - struct si_sampler_view *sview, - struct si_texture *tex, - uint32_t *desc) -{ - if (sview && sview->is_integer) - memcpy(desc, sstate->integer_val, 4*4); - else if (tex && tex->upgraded_depth && - (!sview || !sview->is_stencil_sampler)) - memcpy(desc, sstate->upgraded_depth_val, 4*4); - else - memcpy(desc, sstate->val, 4*4); -} - -static void si_set_sampler_view_desc(struct si_context *sctx, - struct si_sampler_view *sview, - struct si_sampler_state *sstate, - uint32_t *desc) -{ - struct pipe_sampler_view *view = &sview->base; - struct si_texture *tex = (struct si_texture *)view->texture; - bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER; - - if (unlikely(!is_buffer && sview->dcc_incompatible)) { - if (vi_dcc_enabled(tex, view->u.tex.first_level)) - if (!si_texture_disable_dcc(sctx, tex)) - si_decompress_dcc(sctx, tex); - - sview->dcc_incompatible = false; - } - - assert(tex); /* views with texture == NULL aren't supported */ - memcpy(desc, sview->state, 8*4); - - if (is_buffer) { - si_set_buf_desc_address(&tex->buffer, - sview->base.u.buf.offset, - desc + 4); - } else { - bool is_separate_stencil = tex->db_compatible && - sview->is_stencil_sampler; - - si_set_mutable_tex_desc_fields(sctx->screen, tex, - sview->base_level_info, - sview->base_level, - sview->base.u.tex.first_level, - sview->block_width, - is_separate_stencil, - desc); - } - - if (!is_buffer && tex->surface.fmask_size) { - memcpy(desc + 8, sview->fmask_state, 8*4); - } else { - /* Disable FMASK and bind sampler state in [12:15]. */ - memcpy(desc + 8, null_texture_descriptor, 4*4); - - if (sstate) - si_set_sampler_state_desc(sstate, sview, - is_buffer ? NULL : tex, - desc + 12); - } + struct si_sampler_view *sview, struct si_texture *tex, + uint32_t *desc) +{ + if (sview && sview->is_integer) + memcpy(desc, sstate->integer_val, 4 * 4); + else if (tex && tex->upgraded_depth && (!sview || !sview->is_stencil_sampler)) + memcpy(desc, sstate->upgraded_depth_val, 4 * 4); + else + memcpy(desc, sstate->val, 4 * 4); +} + +static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview, + struct si_sampler_state *sstate, uint32_t *desc) +{ + struct pipe_sampler_view *view = &sview->base; + struct si_texture *tex = (struct si_texture *)view->texture; + bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER; + + if (unlikely(!is_buffer && sview->dcc_incompatible)) { + if (vi_dcc_enabled(tex, view->u.tex.first_level)) + if (!si_texture_disable_dcc(sctx, tex)) + si_decompress_dcc(sctx, tex); + + sview->dcc_incompatible = false; + } + + assert(tex); /* views with texture == NULL aren't supported */ + memcpy(desc, sview->state, 8 * 4); + + if (is_buffer) { + si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4); + } else { + bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler; + + si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level, + sview->base.u.tex.first_level, sview->block_width, + is_separate_stencil, desc); + } + + if (!is_buffer && tex->surface.fmask_size) { + memcpy(desc + 8, sview->fmask_state, 8 * 4); + } else { + /* Disable FMASK and bind sampler state in [12:15]. */ + memcpy(desc + 8, null_texture_descriptor, 4 * 4); + + if (sstate) + si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12); + } } static bool color_needs_decompression(struct si_texture *tex) { - return tex->surface.fmask_size || - (tex->dirty_level_mask && - (tex->cmask_buffer || tex->surface.dcc_offset)); + return tex->surface.fmask_size || + (tex->dirty_level_mask && (tex->cmask_buffer || tex->surface.dcc_offset)); } static bool depth_needs_decompression(struct si_texture *tex) { - /* If the depth/stencil texture is TC-compatible, no decompression - * will be done. The decompression function will only flush DB caches - * to make it coherent with shaders. That's necessary because the driver - * doesn't flush DB caches in any other case. - */ - return tex->db_compatible; -} - -static void si_set_sampler_view(struct si_context *sctx, - unsigned shader, - unsigned slot, struct pipe_sampler_view *view, - bool disallow_early_out) -{ - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_sampler_view *sview = (struct si_sampler_view*)view; - struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); - unsigned desc_slot = si_get_sampler_slot(slot); - uint32_t *desc = descs->list + desc_slot * 16; - - if (samplers->views[slot] == view && !disallow_early_out) - return; - - if (view) { - struct si_texture *tex = (struct si_texture *)view->texture; - - si_set_sampler_view_desc(sctx, sview, - samplers->sampler_states[slot], desc); - - if (tex->buffer.b.b.target == PIPE_BUFFER) { - tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); - } else { - if (depth_needs_decompression(tex)) { - samplers->needs_depth_decompress_mask |= 1u << slot; - } else { - samplers->needs_depth_decompress_mask &= ~(1u << slot); - } - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << slot; - } else { - samplers->needs_color_decompress_mask &= ~(1u << slot); - } - - if (tex->surface.dcc_offset && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - } - - pipe_sampler_view_reference(&samplers->views[slot], view); - samplers->enabled_mask |= 1u << slot; - - /* Since this can flush, it must be done after enabled_mask is - * updated. */ - si_sampler_view_add_buffer(sctx, view->texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, true); - } else { - pipe_sampler_view_reference(&samplers->views[slot], NULL); - memcpy(desc, null_texture_descriptor, 8*4); - /* Only clear the lower dwords of FMASK. */ - memcpy(desc + 8, null_texture_descriptor, 4*4); - /* Re-set the sampler state if we are transitioning from FMASK. */ - if (samplers->sampler_states[slot]) - si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, - desc + 12); - - samplers->enabled_mask &= ~(1u << slot); - samplers->needs_depth_decompress_mask &= ~(1u << slot); - samplers->needs_color_decompress_mask &= ~(1u << slot); - } - - sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); -} - -static void si_update_shader_needs_decompress_mask(struct si_context *sctx, - unsigned shader) -{ - struct si_samplers *samplers = &sctx->samplers[shader]; - unsigned shader_bit = 1 << shader; - - if (samplers->needs_depth_decompress_mask || - samplers->needs_color_decompress_mask || - sctx->images[shader].needs_color_decompress_mask) - sctx->shader_needs_decompress_mask |= shader_bit; - else - sctx->shader_needs_decompress_mask &= ~shader_bit; -} - -static void si_set_sampler_views(struct pipe_context *ctx, - enum pipe_shader_type shader, unsigned start, - unsigned count, - struct pipe_sampler_view **views) -{ - struct si_context *sctx = (struct si_context *)ctx; - int i; - - if (!count || shader >= SI_NUM_SHADERS) - return; - - if (views) { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, views[i], false); - } else { - for (i = 0; i < count; i++) - si_set_sampler_view(sctx, shader, start + i, NULL, false); - } - - si_update_shader_needs_decompress_mask(sctx, shader); -} - -static void -si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers) -{ - unsigned mask = samplers->enabled_mask; - - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_resource *res = samplers->views[i]->texture; - - if (res && res->target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; - - if (color_needs_decompression(tex)) { - samplers->needs_color_decompress_mask |= 1u << i; - } else { - samplers->needs_color_decompress_mask &= ~(1u << i); - } - } - } + /* If the depth/stencil texture is TC-compatible, no decompression + * will be done. The decompression function will only flush DB caches + * to make it coherent with shaders. That's necessary because the driver + * doesn't flush DB caches in any other case. + */ + return tex->db_compatible; +} + +static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot, + struct pipe_sampler_view *view, bool disallow_early_out) +{ + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_sampler_view *sview = (struct si_sampler_view *)view; + struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); + unsigned desc_slot = si_get_sampler_slot(slot); + uint32_t *desc = descs->list + desc_slot * 16; + + if (samplers->views[slot] == view && !disallow_early_out) + return; + + if (view) { + struct si_texture *tex = (struct si_texture *)view->texture; + + si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc); + + if (tex->buffer.b.b.target == PIPE_BUFFER) { + tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW; + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); + } else { + if (depth_needs_decompression(tex)) { + samplers->needs_depth_decompress_mask |= 1u << slot; + } else { + samplers->needs_depth_decompress_mask &= ~(1u << slot); + } + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << slot; + } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } + + if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + } + + pipe_sampler_view_reference(&samplers->views[slot], view); + samplers->enabled_mask |= 1u << slot; + + /* Since this can flush, it must be done after enabled_mask is + * updated. */ + si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler, + true); + } else { + pipe_sampler_view_reference(&samplers->views[slot], NULL); + memcpy(desc, null_texture_descriptor, 8 * 4); + /* Only clear the lower dwords of FMASK. */ + memcpy(desc + 8, null_texture_descriptor, 4 * 4); + /* Re-set the sampler state if we are transitioning from FMASK. */ + if (samplers->sampler_states[slot]) + si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12); + + samplers->enabled_mask &= ~(1u << slot); + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); + } + + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); +} + +static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader) +{ + struct si_samplers *samplers = &sctx->samplers[shader]; + unsigned shader_bit = 1 << shader; + + if (samplers->needs_depth_decompress_mask || samplers->needs_color_decompress_mask || + sctx->images[shader].needs_color_decompress_mask) + sctx->shader_needs_decompress_mask |= shader_bit; + else + sctx->shader_needs_decompress_mask &= ~shader_bit; +} + +static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader, + unsigned start, unsigned count, struct pipe_sampler_view **views) +{ + struct si_context *sctx = (struct si_context *)ctx; + int i; + + if (!count || shader >= SI_NUM_SHADERS) + return; + + if (views) { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, views[i], false); + } else { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, NULL, false); + } + + si_update_shader_needs_decompress_mask(sctx, shader); +} + +static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers) +{ + unsigned mask = samplers->enabled_mask; + + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_resource *res = samplers->views[i]->texture; + + if (res && res->target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + + if (color_needs_decompression(tex)) { + samplers->needs_color_decompress_mask |= 1u << i; + } else { + samplers->needs_color_decompress_mask &= ~(1u << i); + } + } + } } /* IMAGE VIEWS */ -static void -si_release_image_views(struct si_images *images) +static void si_release_image_views(struct si_images *images) { - unsigned i; + unsigned i; - for (i = 0; i < SI_NUM_IMAGES; ++i) { - struct pipe_image_view *view = &images->views[i]; + for (i = 0; i < SI_NUM_IMAGES; ++i) { + struct pipe_image_view *view = &images->views[i]; - pipe_resource_reference(&view->resource, NULL); - } + pipe_resource_reference(&view->resource, NULL); + } } -static void -si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images) +static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images) { - uint mask = images->enabled_mask; + uint mask = images->enabled_mask; + + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_image_view *view = &images->views[i]; + assert(view->resource); - assert(view->resource); + si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false); + } +} - si_sampler_view_add_buffer(sctx, view->resource, - RADEON_USAGE_READWRITE, false, false); - } -} - -static void -si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) -{ - struct si_images *images = &ctx->images[shader]; - - if (images->enabled_mask & (1u << slot)) { - struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); - unsigned desc_slot = si_get_image_slot(slot); - - pipe_resource_reference(&images->views[slot].resource, NULL); - images->needs_color_decompress_mask &= ~(1 << slot); - - memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); - images->enabled_mask &= ~(1u << slot); - ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - } -} - -static void -si_mark_image_range_valid(const struct pipe_image_view *view) -{ - struct si_resource *res = si_resource(view->resource); - - if (res->b.b.target != PIPE_BUFFER) - return; - - util_range_add(&res->b.b, &res->valid_buffer_range, - view->u.buf.offset, - view->u.buf.offset + view->u.buf.size); -} - -static void si_set_shader_image_desc(struct si_context *ctx, - const struct pipe_image_view *view, - bool skip_decompress, - uint32_t *desc, uint32_t *fmask_desc) -{ - struct si_screen *screen = ctx->screen; - struct si_resource *res; - - res = si_resource(view->resource); - - if (res->b.b.target == PIPE_BUFFER || - view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { - if (view->access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(view); - - si_make_buffer_descriptor(screen, res, - view->format, - view->u.buf.offset, - view->u.buf.size, desc); - si_set_buf_desc_address(res, view->u.buf.offset, desc + 4); - } else { - static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - unsigned width, height, depth, hw_level; - bool uses_dcc = vi_dcc_enabled(tex, level); - unsigned access = view->access; - - assert(!tex->is_depth); - assert(fmask_desc || tex->surface.fmask_offset == 0); - - if (uses_dcc && !skip_decompress && - (access & PIPE_IMAGE_ACCESS_WRITE || - !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) { - /* If DCC can't be disabled, at least decompress it. - * The decompression is relatively cheap if the surface - * has been decompressed already. - */ - if (!si_texture_disable_dcc(ctx, tex)) - si_decompress_dcc(ctx, tex); - } - - if (ctx->chip_class >= GFX9) { - /* Always set the base address. The swizzle modes don't - * allow setting mipmap level offsets as the base. - */ - width = res->b.b.width0; - height = res->b.b.height0; - depth = res->b.b.depth0; - hw_level = level; - } else { - /* Always force the base level to the selected level. - * - * This is required for 3D textures, where otherwise - * selecting a single slice for non-layered bindings - * fails. It doesn't hurt the other targets. - */ - width = u_minify(res->b.b.width0, level); - height = u_minify(res->b.b.height0, level); - depth = u_minify(res->b.b.depth0, level); - hw_level = 0; - } - - screen->make_texture_descriptor(screen, tex, - false, res->b.b.target, - view->format, swizzle, - hw_level, hw_level, - view->u.tex.first_layer, - view->u.tex.last_layer, - width, height, depth, - desc, fmask_desc); - si_set_mutable_tex_desc_fields(screen, tex, - &tex->surface.u.legacy.level[level], - level, level, - util_format_get_blockwidth(view->format), - false, desc); - } -} - -static void si_set_shader_image(struct si_context *ctx, - unsigned shader, - unsigned slot, const struct pipe_image_view *view, - bool skip_decompress) -{ - struct si_images *images = &ctx->images[shader]; - struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); - struct si_resource *res; - - if (!view || !view->resource) { - si_disable_shader_image(ctx, shader, slot); - return; - } - - res = si_resource(view->resource); - - if (&images->views[slot] != view) - util_copy_image_view(&images->views[slot], view); - - si_set_shader_image_desc(ctx, view, skip_decompress, - descs->list + si_get_image_slot(slot) * 8, - descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8); - - if (res->b.b.target == PIPE_BUFFER || - view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { - images->needs_color_decompress_mask &= ~(1 << slot); - res->bind_history |= PIPE_BIND_SHADER_IMAGE; - } else { - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - - if (color_needs_decompression(tex)) { - images->needs_color_decompress_mask |= 1 << slot; - } else { - images->needs_color_decompress_mask &= ~(1 << slot); - } - - if (vi_dcc_enabled(tex, level) && - p_atomic_read(&tex->framebuffers_bound)) - ctx->need_check_render_feedback = true; - } - - images->enabled_mask |= 1u << slot; - ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - - /* Since this can flush, it must be done after enabled_mask is updated. */ - si_sampler_view_add_buffer(ctx, &res->b.b, - (view->access & PIPE_IMAGE_ACCESS_WRITE) ? - RADEON_USAGE_READWRITE : RADEON_USAGE_READ, - false, true); -} - -static void -si_set_shader_images(struct pipe_context *pipe, - enum pipe_shader_type shader, - unsigned start_slot, unsigned count, - const struct pipe_image_view *views) -{ - struct si_context *ctx = (struct si_context *)pipe; - unsigned i, slot; - - assert(shader < SI_NUM_SHADERS); - - if (!count) - return; - - assert(start_slot + count <= SI_NUM_IMAGES); - - if (views) { - for (i = 0, slot = start_slot; i < count; ++i, ++slot) - si_set_shader_image(ctx, shader, slot, &views[i], false); - } else { - for (i = 0, slot = start_slot; i < count; ++i, ++slot) - si_set_shader_image(ctx, shader, slot, NULL, false); - } - - si_update_shader_needs_decompress_mask(ctx, shader); -} - -static void -si_images_update_needs_color_decompress_mask(struct si_images *images) +static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) { - unsigned mask = images->enabled_mask; + struct si_images *images = &ctx->images[shader]; + + if (images->enabled_mask & (1u << slot)) { + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + unsigned desc_slot = si_get_image_slot(slot); + + pipe_resource_reference(&images->views[slot].resource, NULL); + images->needs_color_decompress_mask &= ~(1 << slot); + + memcpy(descs->list + desc_slot * 8, null_image_descriptor, 8 * 4); + images->enabled_mask &= ~(1u << slot); + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + } +} - while (mask) { - int i = u_bit_scan(&mask); - struct pipe_resource *res = images->views[i].resource; +static void si_mark_image_range_valid(const struct pipe_image_view *view) +{ + struct si_resource *res = si_resource(view->resource); - if (res && res->target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; + if (res->b.b.target != PIPE_BUFFER) + return; - if (color_needs_decompression(tex)) { - images->needs_color_decompress_mask |= 1 << i; - } else { - images->needs_color_decompress_mask &= ~(1 << i); - } - } - } + util_range_add(&res->b.b, &res->valid_buffer_range, view->u.buf.offset, + view->u.buf.offset + view->u.buf.size); +} + +static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_image_view *view, + bool skip_decompress, uint32_t *desc, uint32_t *fmask_desc) +{ + struct si_screen *screen = ctx->screen; + struct si_resource *res; + + res = si_resource(view->resource); + + if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(view); + + si_make_buffer_descriptor(screen, res, view->format, view->u.buf.offset, view->u.buf.size, + desc); + si_set_buf_desc_address(res, view->u.buf.offset, desc + 4); + } else { + static const unsigned char swizzle[4] = {0, 1, 2, 3}; + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + unsigned width, height, depth, hw_level; + bool uses_dcc = vi_dcc_enabled(tex, level); + unsigned access = view->access; + + assert(!tex->is_depth); + assert(fmask_desc || tex->surface.fmask_offset == 0); + + if (uses_dcc && !skip_decompress && + (access & PIPE_IMAGE_ACCESS_WRITE || + !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) { + /* If DCC can't be disabled, at least decompress it. + * The decompression is relatively cheap if the surface + * has been decompressed already. + */ + if (!si_texture_disable_dcc(ctx, tex)) + si_decompress_dcc(ctx, tex); + } + + if (ctx->chip_class >= GFX9) { + /* Always set the base address. The swizzle modes don't + * allow setting mipmap level offsets as the base. + */ + width = res->b.b.width0; + height = res->b.b.height0; + depth = res->b.b.depth0; + hw_level = level; + } else { + /* Always force the base level to the selected level. + * + * This is required for 3D textures, where otherwise + * selecting a single slice for non-layered bindings + * fails. It doesn't hurt the other targets. + */ + width = u_minify(res->b.b.width0, level); + height = u_minify(res->b.b.height0, level); + depth = u_minify(res->b.b.depth0, level); + hw_level = 0; + } + + screen->make_texture_descriptor( + screen, tex, false, res->b.b.target, view->format, swizzle, hw_level, hw_level, + view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc); + si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level, + util_format_get_blockwidth(view->format), false, desc); + } +} + +static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigned slot, + const struct pipe_image_view *view, bool skip_decompress) +{ + struct si_images *images = &ctx->images[shader]; + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + struct si_resource *res; + + if (!view || !view->resource) { + si_disable_shader_image(ctx, shader, slot); + return; + } + + res = si_resource(view->resource); + + if (&images->views[slot] != view) + util_copy_image_view(&images->views[slot], view); + + si_set_shader_image_desc(ctx, view, skip_decompress, descs->list + si_get_image_slot(slot) * 8, + descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8); + + if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { + images->needs_color_decompress_mask &= ~(1 << slot); + res->bind_history |= PIPE_BIND_SHADER_IMAGE; + } else { + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << slot; + } else { + images->needs_color_decompress_mask &= ~(1 << slot); + } + + if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) + ctx->need_check_render_feedback = true; + } + + images->enabled_mask |= 1u << slot; + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + + /* Since this can flush, it must be done after enabled_mask is updated. */ + si_sampler_view_add_buffer( + ctx, &res->b.b, + (view->access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false, + true); +} + +static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_type shader, + unsigned start_slot, unsigned count, + const struct pipe_image_view *views) +{ + struct si_context *ctx = (struct si_context *)pipe; + unsigned i, slot; + + assert(shader < SI_NUM_SHADERS); + + if (!count) + return; + + assert(start_slot + count <= SI_NUM_IMAGES); + + if (views) { + for (i = 0, slot = start_slot; i < count; ++i, ++slot) + si_set_shader_image(ctx, shader, slot, &views[i], false); + } else { + for (i = 0, slot = start_slot; i < count; ++i, ++slot) + si_set_shader_image(ctx, shader, slot, NULL, false); + } + + si_update_shader_needs_decompress_mask(ctx, shader); +} + +static void si_images_update_needs_color_decompress_mask(struct si_images *images) +{ + unsigned mask = images->enabled_mask; + + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_resource *res = images->views[i].resource; + + if (res && res->target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << i; + } else { + images->needs_color_decompress_mask &= ~(1 << i); + } + } + } } void si_update_ps_colorbuf0_slot(struct si_context *sctx) { - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - unsigned slot = SI_PS_IMAGE_COLORBUF0; - struct pipe_surface *surf = NULL; - - /* si_texture_disable_dcc can get us here again. */ - if (sctx->blitter->running) - return; - - /* See whether FBFETCH is used and color buffer 0 is set. */ - if (sctx->ps_shader.cso && - sctx->ps_shader.cso->info.uses_fbfetch && - sctx->framebuffer.state.nr_cbufs && - sctx->framebuffer.state.cbufs[0]) - surf = sctx->framebuffer.state.cbufs[0]; - - /* Return if FBFETCH transitions from disabled to disabled. */ - if (!buffers->buffers[slot] && !surf) - return; - - sctx->ps_uses_fbfetch = surf != NULL; - si_update_ps_iter_samples(sctx); - - if (surf) { - struct si_texture *tex = (struct si_texture*)surf->texture; - struct pipe_image_view view = {0}; - - assert(tex); - assert(!tex->is_depth); - - /* Disable DCC, because the texture is used as both a sampler - * and color buffer. - */ - si_texture_disable_dcc(sctx, tex); - - if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) { - /* Disable CMASK. */ - assert(tex->cmask_buffer != &tex->buffer); - si_eliminate_fast_color_clear(sctx, tex); - si_texture_discard_cmask(sctx->screen, tex); - } - - view.resource = surf->texture; - view.format = surf->format; - view.access = PIPE_IMAGE_ACCESS_READ; - view.u.tex.first_layer = surf->u.tex.first_layer; - view.u.tex.last_layer = surf->u.tex.last_layer; - view.u.tex.level = surf->u.tex.level; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - memset(desc, 0, 16 * 4); - si_set_shader_image_desc(sctx, &view, true, desc, desc + 8); - - pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_RW_IMAGE); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, 8*4); - pipe_resource_reference(&buffers->buffers[slot], NULL); - buffers->enabled_mask &= ~(1u << slot); - } - - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + unsigned slot = SI_PS_IMAGE_COLORBUF0; + struct pipe_surface *surf = NULL; + + /* si_texture_disable_dcc can get us here again. */ + if (sctx->blitter->running) + return; + + /* See whether FBFETCH is used and color buffer 0 is set. */ + if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_fbfetch && + sctx->framebuffer.state.nr_cbufs && sctx->framebuffer.state.cbufs[0]) + surf = sctx->framebuffer.state.cbufs[0]; + + /* Return if FBFETCH transitions from disabled to disabled. */ + if (!buffers->buffers[slot] && !surf) + return; + + sctx->ps_uses_fbfetch = surf != NULL; + si_update_ps_iter_samples(sctx); + + if (surf) { + struct si_texture *tex = (struct si_texture *)surf->texture; + struct pipe_image_view view = {0}; + + assert(tex); + assert(!tex->is_depth); + + /* Disable DCC, because the texture is used as both a sampler + * and color buffer. + */ + si_texture_disable_dcc(sctx, tex); + + if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) { + /* Disable CMASK. */ + assert(tex->cmask_buffer != &tex->buffer); + si_eliminate_fast_color_clear(sctx, tex); + si_texture_discard_cmask(sctx->screen, tex); + } + + view.resource = surf->texture; + view.format = surf->format; + view.access = PIPE_IMAGE_ACCESS_READ; + view.u.tex.first_layer = surf->u.tex.first_layer; + view.u.tex.last_layer = surf->u.tex.last_layer; + view.u.tex.level = surf->u.tex.level; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot * 4; + memset(desc, 0, 16 * 4); + si_set_shader_image_desc(sctx, &view, true, desc, desc + 8); + + pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RW_IMAGE); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot * 4, 0, 8 * 4); + pipe_resource_reference(&buffers->buffers[slot], NULL); + buffers->enabled_mask &= ~(1u << slot); + } + + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } /* SAMPLER STATES */ -static void si_bind_sampler_states(struct pipe_context *ctx, - enum pipe_shader_type shader, +static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_type shader, unsigned start, unsigned count, void **states) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader); - struct si_sampler_state **sstates = (struct si_sampler_state**)states; - int i; + struct si_context *sctx = (struct si_context *)ctx; + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader); + struct si_sampler_state **sstates = (struct si_sampler_state **)states; + int i; - if (!count || shader >= SI_NUM_SHADERS || !sstates) - return; + if (!count || shader >= SI_NUM_SHADERS || !sstates) + return; - for (i = 0; i < count; i++) { - unsigned slot = start + i; - unsigned desc_slot = si_get_sampler_slot(slot); + for (i = 0; i < count; i++) { + unsigned slot = start + i; + unsigned desc_slot = si_get_sampler_slot(slot); - if (!sstates[i] || - sstates[i] == samplers->sampler_states[slot]) - continue; + if (!sstates[i] || sstates[i] == samplers->sampler_states[slot]) + continue; #ifndef NDEBUG - assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC); + assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC); #endif - samplers->sampler_states[slot] = sstates[i]; + samplers->sampler_states[slot] = sstates[i]; - /* If FMASK is bound, don't overwrite it. - * The sampler state will be set after FMASK is unbound. - */ - struct si_sampler_view *sview = - (struct si_sampler_view *)samplers->views[slot]; + /* If FMASK is bound, don't overwrite it. + * The sampler state will be set after FMASK is unbound. + */ + struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[slot]; - struct si_texture *tex = NULL; + struct si_texture *tex = NULL; - if (sview && sview->base.texture && - sview->base.texture->target != PIPE_BUFFER) - tex = (struct si_texture *)sview->base.texture; + if (sview && sview->base.texture && sview->base.texture->target != PIPE_BUFFER) + tex = (struct si_texture *)sview->base.texture; - if (tex && tex->surface.fmask_size) - continue; + if (tex && tex->surface.fmask_size) + continue; - si_set_sampler_state_desc(sstates[i], sview, tex, - desc->list + desc_slot * 16 + 12); + si_set_sampler_state_desc(sstates[i], sview, tex, desc->list + desc_slot * 16 + 12); - sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); - } + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + } } /* BUFFER RESOURCES */ static void si_init_buffer_resources(struct si_buffer_resources *buffers, - struct si_descriptors *descs, - unsigned num_buffers, - short shader_userdata_rel_index, - enum radeon_bo_priority priority, - enum radeon_bo_priority priority_constbuf) + struct si_descriptors *descs, unsigned num_buffers, + short shader_userdata_rel_index, + enum radeon_bo_priority priority, + enum radeon_bo_priority priority_constbuf) { - buffers->priority = priority; - buffers->priority_constbuf = priority_constbuf; - buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0])); + buffers->priority = priority; + buffers->priority_constbuf = priority_constbuf; + buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource *)); + buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0])); - si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); + si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, - struct si_descriptors *descs) + struct si_descriptors *descs) { - int i; + int i; - for (i = 0; i < descs->num_elements; i++) { - pipe_resource_reference(&buffers->buffers[i], NULL); - } + for (i = 0; i < descs->num_elements; i++) { + pipe_resource_reference(&buffers->buffers[i], NULL); + } - FREE(buffers->buffers); - FREE(buffers->offsets); + FREE(buffers->buffers); + FREE(buffers->offsets); } static void si_buffer_resources_begin_new_cs(struct si_context *sctx, - struct si_buffer_resources *buffers) + struct si_buffer_resources *buffers) { - unsigned mask = buffers->enabled_mask; + unsigned mask = buffers->enabled_mask; - /* Add buffers to the CS. */ - while (mask) { - int i = u_bit_scan(&mask); + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(buffers->buffers[i]), - buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - i < SI_NUM_SHADER_BUFFERS ? buffers->priority : - buffers->priority_constbuf); - } + radeon_add_to_buffer_list( + sctx, sctx->gfx_cs, si_resource(buffers->buffers[i]), + buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, + i < SI_NUM_SHADER_BUFFERS ? buffers->priority : buffers->priority_constbuf); + } } static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers, - struct si_descriptors *descs, - unsigned idx, struct pipe_resource **buf, - unsigned *offset, unsigned *size) + struct si_descriptors *descs, unsigned idx, + struct pipe_resource **buf, unsigned *offset, + unsigned *size) { - pipe_resource_reference(buf, buffers->buffers[idx]); - if (*buf) { - struct si_resource *res = si_resource(*buf); - const uint32_t *desc = descs->list + idx * 4; - uint64_t va; + pipe_resource_reference(buf, buffers->buffers[idx]); + if (*buf) { + struct si_resource *res = si_resource(*buf); + const uint32_t *desc = descs->list + idx * 4; + uint64_t va; - *size = desc[2]; + *size = desc[2]; - assert(G_008F04_STRIDE(desc[1]) == 0); - va = si_desc_extract_buffer_address(desc); + assert(G_008F04_STRIDE(desc[1]) == 0); + va = si_desc_extract_buffer_address(desc); - assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size); - *offset = va - res->gpu_address; - } + assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size); + *offset = va - res->gpu_address; + } } /* VERTEX BUFFERS */ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) { - int count = sctx->num_vertex_elements; - int i; + int count = sctx->num_vertex_elements; + int i; - for (i = 0; i < count; i++) { - int vb = sctx->vertex_elements->vertex_buffer_index[i]; + for (i = 0; i < count; i++) { + int vb = sctx->vertex_elements->vertex_buffer_index[i]; - if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) - continue; - if (!sctx->vertex_buffer[vb].buffer.resource) - continue; + if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) + continue; + if (!sctx->vertex_buffer[vb].buffer.resource) + continue; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(sctx->vertex_buffer[vb].buffer.resource), - RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); - } + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, + si_resource(sctx->vertex_buffer[vb].buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } - if (!sctx->vb_descriptors_buffer) - return; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); + if (!sctx->vb_descriptors_buffer) + return; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); } bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { - unsigned i, count = sctx->num_vertex_elements; - uint32_t *ptr; - - if (!sctx->vertex_buffers_dirty || !count) - return true; - - struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned alloc_size = velems->vb_desc_list_alloc_size; - - if (alloc_size) { - /* Vertex buffer descriptors are the only ones which are uploaded - * directly through a staging buffer and don't go through - * the fine-grained upload path. - */ - u_upload_alloc(sctx->b.const_uploader, 0, - alloc_size, - si_optimal_tcc_alignment(sctx, alloc_size), - &sctx->vb_descriptors_offset, - (struct pipe_resource**)&sctx->vb_descriptors_buffer, - (void**)&ptr); - if (!sctx->vb_descriptors_buffer) { - sctx->vb_descriptors_offset = 0; - sctx->vb_descriptors_gpu_list = NULL; - return false; - } - - sctx->vb_descriptors_gpu_list = ptr; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - sctx->vertex_buffer_pointer_dirty = true; - sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; - } else { - si_resource_reference(&sctx->vb_descriptors_buffer, NULL); - sctx->vertex_buffer_pointer_dirty = false; - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; - } - - assert(count <= SI_MAX_ATTRIBS); - - unsigned first_vb_use_mask = velems->first_vb_use_mask; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; - - for (i = 0; i < count; i++) { - struct pipe_vertex_buffer *vb; - struct si_resource *buf; - unsigned vbo_index = velems->vertex_buffer_index[i]; - uint32_t *desc = i < num_vbos_in_user_sgprs ? - &sctx->vb_descriptor_user_sgprs[i * 4] : - &ptr[(i - num_vbos_in_user_sgprs) * 4]; - - vb = &sctx->vertex_buffer[vbo_index]; - buf = si_resource(vb->buffer.resource); - if (!buf) { - memset(desc, 0, 16); - continue; - } - - int64_t offset = (int64_t)((int)vb->buffer_offset) + - velems->src_offset[i]; - - if (offset >= buf->b.b.width0) { - assert(offset < buf->b.b.width0); - memset(desc, 0, 16); - continue; - } - - uint64_t va = buf->gpu_address + offset; - - int64_t num_records = (int64_t)buf->b.b.width0 - offset; - if (sctx->chip_class != GFX8 && vb->stride) { - /* Round up by rounding down and adding 1 */ - num_records = (num_records - velems->format_size[i]) / - vb->stride + 1; - } - assert(num_records >= 0 && num_records <= UINT_MAX); - - uint32_t rsrc_word3 = velems->rsrc_word3[i]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 1: index >= NUM_RECORDS (Structured) - * - 3: offset >= NUM_RECORDS (Raw) - */ - if (sctx->chip_class >= GFX10) - rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW); - - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(vb->stride); - desc[2] = num_records; - desc[3] = rsrc_word3; - - if (first_vb_use_mask & (1 << i)) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(vb->buffer.resource), - RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); - } - } - - /* Don't flush the const cache. It would have a very negative effect - * on performance (confirmed by testing). New descriptors are always - * uploaded to a fresh new buffer, so I don't think flushing the const - * cache is needed. */ - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; - sctx->vertex_buffers_dirty = false; - return true; + unsigned i, count = sctx->num_vertex_elements; + uint32_t *ptr; + + if (!sctx->vertex_buffers_dirty || !count) + return true; + + struct si_vertex_elements *velems = sctx->vertex_elements; + unsigned alloc_size = velems->vb_desc_list_alloc_size; + + if (alloc_size) { + /* Vertex buffer descriptors are the only ones which are uploaded + * directly through a staging buffer and don't go through + * the fine-grained upload path. + */ + u_upload_alloc(sctx->b.const_uploader, 0, alloc_size, + si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset, + (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr); + if (!sctx->vb_descriptors_buffer) { + sctx->vb_descriptors_offset = 0; + sctx->vb_descriptors_gpu_list = NULL; + return false; + } + + sctx->vb_descriptors_gpu_list = ptr; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; + } else { + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vertex_buffer_pointer_dirty = false; + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; + } + + assert(count <= SI_MAX_ATTRIBS); + + unsigned first_vb_use_mask = velems->first_vb_use_mask; + unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; + + for (i = 0; i < count; i++) { + struct pipe_vertex_buffer *vb; + struct si_resource *buf; + unsigned vbo_index = velems->vertex_buffer_index[i]; + uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4] + : &ptr[(i - num_vbos_in_user_sgprs) * 4]; + + vb = &sctx->vertex_buffer[vbo_index]; + buf = si_resource(vb->buffer.resource); + if (!buf) { + memset(desc, 0, 16); + continue; + } + + int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i]; + + if (offset >= buf->b.b.width0) { + assert(offset < buf->b.b.width0); + memset(desc, 0, 16); + continue; + } + + uint64_t va = buf->gpu_address + offset; + + int64_t num_records = (int64_t)buf->b.b.width0 - offset; + if (sctx->chip_class != GFX8 && vb->stride) { + /* Round up by rounding down and adding 1 */ + num_records = (num_records - velems->format_size[i]) / vb->stride + 1; + } + assert(num_records >= 0 && num_records <= UINT_MAX); + + uint32_t rsrc_word3 = velems->rsrc_word3[i]; + + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + if (sctx->chip_class >= GFX10) + rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED + : V_008F0C_OOB_SELECT_RAW); + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride); + desc[2] = num_records; + desc[3] = rsrc_word3; + + if (first_vb_use_mask & (1 << i)) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } + } + + /* Don't flush the const cache. It would have a very negative effect + * on performance (confirmed by testing). New descriptors are always + * uploaded to a fresh new buffer, so I don't think flushing the const + * cache is needed. */ + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; + sctx->vertex_buffers_dirty = false; + return true; } - /* CONSTANT BUFFERS */ -static struct si_descriptors * -si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader) -{ - return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)]; -} - -void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, - const uint8_t *ptr, unsigned size, uint32_t *const_offset) -{ - void *tmp; - - u_upload_alloc(sctx->b.const_uploader, 0, size, - si_optimal_tcc_alignment(sctx, size), - const_offset, - (struct pipe_resource**)buf, &tmp); - if (*buf) - util_memcpy_cpu_to_le32(tmp, ptr, size); -} - -static void si_set_constant_buffer(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - uint slot, const struct pipe_constant_buffer *input) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - assert(slot < descs->num_elements); - pipe_resource_reference(&buffers->buffers[slot], NULL); - - /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy - * with a NULL buffer). We need to use a dummy buffer instead. */ - if (sctx->chip_class == GFX7 && - (!input || (!input->buffer && !input->user_buffer))) - input = &sctx->null_const_buf; - - if (input && (input->buffer || input->user_buffer)) { - struct pipe_resource *buffer = NULL; - uint64_t va; - unsigned buffer_offset; - - /* Upload the user buffer if needed. */ - if (input->user_buffer) { - si_upload_const_buffer(sctx, - (struct si_resource**)&buffer, input->user_buffer, - input->buffer_size, &buffer_offset); - if (!buffer) { - /* Just unbind on failure. */ - si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL); - return; - } - } else { - pipe_resource_reference(&buffer, input->buffer); - buffer_offset = input->buffer_offset; - } - - va = si_resource(buffer)->gpu_address + buffer_offset; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(0); - desc[2] = input->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - buffers->buffers[slot] = buffer; - buffers->offsets[slot] = buffer_offset; - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - RADEON_USAGE_READ, - buffers->priority_constbuf, true); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - } - - sctx->descriptors_dirty |= 1u << descriptors_idx; -} - -static void si_pipe_set_constant_buffer(struct pipe_context *ctx, - enum pipe_shader_type shader, uint slot, - const struct pipe_constant_buffer *input) -{ - struct si_context *sctx = (struct si_context *)ctx; - - if (shader >= SI_NUM_SHADERS) - return; - - if (slot == 0 && input && input->buffer && - !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) { - assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader"); - return; - } - - if (input && input->buffer) - si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; - - slot = si_get_constbuf_slot(slot); - si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - slot, input); -} - -void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, - uint slot, struct pipe_constant_buffer *cbuf) -{ - cbuf->user_buffer = NULL; - si_get_buffer_from_descriptors( - &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors(sctx, shader), - si_get_constbuf_slot(slot), - &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); +static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx, + unsigned shader) +{ + return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)]; +} + +void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr, + unsigned size, uint32_t *const_offset) +{ + void *tmp; + + u_upload_alloc(sctx->b.const_uploader, 0, size, si_optimal_tcc_alignment(sctx, size), + const_offset, (struct pipe_resource **)buf, &tmp); + if (*buf) + util_memcpy_cpu_to_le32(tmp, ptr, size); +} + +static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_resources *buffers, + unsigned descriptors_idx, uint slot, + const struct pipe_constant_buffer *input) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + assert(slot < descs->num_elements); + pipe_resource_reference(&buffers->buffers[slot], NULL); + + /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy + * with a NULL buffer). We need to use a dummy buffer instead. */ + if (sctx->chip_class == GFX7 && (!input || (!input->buffer && !input->user_buffer))) + input = &sctx->null_const_buf; + + if (input && (input->buffer || input->user_buffer)) { + struct pipe_resource *buffer = NULL; + uint64_t va; + unsigned buffer_offset; + + /* Upload the user buffer if needed. */ + if (input->user_buffer) { + si_upload_const_buffer(sctx, (struct si_resource **)&buffer, input->user_buffer, + input->buffer_size, &buffer_offset); + if (!buffer) { + /* Just unbind on failure. */ + si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL); + return; + } + } else { + pipe_resource_reference(&buffer, input->buffer); + buffer_offset = input->buffer_offset; + } + + va = si_resource(buffer)->gpu_address + buffer_offset; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot * 4; + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); + desc[2] = input->buffer_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + buffers->buffers[slot] = buffer; + buffers->offsets[slot] = buffer_offset; + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ, + buffers->priority_constbuf, true); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + } + + sctx->descriptors_dirty |= 1u << descriptors_idx; +} + +static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader, + uint slot, const struct pipe_constant_buffer *input) +{ + struct si_context *sctx = (struct si_context *)ctx; + + if (shader >= SI_NUM_SHADERS) + return; + + if (slot == 0 && input && input->buffer && + !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) { + assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader"); + return; + } + + if (input && input->buffer) + si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; + + slot = si_get_constbuf_slot(slot); + si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), slot, input); +} + +void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, + struct pipe_constant_buffer *cbuf) +{ + cbuf->user_buffer = NULL; + si_get_buffer_from_descriptors( + &sctx->const_and_shader_buffers[shader], si_const_and_shader_buffer_descriptors(sctx, shader), + si_get_constbuf_slot(slot), &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); } /* SHADER BUFFERS */ -static void si_set_shader_buffer(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - uint slot, const struct pipe_shader_buffer *sbuffer, - bool writable, enum radeon_bo_priority priority) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - uint32_t *desc = descs->list + slot * 4; - - if (!sbuffer || !sbuffer->buffer) { - pipe_resource_reference(&buffers->buffers[slot], NULL); - memset(desc, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - buffers->writable_mask &= ~(1u << slot); - sctx->descriptors_dirty |= 1u << descriptors_idx; - return; - } - - struct si_resource *buf = si_resource(sbuffer->buffer); - uint64_t va = buf->gpu_address + sbuffer->buffer_offset; - - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(0); - desc[2] = sbuffer->buffer_size; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); - buffers->offsets[slot] = sbuffer->buffer_offset; - radeon_add_to_gfx_buffer_list_check_mem(sctx, buf, - writable ? RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - priority, true); - if (writable) - buffers->writable_mask |= 1u << slot; - else - buffers->writable_mask &= ~(1u << slot); - - buffers->enabled_mask |= 1u << slot; - sctx->descriptors_dirty |= 1u << descriptors_idx; - - util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset, - sbuffer->buffer_offset + sbuffer->buffer_size); -} - -static void si_set_shader_buffers(struct pipe_context *ctx, - enum pipe_shader_type shader, - unsigned start_slot, unsigned count, - const struct pipe_shader_buffer *sbuffers, - unsigned writable_bitmask) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; - unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader); - unsigned i; - - assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); - - for (i = 0; i < count; ++i) { - const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; - unsigned slot = si_get_shaderbuf_slot(start_slot + i); - - if (sbuffer && sbuffer->buffer) - si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER; - - si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer, - !!(writable_bitmask & (1u << i)), - buffers->priority); - } -} - -void si_get_shader_buffers(struct si_context *sctx, - enum pipe_shader_type shader, - uint start_slot, uint count, - struct pipe_shader_buffer *sbuf) -{ - struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; - struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); - - for (unsigned i = 0; i < count; ++i) { - si_get_buffer_from_descriptors( - buffers, descs, - si_get_shaderbuf_slot(start_slot + i), - &sbuf[i].buffer, &sbuf[i].buffer_offset, - &sbuf[i].buffer_size); - } +static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resources *buffers, + unsigned descriptors_idx, uint slot, + const struct pipe_shader_buffer *sbuffer, bool writable, + enum radeon_bo_priority priority) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + uint32_t *desc = descs->list + slot * 4; + + if (!sbuffer || !sbuffer->buffer) { + pipe_resource_reference(&buffers->buffers[slot], NULL); + memset(desc, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + buffers->writable_mask &= ~(1u << slot); + sctx->descriptors_dirty |= 1u << descriptors_idx; + return; + } + + struct si_resource *buf = si_resource(sbuffer->buffer); + uint64_t va = buf->gpu_address + sbuffer->buffer_offset; + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); + desc[2] = sbuffer->buffer_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); + buffers->offsets[slot] = sbuffer->buffer_offset; + radeon_add_to_gfx_buffer_list_check_mem( + sctx, buf, writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, priority, true); + if (writable) + buffers->writable_mask |= 1u << slot; + else + buffers->writable_mask &= ~(1u << slot); + + buffers->enabled_mask |= 1u << slot; + sctx->descriptors_dirty |= 1u << descriptors_idx; + + util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset, + sbuffer->buffer_offset + sbuffer->buffer_size); +} + +static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader, + unsigned start_slot, unsigned count, + const struct pipe_shader_buffer *sbuffers, + unsigned writable_bitmask) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader); + unsigned i; + + assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); + + for (i = 0; i < count; ++i) { + const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; + unsigned slot = si_get_shaderbuf_slot(start_slot + i); + + if (sbuffer && sbuffer->buffer) + si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER; + + si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer, + !!(writable_bitmask & (1u << i)), buffers->priority); + } +} + +void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot, + uint count, struct pipe_shader_buffer *sbuf) +{ + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); + + for (unsigned i = 0; i < count; ++i) { + si_get_buffer_from_descriptors(buffers, descs, si_get_shaderbuf_slot(start_slot + i), + &sbuf[i].buffer, &sbuf[i].buffer_offset, &sbuf[i].buffer_size); + } } /* RING BUFFERS */ -void si_set_rw_buffer(struct si_context *sctx, - uint slot, const struct pipe_constant_buffer *input) +void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input) { - si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, - slot, input); + si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, input); } void si_set_rw_shader_buffer(struct si_context *sctx, uint slot, - const struct pipe_shader_buffer *sbuffer) -{ - si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, - slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER); -} - -void si_set_ring_buffer(struct si_context *sctx, uint slot, - struct pipe_resource *buffer, - unsigned stride, unsigned num_records, - bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride, uint64_t offset) -{ - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - - /* The stride field in the resource descriptor has 14 bits */ - assert(stride < (1 << 14)); - - assert(slot < descs->num_elements); - pipe_resource_reference(&buffers->buffers[slot], NULL); - - if (buffer) { - uint64_t va; - - va = si_resource(buffer)->gpu_address + offset; - - switch (element_size) { - default: - assert(!"Unsupported ring buffer element size"); - case 0: - case 2: - element_size = 0; - break; - case 4: - element_size = 1; - break; - case 8: - element_size = 2; - break; - case 16: - element_size = 3; - break; - } - - switch (index_stride) { - default: - assert(!"Unsupported ring buffer index stride"); - case 0: - case 8: - index_stride = 0; - break; - case 16: - index_stride = 1; - break; - case 32: - index_stride = 2; - break; - case 64: - index_stride = 3; - break; - } - - if (sctx->chip_class >= GFX8 && stride) - num_records *= stride; - - /* Set the descriptor. */ - uint32_t *desc = descs->list + slot*4; - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(stride) | - S_008F04_SWIZZLE_ENABLE(swizzle); - desc[2] = num_records; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_INDEX_STRIDE(index_stride) | - S_008F0C_ADD_TID_ENABLE(add_tid); - - if (sctx->chip_class >= GFX9) - assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */ - else - desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); - - if (sctx->chip_class >= GFX10) { - desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - } - - pipe_resource_reference(&buffers->buffers[slot], buffer); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(buffer), - RADEON_USAGE_READWRITE, buffers->priority); - buffers->enabled_mask |= 1u << slot; - } else { - /* Clear the descriptor. */ - memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4); - buffers->enabled_mask &= ~(1u << slot); - } - - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + const struct pipe_shader_buffer *sbuffer) +{ + si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, sbuffer, true, + RADEON_PRIO_SHADER_RW_BUFFER); +} + +void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer, + unsigned stride, unsigned num_records, bool add_tid, bool swizzle, + unsigned element_size, unsigned index_stride, uint64_t offset) +{ + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + + /* The stride field in the resource descriptor has 14 bits */ + assert(stride < (1 << 14)); + + assert(slot < descs->num_elements); + pipe_resource_reference(&buffers->buffers[slot], NULL); + + if (buffer) { + uint64_t va; + + va = si_resource(buffer)->gpu_address + offset; + + switch (element_size) { + default: + assert(!"Unsupported ring buffer element size"); + case 0: + case 2: + element_size = 0; + break; + case 4: + element_size = 1; + break; + case 8: + element_size = 2; + break; + case 16: + element_size = 3; + break; + } + + switch (index_stride) { + default: + assert(!"Unsupported ring buffer index stride"); + case 0: + case 8: + index_stride = 0; + break; + case 16: + index_stride = 1; + break; + case 32: + index_stride = 2; + break; + case 64: + index_stride = 3; + break; + } + + if (sctx->chip_class >= GFX8 && stride) + num_records *= stride; + + /* Set the descriptor. */ + uint32_t *desc = descs->list + slot * 4; + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE(swizzle); + desc[2] = num_records; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(add_tid); + + if (sctx->chip_class >= GFX9) + assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */ + else + desc[3] |= S_008F0C_ELEMENT_SIZE(element_size); + + if (sctx->chip_class >= GFX10) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); + } else { + desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + pipe_resource_reference(&buffers->buffers[slot], buffer); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE, + buffers->priority); + buffers->enabled_mask |= 1u << slot; + } else { + /* Clear the descriptor. */ + memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4); + buffers->enabled_mask &= ~(1u << slot); + } + + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } /* INTERNAL CONST BUFFERS */ -static void si_set_polygon_stipple(struct pipe_context *ctx, - const struct pipe_poly_stipple *state) +static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_poly_stipple *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb = {}; - unsigned stipple[32]; - int i; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb = {}; + unsigned stipple[32]; + int i; - for (i = 0; i < 32; i++) - stipple[i] = util_bitreverse(state->stipple[i]); + for (i = 0; i < 32; i++) + stipple[i] = util_bitreverse(state->stipple[i]); - cb.user_buffer = stipple; - cb.buffer_size = sizeof(stipple); + cb.user_buffer = stipple; + cb.buffer_size = sizeof(stipple); - si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb); + si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb); } /* TEXTURE METADATA ENABLE/DISABLE */ -static void -si_resident_handles_update_needs_color_decompress(struct si_context *sctx) +static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx) { - util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); - util_dynarray_clear(&sctx->resident_img_needs_color_decompress); + util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); + util_dynarray_clear(&sctx->resident_img_needs_color_decompress); - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct pipe_resource *res = (*tex_handle)->view->texture; - struct si_texture *tex; + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + struct pipe_resource *res = (*tex_handle)->view->texture; + struct si_texture *tex; - if (!res || res->target == PIPE_BUFFER) - continue; + if (!res || res->target == PIPE_BUFFER) + continue; - tex = (struct si_texture *)res; - if (!color_needs_decompression(tex)) - continue; + tex = (struct si_texture *)res; + if (!color_needs_decompression(tex)) + continue; - util_dynarray_append(&sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, *tex_handle); - } + util_dynarray_append(&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *, + *tex_handle); + } - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - struct pipe_resource *res = view->resource; - struct si_texture *tex; + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + struct pipe_resource *res = view->resource; + struct si_texture *tex; - if (!res || res->target == PIPE_BUFFER) - continue; + if (!res || res->target == PIPE_BUFFER) + continue; - tex = (struct si_texture *)res; - if (!color_needs_decompression(tex)) - continue; + tex = (struct si_texture *)res; + if (!color_needs_decompression(tex)) + continue; - util_dynarray_append(&sctx->resident_img_needs_color_decompress, - struct si_image_handle *, *img_handle); - } + util_dynarray_append(&sctx->resident_img_needs_color_decompress, struct si_image_handle *, + *img_handle); + } } /* CMASK can be enabled (for fast clear) and disabled (for texture export) @@ -1620,13 +1473,13 @@ si_resident_handles_update_needs_color_decompress(struct si_context *sctx) */ void si_update_needs_color_decompress_masks(struct si_context *sctx) { - for (int i = 0; i < SI_NUM_SHADERS; ++i) { - si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); - si_images_update_needs_color_decompress_mask(&sctx->images[i]); - si_update_shader_needs_decompress_mask(sctx, i); - } + for (int i = 0; i < SI_NUM_SHADERS; ++i) { + si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); + si_images_update_needs_color_decompress_mask(&sctx->images[i]); + si_update_shader_needs_decompress_mask(sctx, i); + } - si_resident_handles_update_needs_color_decompress(sctx); + si_resident_handles_update_needs_color_decompress(sctx); } /* BUFFER DISCARD/INVALIDATION */ @@ -1634,33 +1487,27 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx) /* Reset descriptors of buffer resources after \p buf has been invalidated. * If buf == NULL, reset all descriptors. */ -static void si_reset_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, - unsigned descriptors_idx, - unsigned slot_mask, - struct pipe_resource *buf, - enum radeon_bo_priority priority) -{ - struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - unsigned mask = buffers->enabled_mask & slot_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = buffers->buffers[i]; - - if (buffer && (!buf || buffer == buf)) { - si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], - descs->list + i*4); - sctx->descriptors_dirty |= 1u << descriptors_idx; - - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - buffers->writable_mask & (1u << i) ? - RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, - priority, true); - } - } +static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers, + unsigned descriptors_idx, unsigned slot_mask, + struct pipe_resource *buf, enum radeon_bo_priority priority) +{ + struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; + unsigned mask = buffers->enabled_mask & slot_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = buffers->buffers[i]; + + if (buffer && (!buf || buffer == buf)) { + si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4); + sctx->descriptors_dirty |= 1u << descriptors_idx; + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), + buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, + priority, true); + } + } } /* Update all buffer bindings where the buffer is bound, including @@ -1671,436 +1518,389 @@ static void si_reset_buffer_resources(struct si_context *sctx, */ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf) { - struct si_resource *buffer = si_resource(buf); - unsigned i, shader; - unsigned num_elems = sctx->num_vertex_elements; - - /* We changed the buffer, now we need to bind it where the old one - * was bound. This consists of 2 things: - * 1) Updating the resource descriptor and dirtying it. - * 2) Adding a relocation to the CS, so that it's usable. - */ - - /* Vertex buffers. */ - if (!buffer) { - if (num_elems) - sctx->vertex_buffers_dirty = true; - } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) { - for (i = 0; i < num_elems; i++) { - int vb = sctx->vertex_elements->vertex_buffer_index[i]; - - if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) - continue; - if (!sctx->vertex_buffer[vb].buffer.resource) - continue; - - if (sctx->vertex_buffer[vb].buffer.resource == buf) { - sctx->vertex_buffers_dirty = true; - break; - } - } - } - - /* Streamout buffers. (other internal buffers can't be invalidated) */ - if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) { - for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) { - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = - &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - struct pipe_resource *buffer = buffers->buffers[i]; - - if (!buffer || (buf && buffer != buf)) - continue; - - si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], - descs->list + i*4); - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; - - radeon_add_to_gfx_buffer_list_check_mem(sctx, - si_resource(buffer), - RADEON_USAGE_WRITE, - RADEON_PRIO_SHADER_RW_BUFFER, - true); - - /* Update the streamout state. */ - if (sctx->streamout.begin_emitted) - si_emit_streamout_end(sctx); - sctx->streamout.append_bitmask = - sctx->streamout.enabled_mask; - si_streamout_buffers_dirty(sctx); - } - } - - /* Constant and shader buffers. */ - if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) { - for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS), - buf, - sctx->const_and_shader_buffers[shader].priority_constbuf); - } - - if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) { - for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], - si_const_and_shader_buffer_descriptors_idx(shader), - u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), - buf, - sctx->const_and_shader_buffers[shader].priority); - } - - if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { - /* Texture buffers - update bindings. */ - for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_descriptors *descs = - si_sampler_and_image_descriptors(sctx, shader); - unsigned mask = samplers->enabled_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = samplers->views[i]->texture; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - unsigned desc_slot = si_get_sampler_slot(i); - - si_set_buf_desc_address(si_resource(buffer), - samplers->views[i]->u.buf.offset, - descs->list + desc_slot * 16 + 4); - sctx->descriptors_dirty |= - 1u << si_sampler_and_image_descriptors_idx(shader); - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READ, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - } - - /* Shader images */ - if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) { - for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { - struct si_images *images = &sctx->images[shader]; - struct si_descriptors *descs = - si_sampler_and_image_descriptors(sctx, shader); - unsigned mask = images->enabled_mask; - - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_resource *buffer = images->views[i].resource; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - unsigned desc_slot = si_get_image_slot(i); - - if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(&images->views[i]); - - si_set_buf_desc_address(si_resource(buffer), - images->views[i].u.buf.offset, - descs->list + desc_slot * 8 + 4); - sctx->descriptors_dirty |= - 1u << si_sampler_and_image_descriptors_idx(shader); - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READWRITE, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - } - - /* Bindless texture handles */ - if (!buffer || buffer->texture_handle_allocated) { - struct si_descriptors *descs = &sctx->bindless_descriptors; - - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct pipe_sampler_view *view = (*tex_handle)->view; - unsigned desc_slot = (*tex_handle)->desc_slot; - struct pipe_resource *buffer = view->texture; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - si_set_buf_desc_address(si_resource(buffer), - view->u.buf.offset, - descs->list + - desc_slot * 16 + 4); - - (*tex_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READ, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - - /* Bindless image handles */ - if (!buffer || buffer->image_handle_allocated) { - struct si_descriptors *descs = &sctx->bindless_descriptors; - - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; - unsigned desc_slot = (*img_handle)->desc_slot; - struct pipe_resource *buffer = view->resource; - - if (buffer && buffer->target == PIPE_BUFFER && - (!buf || buffer == buf)) { - if (view->access & PIPE_IMAGE_ACCESS_WRITE) - si_mark_image_range_valid(view); - - si_set_buf_desc_address(si_resource(buffer), - view->u.buf.offset, - descs->list + - desc_slot * 16 + 4); - - (*img_handle)->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - - radeon_add_to_gfx_buffer_list_check_mem( - sctx, si_resource(buffer), - RADEON_USAGE_READWRITE, - RADEON_PRIO_SAMPLER_BUFFER, true); - } - } - } - - if (buffer) { - /* Do the same for other contexts. They will invoke this function - * with buffer == NULL. - */ - unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter); - - /* Skip the update for the current context, because we have already updated - * the buffer bindings. - */ - if (new_counter == sctx->last_dirty_buf_counter + 1) - sctx->last_dirty_buf_counter = new_counter; - } -} - -static void si_upload_bindless_descriptor(struct si_context *sctx, - unsigned desc_slot, - unsigned num_dwords) -{ - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = desc_slot * 16; - uint32_t *data; - uint64_t va; - - data = desc->list + desc_slot_offset; - va = desc->gpu_address + desc_slot_offset * 4; - - si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, - num_dwords * 4, V_370_TC_L2, V_370_ME, data); + struct si_resource *buffer = si_resource(buf); + unsigned i, shader; + unsigned num_elems = sctx->num_vertex_elements; + + /* We changed the buffer, now we need to bind it where the old one + * was bound. This consists of 2 things: + * 1) Updating the resource descriptor and dirtying it. + * 2) Adding a relocation to the CS, so that it's usable. + */ + + /* Vertex buffers. */ + if (!buffer) { + if (num_elems) + sctx->vertex_buffers_dirty = true; + } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) { + for (i = 0; i < num_elems; i++) { + int vb = sctx->vertex_elements->vertex_buffer_index[i]; + + if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) + continue; + if (!sctx->vertex_buffer[vb].buffer.resource) + continue; + + if (sctx->vertex_buffer[vb].buffer.resource == buf) { + sctx->vertex_buffers_dirty = true; + break; + } + } + } + + /* Streamout buffers. (other internal buffers can't be invalidated) */ + if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) { + for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) { + struct si_buffer_resources *buffers = &sctx->rw_buffers; + struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + struct pipe_resource *buffer = buffers->buffers[i]; + + if (!buffer || (buf && buffer != buf)) + continue; + + si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4); + sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; + + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_WRITE, + RADEON_PRIO_SHADER_RW_BUFFER, true); + + /* Update the streamout state. */ + if (sctx->streamout.begin_emitted) + si_emit_streamout_end(sctx); + sctx->streamout.append_bitmask = sctx->streamout.enabled_mask; + si_streamout_buffers_dirty(sctx); + } + } + + /* Constant and shader buffers. */ + if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) { + for (shader = 0; shader < SI_NUM_SHADERS; shader++) + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS), + buf, sctx->const_and_shader_buffers[shader].priority_constbuf); + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) { + for (shader = 0; shader < SI_NUM_SHADERS; shader++) + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), buf, + sctx->const_and_shader_buffers[shader].priority); + } + + if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { + /* Texture buffers - update bindings. */ + for (shader = 0; shader < SI_NUM_SHADERS; shader++) { + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); + unsigned mask = samplers->enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = samplers->views[i]->texture; + + if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) { + unsigned desc_slot = si_get_sampler_slot(i); + + si_set_buf_desc_address(si_resource(buffer), samplers->views[i]->u.buf.offset, + descs->list + desc_slot * 16 + 4); + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + } + + /* Shader images */ + if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) { + for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { + struct si_images *images = &sctx->images[shader]; + struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); + unsigned mask = images->enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_resource *buffer = images->views[i].resource; + + if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) { + unsigned desc_slot = si_get_image_slot(i); + + if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(&images->views[i]); + + si_set_buf_desc_address(si_resource(buffer), images->views[i].u.buf.offset, + descs->list + desc_slot * 8 + 4); + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); + + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + } + + /* Bindless texture handles */ + if (!buffer || buffer->texture_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + struct pipe_sampler_view *view = (*tex_handle)->view; + unsigned desc_slot = (*tex_handle)->desc_slot; + struct pipe_resource *buffer = view->texture; + + if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) { + si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset, + descs->list + desc_slot * 16 + 4); + + (*tex_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + + /* Bindless image handles */ + if (!buffer || buffer->image_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + unsigned desc_slot = (*img_handle)->desc_slot; + struct pipe_resource *buffer = view->resource; + + if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(view); + + si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset, + descs->list + desc_slot * 16 + 4); + + (*img_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_gfx_buffer_list_check_mem( + sctx, si_resource(buffer), RADEON_USAGE_READWRITE, RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + + if (buffer) { + /* Do the same for other contexts. They will invoke this function + * with buffer == NULL. + */ + unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter); + + /* Skip the update for the current context, because we have already updated + * the buffer bindings. + */ + if (new_counter == sctx->last_dirty_buf_counter + 1) + sctx->last_dirty_buf_counter = new_counter; + } +} + +static void si_upload_bindless_descriptor(struct si_context *sctx, unsigned desc_slot, + unsigned num_dwords) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *data; + uint64_t va; + + data = desc->list + desc_slot_offset; + va = desc->gpu_address + desc_slot_offset * 4; + + si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, num_dwords * 4, V_370_TC_L2, + V_370_ME, data); } static void si_upload_bindless_descriptors(struct si_context *sctx) { - if (!sctx->bindless_descriptors_dirty) - return; + if (!sctx->bindless_descriptors_dirty) + return; - /* Wait for graphics/compute to be idle before updating the resident - * descriptors directly in memory, in case the GPU is using them. - */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx); + /* Wait for graphics/compute to be idle before updating the resident + * descriptors directly in memory, in case the GPU is using them. + */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->emit_cache_flush(sctx); - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - unsigned desc_slot = (*tex_handle)->desc_slot; + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + unsigned desc_slot = (*tex_handle)->desc_slot; - if (!(*tex_handle)->desc_dirty) - continue; + if (!(*tex_handle)->desc_dirty) + continue; - si_upload_bindless_descriptor(sctx, desc_slot, 16); - (*tex_handle)->desc_dirty = false; - } + si_upload_bindless_descriptor(sctx, desc_slot, 16); + (*tex_handle)->desc_dirty = false; + } - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - unsigned desc_slot = (*img_handle)->desc_slot; + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + unsigned desc_slot = (*img_handle)->desc_slot; - if (!(*img_handle)->desc_dirty) - continue; + if (!(*img_handle)->desc_dirty) + continue; - si_upload_bindless_descriptor(sctx, desc_slot, 8); - (*img_handle)->desc_dirty = false; - } + si_upload_bindless_descriptor(sctx, desc_slot, 8); + (*img_handle)->desc_dirty = false; + } - /* Invalidate L1 because it doesn't know that L2 changed. */ - sctx->flags |= SI_CONTEXT_INV_SCACHE; - sctx->emit_cache_flush(sctx); + /* Invalidate L1 because it doesn't know that L2 changed. */ + sctx->flags |= SI_CONTEXT_INV_SCACHE; + sctx->emit_cache_flush(sctx); - sctx->bindless_descriptors_dirty = false; + sctx->bindless_descriptors_dirty = false; } /* Update mutable image descriptor fields of all resident textures. */ static void si_update_bindless_texture_descriptor(struct si_context *sctx, - struct si_texture_handle *tex_handle) + struct si_texture_handle *tex_handle) { - struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = tex_handle->desc_slot * 16; - uint32_t desc_list[16]; + struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = tex_handle->desc_slot * 16; + uint32_t desc_list[16]; - if (sview->base.texture->target == PIPE_BUFFER) - return; + if (sview->base.texture->target == PIPE_BUFFER) + return; - memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list)); - si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, - desc->list + desc_slot_offset); + memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list)); + si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, desc->list + desc_slot_offset); - if (memcmp(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list))) { - tex_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - } + if (memcmp(desc_list, desc->list + desc_slot_offset, sizeof(desc_list))) { + tex_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } } static void si_update_bindless_image_descriptor(struct si_context *sctx, - struct si_image_handle *img_handle) + struct si_image_handle *img_handle) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot_offset = img_handle->desc_slot * 16; - struct pipe_image_view *view = &img_handle->view; - struct pipe_resource *res = view->resource; - uint32_t image_desc[16]; - unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = img_handle->desc_slot * 16; + struct pipe_image_view *view = &img_handle->view; + struct pipe_resource *res = view->resource; + uint32_t image_desc[16]; + unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4; - if (res->target == PIPE_BUFFER) - return; + if (res->target == PIPE_BUFFER) + return; - memcpy(image_desc, desc->list + desc_slot_offset, desc_size); - si_set_shader_image_desc(sctx, view, true, - desc->list + desc_slot_offset, - desc->list + desc_slot_offset + 8); + memcpy(image_desc, desc->list + desc_slot_offset, desc_size); + si_set_shader_image_desc(sctx, view, true, desc->list + desc_slot_offset, + desc->list + desc_slot_offset + 8); - if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) { - img_handle->desc_dirty = true; - sctx->bindless_descriptors_dirty = true; - } + if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) { + img_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } } static void si_update_all_resident_texture_descriptors(struct si_context *sctx) { - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - si_update_bindless_texture_descriptor(sctx, *tex_handle); - } + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + si_update_bindless_texture_descriptor(sctx, *tex_handle); + } - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - si_update_bindless_image_descriptor(sctx, *img_handle); - } + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + si_update_bindless_image_descriptor(sctx, *img_handle); + } - si_upload_bindless_descriptors(sctx); + si_upload_bindless_descriptors(sctx); } /* Update mutable image descriptor fields of all bound textures. */ void si_update_all_texture_descriptors(struct si_context *sctx) { - unsigned shader; + unsigned shader; - for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_samplers *samplers = &sctx->samplers[shader]; - struct si_images *images = &sctx->images[shader]; - unsigned mask; + for (shader = 0; shader < SI_NUM_SHADERS; shader++) { + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_images *images = &sctx->images[shader]; + unsigned mask; - /* Images. */ - mask = images->enabled_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_image_view *view = &images->views[i]; + /* Images. */ + mask = images->enabled_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; - if (!view->resource || - view->resource->target == PIPE_BUFFER) - continue; + if (!view->resource || view->resource->target == PIPE_BUFFER) + continue; - si_set_shader_image(sctx, shader, i, view, true); - } + si_set_shader_image(sctx, shader, i, view, true); + } - /* Sampler views. */ - mask = samplers->enabled_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct pipe_sampler_view *view = samplers->views[i]; + /* Sampler views. */ + mask = samplers->enabled_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct pipe_sampler_view *view = samplers->views[i]; - if (!view || - !view->texture || - view->texture->target == PIPE_BUFFER) - continue; + if (!view || !view->texture || view->texture->target == PIPE_BUFFER) + continue; - si_set_sampler_view(sctx, shader, i, - samplers->views[i], true); - } + si_set_sampler_view(sctx, shader, i, samplers->views[i], true); + } - si_update_shader_needs_decompress_mask(sctx, shader); - } + si_update_shader_needs_decompress_mask(sctx, shader); + } - si_update_all_resident_texture_descriptors(sctx); - si_update_ps_colorbuf0_slot(sctx); + si_update_all_resident_texture_descriptors(sctx); + si_update_ps_colorbuf0_slot(sctx); } /* SHADER USER DATA */ -static void si_mark_shader_pointers_dirty(struct si_context *sctx, - unsigned shader) +static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shader) { - sctx->shader_pointers_dirty |= - u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, - SI_NUM_SHADER_DESCS); + sctx->shader_pointers_dirty |= + u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS); - if (shader == PIPE_SHADER_VERTEX) { - sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; - sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && - sctx->screen->num_vbos_in_user_sgprs; - } + if (shader == PIPE_SHADER_VERTEX) { + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = + sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + } - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); } static void si_shader_pointers_begin_new_cs(struct si_context *sctx) { - sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; - sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 && - sctx->screen->num_vbos_in_user_sgprs; - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; - sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + sctx->vertex_buffer_user_sgprs_dirty = + sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs; + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; } /* Set a base register address for user data constants in the given shader. * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*. */ -static void si_set_user_data_base(struct si_context *sctx, - unsigned shader, uint32_t new_base) +static void si_set_user_data_base(struct si_context *sctx, unsigned shader, uint32_t new_base) { - uint32_t *base = &sctx->shader_pointers.sh_base[shader]; + uint32_t *base = &sctx->shader_pointers.sh_base[shader]; - if (*base != new_base) { - *base = new_base; + if (*base != new_base) { + *base = new_base; - if (new_base) - si_mark_shader_pointers_dirty(sctx, shader); + if (new_base) + si_mark_shader_pointers_dirty(sctx, shader); - /* Any change in enabled shader stages requires re-emitting - * the VS state SGPR, because it contains the clamp_vertex_color - * state, which can be done in VS, TES, and GS. - */ - sctx->last_vs_state = ~0; - } + /* Any change in enabled shader stages requires re-emitting + * the VS state SGPR, because it contains the clamp_vertex_color + * state, which can be done in VS, TES, and GS. + */ + sctx->last_vs_state = ~0; + } } /* This must be called when these are changed between enabled and disabled @@ -2110,922 +1910,822 @@ static void si_set_user_data_base(struct si_context *sctx, */ void si_shader_change_notify(struct si_context *sctx) { - /* VS can be bound as VS, ES, or LS. */ - if (sctx->tes_shader.cso) { - if (sctx->chip_class >= GFX10) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - } else if (sctx->chip_class == GFX9) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B430_SPI_SHADER_USER_DATA_LS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B530_SPI_SHADER_USER_DATA_LS_0); - } - } else if (sctx->chip_class >= GFX10) { - if (sctx->ngg || sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else if (sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - - /* TES can be bound as ES, VS, or not bound. */ - if (sctx->tes_shader.cso) { - if (sctx->chip_class >= GFX10) { - if (sctx->ngg || sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else if (sctx->gs_shader.cso) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0); - } -} - -static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, - unsigned sh_offset, - unsigned pointer_count) -{ - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0)); - radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); -} - -static void si_emit_shader_pointer_body(struct si_screen *sscreen, - struct radeon_cmdbuf *cs, - uint64_t va) -{ - radeon_emit(cs, va); - - assert(va == 0 || (va >> 32) == sscreen->info.address32_hi); -} - -static void si_emit_shader_pointer(struct si_context *sctx, - struct si_descriptors *desc, - unsigned sh_base) -{ - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned sh_offset = sh_base + desc->shader_userdata_offset; - - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); -} - -static void si_emit_consecutive_shader_pointers(struct si_context *sctx, - unsigned pointer_mask, - unsigned sh_base) -{ - if (!sh_base) - return; - - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned mask = sctx->shader_pointers_dirty & pointer_mask; - - while (mask) { - int start, count; - u_bit_scan_consecutive_range(&mask, &start, &count); - - struct si_descriptors *descs = &sctx->descriptors[start]; - unsigned sh_offset = sh_base + descs->shader_userdata_offset; - - si_emit_shader_pointer_head(cs, sh_offset, count); - for (int i = 0; i < count; i++) - si_emit_shader_pointer_body(sctx->screen, cs, - descs[i].gpu_address); - } -} - -static void si_emit_global_shader_pointers(struct si_context *sctx, - struct si_descriptors *descs) -{ - if (sctx->chip_class >= GFX10) { - si_emit_shader_pointer(sctx, descs, - R_00B030_SPI_SHADER_USER_DATA_PS_0); - /* HW VS stage only used in non-NGG mode. */ - si_emit_shader_pointer(sctx, descs, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - return; - } else if (sctx->chip_class == GFX9) { - /* Broadcast it to all shader stages. */ - si_emit_shader_pointer(sctx, descs, - R_00B530_SPI_SHADER_USER_DATA_COMMON_0); - return; - } - - si_emit_shader_pointer(sctx, descs, - R_00B030_SPI_SHADER_USER_DATA_PS_0); - si_emit_shader_pointer(sctx, descs, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - si_emit_shader_pointer(sctx, descs, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - si_emit_shader_pointer(sctx, descs, - R_00B530_SPI_SHADER_USER_DATA_LS_0); + /* VS can be bound as VS, ES, or LS. */ + if (sctx->tes_shader.cso) { + if (sctx->chip_class >= GFX10) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_HS_0); + } else if (sctx->chip_class == GFX9) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_LS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B530_SPI_SHADER_USER_DATA_LS_0); + } + } else if (sctx->chip_class >= GFX10) { + if (sctx->ngg || sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else if (sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + + /* TES can be bound as ES, VS, or not bound. */ + if (sctx->tes_shader.cso) { + if (sctx->chip_class >= GFX10) { + if (sctx->ngg || sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else if (sctx->gs_shader.cso) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0); + } +} + +static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset, + unsigned pointer_count) +{ + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0)); + radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); +} + +static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs, + uint64_t va) +{ + radeon_emit(cs, va); + + assert(va == 0 || (va >> 32) == sscreen->info.address32_hi); +} + +static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, + unsigned sh_base) +{ + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned sh_offset = sh_base + desc->shader_userdata_offset; + + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); +} + +static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask, + unsigned sh_base) +{ + if (!sh_base) + return; + + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned mask = sctx->shader_pointers_dirty & pointer_mask; + + while (mask) { + int start, count; + u_bit_scan_consecutive_range(&mask, &start, &count); + + struct si_descriptors *descs = &sctx->descriptors[start]; + unsigned sh_offset = sh_base + descs->shader_userdata_offset; + + si_emit_shader_pointer_head(cs, sh_offset, count); + for (int i = 0; i < count; i++) + si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address); + } +} + +static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs) +{ + if (sctx->chip_class >= GFX10) { + si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); + /* HW VS stage only used in non-NGG mode. */ + si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + return; + } else if (sctx->chip_class == GFX9) { + /* Broadcast it to all shader stages. */ + si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0); + return; + } + + si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); + si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); + si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0); } void si_emit_graphics_shader_pointers(struct si_context *sctx) { - uint32_t *sh_base = sctx->shader_pointers.sh_base; - - if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { - si_emit_global_shader_pointers(sctx, - &sctx->descriptors[SI_DESCS_RW_BUFFERS]); - } - - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), - sh_base[PIPE_SHADER_VERTEX]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), - sh_base[PIPE_SHADER_TESS_EVAL]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), - sh_base[PIPE_SHADER_FRAGMENT]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), - sh_base[PIPE_SHADER_TESS_CTRL]); - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), - sh_base[PIPE_SHADER_GEOMETRY]); - - sctx->shader_pointers_dirty &= - ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); - - if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - /* Find the location of the VB descriptor pointer. */ - unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; - if (sctx->chip_class >= GFX9) { - if (sctx->tes_shader.cso) - sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; - else if (sctx->gs_shader.cso) - sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; - } - - unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body(sctx->screen, cs, - sctx->vb_descriptors_buffer->gpu_address + - sctx->vb_descriptors_offset); - sctx->vertex_buffer_pointer_dirty = false; - } - - if (sctx->vertex_buffer_user_sgprs_dirty && - sctx->num_vertex_elements && - sctx->screen->num_vbos_in_user_sgprs) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned num_desc = MIN2(sctx->num_vertex_elements, - sctx->screen->num_vbos_in_user_sgprs); - unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4; - - si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4); - radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4); - sctx->vertex_buffer_user_sgprs_dirty = false; - } - - if (sctx->graphics_bindless_pointer_dirty) { - si_emit_global_shader_pointers(sctx, - &sctx->bindless_descriptors); - sctx->graphics_bindless_pointer_dirty = false; - } + uint32_t *sh_base = sctx->shader_pointers.sh_base; + + if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { + si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + } + + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), + sh_base[PIPE_SHADER_VERTEX]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), + sh_base[PIPE_SHADER_TESS_EVAL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), + sh_base[PIPE_SHADER_FRAGMENT]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), + sh_base[PIPE_SHADER_TESS_CTRL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), + sh_base[PIPE_SHADER_GEOMETRY]); + + sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); + + if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + /* Find the location of the VB descriptor pointer. */ + unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; + if (sctx->chip_class >= GFX9) { + if (sctx->tes_shader.cso) + sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; + else if (sctx->gs_shader.cso) + sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; + } + + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body( + sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset); + sctx->vertex_buffer_pointer_dirty = false; + } + + if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements && + sctx->screen->num_vbos_in_user_sgprs) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs); + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4; + + si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4); + radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4); + sctx->vertex_buffer_user_sgprs_dirty = false; + } + + if (sctx->graphics_bindless_pointer_dirty) { + si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors); + sctx->graphics_bindless_pointer_dirty = false; + } } void si_emit_compute_shader_pointers(struct si_context *sctx) { - unsigned base = R_00B900_COMPUTE_USER_DATA_0; + unsigned base = R_00B900_COMPUTE_USER_DATA_0; - si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), - R_00B900_COMPUTE_USER_DATA_0); - sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), + R_00B900_COMPUTE_USER_DATA_0); + sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); - if (sctx->compute_bindless_pointer_dirty) { - si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); - sctx->compute_bindless_pointer_dirty = false; - } + if (sctx->compute_bindless_pointer_dirty) { + si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); + sctx->compute_bindless_pointer_dirty = false; + } } /* BINDLESS */ -static void si_init_bindless_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - short shader_userdata_rel_index, - unsigned num_elements) +static void si_init_bindless_descriptors(struct si_context *sctx, struct si_descriptors *desc, + short shader_userdata_rel_index, unsigned num_elements) { - ASSERTED unsigned desc_slot; + ASSERTED unsigned desc_slot; - si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements); - sctx->bindless_descriptors.num_active_slots = num_elements; + si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements); + sctx->bindless_descriptors.num_active_slots = num_elements; - /* The first bindless descriptor is stored at slot 1, because 0 is not - * considered to be a valid handle. - */ - sctx->num_bindless_descriptors = 1; + /* The first bindless descriptor is stored at slot 1, because 0 is not + * considered to be a valid handle. + */ + sctx->num_bindless_descriptors = 1; - /* Track which bindless slots are used (or not). */ - util_idalloc_init(&sctx->bindless_used_slots); - util_idalloc_resize(&sctx->bindless_used_slots, num_elements); + /* Track which bindless slots are used (or not). */ + util_idalloc_init(&sctx->bindless_used_slots); + util_idalloc_resize(&sctx->bindless_used_slots, num_elements); - /* Reserve slot 0 because it's an invalid handle for bindless. */ - desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); - assert(desc_slot == 0); + /* Reserve slot 0 because it's an invalid handle for bindless. */ + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + assert(desc_slot == 0); } static void si_release_bindless_descriptors(struct si_context *sctx) { - si_release_descriptors(&sctx->bindless_descriptors); - util_idalloc_fini(&sctx->bindless_used_slots); + si_release_descriptors(&sctx->bindless_descriptors); + util_idalloc_fini(&sctx->bindless_used_slots); } static unsigned si_get_first_free_bindless_slot(struct si_context *sctx) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot; - desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); - if (desc_slot >= desc->num_elements) { - /* The array of bindless descriptors is full, resize it. */ - unsigned slot_size = desc->element_dw_size * 4; - unsigned new_num_elements = desc->num_elements * 2; + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + if (desc_slot >= desc->num_elements) { + /* The array of bindless descriptors is full, resize it. */ + unsigned slot_size = desc->element_dw_size * 4; + unsigned new_num_elements = desc->num_elements * 2; - desc->list = REALLOC(desc->list, desc->num_elements * slot_size, - new_num_elements * slot_size); - desc->num_elements = new_num_elements; - desc->num_active_slots = new_num_elements; - } + desc->list = + REALLOC(desc->list, desc->num_elements * slot_size, new_num_elements * slot_size); + desc->num_elements = new_num_elements; + desc->num_active_slots = new_num_elements; + } - assert(desc_slot); - return desc_slot; + assert(desc_slot); + return desc_slot; } -static unsigned -si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list, - unsigned size) +static unsigned si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list, + unsigned size) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - unsigned desc_slot, desc_slot_offset; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot, desc_slot_offset; - /* Find a free slot. */ - desc_slot = si_get_first_free_bindless_slot(sctx); + /* Find a free slot. */ + desc_slot = si_get_first_free_bindless_slot(sctx); - /* For simplicity, sampler and image bindless descriptors use fixed - * 16-dword slots for now. Image descriptors only need 8-dword but this - * doesn't really matter because no real apps use image handles. - */ - desc_slot_offset = desc_slot * 16; + /* For simplicity, sampler and image bindless descriptors use fixed + * 16-dword slots for now. Image descriptors only need 8-dword but this + * doesn't really matter because no real apps use image handles. + */ + desc_slot_offset = desc_slot * 16; - /* Copy the descriptor into the array. */ - memcpy(desc->list + desc_slot_offset, desc_list, size); + /* Copy the descriptor into the array. */ + memcpy(desc->list + desc_slot_offset, desc_list, size); - /* Re-upload the whole array of bindless descriptors into a new buffer. - */ - if (!si_upload_descriptors(sctx, desc)) - return 0; + /* Re-upload the whole array of bindless descriptors into a new buffer. + */ + if (!si_upload_descriptors(sctx, desc)) + return 0; - /* Make sure to re-emit the shader pointers for all stages. */ - sctx->graphics_bindless_pointer_dirty = true; - sctx->compute_bindless_pointer_dirty = true; + /* Make sure to re-emit the shader pointers for all stages. */ + sctx->graphics_bindless_pointer_dirty = true; + sctx->compute_bindless_pointer_dirty = true; - return desc_slot; + return desc_slot; } -static void si_update_bindless_buffer_descriptor(struct si_context *sctx, - unsigned desc_slot, - struct pipe_resource *resource, - uint64_t offset, - bool *desc_dirty) +static void si_update_bindless_buffer_descriptor(struct si_context *sctx, unsigned desc_slot, + struct pipe_resource *resource, uint64_t offset, + bool *desc_dirty) { - struct si_descriptors *desc = &sctx->bindless_descriptors; - struct si_resource *buf = si_resource(resource); - unsigned desc_slot_offset = desc_slot * 16; - uint32_t *desc_list = desc->list + desc_slot_offset + 4; - uint64_t old_desc_va; + struct si_descriptors *desc = &sctx->bindless_descriptors; + struct si_resource *buf = si_resource(resource); + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *desc_list = desc->list + desc_slot_offset + 4; + uint64_t old_desc_va; - assert(resource->target == PIPE_BUFFER); + assert(resource->target == PIPE_BUFFER); - /* Retrieve the old buffer addr from the descriptor. */ - old_desc_va = si_desc_extract_buffer_address(desc_list); + /* Retrieve the old buffer addr from the descriptor. */ + old_desc_va = si_desc_extract_buffer_address(desc_list); - if (old_desc_va != buf->gpu_address + offset) { - /* The buffer has been invalidated when the handle wasn't - * resident, update the descriptor and the dirty flag. - */ - si_set_buf_desc_address(buf, offset, &desc_list[0]); + if (old_desc_va != buf->gpu_address + offset) { + /* The buffer has been invalidated when the handle wasn't + * resident, update the descriptor and the dirty flag. + */ + si_set_buf_desc_address(buf, offset, &desc_list[0]); - *desc_dirty = true; - } + *desc_dirty = true; + } } -static uint64_t si_create_texture_handle(struct pipe_context *ctx, - struct pipe_sampler_view *view, - const struct pipe_sampler_state *state) +static uint64_t si_create_texture_handle(struct pipe_context *ctx, struct pipe_sampler_view *view, + const struct pipe_sampler_state *state) { - struct si_sampler_view *sview = (struct si_sampler_view *)view; - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct si_sampler_state *sstate; - uint32_t desc_list[16]; - uint64_t handle; + struct si_sampler_view *sview = (struct si_sampler_view *)view; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_state *sstate; + uint32_t desc_list[16]; + uint64_t handle; - tex_handle = CALLOC_STRUCT(si_texture_handle); - if (!tex_handle) - return 0; + tex_handle = CALLOC_STRUCT(si_texture_handle); + if (!tex_handle) + return 0; - memset(desc_list, 0, sizeof(desc_list)); - si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor); + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor); - sstate = ctx->create_sampler_state(ctx, state); - if (!sstate) { - FREE(tex_handle); - return 0; - } + sstate = ctx->create_sampler_state(ctx, state); + if (!sstate) { + FREE(tex_handle); + return 0; + } - si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]); - memcpy(&tex_handle->sstate, sstate, sizeof(*sstate)); - ctx->delete_sampler_state(ctx, sstate); + si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]); + memcpy(&tex_handle->sstate, sstate, sizeof(*sstate)); + ctx->delete_sampler_state(ctx, sstate); - tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, - sizeof(desc_list)); - if (!tex_handle->desc_slot) { - FREE(tex_handle); - return 0; - } + tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list)); + if (!tex_handle->desc_slot) { + FREE(tex_handle); + return 0; + } - handle = tex_handle->desc_slot; + handle = tex_handle->desc_slot; - if (!_mesa_hash_table_insert(sctx->tex_handles, - (void *)(uintptr_t)handle, - tex_handle)) { - FREE(tex_handle); - return 0; - } + if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)(uintptr_t)handle, tex_handle)) { + FREE(tex_handle); + return 0; + } - pipe_sampler_view_reference(&tex_handle->view, view); + pipe_sampler_view_reference(&tex_handle->view, view); - si_resource(sview->base.texture)->texture_handle_allocated = true; + si_resource(sview->base.texture)->texture_handle_allocated = true; - return handle; + return handle; } static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->tex_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - tex_handle = (struct si_texture_handle *)entry->data; - - /* Allow this descriptor slot to be re-used. */ - util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot); - - pipe_sampler_view_reference(&tex_handle->view, NULL); - _mesa_hash_table_remove(sctx->tex_handles, entry); - FREE(tex_handle); -} - -static void si_make_texture_handle_resident(struct pipe_context *ctx, - uint64_t handle, bool resident) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_texture_handle *tex_handle; - struct si_sampler_view *sview; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->tex_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - tex_handle = (struct si_texture_handle *)entry->data; - sview = (struct si_sampler_view *)tex_handle->view; - - if (resident) { - if (sview->base.texture->target != PIPE_BUFFER) { - struct si_texture *tex = - (struct si_texture *)sview->base.texture; - - if (depth_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, - tex_handle); - } - - if (color_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, - tex_handle); - } - - if (tex->surface.dcc_offset && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - - si_update_bindless_texture_descriptor(sctx, tex_handle); - } else { - si_update_bindless_buffer_descriptor(sctx, - tex_handle->desc_slot, - sview->base.texture, - sview->base.u.buf.offset, - &tex_handle->desc_dirty); - } - - /* Re-upload the descriptor if it has been updated while it - * wasn't resident. - */ - if (tex_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; - - /* Add the texture handle to the per-context list. */ - util_dynarray_append(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle); - - /* Add the buffers to the current CS in case si_begin_new_cs() - * is not going to be called. - */ - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } else { - /* Remove the texture handle from the per-context list. */ - util_dynarray_delete_unordered(&sctx->resident_tex_handles, - struct si_texture_handle *, - tex_handle); - - if (sview->base.texture->target != PIPE_BUFFER) { - util_dynarray_delete_unordered( - &sctx->resident_tex_needs_depth_decompress, - struct si_texture_handle *, tex_handle); - - util_dynarray_delete_unordered( - &sctx->resident_tex_needs_color_decompress, - struct si_texture_handle *, tex_handle); - } - } -} - -static uint64_t si_create_image_handle(struct pipe_context *ctx, - const struct pipe_image_view *view) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - uint32_t desc_list[16]; - uint64_t handle; - - if (!view || !view->resource) - return 0; - - img_handle = CALLOC_STRUCT(si_image_handle); - if (!img_handle) - return 0; - - memset(desc_list, 0, sizeof(desc_list)); - si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor); - - si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]); - - img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, - sizeof(desc_list)); - if (!img_handle->desc_slot) { - FREE(img_handle); - return 0; - } - - handle = img_handle->desc_slot; - - if (!_mesa_hash_table_insert(sctx->img_handles, - (void *)(uintptr_t)handle, - img_handle)) { - FREE(img_handle); - return 0; - } - - util_copy_image_view(&img_handle->view, view); - - si_resource(view->resource)->image_handle_allocated = true; - - return handle; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + + /* Allow this descriptor slot to be re-used. */ + util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot); + + pipe_sampler_view_reference(&tex_handle->view, NULL); + _mesa_hash_table_remove(sctx->tex_handles, entry); + FREE(tex_handle); +} + +static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t handle, + bool resident) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_view *sview; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + sview = (struct si_sampler_view *)tex_handle->view; + + if (resident) { + if (sview->base.texture->target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)sview->base.texture; + + if (depth_needs_decompression(tex)) { + util_dynarray_append(&sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, tex_handle); + } + + if (color_needs_decompression(tex)) { + util_dynarray_append(&sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, tex_handle); + } + + if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_texture_descriptor(sctx, tex_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, tex_handle->desc_slot, sview->base.texture, + sview->base.u.buf.offset, &tex_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (tex_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the texture handle to the per-context list. */ + util_dynarray_append(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } else { + /* Remove the texture handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *, + tex_handle); + + if (sview->base.texture->target != PIPE_BUFFER) { + util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, tex_handle); + + util_dynarray_delete_unordered(&sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, tex_handle); + } + } +} + +static uint64_t si_create_image_handle(struct pipe_context *ctx, const struct pipe_image_view *view) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + uint32_t desc_list[16]; + uint64_t handle; + + if (!view || !view->resource) + return 0; + + img_handle = CALLOC_STRUCT(si_image_handle); + if (!img_handle) + return 0; + + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor); + + si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]); + + img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list)); + if (!img_handle->desc_slot) { + FREE(img_handle); + return 0; + } + + handle = img_handle->desc_slot; + + if (!_mesa_hash_table_insert(sctx->img_handles, (void *)(uintptr_t)handle, img_handle)) { + FREE(img_handle); + return 0; + } + + util_copy_image_view(&img_handle->view, view); + + si_resource(view->resource)->image_handle_allocated = true; + + return handle; } static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->img_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - img_handle = (struct si_image_handle *)entry->data; - - util_copy_image_view(&img_handle->view, NULL); - _mesa_hash_table_remove(sctx->img_handles, entry); - FREE(img_handle); -} - -static void si_make_image_handle_resident(struct pipe_context *ctx, - uint64_t handle, unsigned access, - bool resident) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_image_handle *img_handle; - struct pipe_image_view *view; - struct si_resource *res; - struct hash_entry *entry; - - entry = _mesa_hash_table_search(sctx->img_handles, - (void *)(uintptr_t)handle); - if (!entry) - return; - - img_handle = (struct si_image_handle *)entry->data; - view = &img_handle->view; - res = si_resource(view->resource); - - if (resident) { - if (res->b.b.target != PIPE_BUFFER) { - struct si_texture *tex = (struct si_texture *)res; - unsigned level = view->u.tex.level; - - if (color_needs_decompression(tex)) { - util_dynarray_append( - &sctx->resident_img_needs_color_decompress, - struct si_image_handle *, - img_handle); - } - - if (vi_dcc_enabled(tex, level) && - p_atomic_read(&tex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - - si_update_bindless_image_descriptor(sctx, img_handle); - } else { - si_update_bindless_buffer_descriptor(sctx, - img_handle->desc_slot, - view->resource, - view->u.buf.offset, - &img_handle->desc_dirty); - } - - /* Re-upload the descriptor if it has been updated while it - * wasn't resident. - */ - if (img_handle->desc_dirty) - sctx->bindless_descriptors_dirty = true; - - /* Add the image handle to the per-context list. */ - util_dynarray_append(&sctx->resident_img_handles, - struct si_image_handle *, img_handle); - - /* Add the buffers to the current CS in case si_begin_new_cs() - * is not going to be called. - */ - si_sampler_view_add_buffer(sctx, view->resource, - (access & PIPE_IMAGE_ACCESS_WRITE) ? - RADEON_USAGE_READWRITE : - RADEON_USAGE_READ, false, false); - } else { - /* Remove the image handle from the per-context list. */ - util_dynarray_delete_unordered(&sctx->resident_img_handles, - struct si_image_handle *, - img_handle); - - if (res->b.b.target != PIPE_BUFFER) { - util_dynarray_delete_unordered( - &sctx->resident_img_needs_color_decompress, - struct si_image_handle *, - img_handle); - } - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + + util_copy_image_view(&img_handle->view, NULL); + _mesa_hash_table_remove(sctx->img_handles, entry); + FREE(img_handle); +} + +static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t handle, + unsigned access, bool resident) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct pipe_image_view *view; + struct si_resource *res; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + view = &img_handle->view; + res = si_resource(view->resource); + + if (resident) { + if (res->b.b.target != PIPE_BUFFER) { + struct si_texture *tex = (struct si_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(tex)) { + util_dynarray_append(&sctx->resident_img_needs_color_decompress, + struct si_image_handle *, img_handle); + } + + if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_image_descriptor(sctx, img_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, img_handle->desc_slot, view->resource, + view->u.buf.offset, &img_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (img_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the image handle to the per-context list. */ + util_dynarray_append(&sctx->resident_img_handles, struct si_image_handle *, img_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer( + sctx, view->resource, + (access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false, + false); + } else { + /* Remove the image handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *, + img_handle); + + if (res->b.b.target != PIPE_BUFFER) { + util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress, + struct si_image_handle *, img_handle); + } + } } static void si_resident_buffers_add_all_to_bo_list(struct si_context *sctx) { - unsigned num_resident_tex_handles, num_resident_img_handles; + unsigned num_resident_tex_handles, num_resident_img_handles; - num_resident_tex_handles = sctx->resident_tex_handles.size / - sizeof(struct si_texture_handle *); - num_resident_img_handles = sctx->resident_img_handles.size / - sizeof(struct si_image_handle *); + num_resident_tex_handles = sctx->resident_tex_handles.size / sizeof(struct si_texture_handle *); + num_resident_img_handles = sctx->resident_img_handles.size / sizeof(struct si_image_handle *); - /* Add all resident texture handles. */ - util_dynarray_foreach(&sctx->resident_tex_handles, - struct si_texture_handle *, tex_handle) { - struct si_sampler_view *sview = - (struct si_sampler_view *)(*tex_handle)->view; + /* Add all resident texture handles. */ + util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { + struct si_sampler_view *sview = (struct si_sampler_view *)(*tex_handle)->view; - si_sampler_view_add_buffer(sctx, sview->base.texture, - RADEON_USAGE_READ, - sview->is_stencil_sampler, false); - } + si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } - /* Add all resident image handles. */ - util_dynarray_foreach(&sctx->resident_img_handles, - struct si_image_handle *, img_handle) { - struct pipe_image_view *view = &(*img_handle)->view; + /* Add all resident image handles. */ + util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; - si_sampler_view_add_buffer(sctx, view->resource, - RADEON_USAGE_READWRITE, - false, false); - } + si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false); + } - sctx->num_resident_handles += num_resident_tex_handles + - num_resident_img_handles; - assert(sctx->bo_list_add_all_resident_resources); - sctx->bo_list_add_all_resident_resources = false; + sctx->num_resident_handles += num_resident_tex_handles + num_resident_img_handles; + assert(sctx->bo_list_add_all_resident_resources); + sctx->bo_list_add_all_resident_resources = false; } /* INIT/DEINIT/UPLOAD */ void si_init_all_descriptors(struct si_context *sctx) { - int i; - unsigned first_shader = - sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; - - for (i = first_shader; i < SI_NUM_SHADERS; i++) { - bool is_2nd = sctx->chip_class >= GFX9 && - (i == PIPE_SHADER_TESS_CTRL || - i == PIPE_SHADER_GEOMETRY); - unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS; - unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; - int rel_dw_offset; - struct si_descriptors *desc; - - if (is_2nd) { - if (i == PIPE_SHADER_TESS_CTRL) { - rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; - } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ - rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; - } else { - rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; - } - } else { - rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; - } - desc = si_const_and_shader_buffer_descriptors(sctx, i); - si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, - num_buffer_slots, rel_dw_offset, - RADEON_PRIO_SHADER_RW_BUFFER, - RADEON_PRIO_CONST_BUFFER); - desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); - - if (is_2nd) { - if (i == PIPE_SHADER_TESS_CTRL) { - rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; - } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ - rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; - } else { - rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; - } - } else { - rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES; - } - - desc = si_sampler_and_image_descriptors(sctx, i); - si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); - - int j; - for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++) - memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); - for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++) - memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); - } - - si_init_buffer_resources(&sctx->rw_buffers, - &sctx->descriptors[SI_DESCS_RW_BUFFERS], - SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, - /* The second priority is used by - * const buffers in RW buffer slots. */ - RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); - sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; - - /* Initialize an array of 1024 bindless descriptors, when the limit is - * reached, just make it larger and re-upload the whole array. - */ - si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors, - SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, - 1024); - - sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - - /* Set pipe_context functions. */ - sctx->b.bind_sampler_states = si_bind_sampler_states; - sctx->b.set_shader_images = si_set_shader_images; - sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; - sctx->b.set_shader_buffers = si_set_shader_buffers; - sctx->b.set_sampler_views = si_set_sampler_views; - sctx->b.create_texture_handle = si_create_texture_handle; - sctx->b.delete_texture_handle = si_delete_texture_handle; - sctx->b.make_texture_handle_resident = si_make_texture_handle_resident; - sctx->b.create_image_handle = si_create_image_handle; - sctx->b.delete_image_handle = si_delete_image_handle; - sctx->b.make_image_handle_resident = si_make_image_handle_resident; - - if (!sctx->has_graphics) - return; - - sctx->b.set_polygon_stipple = si_set_polygon_stipple; - - /* Shader user data. */ - sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers; - - /* Set default and immutable mappings. */ - if (sctx->ngg) { - assert(sctx->chip_class >= GFX10); - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); - } - - if (sctx->chip_class == GFX9) { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, - R_00B430_SPI_SHADER_USER_DATA_LS_0); - si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, - R_00B330_SPI_SHADER_USER_DATA_ES_0); - } else { - si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - } - si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); + int i; + unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; + + for (i = first_shader; i < SI_NUM_SHADERS; i++) { + bool is_2nd = + sctx->chip_class >= GFX9 && (i == PIPE_SHADER_TESS_CTRL || i == PIPE_SHADER_GEOMETRY); + unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS; + unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; + int rel_dw_offset; + struct si_descriptors *desc; + + if (is_2nd) { + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = + (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = + (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; + } else { + rel_dw_offset = + (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } + } else { + rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; + } + desc = si_const_and_shader_buffer_descriptors(sctx, i); + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots, + rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER, + RADEON_PRIO_CONST_BUFFER); + desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); + + if (is_2nd) { + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = + (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = + (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4; + } else { + rel_dw_offset = + (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } + } else { + rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES; + } + + desc = si_sampler_and_image_descriptors(sctx, i); + si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); + + int j; + for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++) + memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); + for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++) + memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); + } + + si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS], + SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, + /* The second priority is used by + * const buffers in RW buffer slots. */ + RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; + + /* Initialize an array of 1024 bindless descriptors, when the limit is + * reached, just make it larger and re-upload the whole array. + */ + si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors, + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, 1024); + + sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); + + /* Set pipe_context functions. */ + sctx->b.bind_sampler_states = si_bind_sampler_states; + sctx->b.set_shader_images = si_set_shader_images; + sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; + sctx->b.set_shader_buffers = si_set_shader_buffers; + sctx->b.set_sampler_views = si_set_sampler_views; + sctx->b.create_texture_handle = si_create_texture_handle; + sctx->b.delete_texture_handle = si_delete_texture_handle; + sctx->b.make_texture_handle_resident = si_make_texture_handle_resident; + sctx->b.create_image_handle = si_create_image_handle; + sctx->b.delete_image_handle = si_delete_image_handle; + sctx->b.make_image_handle_resident = si_make_image_handle_resident; + + if (!sctx->has_graphics) + return; + + sctx->b.set_polygon_stipple = si_set_polygon_stipple; + + /* Shader user data. */ + sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers; + + /* Set default and immutable mappings. */ + if (sctx->ngg) { + assert(sctx->chip_class >= GFX10); + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); + } + + if (sctx->chip_class == GFX9) { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_LS_0); + si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B330_SPI_SHADER_USER_DATA_ES_0); + } else { + si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0); + } + si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); } static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask) { - unsigned dirty = sctx->descriptors_dirty & mask; + unsigned dirty = sctx->descriptors_dirty & mask; - /* Assume nothing will go wrong: */ - sctx->shader_pointers_dirty |= dirty; + /* Assume nothing will go wrong: */ + sctx->shader_pointers_dirty |= dirty; - while (dirty) { - unsigned i = u_bit_scan(&dirty); + while (dirty) { + unsigned i = u_bit_scan(&dirty); - if (!si_upload_descriptors(sctx, &sctx->descriptors[i])) - return false; - } + if (!si_upload_descriptors(sctx, &sctx->descriptors[i])) + return false; + } - sctx->descriptors_dirty &= ~mask; + sctx->descriptors_dirty &= ~mask; - si_upload_bindless_descriptors(sctx); + si_upload_bindless_descriptors(sctx); - return true; + return true; } bool si_upload_graphics_shader_descriptors(struct si_context *sctx) { - const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); - return si_upload_shader_descriptors(sctx, mask); + const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); + return si_upload_shader_descriptors(sctx, mask); } bool si_upload_compute_shader_descriptors(struct si_context *sctx) { - /* Does not update rw_buffers as that is not needed for compute shaders - * and the input buffer is using the same SGPR's anyway. - */ - const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, - SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); - return si_upload_shader_descriptors(sctx, mask); + /* Does not update rw_buffers as that is not needed for compute shaders + * and the input buffer is using the same SGPR's anyway. + */ + const unsigned mask = + u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); + return si_upload_shader_descriptors(sctx, mask); } void si_release_all_descriptors(struct si_context *sctx) { - int i; + int i; - for (i = 0; i < SI_NUM_SHADERS; i++) { - si_release_buffer_resources(&sctx->const_and_shader_buffers[i], - si_const_and_shader_buffer_descriptors(sctx, i)); - si_release_sampler_views(&sctx->samplers[i]); - si_release_image_views(&sctx->images[i]); - } - si_release_buffer_resources(&sctx->rw_buffers, - &sctx->descriptors[SI_DESCS_RW_BUFFERS]); - for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++) - pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]); + for (i = 0; i < SI_NUM_SHADERS; i++) { + si_release_buffer_resources(&sctx->const_and_shader_buffers[i], + si_const_and_shader_buffer_descriptors(sctx, i)); + si_release_sampler_views(&sctx->samplers[i]); + si_release_image_views(&sctx->images[i]); + } + si_release_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++) + pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]); - for (i = 0; i < SI_NUM_DESCS; ++i) - si_release_descriptors(&sctx->descriptors[i]); + for (i = 0; i < SI_NUM_DESCS; ++i) + si_release_descriptors(&sctx->descriptors[i]); - si_resource_reference(&sctx->vb_descriptors_buffer, NULL); - sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */ + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */ - si_release_bindless_descriptors(sctx); + si_release_bindless_descriptors(sctx); } void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx) { - for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { - si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]); - si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]); - si_image_views_begin_new_cs(sctx, &sctx->images[i]); - } - si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); - si_vertex_buffers_begin_new_cs(sctx); + for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { + si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]); + si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]); + si_image_views_begin_new_cs(sctx, &sctx->images[i]); + } + si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); + si_vertex_buffers_begin_new_cs(sctx); - if (sctx->bo_list_add_all_resident_resources) - si_resident_buffers_add_all_to_bo_list(sctx); + if (sctx->bo_list_add_all_resident_resources) + si_resident_buffers_add_all_to_bo_list(sctx); - assert(sctx->bo_list_add_all_gfx_resources); - sctx->bo_list_add_all_gfx_resources = false; + assert(sctx->bo_list_add_all_gfx_resources); + sctx->bo_list_add_all_gfx_resources = false; } void si_compute_resources_add_all_to_bo_list(struct si_context *sctx) { - unsigned sh = PIPE_SHADER_COMPUTE; + unsigned sh = PIPE_SHADER_COMPUTE; - si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]); - si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]); - si_image_views_begin_new_cs(sctx, &sctx->images[sh]); - si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); + si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]); + si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]); + si_image_views_begin_new_cs(sctx, &sctx->images[sh]); + si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); - if (sctx->bo_list_add_all_resident_resources) - si_resident_buffers_add_all_to_bo_list(sctx); + if (sctx->bo_list_add_all_resident_resources) + si_resident_buffers_add_all_to_bo_list(sctx); - assert(sctx->bo_list_add_all_compute_resources); - sctx->bo_list_add_all_compute_resources = false; + assert(sctx->bo_list_add_all_compute_resources); + sctx->bo_list_add_all_compute_resources = false; } void si_all_descriptors_begin_new_cs(struct si_context *sctx) { - for (unsigned i = 0; i < SI_NUM_DESCS; ++i) - si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]); - si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors); + for (unsigned i = 0; i < SI_NUM_DESCS; ++i) + si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]); + si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors); - si_shader_pointers_begin_new_cs(sctx); + si_shader_pointers_begin_new_cs(sctx); - sctx->bo_list_add_all_resident_resources = true; - sctx->bo_list_add_all_gfx_resources = true; - sctx->bo_list_add_all_compute_resources = true; + sctx->bo_list_add_all_resident_resources = true; + sctx->bo_list_add_all_gfx_resources = true; + sctx->bo_list_add_all_compute_resources = true; } -void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, - uint64_t new_active_mask) +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, uint64_t new_active_mask) { - struct si_descriptors *desc = &sctx->descriptors[desc_idx]; + struct si_descriptors *desc = &sctx->descriptors[desc_idx]; - /* Ignore no-op updates and updates that disable all slots. */ - if (!new_active_mask || - new_active_mask == u_bit_consecutive64(desc->first_active_slot, - desc->num_active_slots)) - return; + /* Ignore no-op updates and updates that disable all slots. */ + if (!new_active_mask || + new_active_mask == u_bit_consecutive64(desc->first_active_slot, desc->num_active_slots)) + return; - int first, count; - u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); - assert(new_active_mask == 0); + int first, count; + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); + assert(new_active_mask == 0); - /* Upload/dump descriptors if slots are being enabled. */ - if (first < desc->first_active_slot || - first + count > desc->first_active_slot + desc->num_active_slots) - sctx->descriptors_dirty |= 1u << desc_idx; + /* Upload/dump descriptors if slots are being enabled. */ + if (first < desc->first_active_slot || + first + count > desc->first_active_slot + desc->num_active_slots) + sctx->descriptors_dirty |= 1u << desc_idx; - desc->first_active_slot = first; - desc->num_active_slots = count; + desc->first_active_slot = first; + desc->num_active_slots = count; } -void si_set_active_descriptors_for_shader(struct si_context *sctx, - struct si_shader_selector *sel) +void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel) { - if (!sel) - return; + if (!sel) + return; - si_set_active_descriptors(sctx, - si_const_and_shader_buffer_descriptors_idx(sel->type), - sel->active_const_and_shader_buffers); - si_set_active_descriptors(sctx, - si_sampler_and_image_descriptors_idx(sel->type), - sel->active_samplers_and_images); + si_set_active_descriptors(sctx, si_const_and_shader_buffer_descriptors_idx(sel->type), + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, si_sampler_and_image_descriptors_idx(sel->type), + sel->active_samplers_and_images); } diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index c58b2b103be..673c3310a1a 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -27,304 +27,279 @@ static void si_dma_emit_wait_idle(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; + struct radeon_cmdbuf *cs = sctx->sdma_cs; - /* NOP waits for idle. */ - if (sctx->chip_class >= GFX7) - radeon_emit(cs, 0x00000000); /* NOP */ - else - radeon_emit(cs, 0xf0000000); /* NOP */ + /* NOP waits for idle. */ + if (sctx->chip_class >= GFX7) + radeon_emit(cs, 0x00000000); /* NOP */ + else + radeon_emit(cs, 0xf0000000); /* NOP */ } -void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, - uint64_t offset) +void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - uint64_t va = dst->gpu_address + offset; + struct radeon_cmdbuf *cs = sctx->sdma_cs; + uint64_t va = dst->gpu_address + offset; - if (sctx->chip_class == GFX6) { - unreachable("SI DMA doesn't support the timestamp packet."); - return; - } + if (sctx->chip_class == GFX6) { + unreachable("SI DMA doesn't support the timestamp packet."); + return; + } - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8); + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8); - assert(va % 8 == 0); + assert(va % 8 == 0); - si_need_dma_space(sctx, 4, dst, NULL); - si_dma_emit_wait_idle(sctx); + si_need_dma_space(sctx, 4, dst, NULL); + si_dma_emit_wait_idle(sctx); - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, - SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, - 0)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + radeon_emit( + cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); } -void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned clear_value) +void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, + uint64_t size, unsigned clear_value) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - unsigned i, ncopy, csize; - struct si_resource *sdst = si_resource(dst); - - assert(offset % 4 == 0); - assert(size); - assert(size % 4 == 0); - - if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || - sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) { - sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); - - offset += sdst->gpu_address; - - if (sctx->chip_class == GFX6) { - /* the same maximum size as for copying */ - ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - si_need_dma_space(sctx, ncopy * 4, sdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, - csize / 4)); - radeon_emit(cs, offset); - radeon_emit(cs, clear_value); - radeon_emit(cs, (offset >> 32) << 16); - offset += csize; - size -= csize; - } - return; - } - - /* The following code is for Sea Islands and later. */ - /* the same maximum size as for copying */ - ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); - si_need_dma_space(sctx, ncopy * 5, sdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, - 0x8000 /* dword copy */)); - radeon_emit(cs, offset); - radeon_emit(cs, offset >> 32); - radeon_emit(cs, clear_value); - /* dw count */ - radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc); - offset += csize; - size -= csize; - } + struct radeon_cmdbuf *cs = sctx->sdma_cs; + unsigned i, ncopy, csize; + struct si_resource *sdst = si_resource(dst); + + assert(offset % 4 == 0); + assert(size); + assert(size % 4 == 0); + + if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || + sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) { + sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); + + offset += sdst->gpu_address; + + if (sctx->chip_class == GFX6) { + /* the same maximum size as for copying */ + ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + si_need_dma_space(sctx, ncopy * 4, sdst, NULL); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4)); + radeon_emit(cs, offset); + radeon_emit(cs, clear_value); + radeon_emit(cs, (offset >> 32) << 16); + offset += csize; + size -= csize; + } + return; + } + + /* The following code is for Sea Islands and later. */ + /* the same maximum size as for copying */ + ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); + si_need_dma_space(sctx, ncopy * 5, sdst, NULL); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */)); + radeon_emit(cs, offset); + radeon_emit(cs, offset >> 32); + radeon_emit(cs, clear_value); + /* dw count */ + radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc); + offset += csize; + size -= csize; + } } void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, - struct pipe_resource *src, uint64_t dst_offset, - uint64_t src_offset, uint64_t size) + struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, + uint64_t size) { - struct radeon_cmdbuf *cs = sctx->sdma_cs; - unsigned i, ncopy, csize; - struct si_resource *sdst = si_resource(dst); - struct si_resource *ssrc = si_resource(src); - - if (!cs || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE || - src->flags & PIPE_RESOURCE_FLAG_SPARSE) { - si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &sdst->valid_buffer_range, dst_offset, - dst_offset + size); - - dst_offset += sdst->gpu_address; - src_offset += ssrc->gpu_address; - - if (sctx->chip_class == GFX6) { - unsigned max_size, sub_cmd, shift; - - /* see whether we should use the dword-aligned or byte-aligned copy */ - if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { - sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; - shift = 2; - max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; - } else { - sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; - shift = 0; - max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; - } - - ncopy = DIV_ROUND_UP(size, max_size); - si_need_dma_space(sctx, ncopy * 5, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, max_size); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, - csize >> shift)); - radeon_emit(cs, dst_offset); - radeon_emit(cs, src_offset); - radeon_emit(cs, (dst_offset >> 32UL) & 0xff); - radeon_emit(cs, (src_offset >> 32UL) & 0xff); - dst_offset += csize; - src_offset += csize; - size -= csize; - } - return; - } - - /* The following code is for CI and later. */ - unsigned align = ~0u; - ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); - - /* Align copy size to dw if src/dst address are dw aligned */ - if ((src_offset & 0x3) == 0 && - (dst_offset & 0x3) == 0 && - size > 4 && - (size & 3) != 0) { - align = ~0x3u; - ncopy++; - } - - si_need_dma_space(sctx, ncopy * 7, sdst, ssrc); - - for (i = 0; i < ncopy; i++) { - csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size; - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, - CIK_SDMA_COPY_SUB_OPCODE_LINEAR, - 0)); - radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); - radeon_emit(cs, 0); /* src/dst endian swap */ - radeon_emit(cs, src_offset); - radeon_emit(cs, src_offset >> 32); - radeon_emit(cs, dst_offset); - radeon_emit(cs, dst_offset >> 32); - dst_offset += csize; - src_offset += csize; - size -= csize; - } + struct radeon_cmdbuf *cs = sctx->sdma_cs; + unsigned i, ncopy, csize; + struct si_resource *sdst = si_resource(dst); + struct si_resource *ssrc = si_resource(src); + + if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) { + si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size); + + dst_offset += sdst->gpu_address; + src_offset += ssrc->gpu_address; + + if (sctx->chip_class == GFX6) { + unsigned max_size, sub_cmd, shift; + + /* see whether we should use the dword-aligned or byte-aligned copy */ + if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) { + sub_cmd = SI_DMA_COPY_DWORD_ALIGNED; + shift = 2; + max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE; + } else { + sub_cmd = SI_DMA_COPY_BYTE_ALIGNED; + shift = 0; + max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE; + } + + ncopy = DIV_ROUND_UP(size, max_size); + si_need_dma_space(sctx, ncopy * 5, sdst, ssrc); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, max_size); + radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift)); + radeon_emit(cs, dst_offset); + radeon_emit(cs, src_offset); + radeon_emit(cs, (dst_offset >> 32UL) & 0xff); + radeon_emit(cs, (src_offset >> 32UL) & 0xff); + dst_offset += csize; + src_offset += csize; + size -= csize; + } + return; + } + + /* The following code is for CI and later. */ + unsigned align = ~0u; + ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); + + /* Align copy size to dw if src/dst address are dw aligned */ + if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) { + align = ~0x3u; + ncopy++; + } + + si_need_dma_space(sctx, ncopy * 7, sdst, ssrc); + + for (i = 0; i < ncopy; i++) { + csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size; + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0)); + radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); + radeon_emit(cs, 0); /* src/dst endian swap */ + radeon_emit(cs, src_offset); + radeon_emit(cs, src_offset >> 32); + radeon_emit(cs, dst_offset); + radeon_emit(cs, dst_offset >> 32); + dst_offset += csize; + src_offset += csize; + size -= csize; + } } -void si_need_dma_space(struct si_context *ctx, unsigned num_dw, - struct si_resource *dst, struct si_resource *src) +void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, + struct si_resource *src) { - struct radeon_winsys *ws = ctx->ws; - uint64_t vram = ctx->sdma_cs->used_vram; - uint64_t gtt = ctx->sdma_cs->used_gart; - - if (dst) { - vram += dst->vram_usage; - gtt += dst->gart_usage; - } - if (src) { - vram += src->vram_usage; - gtt += src->gart_usage; - } - - /* Flush the GFX IB if DMA depends on it. */ - if (!ctx->sdma_uploads_in_progress && - radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && - ((dst && - ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, - RADEON_USAGE_READWRITE)) || - (src && - ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, - RADEON_USAGE_WRITE)))) - si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - - /* Flush if there's not enough space, or if the memory usage per IB - * is too large. - * - * IBs using too little memory are limited by the IB submission overhead. - * IBs using too much memory are limited by the kernel/TTM overhead. - * Too long IBs create CPU-GPU pipeline bubbles and add latency. - * - * This heuristic makes sure that DMA requests are executed - * very soon after the call is made and lowers memory usage. - * It improves texture upload performance by keeping the DMA - * engine busy while uploads are being submitted. - */ - num_dw++; /* for emit_wait_idle below */ - if (!ctx->sdma_uploads_in_progress && - (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) || - ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 || - !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) { - si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw); - } - - /* Wait for idle if either buffer has been used in the IB before to - * prevent read-after-write hazards. - */ - if ((dst && - ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, - RADEON_USAGE_READWRITE)) || - (src && - ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, - RADEON_USAGE_WRITE))) - si_dma_emit_wait_idle(ctx); - - unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; - if (dst) { - ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, - dst->domains, 0); - } - if (src) { - ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, - src->domains, 0); - } - - /* this function is called before all DMA calls, so increment this. */ - ctx->num_dma_calls++; + struct radeon_winsys *ws = ctx->ws; + uint64_t vram = ctx->sdma_cs->used_vram; + uint64_t gtt = ctx->sdma_cs->used_gart; + + if (dst) { + vram += dst->vram_usage; + gtt += dst->gart_usage; + } + if (src) { + vram += src->vram_usage; + gtt += src->gart_usage; + } + + /* Flush the GFX IB if DMA depends on it. */ + if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && + ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) || + (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE)))) + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + + /* Flush if there's not enough space, or if the memory usage per IB + * is too large. + * + * IBs using too little memory are limited by the IB submission overhead. + * IBs using too much memory are limited by the kernel/TTM overhead. + * Too long IBs create CPU-GPU pipeline bubbles and add latency. + * + * This heuristic makes sure that DMA requests are executed + * very soon after the call is made and lowers memory usage. + * It improves texture upload performance by keeping the DMA + * engine busy while uploads are being submitted. + */ + num_dw++; /* for emit_wait_idle below */ + if (!ctx->sdma_uploads_in_progress && + (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) || + ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 || + !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) { + si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw); + } + + /* Wait for idle if either buffer has been used in the IB before to + * prevent read-after-write hazards. + */ + if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) || + (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE))) + si_dma_emit_wait_idle(ctx); + + unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; + if (dst) { + ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0); + } + if (src) { + ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0); + } + + /* this function is called before all DMA calls, so increment this. */ + ctx->num_dma_calls++; } -void si_flush_dma_cs(struct si_context *ctx, unsigned flags, - struct pipe_fence_handle **fence) +void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { - struct radeon_cmdbuf *cs = ctx->sdma_cs; - struct radeon_saved_cs saved; - bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0; - - if (!radeon_emitted(cs, 0)) { - if (fence) - ctx->ws->fence_reference(fence, ctx->last_sdma_fence); - return; - } - - if (check_vm) - si_save_cs(ctx->ws, cs, &saved, true); - - ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence); - if (fence) - ctx->ws->fence_reference(fence, ctx->last_sdma_fence); - - if (check_vm) { - /* Use conservative timeout 800ms, after which we won't wait any - * longer and assume the GPU is hung. - */ - ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000); - - si_check_vm_faults(ctx, &saved, RING_DMA); - si_clear_saved_cs(&saved); - } + struct radeon_cmdbuf *cs = ctx->sdma_cs; + struct radeon_saved_cs saved; + bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0; + + if (!radeon_emitted(cs, 0)) { + if (fence) + ctx->ws->fence_reference(fence, ctx->last_sdma_fence); + return; + } + + if (check_vm) + si_save_cs(ctx->ws, cs, &saved, true); + + ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence); + if (fence) + ctx->ws->fence_reference(fence, ctx->last_sdma_fence); + + if (check_vm) { + /* Use conservative timeout 800ms, after which we won't wait any + * longer and assume the GPU is hung. + */ + ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000); + + si_check_vm_faults(ctx, &saved, RING_DMA); + si_clear_saved_cs(&saved); + } } -void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value) +void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, + uint64_t size, unsigned value) { - struct si_context *ctx = (struct si_context*)sscreen->aux_context; + struct si_context *ctx = (struct si_context *)sscreen->aux_context; - simple_mtx_lock(&sscreen->aux_context_lock); - si_sdma_clear_buffer(ctx, dst, offset, size, value); - sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); - simple_mtx_unlock(&sscreen->aux_context_lock); + simple_mtx_lock(&sscreen->aux_context_lock); + si_sdma_clear_buffer(ctx, dst, offset, size, value); + sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); + simple_mtx_unlock(&sscreen->aux_context_lock); } diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index 26b5fc4bdba..91d1bed505d 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -23,34 +23,33 @@ * */ -#include - +#include "si_build_pm4.h" #include "util/os_time.h" #include "util/u_memory.h" #include "util/u_queue.h" #include "util/u_upload_mgr.h" -#include "si_build_pm4.h" +#include struct si_fine_fence { - struct si_resource *buf; - unsigned offset; + struct si_resource *buf; + unsigned offset; }; struct si_multi_fence { - struct pipe_reference reference; - struct pipe_fence_handle *gfx; - struct pipe_fence_handle *sdma; - struct tc_unflushed_batch_token *tc_token; - struct util_queue_fence ready; - - /* If the context wasn't flushed at fence creation, this is non-NULL. */ - struct { - struct si_context *ctx; - unsigned ib_index; - } gfx_unflushed; - - struct si_fine_fence fine; + struct pipe_reference reference; + struct pipe_fence_handle *gfx; + struct pipe_fence_handle *sdma; + struct tc_unflushed_batch_token *tc_token; + struct util_queue_fence ready; + + /* If the context wasn't flushed at fence creation, this is non-NULL. */ + struct { + struct si_context *ctx; + unsigned ib_index; + } gfx_unflushed; + + struct si_fine_fence fine; }; /** @@ -66,591 +65,554 @@ struct si_multi_fence { * \param old_value Previous fence value (for a bug workaround) * \param new_value Fence value to write for this event. */ -void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, - unsigned event, unsigned event_flags, - unsigned dst_sel, unsigned int_sel, unsigned data_sel, - struct si_resource *buf, uint64_t va, - uint32_t new_fence, unsigned query_type) +void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event, + unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel, + struct si_resource *buf, uint64_t va, uint32_t new_fence, + unsigned query_type) { - unsigned op = EVENT_TYPE(event) | - EVENT_INDEX(event == V_028A90_CS_DONE || - event == V_028A90_PS_DONE ? 6 : 5) | - event_flags; - unsigned sel = EOP_DST_SEL(dst_sel) | - EOP_INT_SEL(int_sel) | - EOP_DATA_SEL(data_sel); - bool compute_ib = !ctx->has_graphics || - cs == ctx->prim_discard_compute_cs; - - if (ctx->chip_class >= GFX9 || - (compute_ib && ctx->chip_class >= GFX7)) { - /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion - * counters) must immediately precede every timestamp event to - * prevent a GPU hang on GFX9. - * - * Occlusion queries don't need to do it here, because they - * always do ZPASS_DONE before the timestamp. - */ - if (ctx->chip_class == GFX9 && !compute_ib && - query_type != PIPE_QUERY_OCCLUSION_COUNTER && - query_type != PIPE_QUERY_OCCLUSION_PREDICATE && - query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { - struct si_resource *scratch = ctx->eop_bug_scratch; - - assert(16 * ctx->screen->info.num_render_backends <= - scratch->b.b.width0); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); - radeon_emit(cs, scratch->gpu_address); - radeon_emit(cs, scratch->gpu_address >> 32); - - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, - RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); - } - - radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0)); - radeon_emit(cs, op); - radeon_emit(cs, sel); - radeon_emit(cs, va); /* address lo */ - radeon_emit(cs, va >> 32); /* address hi */ - radeon_emit(cs, new_fence); /* immediate data lo */ - radeon_emit(cs, 0); /* immediate data hi */ - if (ctx->chip_class >= GFX9) - radeon_emit(cs, 0); /* unused */ - } else { - if (ctx->chip_class == GFX7 || - ctx->chip_class == GFX8) { - struct si_resource *scratch = ctx->eop_bug_scratch; - uint64_t va = scratch->gpu_address; - - /* Two EOP events are required to make all engines go idle - * (and optional cache flushes executed) before the timestamp - * is written. - */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, op); - radeon_emit(cs, va); - radeon_emit(cs, ((va >> 32) & 0xffff) | sel); - radeon_emit(cs, 0); /* immediate data */ - radeon_emit(cs, 0); /* unused */ - - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, - RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); - } - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, op); - radeon_emit(cs, va); - radeon_emit(cs, ((va >> 32) & 0xffff) | sel); - radeon_emit(cs, new_fence); /* immediate data */ - radeon_emit(cs, 0); /* unused */ - } - - if (buf) { - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE, - RADEON_PRIO_QUERY); - } + unsigned op = EVENT_TYPE(event) | + EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) | + event_flags; + unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel); + bool compute_ib = !ctx->has_graphics || cs == ctx->prim_discard_compute_cs; + + if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) { + /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion + * counters) must immediately precede every timestamp event to + * prevent a GPU hang on GFX9. + * + * Occlusion queries don't need to do it here, because they + * always do ZPASS_DONE before the timestamp. + */ + if (ctx->chip_class == GFX9 && !compute_ib && query_type != PIPE_QUERY_OCCLUSION_COUNTER && + query_type != PIPE_QUERY_OCCLUSION_PREDICATE && + query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { + struct si_resource *scratch = ctx->eop_bug_scratch; + + assert(16 * ctx->screen->info.num_render_backends <= scratch->b.b.width0); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, scratch->gpu_address); + radeon_emit(cs, scratch->gpu_address >> 32); + + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); + } + + radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0)); + radeon_emit(cs, op); + radeon_emit(cs, sel); + radeon_emit(cs, va); /* address lo */ + radeon_emit(cs, va >> 32); /* address hi */ + radeon_emit(cs, new_fence); /* immediate data lo */ + radeon_emit(cs, 0); /* immediate data hi */ + if (ctx->chip_class >= GFX9) + radeon_emit(cs, 0); /* unused */ + } else { + if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) { + struct si_resource *scratch = ctx->eop_bug_scratch; + uint64_t va = scratch->gpu_address; + + /* Two EOP events are required to make all engines go idle + * (and optional cache flushes executed) before the timestamp + * is written. + */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xffff) | sel); + radeon_emit(cs, 0); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); + } + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xffff) | sel); + radeon_emit(cs, new_fence); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + } + + if (buf) { + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + } } unsigned si_cp_write_fence_dwords(struct si_screen *screen) { - unsigned dwords = 6; + unsigned dwords = 6; - if (screen->info.chip_class == GFX7 || - screen->info.chip_class == GFX8) - dwords *= 2; + if (screen->info.chip_class == GFX7 || screen->info.chip_class == GFX8) + dwords *= 2; - return dwords; + return dwords; } -void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, - uint64_t va, uint32_t ref, uint32_t mask, unsigned flags) +void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref, + uint32_t mask, unsigned flags) { - radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, ref); /* reference value */ - radeon_emit(cs, mask); /* mask */ - radeon_emit(cs, 4); /* poll interval */ + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, ref); /* reference value */ + radeon_emit(cs, mask); /* mask */ + radeon_emit(cs, 4); /* poll interval */ } -static void si_add_fence_dependency(struct si_context *sctx, - struct pipe_fence_handle *fence) +static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence) { - struct radeon_winsys *ws = sctx->ws; + struct radeon_winsys *ws = sctx->ws; - if (sctx->sdma_cs) - ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0); - ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0); + if (sctx->sdma_cs) + ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0); + ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0); } -static void si_add_syncobj_signal(struct si_context *sctx, - struct pipe_fence_handle *fence) +static void si_add_syncobj_signal(struct si_context *sctx, struct pipe_fence_handle *fence) { - sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence); + sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence); } -static void si_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **dst, - struct pipe_fence_handle *src) +static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) { - struct radeon_winsys *ws = ((struct si_screen*)screen)->ws; - struct si_multi_fence **sdst = (struct si_multi_fence **)dst; - struct si_multi_fence *ssrc = (struct si_multi_fence *)src; - - if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) { - ws->fence_reference(&(*sdst)->gfx, NULL); - ws->fence_reference(&(*sdst)->sdma, NULL); - tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL); - si_resource_reference(&(*sdst)->fine.buf, NULL); - FREE(*sdst); - } - *sdst = ssrc; + struct radeon_winsys *ws = ((struct si_screen *)screen)->ws; + struct si_multi_fence **sdst = (struct si_multi_fence **)dst; + struct si_multi_fence *ssrc = (struct si_multi_fence *)src; + + if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) { + ws->fence_reference(&(*sdst)->gfx, NULL); + ws->fence_reference(&(*sdst)->sdma, NULL); + tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL); + si_resource_reference(&(*sdst)->fine.buf, NULL); + FREE(*sdst); + } + *sdst = ssrc; } static struct si_multi_fence *si_create_multi_fence() { - struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); - if (!fence) - return NULL; + struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); + if (!fence) + return NULL; - pipe_reference_init(&fence->reference, 1); - util_queue_fence_init(&fence->ready); + pipe_reference_init(&fence->reference, 1); + util_queue_fence_init(&fence->ready); - return fence; + return fence; } struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, - struct tc_unflushed_batch_token *tc_token) + struct tc_unflushed_batch_token *tc_token) { - struct si_multi_fence *fence = si_create_multi_fence(); - if (!fence) - return NULL; + struct si_multi_fence *fence = si_create_multi_fence(); + if (!fence) + return NULL; - util_queue_fence_reset(&fence->ready); - tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); + util_queue_fence_reset(&fence->ready); + tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); - return (struct pipe_fence_handle *)fence; + return (struct pipe_fence_handle *)fence; } -static bool si_fine_fence_signaled(struct radeon_winsys *rws, - const struct si_fine_fence *fine) +static bool si_fine_fence_signaled(struct radeon_winsys *rws, const struct si_fine_fence *fine) { - char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | - PIPE_TRANSFER_UNSYNCHRONIZED); - if (!map) - return false; + char *map = + rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED); + if (!map) + return false; - uint32_t *fence = (uint32_t*)(map + fine->offset); - return *fence != 0; + uint32_t *fence = (uint32_t *)(map + fine->offset); + return *fence != 0; } -static void si_fine_fence_set(struct si_context *ctx, - struct si_fine_fence *fine, - unsigned flags) +static void si_fine_fence_set(struct si_context *ctx, struct si_fine_fence *fine, unsigned flags) { - uint32_t *fence_ptr; - - assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1); - - /* Use cached system memory for the fence. */ - u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4, - &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr); - if (!fine->buf) - return; - - *fence_ptr = 0; - - if (flags & PIPE_FLUSH_TOP_OF_PIPE) { - uint32_t value = 0x80000000; - - si_cp_write_data(ctx, fine->buf, fine->offset, 4, - V_370_MEM, V_370_PFP, &value); - } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) { - uint64_t fence_va = fine->buf->gpu_address + fine->offset; - - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf, - RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); - si_cp_release_mem(ctx, ctx->gfx_cs, - V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - NULL, fence_va, 0x80000000, - PIPE_QUERY_GPU_FINISHED); - } else { - assert(false); - } + uint32_t *fence_ptr; + + assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1); + + /* Use cached system memory for the fence. */ + u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4, &fine->offset, + (struct pipe_resource **)&fine->buf, (void **)&fence_ptr); + if (!fine->buf) + return; + + *fence_ptr = 0; + + if (flags & PIPE_FLUSH_TOP_OF_PIPE) { + uint32_t value = 0x80000000; + + si_cp_write_data(ctx, fine->buf, fine->offset, 4, V_370_MEM, V_370_PFP, &value); + } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) { + uint64_t fence_va = fine->buf->gpu_address + fine->offset; + + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + si_cp_release_mem(ctx, ctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, + EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, NULL, fence_va, 0x80000000, + PIPE_QUERY_GPU_FINISHED); + } else { + assert(false); + } } -static bool si_fence_finish(struct pipe_screen *screen, - struct pipe_context *ctx, - struct pipe_fence_handle *fence, - uint64_t timeout) +static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx, + struct pipe_fence_handle *fence, uint64_t timeout) { - struct radeon_winsys *rws = ((struct si_screen*)screen)->ws; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; - struct si_context *sctx; - int64_t abs_timeout = os_time_get_absolute_timeout(timeout); - - ctx = threaded_context_unwrap_sync(ctx); - sctx = (struct si_context*)(ctx ? ctx : NULL); - - if (!util_queue_fence_is_signalled(&sfence->ready)) { - if (sfence->tc_token) { - /* Ensure that si_flush_from_st will be called for - * this fence, but only if we're in the API thread - * where the context is current. - * - * Note that the batch containing the flush may already - * be in flight in the driver thread, so the fence - * may not be ready yet when this call returns. - */ - threaded_context_flush(ctx, sfence->tc_token, - timeout == 0); - } - - if (!timeout) - return false; - - if (timeout == PIPE_TIMEOUT_INFINITE) { - util_queue_fence_wait(&sfence->ready); - } else { - if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout)) - return false; - } - - if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { - int64_t time = os_time_get_nano(); - timeout = abs_timeout > time ? abs_timeout - time : 0; - } - } - - if (sfence->sdma) { - if (!rws->fence_wait(rws, sfence->sdma, timeout)) - return false; - - /* Recompute the timeout after waiting. */ - if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { - int64_t time = os_time_get_nano(); - timeout = abs_timeout > time ? abs_timeout - time : 0; - } - } - - if (!sfence->gfx) - return true; - - if (sfence->fine.buf && - si_fine_fence_signaled(rws, &sfence->fine)) { - rws->fence_reference(&sfence->gfx, NULL); - si_resource_reference(&sfence->fine.buf, NULL); - return true; - } - - /* Flush the gfx IB if it hasn't been flushed yet. */ - if (sctx && sfence->gfx_unflushed.ctx == sctx && - sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) { - /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile) - * spec says: - * - * "If the sync object being blocked upon will not be - * signaled in finite time (for example, by an associated - * fence command issued previously, but not yet flushed to - * the graphics pipeline), then ClientWaitSync may hang - * forever. To help prevent this behavior, if - * ClientWaitSync is called and all of the following are - * true: - * - * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags, - * * sync is unsignaled when ClientWaitSync is called, - * * and the calls to ClientWaitSync and FenceSync were - * issued from the same context, - * - * then the GL will behave as if the equivalent of Flush - * were inserted immediately after the creation of sync." - * - * This means we need to flush for such fences even when we're - * not going to wait. - */ - si_flush_gfx_cs(sctx, - (timeout ? 0 : PIPE_FLUSH_ASYNC) | - RADEON_FLUSH_START_NEXT_GFX_IB_NOW, - NULL); - sfence->gfx_unflushed.ctx = NULL; - - if (!timeout) - return false; - - /* Recompute the timeout after all that. */ - if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { - int64_t time = os_time_get_nano(); - timeout = abs_timeout > time ? abs_timeout - time : 0; - } - } - - if (rws->fence_wait(rws, sfence->gfx, timeout)) - return true; - - /* Re-check in case the GPU is slow or hangs, but the commands before - * the fine-grained fence have completed. */ - if (sfence->fine.buf && - si_fine_fence_signaled(rws, &sfence->fine)) - return true; - - return false; + struct radeon_winsys *rws = ((struct si_screen *)screen)->ws; + struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + struct si_context *sctx; + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); + + ctx = threaded_context_unwrap_sync(ctx); + sctx = (struct si_context *)(ctx ? ctx : NULL); + + if (!util_queue_fence_is_signalled(&sfence->ready)) { + if (sfence->tc_token) { + /* Ensure that si_flush_from_st will be called for + * this fence, but only if we're in the API thread + * where the context is current. + * + * Note that the batch containing the flush may already + * be in flight in the driver thread, so the fence + * may not be ready yet when this call returns. + */ + threaded_context_flush(ctx, sfence->tc_token, timeout == 0); + } + + if (!timeout) + return false; + + if (timeout == PIPE_TIMEOUT_INFINITE) { + util_queue_fence_wait(&sfence->ready); + } else { + if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout)) + return false; + } + + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (sfence->sdma) { + if (!rws->fence_wait(rws, sfence->sdma, timeout)) + return false; + + /* Recompute the timeout after waiting. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (!sfence->gfx) + return true; + + if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine)) { + rws->fence_reference(&sfence->gfx, NULL); + si_resource_reference(&sfence->fine.buf, NULL); + return true; + } + + /* Flush the gfx IB if it hasn't been flushed yet. */ + if (sctx && sfence->gfx_unflushed.ctx == sctx && + sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) { + /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile) + * spec says: + * + * "If the sync object being blocked upon will not be + * signaled in finite time (for example, by an associated + * fence command issued previously, but not yet flushed to + * the graphics pipeline), then ClientWaitSync may hang + * forever. To help prevent this behavior, if + * ClientWaitSync is called and all of the following are + * true: + * + * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags, + * * sync is unsignaled when ClientWaitSync is called, + * * and the calls to ClientWaitSync and FenceSync were + * issued from the same context, + * + * then the GL will behave as if the equivalent of Flush + * were inserted immediately after the creation of sync." + * + * This means we need to flush for such fences even when we're + * not going to wait. + */ + si_flush_gfx_cs(sctx, (timeout ? 0 : PIPE_FLUSH_ASYNC) | RADEON_FLUSH_START_NEXT_GFX_IB_NOW, + NULL); + sfence->gfx_unflushed.ctx = NULL; + + if (!timeout) + return false; + + /* Recompute the timeout after all that. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (rws->fence_wait(rws, sfence->gfx, timeout)) + return true; + + /* Re-check in case the GPU is slow or hangs, but the commands before + * the fine-grained fence have completed. */ + if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine)) + return true; + + return false; } -static void si_create_fence_fd(struct pipe_context *ctx, - struct pipe_fence_handle **pfence, int fd, - enum pipe_fd_type type) +static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handle **pfence, int fd, + enum pipe_fd_type type) { - struct si_screen *sscreen = (struct si_screen*)ctx->screen; - struct radeon_winsys *ws = sscreen->ws; - struct si_multi_fence *sfence; + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + struct radeon_winsys *ws = sscreen->ws; + struct si_multi_fence *sfence; - *pfence = NULL; + *pfence = NULL; - sfence = si_create_multi_fence(); - if (!sfence) - return; + sfence = si_create_multi_fence(); + if (!sfence) + return; - switch (type) { - case PIPE_FD_TYPE_NATIVE_SYNC: - if (!sscreen->info.has_fence_to_handle) - goto finish; + switch (type) { + case PIPE_FD_TYPE_NATIVE_SYNC: + if (!sscreen->info.has_fence_to_handle) + goto finish; - sfence->gfx = ws->fence_import_sync_file(ws, fd); - break; + sfence->gfx = ws->fence_import_sync_file(ws, fd); + break; - case PIPE_FD_TYPE_SYNCOBJ: - if (!sscreen->info.has_syncobj) - goto finish; + case PIPE_FD_TYPE_SYNCOBJ: + if (!sscreen->info.has_syncobj) + goto finish; - sfence->gfx = ws->fence_import_syncobj(ws, fd); - break; + sfence->gfx = ws->fence_import_syncobj(ws, fd); + break; - default: - unreachable("bad fence fd type when importing"); - } + default: + unreachable("bad fence fd type when importing"); + } finish: - if (!sfence->gfx) { - FREE(sfence); - return; - } + if (!sfence->gfx) { + FREE(sfence); + return; + } - *pfence = (struct pipe_fence_handle*)sfence; + *pfence = (struct pipe_fence_handle *)sfence; } -static int si_fence_get_fd(struct pipe_screen *screen, - struct pipe_fence_handle *fence) +static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct radeon_winsys *ws = sscreen->ws; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; - int gfx_fd = -1, sdma_fd = -1; - - if (!sscreen->info.has_fence_to_handle) - return -1; - - util_queue_fence_wait(&sfence->ready); - - /* Deferred fences aren't supported. */ - assert(!sfence->gfx_unflushed.ctx); - if (sfence->gfx_unflushed.ctx) - return -1; - - if (sfence->sdma) { - sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma); - if (sdma_fd == -1) - return -1; - } - if (sfence->gfx) { - gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx); - if (gfx_fd == -1) { - if (sdma_fd != -1) - close(sdma_fd); - return -1; - } - } - - /* If we don't have FDs at this point, it means we don't have fences - * either. */ - if (sdma_fd == -1 && gfx_fd == -1) - return ws->export_signalled_sync_file(ws); - if (sdma_fd == -1) - return gfx_fd; - if (gfx_fd == -1) - return sdma_fd; - - /* Get a fence that will be a combination of both fences. */ - sync_accumulate("radeonsi", &gfx_fd, sdma_fd); - close(sdma_fd); - return gfx_fd; + struct si_screen *sscreen = (struct si_screen *)screen; + struct radeon_winsys *ws = sscreen->ws; + struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + int gfx_fd = -1, sdma_fd = -1; + + if (!sscreen->info.has_fence_to_handle) + return -1; + + util_queue_fence_wait(&sfence->ready); + + /* Deferred fences aren't supported. */ + assert(!sfence->gfx_unflushed.ctx); + if (sfence->gfx_unflushed.ctx) + return -1; + + if (sfence->sdma) { + sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma); + if (sdma_fd == -1) + return -1; + } + if (sfence->gfx) { + gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx); + if (gfx_fd == -1) { + if (sdma_fd != -1) + close(sdma_fd); + return -1; + } + } + + /* If we don't have FDs at this point, it means we don't have fences + * either. */ + if (sdma_fd == -1 && gfx_fd == -1) + return ws->export_signalled_sync_file(ws); + if (sdma_fd == -1) + return gfx_fd; + if (gfx_fd == -1) + return sdma_fd; + + /* Get a fence that will be a combination of both fences. */ + sync_accumulate("radeonsi", &gfx_fd, sdma_fd); + close(sdma_fd); + return gfx_fd; } -static void si_flush_from_st(struct pipe_context *ctx, - struct pipe_fence_handle **fence, - unsigned flags) +static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, + unsigned flags) { - struct pipe_screen *screen = ctx->screen; - struct si_context *sctx = (struct si_context *)ctx; - struct radeon_winsys *ws = sctx->ws; - struct pipe_fence_handle *gfx_fence = NULL; - struct pipe_fence_handle *sdma_fence = NULL; - bool deferred_fence = false; - struct si_fine_fence fine = {}; - unsigned rflags = PIPE_FLUSH_ASYNC; - - if (flags & PIPE_FLUSH_END_OF_FRAME) - rflags |= PIPE_FLUSH_END_OF_FRAME; - - if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) { - assert(flags & PIPE_FLUSH_DEFERRED); - assert(fence); - - si_fine_fence_set(sctx, &fine, flags); - } - - /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ - if (sctx->sdma_cs) - si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL); - - if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) { - if (fence) - ws->fence_reference(&gfx_fence, sctx->last_gfx_fence); - if (!(flags & PIPE_FLUSH_DEFERRED)) - ws->cs_sync_flush(sctx->gfx_cs); - } else { - /* Instead of flushing, create a deferred fence. Constraints: - * - The state tracker must allow a deferred flush. - * - The state tracker must request a fence. - * - fence_get_fd is not allowed. - * Thread safety in fence_finish must be ensured by the state tracker. - */ - if (flags & PIPE_FLUSH_DEFERRED && - !(flags & PIPE_FLUSH_FENCE_FD) && - fence) { - gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs); - deferred_fence = true; - } else { - si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL); - } - } - - /* Both engines can signal out of order, so we need to keep both fences. */ - if (fence) { - struct si_multi_fence *multi_fence; - - if (flags & TC_FLUSH_ASYNC) { - multi_fence = (struct si_multi_fence *)*fence; - assert(multi_fence); - } else { - multi_fence = si_create_multi_fence(); - if (!multi_fence) { - ws->fence_reference(&sdma_fence, NULL); - ws->fence_reference(&gfx_fence, NULL); - goto finish; - } - - screen->fence_reference(screen, fence, NULL); - *fence = (struct pipe_fence_handle*)multi_fence; - } - - /* If both fences are NULL, fence_finish will always return true. */ - multi_fence->gfx = gfx_fence; - multi_fence->sdma = sdma_fence; - - if (deferred_fence) { - multi_fence->gfx_unflushed.ctx = sctx; - multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes; - } - - multi_fence->fine = fine; - fine.buf = NULL; - - if (flags & TC_FLUSH_ASYNC) { - util_queue_fence_signal(&multi_fence->ready); - tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); - } - } - assert(!fine.buf); + struct pipe_screen *screen = ctx->screen; + struct si_context *sctx = (struct si_context *)ctx; + struct radeon_winsys *ws = sctx->ws; + struct pipe_fence_handle *gfx_fence = NULL; + struct pipe_fence_handle *sdma_fence = NULL; + bool deferred_fence = false; + struct si_fine_fence fine = {}; + unsigned rflags = PIPE_FLUSH_ASYNC; + + if (flags & PIPE_FLUSH_END_OF_FRAME) + rflags |= PIPE_FLUSH_END_OF_FRAME; + + if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) { + assert(flags & PIPE_FLUSH_DEFERRED); + assert(fence); + + si_fine_fence_set(sctx, &fine, flags); + } + + /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ + if (sctx->sdma_cs) + si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL); + + if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) { + if (fence) + ws->fence_reference(&gfx_fence, sctx->last_gfx_fence); + if (!(flags & PIPE_FLUSH_DEFERRED)) + ws->cs_sync_flush(sctx->gfx_cs); + } else { + /* Instead of flushing, create a deferred fence. Constraints: + * - The state tracker must allow a deferred flush. + * - The state tracker must request a fence. + * - fence_get_fd is not allowed. + * Thread safety in fence_finish must be ensured by the state tracker. + */ + if (flags & PIPE_FLUSH_DEFERRED && !(flags & PIPE_FLUSH_FENCE_FD) && fence) { + gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs); + deferred_fence = true; + } else { + si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL); + } + } + + /* Both engines can signal out of order, so we need to keep both fences. */ + if (fence) { + struct si_multi_fence *multi_fence; + + if (flags & TC_FLUSH_ASYNC) { + multi_fence = (struct si_multi_fence *)*fence; + assert(multi_fence); + } else { + multi_fence = si_create_multi_fence(); + if (!multi_fence) { + ws->fence_reference(&sdma_fence, NULL); + ws->fence_reference(&gfx_fence, NULL); + goto finish; + } + + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle *)multi_fence; + } + + /* If both fences are NULL, fence_finish will always return true. */ + multi_fence->gfx = gfx_fence; + multi_fence->sdma = sdma_fence; + + if (deferred_fence) { + multi_fence->gfx_unflushed.ctx = sctx; + multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes; + } + + multi_fence->fine = fine; + fine.buf = NULL; + + if (flags & TC_FLUSH_ASYNC) { + util_queue_fence_signal(&multi_fence->ready); + tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); + } + } + assert(!fine.buf); finish: - if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) { - if (sctx->sdma_cs) - ws->cs_sync_flush(sctx->sdma_cs); - ws->cs_sync_flush(sctx->gfx_cs); - } + if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) { + if (sctx->sdma_cs) + ws->cs_sync_flush(sctx->sdma_cs); + ws->cs_sync_flush(sctx->gfx_cs); + } } -static void si_fence_server_signal(struct pipe_context *ctx, - struct pipe_fence_handle *fence) +static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; - - /* We should have at least one syncobj to signal */ - assert(sfence->sdma || sfence->gfx); - - if (sfence->sdma) - si_add_syncobj_signal(sctx, sfence->sdma); - if (sfence->gfx) - si_add_syncobj_signal(sctx, sfence->gfx); - - /** - * The spec does not require a flush here. We insert a flush - * because syncobj based signals are not directly placed into - * the command stream. Instead the signal happens when the - * submission associated with the syncobj finishes execution. - * - * Therefore, we must make sure that we flush the pipe to avoid - * new work being emitted and getting executed before the signal - * operation. - * - * Set sctx->initial_gfx_cs_size to force IB submission even if - * it is empty. - */ - sctx->initial_gfx_cs_size = 0; - si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC); + struct si_context *sctx = (struct si_context *)ctx; + struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + + /* We should have at least one syncobj to signal */ + assert(sfence->sdma || sfence->gfx); + + if (sfence->sdma) + si_add_syncobj_signal(sctx, sfence->sdma); + if (sfence->gfx) + si_add_syncobj_signal(sctx, sfence->gfx); + + /** + * The spec does not require a flush here. We insert a flush + * because syncobj based signals are not directly placed into + * the command stream. Instead the signal happens when the + * submission associated with the syncobj finishes execution. + * + * Therefore, we must make sure that we flush the pipe to avoid + * new work being emitted and getting executed before the signal + * operation. + * + * Set sctx->initial_gfx_cs_size to force IB submission even if + * it is empty. + */ + sctx->initial_gfx_cs_size = 0; + si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC); } -static void si_fence_server_sync(struct pipe_context *ctx, - struct pipe_fence_handle *fence) +static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_multi_fence *sfence = (struct si_multi_fence *)fence; - - util_queue_fence_wait(&sfence->ready); - - /* Unflushed fences from the same context are no-ops. */ - if (sfence->gfx_unflushed.ctx && - sfence->gfx_unflushed.ctx == sctx) - return; - - /* All unflushed commands will not start execution before - * this fence dependency is signalled. - * - * Therefore we must flush before inserting the dependency - */ - si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC); - - if (sfence->sdma) - si_add_fence_dependency(sctx, sfence->sdma); - if (sfence->gfx) - si_add_fence_dependency(sctx, sfence->gfx); + struct si_context *sctx = (struct si_context *)ctx; + struct si_multi_fence *sfence = (struct si_multi_fence *)fence; + + util_queue_fence_wait(&sfence->ready); + + /* Unflushed fences from the same context are no-ops. */ + if (sfence->gfx_unflushed.ctx && sfence->gfx_unflushed.ctx == sctx) + return; + + /* All unflushed commands will not start execution before + * this fence dependency is signalled. + * + * Therefore we must flush before inserting the dependency + */ + si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC); + + if (sfence->sdma) + si_add_fence_dependency(sctx, sfence->sdma); + if (sfence->gfx) + si_add_fence_dependency(sctx, sfence->gfx); } void si_init_fence_functions(struct si_context *ctx) { - ctx->b.flush = si_flush_from_st; - ctx->b.create_fence_fd = si_create_fence_fd; - ctx->b.fence_server_sync = si_fence_server_sync; - ctx->b.fence_server_signal = si_fence_server_signal; + ctx->b.flush = si_flush_from_st; + ctx->b.create_fence_fd = si_create_fence_fd; + ctx->b.fence_server_sync = si_fence_server_sync; + ctx->b.fence_server_signal = si_fence_server_signal; } void si_init_screen_fence_functions(struct si_screen *screen) { - screen->b.fence_finish = si_fence_finish; - screen->b.fence_reference = si_fence_reference; - screen->b.fence_get_fd = si_fence_get_fd; + screen->b.fence_finish = si_fence_finish; + screen->b.fence_reference = si_fence_reference; + screen->b.fence_get_fd = si_fence_get_fd; } diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index f0a00b17e7e..2a4a23cec13 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -22,981 +22,947 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_pipe.h" -#include "radeon/radeon_video.h" -#include "radeon/radeon_vce.h" +#include "compiler/nir/nir.h" #include "radeon/radeon_uvd_enc.h" -#include "vl/vl_decoder.h" -#include "vl/vl_video_buffer.h" +#include "radeon/radeon_vce.h" +#include "radeon/radeon_video.h" +#include "si_pipe.h" #include "util/u_screen.h" #include "util/u_video.h" -#include "compiler/nir/nir.h" - +#include "vl/vl_decoder.h" +#include "vl/vl_video_buffer.h" #include static const char *si_get_vendor(struct pipe_screen *pscreen) { - /* Don't change this. Games such as Alien Isolation are broken if this - * returns "Advanced Micro Devices, Inc." - */ - return "X.Org"; + /* Don't change this. Games such as Alien Isolation are broken if this + * returns "Advanced Micro Devices, Inc." + */ + return "X.Org"; } static const char *si_get_device_vendor(struct pipe_screen *pscreen) { - return "AMD"; + return "AMD"; } static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) { - struct si_screen *sscreen = (struct si_screen *)pscreen; - - switch (param) { - /* Supported features (boolean caps). */ - case PIPE_CAP_ACCELERATED: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_ANISOTROPIC_FILTER: - case PIPE_CAP_POINT_SPRITE: - case PIPE_CAP_OCCLUSION_QUERY: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_TEXTURE_SHADOW_LOD: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: - case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: - case PIPE_CAP_VERTEX_SHADER_SATURATE: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_TEXTURE_BARRIER: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_TGSI_TEXCOORD: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_TEXTURE_GATHER_SM5: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_BUFFER_OBJECT: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_TGSI_ARRAY_COMPONENTS: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_CLOCK: - case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: - case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: - case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: - case PIPE_CAP_TGSI_BALLOT: - case PIPE_CAP_TGSI_VOTE: - case PIPE_CAP_FBFETCH: - case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK: - case PIPE_CAP_IMAGE_LOAD_FORMATTED: - case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA: - case PIPE_CAP_TGSI_DIV: - case PIPE_CAP_PACKED_UNIFORMS: - case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: - case PIPE_CAP_GL_SPIRV: - case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES: - return 1; - - case PIPE_CAP_QUERY_SO_OVERFLOW: - return !sscreen->use_ngg_streamout; - - case PIPE_CAP_POST_DEPTH_COVERAGE: - return sscreen->info.chip_class >= GFX10; - - case PIPE_CAP_GRAPHICS: - return sscreen->info.has_graphics; - - case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - return !SI_BIG_ENDIAN && sscreen->info.has_userptr; - - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - return sscreen->info.has_gpu_reset_status_query; - - case PIPE_CAP_TEXTURE_MULTISAMPLE: - return sscreen->info.has_2d_tiling; - - case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: - return SI_MAP_BUFFER_ALIGNMENT; - - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_MAX_VERTEX_STREAMS: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - return 4; - - case PIPE_CAP_GLSL_FEATURE_LEVEL: - case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - if (!sscreen->info.has_indirect_compute_dispatch) - return 420; - return 460; - - case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: - /* Optimal number for good TexSubImage performance on Polaris10. */ - return 64 * 1024 * 1024; - - case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE: - return 4096 * 1024; - - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: - return MIN2(sscreen->info.max_alloc_size, INT_MAX); - - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads; - - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - return sscreen->info.has_sparse_vm_mappings ? - RADEON_SPARSE_PAGE_SIZE : 0; - - - case PIPE_CAP_UMA: - return 0; - - case PIPE_CAP_FENCE_SIGNAL: - return sscreen->info.has_syncobj; - - case PIPE_CAP_CONSTBUF0_FLAGS: - return SI_RESOURCE_FLAG_32BIT; - - case PIPE_CAP_NATIVE_FENCE_FD: - return sscreen->info.has_fence_to_handle; - - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - return sscreen->has_draw_indirect_multi; - - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - return 30; - - case PIPE_CAP_MAX_VARYINGS: - return 32; - - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - return sscreen->info.chip_class <= GFX8 ? - PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; - - /* Stream output. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return 32*4; - - /* Geometry shader output. */ - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - /* gfx9 has to report 256 to make piglit/gs-max-output pass. - * gfx8 and earlier can do 1024. - */ - return 256; - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 4095; - case PIPE_CAP_MAX_GS_INVOCATIONS: - /* The closed driver exposes 127, but 125 is the greatest - * number that works. */ - return 125; - - case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: - return 2048; - - /* Texturing. */ - case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - return 16384; - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return 15; /* 16384 */ - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - if (sscreen->info.chip_class >= GFX10) - return 14; - /* textures support 8192, but layered rendering supports 2048 */ - return 12; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - if (sscreen->info.chip_class >= GFX10) - return 8192; - /* textures support 8192, but layered rendering supports 2048 */ - return 2048; - - /* Viewports and render targets. */ - case PIPE_CAP_MAX_VIEWPORTS: - return SI_MAX_VIEWPORTS; - case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: - case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS: - case PIPE_CAP_MAX_RENDER_TARGETS: - return 8; - case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: - return sscreen->info.has_eqaa_surface_allocator ? 2 : 0; - - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MIN_TEXEL_OFFSET: - return -32; - - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_TEXEL_OFFSET: - return 31; - - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_LITTLE; - - case PIPE_CAP_VENDOR_ID: - return ATI_VENDOR_ID; - case PIPE_CAP_DEVICE_ID: - return sscreen->info.pci_id; - case PIPE_CAP_VIDEO_MEMORY: - return sscreen->info.vram_size >> 20; - case PIPE_CAP_PCI_GROUP: - return sscreen->info.pci_domain; - case PIPE_CAP_PCI_BUS: - return sscreen->info.pci_bus; - case PIPE_CAP_PCI_DEVICE: - return sscreen->info.pci_dev; - case PIPE_CAP_PCI_FUNCTION: - return sscreen->info.pci_func; - case PIPE_CAP_TGSI_ATOMINC_WRAP: - return LLVM_VERSION_MAJOR >= 10; - - default: - return u_pipe_screen_get_param_defaults(pscreen, param); - } + struct si_screen *sscreen = (struct si_screen *)pscreen; + + switch (param) { + /* Supported features (boolean caps). */ + case PIPE_CAP_ACCELERATED: + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + case PIPE_CAP_ANISOTROPIC_FILTER: + case PIPE_CAP_POINT_SPRITE: + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + case PIPE_CAP_TEXTURE_SHADOW_LOD: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: + case PIPE_CAP_SHADER_STENCIL_EXPORT: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_VERTEX_SHADER_SATURATE: + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_PRIMITIVE_RESTART: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + case PIPE_CAP_START_INSTANCE: + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: + case PIPE_CAP_VERTEX_COLOR_CLAMPED: + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_COMPUTE: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: + case PIPE_CAP_QUERY_PIPELINE_STATISTICS: + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + case PIPE_CAP_CUBE_MAP_ARRAY: + case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_CLIP_HALFZ: + case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: + case PIPE_CAP_POLYGON_OFFSET_CLAMP: + case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_TGSI_TEXCOORD: + case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_DEPTH_BOUNDS_TEST: + case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_TEXTURE_QUERY_LOD: + case PIPE_CAP_TEXTURE_GATHER_SM5: + case PIPE_CAP_TGSI_TXQS: + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: + case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + case PIPE_CAP_GENERATE_MIPMAP: + case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_CULL_DISTANCE: + case PIPE_CAP_TGSI_ARRAY_COMPONENTS: + case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: + case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: + case PIPE_CAP_DOUBLES: + case PIPE_CAP_TGSI_TEX_TXF_LZ: + case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: + case PIPE_CAP_BINDLESS_TEXTURE: + case PIPE_CAP_QUERY_TIMESTAMP: + case PIPE_CAP_QUERY_TIME_ELAPSED: + case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: + case PIPE_CAP_MEMOBJ: + case PIPE_CAP_LOAD_CONSTBUF: + case PIPE_CAP_INT64: + case PIPE_CAP_INT64_DIVMOD: + case PIPE_CAP_TGSI_CLOCK: + case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: + case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: + case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: + case PIPE_CAP_TGSI_BALLOT: + case PIPE_CAP_TGSI_VOTE: + case PIPE_CAP_FBFETCH: + case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK: + case PIPE_CAP_IMAGE_LOAD_FORMATTED: + case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA: + case PIPE_CAP_TGSI_DIV: + case PIPE_CAP_PACKED_UNIFORMS: + case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: + case PIPE_CAP_GL_SPIRV: + case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES: + return 1; + + case PIPE_CAP_QUERY_SO_OVERFLOW: + return !sscreen->use_ngg_streamout; + + case PIPE_CAP_POST_DEPTH_COVERAGE: + return sscreen->info.chip_class >= GFX10; + + case PIPE_CAP_GRAPHICS: + return sscreen->info.has_graphics; + + case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + return !SI_BIG_ENDIAN && sscreen->info.has_userptr; + + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return sscreen->info.has_gpu_reset_status_query; + + case PIPE_CAP_TEXTURE_MULTISAMPLE: + return sscreen->info.has_2d_tiling; + + case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: + return SI_MAP_BUFFER_ALIGNMENT; + + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + case PIPE_CAP_MAX_VERTEX_STREAMS: + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + case PIPE_CAP_MAX_WINDOW_RECTANGLES: + return 4; + + case PIPE_CAP_GLSL_FEATURE_LEVEL: + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + if (!sscreen->info.has_indirect_compute_dispatch) + return 420; + return 460; + + case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + /* Optimal number for good TexSubImage performance on Polaris10. */ + return 64 * 1024 * 1024; + + case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE: + return 4096 * 1024; + + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: + return MIN2(sscreen->info.max_alloc_size, INT_MAX); + + case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: + return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads; + + case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: + return sscreen->info.has_sparse_vm_mappings ? RADEON_SPARSE_PAGE_SIZE : 0; + + case PIPE_CAP_UMA: + return 0; + + case PIPE_CAP_FENCE_SIGNAL: + return sscreen->info.has_syncobj; + + case PIPE_CAP_CONSTBUF0_FLAGS: + return SI_RESOURCE_FLAG_32BIT; + + case PIPE_CAP_NATIVE_FENCE_FD: + return sscreen->info.has_fence_to_handle; + + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + return sscreen->has_draw_indirect_multi; + + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + return 30; + + case PIPE_CAP_MAX_VARYINGS: + return 32; + + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: + return sscreen->info.chip_class <= GFX8 ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; + + /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return 32 * 4; + + /* Geometry shader output. */ + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + /* gfx9 has to report 256 to make piglit/gs-max-output pass. + * gfx8 and earlier can do 1024. + */ + return 256; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return 4095; + case PIPE_CAP_MAX_GS_INVOCATIONS: + /* The closed driver exposes 127, but 125 is the greatest + * number that works. */ + return 125; + + case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: + return 2048; + + /* Texturing. */ + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + return 16384; + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return 15; /* 16384 */ + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + if (sscreen->info.chip_class >= GFX10) + return 14; + /* textures support 8192, but layered rendering supports 2048 */ + return 12; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + if (sscreen->info.chip_class >= GFX10) + return 8192; + /* textures support 8192, but layered rendering supports 2048 */ + return 2048; + + /* Viewports and render targets. */ + case PIPE_CAP_MAX_VIEWPORTS: + return SI_MAX_VIEWPORTS; + case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: + case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS: + case PIPE_CAP_MAX_RENDER_TARGETS: + return 8; + case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS: + return sscreen->info.has_eqaa_surface_allocator ? 2 : 0; + + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + case PIPE_CAP_MIN_TEXEL_OFFSET: + return -32; + + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + case PIPE_CAP_MAX_TEXEL_OFFSET: + return 31; + + case PIPE_CAP_ENDIANNESS: + return PIPE_ENDIAN_LITTLE; + + case PIPE_CAP_VENDOR_ID: + return ATI_VENDOR_ID; + case PIPE_CAP_DEVICE_ID: + return sscreen->info.pci_id; + case PIPE_CAP_VIDEO_MEMORY: + return sscreen->info.vram_size >> 20; + case PIPE_CAP_PCI_GROUP: + return sscreen->info.pci_domain; + case PIPE_CAP_PCI_BUS: + return sscreen->info.pci_bus; + case PIPE_CAP_PCI_DEVICE: + return sscreen->info.pci_dev; + case PIPE_CAP_PCI_FUNCTION: + return sscreen->info.pci_func; + case PIPE_CAP_TGSI_ATOMINC_WRAP: + return LLVM_VERSION_MAJOR >= 10; + + default: + return u_pipe_screen_get_param_defaults(pscreen, param); + } } -static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param) +static float si_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) { - switch (param) { - case PIPE_CAPF_MAX_LINE_WIDTH: - case PIPE_CAPF_MAX_LINE_WIDTH_AA: - /* This depends on the quant mode, though the precise interactions - * are unknown. */ - return 2048; - case PIPE_CAPF_MAX_POINT_WIDTH: - case PIPE_CAPF_MAX_POINT_WIDTH_AA: - return SI_MAX_POINT_SIZE; - case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: - return 16.0f; - case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 16.0f; - case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: - return 0.0f; - } - return 0.0f; + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + /* This depends on the quant mode, though the precise interactions + * are unknown. */ + return 2048; + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return SI_MAX_POINT_SIZE; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 16.0f; + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; + } + return 0.0f; } -static int si_get_shader_param(struct pipe_screen* pscreen, - enum pipe_shader_type shader, - enum pipe_shader_cap param) +static int si_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader, + enum pipe_shader_cap param) { - struct si_screen *sscreen = (struct si_screen *)pscreen; - - switch(shader) - { - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_GEOMETRY: - case PIPE_SHADER_TESS_CTRL: - case PIPE_SHADER_TESS_EVAL: - break; - case PIPE_SHADER_COMPUTE: - switch (param) { - case PIPE_SHADER_CAP_SUPPORTED_IRS: { - int ir = 1 << PIPE_SHADER_IR_NATIVE; - - if (sscreen->info.has_indirect_compute_dispatch) - ir |= 1 << PIPE_SHADER_IR_NIR; - - return ir; - } - - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: { - uint64_t max_const_buffer_size; - pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR, - PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, - &max_const_buffer_size); - return MIN2(max_const_buffer_size, INT_MAX); - } - default: - /* If compute shaders don't require a special value - * for this cap, we can return the same value we - * do for other shader types. */ - break; - } - break; - default: - return 0; - } - - switch (param) { - /* Shader limits. */ - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: - case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: - return 16384; - case PIPE_SHADER_CAP_MAX_INPUTS: - return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32; - case PIPE_SHADER_CAP_MAX_OUTPUTS: - return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; - case PIPE_SHADER_CAP_MAX_TEMPS: - return 256; /* Max native temporaries. */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return SI_NUM_CONST_BUFFERS; - case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return SI_NUM_SAMPLERS; - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - return SI_NUM_SHADER_BUFFERS; - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - return SI_NUM_IMAGES; - case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - return 0; - case PIPE_SHADER_CAP_PREFERRED_IR: - return PIPE_SHADER_IR_NIR; - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: - return 4; - - /* Supported boolean features. */ - case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - case PIPE_SHADER_CAP_INTEGERS: - case PIPE_SHADER_CAP_INT64_ATOMICS: - case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: - case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: - case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: - return 1; - - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - /* TODO: Indirect indexing of GS inputs is unimplemented. */ - if (shader == PIPE_SHADER_GEOMETRY) - return 0; - - if (shader == PIPE_SHADER_VERTEX && - !sscreen->llvm_has_working_vgpr_indexing) - return 0; - - /* TCS and TES load inputs directly from LDS or offchip - * memory, so indirect indexing is always supported. - * PS has to support indirect indexing, because we can't - * lower that to TEMPs for INTERP instructions. - */ - return 1; - - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - return sscreen->llvm_has_working_vgpr_indexing || - /* TCS stores outputs directly to memory. */ - shader == PIPE_SHADER_TESS_CTRL; - - /* Unsupported boolean features. */ - case PIPE_SHADER_CAP_FP16: - case PIPE_SHADER_CAP_SUBROUTINES: - case PIPE_SHADER_CAP_SUPPORTED_IRS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - return 0; - } - return 0; + struct si_screen *sscreen = (struct si_screen *)pscreen; + + switch (shader) { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_GEOMETRY: + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + break; + case PIPE_SHADER_COMPUTE: + switch (param) { + case PIPE_SHADER_CAP_SUPPORTED_IRS: { + int ir = 1 << PIPE_SHADER_IR_NATIVE; + + if (sscreen->info.has_indirect_compute_dispatch) + ir |= 1 << PIPE_SHADER_IR_NIR; + + return ir; + } + + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: { + uint64_t max_const_buffer_size; + pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR, + PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, &max_const_buffer_size); + return MIN2(max_const_buffer_size, INT_MAX); + } + default: + /* If compute shaders don't require a special value + * for this cap, we can return the same value we + * do for other shader types. */ + break; + } + break; + default: + return 0; + } + + switch (param) { + /* Shader limits. */ + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 16384; + case PIPE_SHADER_CAP_MAX_INPUTS: + return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32; + case PIPE_SHADER_CAP_MAX_OUTPUTS: + return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; + case PIPE_SHADER_CAP_MAX_TEMPS: + return 256; /* Max native temporaries. */ + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */ + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return SI_NUM_CONST_BUFFERS; + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + return SI_NUM_SAMPLERS; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return SI_NUM_SHADER_BUFFERS; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return SI_NUM_IMAGES; + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 0; + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + return 4; + + /* Supported boolean features. */ + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + case PIPE_SHADER_CAP_INTEGERS: + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: + return 1; + + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + /* TODO: Indirect indexing of GS inputs is unimplemented. */ + if (shader == PIPE_SHADER_GEOMETRY) + return 0; + + if (shader == PIPE_SHADER_VERTEX && !sscreen->llvm_has_working_vgpr_indexing) + return 0; + + /* TCS and TES load inputs directly from LDS or offchip + * memory, so indirect indexing is always supported. + * PS has to support indirect indexing, because we can't + * lower that to TEMPs for INTERP instructions. + */ + return 1; + + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + return sscreen->llvm_has_working_vgpr_indexing || + /* TCS stores outputs directly to memory. */ + shader == PIPE_SHADER_TESS_CTRL; + + /* Unsupported boolean features. */ + case PIPE_SHADER_CAP_FP16: + case PIPE_SHADER_CAP_SUBROUTINES: + case PIPE_SHADER_CAP_SUPPORTED_IRS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + return 0; + } + return 0; } static const struct nir_shader_compiler_options nir_options = { - .lower_scmp = true, - .lower_flrp32 = true, - .lower_flrp64 = true, - .lower_fsat = true, - .lower_fdiv = true, - .lower_bitfield_insert_to_bitfield_select = true, - .lower_bitfield_extract = true, - .lower_sub = true, - .fuse_ffma = true, - .lower_fmod = true, - .lower_pack_snorm_4x8 = true, - .lower_pack_unorm_4x8 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_snorm_4x8 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .lower_rotate = true, - .lower_to_scalar = true, - .optimize_sample_mask_in = true, - .max_unroll_iterations = 32, - .use_interpolated_input_intrinsics = true, + .lower_scmp = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fsat = true, + .lower_fdiv = true, + .lower_bitfield_insert_to_bitfield_select = true, + .lower_bitfield_extract = true, + .lower_sub = true, + .fuse_ffma = true, + .lower_fmod = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + .lower_to_scalar = true, + .optimize_sample_mask_in = true, + .max_unroll_iterations = 32, + .use_interpolated_input_intrinsics = true, }; -static const void * -si_get_compiler_options(struct pipe_screen *screen, - enum pipe_shader_ir ir, - enum pipe_shader_type shader) +static const void *si_get_compiler_options(struct pipe_screen *screen, enum pipe_shader_ir ir, + enum pipe_shader_type shader) { - assert(ir == PIPE_SHADER_IR_NIR); - return &nir_options; + assert(ir == PIPE_SHADER_IR_NIR); + return &nir_options; } static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid) { - ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); + ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); } static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid) { - struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE); + ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE); } -static const char* si_get_name(struct pipe_screen *pscreen) +static const char *si_get_name(struct pipe_screen *pscreen) { - struct si_screen *sscreen = (struct si_screen*)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - return sscreen->renderer_string; + return sscreen->renderer_string; } -static int si_get_video_param_no_decode(struct pipe_screen *screen, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint, - enum pipe_video_cap param) +static int si_get_video_param_no_decode(struct pipe_screen *screen, enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param) { - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - return vl_profile_supported(screen, profile, entrypoint); - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - case PIPE_VIDEO_CAP_MAX_HEIGHT: - return vl_video_buffer_max_size(screen); - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_MAX_LEVEL: - return vl_level_supported(screen, profile); - default: - return 0; - } + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return vl_profile_supported(screen, profile, entrypoint); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return vl_video_buffer_max_size(screen); + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + return vl_level_supported(screen, profile); + default: + return 0; + } } -static int si_get_video_param(struct pipe_screen *screen, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint, - enum pipe_video_cap param) +static int si_get_video_param(struct pipe_screen *screen, enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, enum pipe_video_cap param) { - struct si_screen *sscreen = (struct si_screen *)screen; - enum pipe_video_format codec = u_reduce_video_profile(profile); - - if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - return ((codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && - (sscreen->info.family >= CHIP_RAVEN || - si_vce_is_fw_version_supported(sscreen))) || - (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN && - (sscreen->info.family >= CHIP_RAVEN || - si_radeon_uvd_enc_supported(sscreen))) || - (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 && - sscreen->info.family >= CHIP_RENOIR)); - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; - case PIPE_VIDEO_CAP_MAX_HEIGHT: - return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304; - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_STACKED_FRAMES: - return (sscreen->info.family < CHIP_TONGA) ? 1 : 2; - default: - return 0; - } - } - - switch (param) { - case PIPE_VIDEO_CAP_SUPPORTED: - switch (codec) { - case PIPE_VIDEO_FORMAT_MPEG12: - return profile != PIPE_VIDEO_PROFILE_MPEG1; - case PIPE_VIDEO_FORMAT_MPEG4: - return 1; - case PIPE_VIDEO_FORMAT_MPEG4_AVC: - if ((sscreen->info.family == CHIP_POLARIS10 || - sscreen->info.family == CHIP_POLARIS11) && - sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) { - RVID_ERR("POLARIS10/11 firmware version need to be updated.\n"); - return false; - } - return true; - case PIPE_VIDEO_FORMAT_VC1: - return true; - case PIPE_VIDEO_FORMAT_HEVC: - /* Carrizo only supports HEVC Main */ - if (sscreen->info.family >= CHIP_STONEY) - return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || - profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); - else if (sscreen->info.family >= CHIP_CARRIZO) - return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; - return false; - case PIPE_VIDEO_FORMAT_JPEG: - if (sscreen->info.family >= CHIP_RAVEN) - return true; - if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10) - return false; - if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) { - RVID_ERR("No MJPEG support for the kernel version\n"); - return false; - } - return true; - case PIPE_VIDEO_FORMAT_VP9: - if (sscreen->info.family < CHIP_RAVEN) - return false; - return true; - default: - return false; - } - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: - switch (codec) { - case PIPE_VIDEO_FORMAT_HEVC: - case PIPE_VIDEO_FORMAT_VP9: - return (sscreen->info.family < CHIP_RENOIR) ? - ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) : - 8192; - default: - return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; - } - case PIPE_VIDEO_CAP_MAX_HEIGHT: - switch (codec) { - case PIPE_VIDEO_FORMAT_HEVC: - case PIPE_VIDEO_FORMAT_VP9: - return (sscreen->info.family < CHIP_RENOIR) ? - ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) : - 4352; - default: - return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096; - } - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - return PIPE_FORMAT_P010; - else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) - return PIPE_FORMAT_P016; - else - return PIPE_FORMAT_NV12; - - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: { - enum pipe_video_format format = u_reduce_video_profile(profile); - - if (format == PIPE_VIDEO_FORMAT_HEVC) - return false; //The firmware doesn't support interlaced HEVC. - else if (format == PIPE_VIDEO_FORMAT_JPEG) - return false; - else if (format == PIPE_VIDEO_FORMAT_VP9) - return false; - return true; - } - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_MAX_LEVEL: - switch (profile) { - case PIPE_VIDEO_PROFILE_MPEG1: - return 0; - case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE: - case PIPE_VIDEO_PROFILE_MPEG2_MAIN: - return 3; - case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE: - return 3; - case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE: - return 5; - case PIPE_VIDEO_PROFILE_VC1_SIMPLE: - return 1; - case PIPE_VIDEO_PROFILE_VC1_MAIN: - return 2; - case PIPE_VIDEO_PROFILE_VC1_ADVANCED: - return 4; - case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: - case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: - case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: - return (sscreen->info.family < CHIP_TONGA) ? 41 : 52; - case PIPE_VIDEO_PROFILE_HEVC_MAIN: - case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: - return 186; - default: - return 0; - } - default: - return 0; - } + struct si_screen *sscreen = (struct si_screen *)screen; + enum pipe_video_format codec = u_reduce_video_profile(profile); + + if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return ( + (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && + (sscreen->info.family >= CHIP_RAVEN || si_vce_is_fw_version_supported(sscreen))) || + (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN && + (sscreen->info.family >= CHIP_RAVEN || si_radeon_uvd_enc_supported(sscreen))) || + (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 && sscreen->info.family >= CHIP_RENOIR)); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304; + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_STACKED_FRAMES: + return (sscreen->info.family < CHIP_TONGA) ? 1 : 2; + default: + return 0; + } + } + + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + switch (codec) { + case PIPE_VIDEO_FORMAT_MPEG12: + return profile != PIPE_VIDEO_PROFILE_MPEG1; + case PIPE_VIDEO_FORMAT_MPEG4: + return 1; + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + if ((sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11) && + sscreen->info.uvd_fw_version < UVD_FW_1_66_16) { + RVID_ERR("POLARIS10/11 firmware version need to be updated.\n"); + return false; + } + return true; + case PIPE_VIDEO_FORMAT_VC1: + return true; + case PIPE_VIDEO_FORMAT_HEVC: + /* Carrizo only supports HEVC Main */ + if (sscreen->info.family >= CHIP_STONEY) + return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || + profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); + else if (sscreen->info.family >= CHIP_CARRIZO) + return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; + return false; + case PIPE_VIDEO_FORMAT_JPEG: + if (sscreen->info.family >= CHIP_RAVEN) + return true; + if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10) + return false; + if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) { + RVID_ERR("No MJPEG support for the kernel version\n"); + return false; + } + return true; + case PIPE_VIDEO_FORMAT_VP9: + if (sscreen->info.family < CHIP_RAVEN) + return false; + return true; + default: + return false; + } + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + switch (codec) { + case PIPE_VIDEO_FORMAT_HEVC: + case PIPE_VIDEO_FORMAT_VP9: + return (sscreen->info.family < CHIP_RENOIR) + ? ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) + : 8192; + default: + return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096; + } + case PIPE_VIDEO_CAP_MAX_HEIGHT: + switch (codec) { + case PIPE_VIDEO_FORMAT_HEVC: + case PIPE_VIDEO_FORMAT_VP9: + return (sscreen->info.family < CHIP_RENOIR) + ? ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) + : 4352; + default: + return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096; + } + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return PIPE_FORMAT_P010; + else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) + return PIPE_FORMAT_P016; + else + return PIPE_FORMAT_NV12; + + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: { + enum pipe_video_format format = u_reduce_video_profile(profile); + + if (format == PIPE_VIDEO_FORMAT_HEVC) + return false; // The firmware doesn't support interlaced HEVC. + else if (format == PIPE_VIDEO_FORMAT_JPEG) + return false; + else if (format == PIPE_VIDEO_FORMAT_VP9) + return false; + return true; + } + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + switch (profile) { + case PIPE_VIDEO_PROFILE_MPEG1: + return 0; + case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE: + case PIPE_VIDEO_PROFILE_MPEG2_MAIN: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE: + return 5; + case PIPE_VIDEO_PROFILE_VC1_SIMPLE: + return 1; + case PIPE_VIDEO_PROFILE_VC1_MAIN: + return 2; + case PIPE_VIDEO_PROFILE_VC1_ADVANCED: + return 4; + case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: + return (sscreen->info.family < CHIP_TONGA) ? 41 : 52; + case PIPE_VIDEO_PROFILE_HEVC_MAIN: + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: + return 186; + default: + return 0; + } + default: + return 0; + } } -static bool si_vid_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_video_profile profile, - enum pipe_video_entrypoint entrypoint) +static bool si_vid_is_format_supported(struct pipe_screen *screen, enum pipe_format format, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint) { - /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */ - if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) - return (format == PIPE_FORMAT_NV12) || - (format == PIPE_FORMAT_P010) || - (format == PIPE_FORMAT_P016); - - /* Vp9 profile 2 supports 10 bit decoding using P016 */ - if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) - return format == PIPE_FORMAT_P016; + /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */ + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return (format == PIPE_FORMAT_NV12) || (format == PIPE_FORMAT_P010) || + (format == PIPE_FORMAT_P016); + /* Vp9 profile 2 supports 10 bit decoding using P016 */ + if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2) + return format == PIPE_FORMAT_P016; - /* we can only handle this one with UVD */ - if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) - return format == PIPE_FORMAT_NV12; + /* we can only handle this one with UVD */ + if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) + return format == PIPE_FORMAT_NV12; - return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint); + return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint); } -static unsigned get_max_threads_per_block(struct si_screen *screen, - enum pipe_shader_ir ir_type) +static unsigned get_max_threads_per_block(struct si_screen *screen, enum pipe_shader_ir ir_type) { - if (ir_type == PIPE_SHADER_IR_NATIVE) - return 256; + if (ir_type == PIPE_SHADER_IR_NATIVE) + return 256; - /* LLVM 10 only supports 1024 threads per block. */ - return 1024; + /* LLVM 10 only supports 1024 threads per block. */ + return 1024; } -static int si_get_compute_param(struct pipe_screen *screen, - enum pipe_shader_ir ir_type, - enum pipe_compute_cap param, - void *ret) +static int si_get_compute_param(struct pipe_screen *screen, enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, void *ret) { - struct si_screen *sscreen = (struct si_screen *)screen; - - //TODO: select these params by asic - switch (param) { - case PIPE_COMPUTE_CAP_IR_TARGET: { - const char *gpu, *triple; - - triple = "amdgcn-mesa-mesa3d"; - gpu = ac_get_llvm_processor_name(sscreen->info.family); - if (ret) { - sprintf(ret, "%s-%s", gpu, triple); - } - /* +2 for dash and terminating NIL byte */ - return (strlen(triple) + strlen(gpu) + 2) * sizeof(char); - } - case PIPE_COMPUTE_CAP_GRID_DIMENSION: - if (ret) { - uint64_t *grid_dimension = ret; - grid_dimension[0] = 3; - } - return 1 * sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: - if (ret) { - uint64_t *grid_size = ret; - grid_size[0] = 65535; - grid_size[1] = 65535; - grid_size[2] = 65535; - } - return 3 * sizeof(uint64_t) ; - - case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: - if (ret) { - uint64_t *block_size = ret; - unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type); - block_size[0] = threads_per_block; - block_size[1] = threads_per_block; - block_size[2] = threads_per_block; - } - return 3 * sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: - if (ret) { - uint64_t *max_threads_per_block = ret; - *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); - } - return sizeof(uint64_t); - case PIPE_COMPUTE_CAP_ADDRESS_BITS: - if (ret) { - uint32_t *address_bits = ret; - address_bits[0] = 64; - } - return 1 * sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: - if (ret) { - uint64_t *max_global_size = ret; - uint64_t max_mem_alloc_size; - - si_get_compute_param(screen, ir_type, - PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, - &max_mem_alloc_size); - - /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least - * 1/4 of the MAX_GLOBAL_SIZE. Since the - * MAX_MEM_ALLOC_SIZE is fixed for older kernels, - * make sure we never report more than - * 4 * MAX_MEM_ALLOC_SIZE. - */ - *max_global_size = MIN2(4 * max_mem_alloc_size, - MAX2(sscreen->info.gart_size, - sscreen->info.vram_size)); - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - if (ret) { - uint64_t *max_local_size = ret; - /* Value reported by the closed source driver. */ - *max_local_size = 32768; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - if (ret) { - uint64_t *max_input_size = ret; - /* Value reported by the closed source driver. */ - *max_input_size = 1024; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: - if (ret) { - uint64_t *max_mem_alloc_size = ret; - - *max_mem_alloc_size = sscreen->info.max_alloc_size; - } - return sizeof(uint64_t); - - case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: - if (ret) { - uint32_t *max_clock_frequency = ret; - *max_clock_frequency = sscreen->info.max_shader_clock; - } - return sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: - if (ret) { - uint32_t *max_compute_units = ret; - *max_compute_units = sscreen->info.num_good_compute_units; - } - return sizeof(uint32_t); - - case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: - if (ret) { - uint32_t *images_supported = ret; - *images_supported = 0; - } - return sizeof(uint32_t); - case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: - break; /* unused */ - case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: - if (ret) { - uint32_t *subgroup_size = ret; - *subgroup_size = sscreen->compute_wave_size; - } - return sizeof(uint32_t); - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - if (ret) { - uint64_t *max_variable_threads_per_block = ret; - if (ir_type == PIPE_SHADER_IR_NATIVE) - *max_variable_threads_per_block = 0; - else - *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - return sizeof(uint64_t); - } - - fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); - return 0; + struct si_screen *sscreen = (struct si_screen *)screen; + + // TODO: select these params by asic + switch (param) { + case PIPE_COMPUTE_CAP_IR_TARGET: { + const char *gpu, *triple; + + triple = "amdgcn-mesa-mesa3d"; + gpu = ac_get_llvm_processor_name(sscreen->info.family); + if (ret) { + sprintf(ret, "%s-%s", gpu, triple); + } + /* +2 for dash and terminating NIL byte */ + return (strlen(triple) + strlen(gpu) + 2) * sizeof(char); + } + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint64_t *grid_dimension = ret; + grid_dimension[0] = 3; + } + return 1 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 65535; + } + return 3 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t *block_size = ret; + unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type); + block_size[0] = threads_per_block; + block_size[1] = threads_per_block; + block_size[2] = threads_per_block; + } + return 3 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_threads_per_block = ret; + *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + address_bits[0] = 64; + } + return 1 * sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + if (ret) { + uint64_t *max_global_size = ret; + uint64_t max_mem_alloc_size; + + si_get_compute_param(screen, ir_type, PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, + &max_mem_alloc_size); + + /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least + * 1/4 of the MAX_GLOBAL_SIZE. Since the + * MAX_MEM_ALLOC_SIZE is fixed for older kernels, + * make sure we never report more than + * 4 * MAX_MEM_ALLOC_SIZE. + */ + *max_global_size = + MIN2(4 * max_mem_alloc_size, MAX2(sscreen->info.gart_size, sscreen->info.vram_size)); + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + if (ret) { + uint64_t *max_local_size = ret; + /* Value reported by the closed source driver. */ + *max_local_size = 32768; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + if (ret) { + uint64_t *max_input_size = ret; + /* Value reported by the closed source driver. */ + *max_input_size = 1024; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + if (ret) { + uint64_t *max_mem_alloc_size = ret; + + *max_mem_alloc_size = sscreen->info.max_alloc_size; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + if (ret) { + uint32_t *max_clock_frequency = ret; + *max_clock_frequency = sscreen->info.max_shader_clock; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + if (ret) { + uint32_t *max_compute_units = ret; + *max_compute_units = sscreen->info.num_good_compute_units; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + if (ret) { + uint32_t *images_supported = ret; + *images_supported = 0; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + break; /* unused */ + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + if (ret) { + uint32_t *subgroup_size = ret; + *subgroup_size = sscreen->compute_wave_size; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_variable_threads_per_block = ret; + if (ir_type == PIPE_SHADER_IR_NATIVE) + *max_variable_threads_per_block = 0; + else + *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return sizeof(uint64_t); + } + + fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); + return 0; } static uint64_t si_get_timestamp(struct pipe_screen *screen) { - struct si_screen *sscreen = (struct si_screen*)screen; + struct si_screen *sscreen = (struct si_screen *)screen; - return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) / - sscreen->info.clock_crystal_freq; + return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) / + sscreen->info.clock_crystal_freq; } -static void si_query_memory_info(struct pipe_screen *screen, - struct pipe_memory_info *info) +static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_info *info) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct radeon_winsys *ws = sscreen->ws; - unsigned vram_usage, gtt_usage; - - info->total_device_memory = sscreen->info.vram_size / 1024; - info->total_staging_memory = sscreen->info.gart_size / 1024; - - /* The real TTM memory usage is somewhat random, because: - * - * 1) TTM delays freeing memory, because it can only free it after - * fences expire. - * - * 2) The memory usage can be really low if big VRAM evictions are - * taking place, but the real usage is well above the size of VRAM. - * - * Instead, return statistics of this process. - */ - vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024; - gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024; - - info->avail_device_memory = - vram_usage <= info->total_device_memory ? - info->total_device_memory - vram_usage : 0; - info->avail_staging_memory = - gtt_usage <= info->total_staging_memory ? - info->total_staging_memory - gtt_usage : 0; - - info->device_memory_evicted = - ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; - - if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4) - info->nr_device_memory_evictions = - ws->query_value(ws, RADEON_NUM_EVICTIONS); - else - /* Just return the number of evicted 64KB pages. */ - info->nr_device_memory_evictions = info->device_memory_evicted / 64; + struct si_screen *sscreen = (struct si_screen *)screen; + struct radeon_winsys *ws = sscreen->ws; + unsigned vram_usage, gtt_usage; + + info->total_device_memory = sscreen->info.vram_size / 1024; + info->total_staging_memory = sscreen->info.gart_size / 1024; + + /* The real TTM memory usage is somewhat random, because: + * + * 1) TTM delays freeing memory, because it can only free it after + * fences expire. + * + * 2) The memory usage can be really low if big VRAM evictions are + * taking place, but the real usage is well above the size of VRAM. + * + * Instead, return statistics of this process. + */ + vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024; + gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024; + + info->avail_device_memory = + vram_usage <= info->total_device_memory ? info->total_device_memory - vram_usage : 0; + info->avail_staging_memory = + gtt_usage <= info->total_staging_memory ? info->total_staging_memory - gtt_usage : 0; + + info->device_memory_evicted = ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; + + if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4) + info->nr_device_memory_evictions = ws->query_value(ws, RADEON_NUM_EVICTIONS); + else + /* Just return the number of evicted 64KB pages. */ + info->nr_device_memory_evictions = info->device_memory_evicted / 64; } static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen) { - struct si_screen *sscreen = (struct si_screen*)pscreen; + struct si_screen *sscreen = (struct si_screen *)pscreen; - return sscreen->disk_shader_cache; + return sscreen->disk_shader_cache; } static void si_init_renderer_string(struct si_screen *sscreen) { - char first_name[256], second_name[32] = {}, kernel_version[128] = {}; - struct utsname uname_data; - - if (sscreen->info.marketing_name) { - snprintf(first_name, sizeof(first_name), "%s", - sscreen->info.marketing_name); - snprintf(second_name, sizeof(second_name), "%s, ", - sscreen->info.name); - } else { - snprintf(first_name, sizeof(first_name), "AMD %s", - sscreen->info.name); - } - - if (uname(&uname_data) == 0) - snprintf(kernel_version, sizeof(kernel_version), - ", %s", uname_data.release); - - snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string), - "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", - first_name, second_name, sscreen->info.drm_major, - sscreen->info.drm_minor, sscreen->info.drm_patchlevel, - kernel_version); + char first_name[256], second_name[32] = {}, kernel_version[128] = {}; + struct utsname uname_data; + + if (sscreen->info.marketing_name) { + snprintf(first_name, sizeof(first_name), "%s", sscreen->info.marketing_name); + snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.name); + } else { + snprintf(first_name, sizeof(first_name), "AMD %s", sscreen->info.name); + } + + if (uname(&uname_data) == 0) + snprintf(kernel_version, sizeof(kernel_version), ", %s", uname_data.release); + + snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string), + "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", first_name, second_name, + sscreen->info.drm_major, sscreen->info.drm_minor, sscreen->info.drm_patchlevel, + kernel_version); } void si_init_screen_get_functions(struct si_screen *sscreen) { - sscreen->b.get_name = si_get_name; - sscreen->b.get_vendor = si_get_vendor; - sscreen->b.get_device_vendor = si_get_device_vendor; - sscreen->b.get_param = si_get_param; - sscreen->b.get_paramf = si_get_paramf; - sscreen->b.get_compute_param = si_get_compute_param; - sscreen->b.get_timestamp = si_get_timestamp; - sscreen->b.get_shader_param = si_get_shader_param; - sscreen->b.get_compiler_options = si_get_compiler_options; - sscreen->b.get_device_uuid = si_get_device_uuid; - sscreen->b.get_driver_uuid = si_get_driver_uuid; - sscreen->b.query_memory_info = si_query_memory_info; - sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; - - if (sscreen->info.has_hw_decode) { - sscreen->b.get_video_param = si_get_video_param; - sscreen->b.is_video_format_supported = si_vid_is_format_supported; - } else { - sscreen->b.get_video_param = si_get_video_param_no_decode; - sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported; - } - - si_init_renderer_string(sscreen); + sscreen->b.get_name = si_get_name; + sscreen->b.get_vendor = si_get_vendor; + sscreen->b.get_device_vendor = si_get_device_vendor; + sscreen->b.get_param = si_get_param; + sscreen->b.get_paramf = si_get_paramf; + sscreen->b.get_compute_param = si_get_compute_param; + sscreen->b.get_timestamp = si_get_timestamp; + sscreen->b.get_shader_param = si_get_shader_param; + sscreen->b.get_compiler_options = si_get_compiler_options; + sscreen->b.get_device_uuid = si_get_device_uuid; + sscreen->b.get_driver_uuid = si_get_driver_uuid; + sscreen->b.query_memory_info = si_query_memory_info; + sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache; + + if (sscreen->info.has_hw_decode) { + sscreen->b.get_video_param = si_get_video_param; + sscreen->b.is_video_format_supported = si_vid_is_format_supported; + } else { + sscreen->b.get_video_param = si_get_video_param_no_decode; + sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported; + } + + si_init_renderer_string(sscreen); } diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 9311b6e6386..30ba6b02f87 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -23,516 +23,499 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_pipe.h" #include "si_build_pm4.h" +#include "si_pipe.h" #include "sid.h" - #include "util/os_time.h" #include "util/u_upload_mgr.h" /* initialize */ void si_need_gfx_cs_space(struct si_context *ctx) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - - /* There is no need to flush the DMA IB here, because - * si_need_dma_space always flushes the GFX IB if there is - * a conflict, which means any unflushed DMA commands automatically - * precede the GFX IB (= they had no dependency on the GFX IB when - * they were submitted). - */ - - /* There are two memory usage counters in the winsys for all buffers - * that have been added (cs_add_buffer) and two counters in the pipe - * driver for those that haven't been added yet. - */ - if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, - ctx->vram, ctx->gtt))) { - ctx->gtt = 0; - ctx->vram = 0; - si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - return; - } - ctx->gtt = 0; - ctx->vram = 0; - - unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx); - if (!ctx->ws->cs_check_space(cs, need_dwords, false)) - si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + struct radeon_cmdbuf *cs = ctx->gfx_cs; + + /* There is no need to flush the DMA IB here, because + * si_need_dma_space always flushes the GFX IB if there is + * a conflict, which means any unflushed DMA commands automatically + * precede the GFX IB (= they had no dependency on the GFX IB when + * they were submitted). + */ + + /* There are two memory usage counters in the winsys for all buffers + * that have been added (cs_add_buffer) and two counters in the pipe + * driver for those that haven't been added yet. + */ + if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) { + ctx->gtt = 0; + ctx->vram = 0; + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + return; + } + ctx->gtt = 0; + ctx->vram = 0; + + unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx); + if (!ctx->ws->cs_check_space(cs, need_dwords, false)) + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } void si_unref_sdma_uploads(struct si_context *sctx) { - for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) { - si_resource_reference(&sctx->sdma_uploads[i].dst, NULL); - si_resource_reference(&sctx->sdma_uploads[i].src, NULL); - } - sctx->num_sdma_uploads = 0; + for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) { + si_resource_reference(&sctx->sdma_uploads[i].dst, NULL); + si_resource_reference(&sctx->sdma_uploads[i].src, NULL); + } + sctx->num_sdma_uploads = 0; } -void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, - struct pipe_fence_handle **fence) +void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - struct radeon_winsys *ws = ctx->ws; - const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - unsigned wait_flags = 0; - - if (ctx->gfx_flush_in_progress) - return; - - if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) { - wait_flags |= wait_ps_cs | - SI_CONTEXT_INV_L2; - } else if (ctx->chip_class == GFX6) { - /* The kernel flushes L2 before shaders are finished. */ - wait_flags |= wait_ps_cs; - } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { - wait_flags |= wait_ps_cs; - } - - /* Drop this flush if it's a no-op. */ - if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && - (!wait_flags || !ctx->gfx_last_ib_is_busy)) - return; - - if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET) - return; - - if (ctx->screen->debug_flags & DBG(CHECK_VM)) - flags &= ~PIPE_FLUSH_ASYNC; - - ctx->gfx_flush_in_progress = true; - - /* If the state tracker is flushing the GFX IB, si_flush_from_st is - * responsible for flushing the DMA IB and merging the fences from both. - * If the driver flushes the GFX IB internally, and it should never ask - * for a fence handle. - */ - assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL); - - /* Update the sdma_uploads list by flushing the uploader. */ - u_upload_unmap(ctx->b.const_uploader); - - /* Execute SDMA uploads. */ - ctx->sdma_uploads_in_progress = true; - for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { - struct si_sdma_upload *up = &ctx->sdma_uploads[i]; - - assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && - up->size % 4 == 0); - - si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, - up->dst_offset, up->src_offset, up->size); - } - ctx->sdma_uploads_in_progress = false; - si_unref_sdma_uploads(ctx); - - /* Flush SDMA (preamble IB). */ - if (radeon_emitted(ctx->sdma_cs, 0)) - si_flush_dma_cs(ctx, flags, NULL); - - if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) { - struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs; - si_compute_signal_gfx(ctx); - - /* Make sure compute shaders are idle before leaving the IB, so that - * the next IB doesn't overwrite GDS that might be in use. */ - radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | - EVENT_INDEX(4)); - - /* Save the GDS prim restart counter if needed. */ - if (ctx->preserve_prim_restart_gds_at_flush) { - si_cp_copy_data(ctx, compute_cs, - COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4, - COPY_DATA_GDS, NULL, 4); - } - } - - if (ctx->has_graphics) { - if (!list_is_empty(&ctx->active_queries)) - si_suspend_queries(ctx); - - ctx->streamout.suspended = false; - if (ctx->streamout.begin_emitted) { - si_emit_streamout_end(ctx); - ctx->streamout.suspended = true; - - /* Since NGG streamout uses GDS, we need to make GDS - * idle when we leave the IB, otherwise another process - * might overwrite it while our shaders are busy. - */ - if (ctx->screen->use_ngg_streamout) - wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - } - } - - /* Make sure CP DMA is idle at the end of IBs after L2 prefetches - * because the kernel doesn't wait for it. */ - if (ctx->chip_class >= GFX7) - si_cp_dma_wait_for_idle(ctx); - - /* Wait for draw calls to finish if needed. */ - if (wait_flags) { - ctx->flags |= wait_flags; - ctx->emit_cache_flush(ctx); - } - ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs; - - if (ctx->current_saved_cs) { - si_trace_emit(ctx); - - /* Save the IB for debug contexts. */ - si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); - ctx->current_saved_cs->flushed = true; - ctx->current_saved_cs->time_flush = os_time_get_nano(); - - si_log_hw_flush(ctx); - } - - if (si_compute_prim_discard_enabled(ctx)) { - /* The compute IB can start after the previous gfx IB starts. */ - if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && - ctx->last_gfx_fence) { - ctx->ws->cs_add_fence_dependency(ctx->gfx_cs, - ctx->last_gfx_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | - RADEON_DEPENDENCY_START_FENCE); - } - - /* Remember the last execution barrier. It's in the IB. - * It will signal the start of the next compute IB. - */ - if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && - ctx->last_pkt3_write_data) { - *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0); - ctx->last_pkt3_write_data = NULL; - - si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf); - ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset; - si_resource_reference(&ctx->barrier_buf, NULL); - - ws->fence_reference(&ctx->last_ib_barrier_fence, NULL); - } - } - - /* Flush the CS. */ - ws->cs_flush(cs, flags, &ctx->last_gfx_fence); - if (fence) - ws->fence_reference(fence, ctx->last_gfx_fence); - - ctx->num_gfx_cs_flushes++; - - if (si_compute_prim_discard_enabled(ctx)) { - /* Remember the last execution barrier, which is the last fence - * in this case. - */ - if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { - ctx->last_pkt3_write_data = NULL; - si_resource_reference(&ctx->last_ib_barrier_buf, NULL); - ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence); - } - } - - /* Check VM faults if needed. */ - if (ctx->screen->debug_flags & DBG(CHECK_VM)) { - /* Use conservative timeout 800ms, after which we won't wait any - * longer and assume the GPU is hung. - */ - ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000); - - si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX); - } - - if (ctx->current_saved_cs) - si_saved_cs_reference(&ctx->current_saved_cs, NULL); - - si_begin_new_gfx_cs(ctx); - ctx->gfx_flush_in_progress = false; + struct radeon_cmdbuf *cs = ctx->gfx_cs; + struct radeon_winsys *ws = ctx->ws; + const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + unsigned wait_flags = 0; + + if (ctx->gfx_flush_in_progress) + return; + + if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) { + wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2; + } else if (ctx->chip_class == GFX6) { + /* The kernel flushes L2 before shaders are finished. */ + wait_flags |= wait_ps_cs; + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + wait_flags |= wait_ps_cs; + } + + /* Drop this flush if it's a no-op. */ + if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy)) + return; + + if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET) + return; + + if (ctx->screen->debug_flags & DBG(CHECK_VM)) + flags &= ~PIPE_FLUSH_ASYNC; + + ctx->gfx_flush_in_progress = true; + + /* If the state tracker is flushing the GFX IB, si_flush_from_st is + * responsible for flushing the DMA IB and merging the fences from both. + * If the driver flushes the GFX IB internally, and it should never ask + * for a fence handle. + */ + assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL); + + /* Update the sdma_uploads list by flushing the uploader. */ + u_upload_unmap(ctx->b.const_uploader); + + /* Execute SDMA uploads. */ + ctx->sdma_uploads_in_progress = true; + for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { + struct si_sdma_upload *up = &ctx->sdma_uploads[i]; + + assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0); + + si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset, + up->size); + } + ctx->sdma_uploads_in_progress = false; + si_unref_sdma_uploads(ctx); + + /* Flush SDMA (preamble IB). */ + if (radeon_emitted(ctx->sdma_cs, 0)) + si_flush_dma_cs(ctx, flags, NULL); + + if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) { + struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs; + si_compute_signal_gfx(ctx); + + /* Make sure compute shaders are idle before leaving the IB, so that + * the next IB doesn't overwrite GDS that might be in use. */ + radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + /* Save the GDS prim restart counter if needed. */ + if (ctx->preserve_prim_restart_gds_at_flush) { + si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4, + COPY_DATA_GDS, NULL, 4); + } + } + + if (ctx->has_graphics) { + if (!list_is_empty(&ctx->active_queries)) + si_suspend_queries(ctx); + + ctx->streamout.suspended = false; + if (ctx->streamout.begin_emitted) { + si_emit_streamout_end(ctx); + ctx->streamout.suspended = true; + + /* Since NGG streamout uses GDS, we need to make GDS + * idle when we leave the IB, otherwise another process + * might overwrite it while our shaders are busy. + */ + if (ctx->screen->use_ngg_streamout) + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + } + } + + /* Make sure CP DMA is idle at the end of IBs after L2 prefetches + * because the kernel doesn't wait for it. */ + if (ctx->chip_class >= GFX7) + si_cp_dma_wait_for_idle(ctx); + + /* Wait for draw calls to finish if needed. */ + if (wait_flags) { + ctx->flags |= wait_flags; + ctx->emit_cache_flush(ctx); + } + ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs; + + if (ctx->current_saved_cs) { + si_trace_emit(ctx); + + /* Save the IB for debug contexts. */ + si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); + ctx->current_saved_cs->flushed = true; + ctx->current_saved_cs->time_flush = os_time_get_nano(); + + si_log_hw_flush(ctx); + } + + if (si_compute_prim_discard_enabled(ctx)) { + /* The compute IB can start after the previous gfx IB starts. */ + if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) { + ctx->ws->cs_add_fence_dependency( + ctx->gfx_cs, ctx->last_gfx_fence, + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE); + } + + /* Remember the last execution barrier. It's in the IB. + * It will signal the start of the next compute IB. + */ + if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) { + *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0); + ctx->last_pkt3_write_data = NULL; + + si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf); + ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset; + si_resource_reference(&ctx->barrier_buf, NULL); + + ws->fence_reference(&ctx->last_ib_barrier_fence, NULL); + } + } + + /* Flush the CS. */ + ws->cs_flush(cs, flags, &ctx->last_gfx_fence); + if (fence) + ws->fence_reference(fence, ctx->last_gfx_fence); + + ctx->num_gfx_cs_flushes++; + + if (si_compute_prim_discard_enabled(ctx)) { + /* Remember the last execution barrier, which is the last fence + * in this case. + */ + if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + ctx->last_pkt3_write_data = NULL; + si_resource_reference(&ctx->last_ib_barrier_buf, NULL); + ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence); + } + } + + /* Check VM faults if needed. */ + if (ctx->screen->debug_flags & DBG(CHECK_VM)) { + /* Use conservative timeout 800ms, after which we won't wait any + * longer and assume the GPU is hung. + */ + ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000); + + si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX); + } + + if (ctx->current_saved_cs) + si_saved_cs_reference(&ctx->current_saved_cs, NULL); + + si_begin_new_gfx_cs(ctx); + ctx->gfx_flush_in_progress = false; } static void si_begin_gfx_cs_debug(struct si_context *ctx) { - static const uint32_t zeros[1]; - assert(!ctx->current_saved_cs); + static const uint32_t zeros[1]; + assert(!ctx->current_saved_cs); - ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs)); - if (!ctx->current_saved_cs) - return; + ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs)); + if (!ctx->current_saved_cs) + return; - pipe_reference_init(&ctx->current_saved_cs->reference, 1); + pipe_reference_init(&ctx->current_saved_cs->reference, 1); - ctx->current_saved_cs->trace_buf = si_resource( - pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8)); - if (!ctx->current_saved_cs->trace_buf) { - free(ctx->current_saved_cs); - ctx->current_saved_cs = NULL; - return; - } + ctx->current_saved_cs->trace_buf = + si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8)); + if (!ctx->current_saved_cs->trace_buf) { + free(ctx->current_saved_cs); + ctx->current_saved_cs = NULL; + return; + } - pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, - 0, sizeof(zeros), zeros); - ctx->current_saved_cs->trace_id = 0; + pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros), + zeros); + ctx->current_saved_cs->trace_id = 0; - si_trace_emit(ctx); + si_trace_emit(ctx); - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf, - RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf, + RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); } static void si_add_gds_to_buffer_list(struct si_context *sctx) { - if (sctx->gds) { - sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, - RADEON_USAGE_READWRITE, 0, 0); - if (sctx->gds_oa) { - sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, - RADEON_USAGE_READWRITE, 0, 0); - } - } + if (sctx->gds) { + sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0); + if (sctx->gds_oa) { + sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0); + } + } } void si_allocate_gds(struct si_context *sctx) { - struct radeon_winsys *ws = sctx->ws; + struct radeon_winsys *ws = sctx->ws; - if (sctx->gds) - return; + if (sctx->gds) + return; - assert(sctx->screen->use_ngg_streamout); + assert(sctx->screen->use_ngg_streamout); - /* 4 streamout GDS counters. - * We need 256B (64 dw) of GDS, otherwise streamout hangs. - */ - sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0); - sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0); + /* 4 streamout GDS counters. + * We need 256B (64 dw) of GDS, otherwise streamout hangs. + */ + sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0); + sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0); - assert(sctx->gds && sctx->gds_oa); - si_add_gds_to_buffer_list(sctx); + assert(sctx->gds && sctx->gds_oa); + si_add_gds_to_buffer_list(sctx); } void si_begin_new_gfx_cs(struct si_context *ctx) { - if (ctx->is_debug) - si_begin_gfx_cs_debug(ctx); - - si_add_gds_to_buffer_list(ctx); - - /* Always invalidate caches at the beginning of IBs, because external - * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our - * buffers. - * - * Note that the cache flush done by the kernel at the end of GFX IBs - * isn't useful here, because that flush can finish after the following - * IB starts drawing. - * - * TODO: Do we also need to invalidate CB & DB caches? - */ - ctx->flags |= SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | - SI_CONTEXT_START_PIPELINE_STATS; - - ctx->cs_shader_state.initialized = false; - si_all_descriptors_begin_new_cs(ctx); - - if (!ctx->has_graphics) { - ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; - return; - } - - /* set all valid group as dirty so they get reemited on - * next draw command - */ - si_pm4_reset_emitted(ctx); - - /* The CS initialization should be emitted before everything else. */ - si_pm4_emit(ctx, ctx->init_config); - if (ctx->init_config_gs_rings) - si_pm4_emit(ctx, ctx->init_config_gs_rings); - - if (ctx->queued.named.ls) - ctx->prefetch_L2_mask |= SI_PREFETCH_LS; - if (ctx->queued.named.hs) - ctx->prefetch_L2_mask |= SI_PREFETCH_HS; - if (ctx->queued.named.es) - ctx->prefetch_L2_mask |= SI_PREFETCH_ES; - if (ctx->queued.named.gs) - ctx->prefetch_L2_mask |= SI_PREFETCH_GS; - if (ctx->queued.named.vs) - ctx->prefetch_L2_mask |= SI_PREFETCH_VS; - if (ctx->queued.named.ps) - ctx->prefetch_L2_mask |= SI_PREFETCH_PS; - if (ctx->vb_descriptors_buffer && ctx->vertex_elements) - ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; - - /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ - bool has_clear_state = ctx->screen->info.has_clear_state; - if (has_clear_state) { - ctx->framebuffer.dirty_cbufs = - u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); - /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */ - ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL; - } else { - ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8); - ctx->framebuffer.dirty_zsbuf = true; - } - /* This should always be marked as dirty to set the framebuffer scissor - * at least. */ - si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer); - - si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs); - /* CLEAR_STATE sets zeros. */ - if (!has_clear_state || ctx->clip_state.any_nonzeros) - si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state); - ctx->sample_locs_num_samples = 0; - si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs); - si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config); - /* CLEAR_STATE sets 0xffff. */ - if (!has_clear_state || ctx->sample_mask != 0xffff) - si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask); - si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state); - /* CLEAR_STATE sets zeros. */ - if (!has_clear_state || ctx->blend_color.any_nonzeros) - si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color); - si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state); - if (ctx->chip_class >= GFX9) - si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state); - si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref); - si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map); - if (!ctx->screen->use_ngg_streamout) - si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable); - si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond); - /* CLEAR_STATE disables all window rectangles. */ - if (!has_clear_state || ctx->num_window_rectangles > 0) - si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles); - - si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); - si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); - si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); - - si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state); - if (ctx->scratch_buffer) { - si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b); - } - - if (ctx->streamout.suspended) { - ctx->streamout.append_bitmask = ctx->streamout.enabled_mask; - si_streamout_buffers_dirty(ctx); - } - - if (!list_is_empty(&ctx->active_queries)) - si_resume_queries(ctx); - - assert(!ctx->gfx_cs->prev_dw); - ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; - - /* Invalidate various draw states so that they are emitted before - * the first draw call. */ - si_invalidate_draw_sh_constants(ctx); - ctx->last_index_size = -1; - ctx->last_primitive_restart_en = -1; - ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN; - ctx->last_prim = -1; - ctx->last_multi_vgt_param = -1; - ctx->last_vs_state = ~0; - ctx->last_ls = NULL; - ctx->last_tcs = NULL; - ctx->last_tes_sh_base = -1; - ctx->last_num_tcs_input_cp = -1; - ctx->last_ls_hs_config = -1; /* impossible value */ - ctx->last_binning_enabled = -1; - ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL; - - ctx->prim_discard_compute_ib_initialized = false; - - /* Compute-based primitive discard: - * The index ring is divided into 2 halves. Switch between the halves - * in the same fashion as doublebuffering. - */ - if (ctx->index_ring_base) - ctx->index_ring_base = 0; - else - ctx->index_ring_base = ctx->index_ring_size_per_ib; - - ctx->index_ring_offset = 0; - - STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8); - - if (has_clear_state) { - ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; - ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003; - ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000; - ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */ - - /* Set all cleared context registers to saved. */ - ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ - ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ - } else { - /* Set all register values to unknown. */ - ctx->tracked_regs.reg_saved = 0; - ctx->last_gs_out_prim = -1; /* unknown */ - } - - /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */ - memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32); + if (ctx->is_debug) + si_begin_gfx_cs_debug(ctx); + + si_add_gds_to_buffer_list(ctx); + + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS; + + ctx->cs_shader_state.initialized = false; + si_all_descriptors_begin_new_cs(ctx); + + if (!ctx->has_graphics) { + ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; + return; + } + + /* set all valid group as dirty so they get reemited on + * next draw command + */ + si_pm4_reset_emitted(ctx); + + /* The CS initialization should be emitted before everything else. */ + si_pm4_emit(ctx, ctx->init_config); + if (ctx->init_config_gs_rings) + si_pm4_emit(ctx, ctx->init_config_gs_rings); + + if (ctx->queued.named.ls) + ctx->prefetch_L2_mask |= SI_PREFETCH_LS; + if (ctx->queued.named.hs) + ctx->prefetch_L2_mask |= SI_PREFETCH_HS; + if (ctx->queued.named.es) + ctx->prefetch_L2_mask |= SI_PREFETCH_ES; + if (ctx->queued.named.gs) + ctx->prefetch_L2_mask |= SI_PREFETCH_GS; + if (ctx->queued.named.vs) + ctx->prefetch_L2_mask |= SI_PREFETCH_VS; + if (ctx->queued.named.ps) + ctx->prefetch_L2_mask |= SI_PREFETCH_PS; + if (ctx->vb_descriptors_buffer && ctx->vertex_elements) + ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; + + /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ + bool has_clear_state = ctx->screen->info.has_clear_state; + if (has_clear_state) { + ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); + /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */ + ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL; + } else { + ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8); + ctx->framebuffer.dirty_zsbuf = true; + } + /* This should always be marked as dirty to set the framebuffer scissor + * at least. */ + si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer); + + si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs); + /* CLEAR_STATE sets zeros. */ + if (!has_clear_state || ctx->clip_state.any_nonzeros) + si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state); + ctx->sample_locs_num_samples = 0; + si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs); + si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config); + /* CLEAR_STATE sets 0xffff. */ + if (!has_clear_state || ctx->sample_mask != 0xffff) + si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask); + si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state); + /* CLEAR_STATE sets zeros. */ + if (!has_clear_state || ctx->blend_color.any_nonzeros) + si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color); + si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state); + if (ctx->chip_class >= GFX9) + si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state); + si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref); + si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map); + if (!ctx->screen->use_ngg_streamout) + si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable); + si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond); + /* CLEAR_STATE disables all window rectangles. */ + if (!has_clear_state || ctx->num_window_rectangles > 0) + si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles); + + si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); + si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); + + si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state); + if (ctx->scratch_buffer) { + si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b); + } + + if (ctx->streamout.suspended) { + ctx->streamout.append_bitmask = ctx->streamout.enabled_mask; + si_streamout_buffers_dirty(ctx); + } + + if (!list_is_empty(&ctx->active_queries)) + si_resume_queries(ctx); + + assert(!ctx->gfx_cs->prev_dw); + ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; + + /* Invalidate various draw states so that they are emitted before + * the first draw call. */ + si_invalidate_draw_sh_constants(ctx); + ctx->last_index_size = -1; + ctx->last_primitive_restart_en = -1; + ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN; + ctx->last_prim = -1; + ctx->last_multi_vgt_param = -1; + ctx->last_vs_state = ~0; + ctx->last_ls = NULL; + ctx->last_tcs = NULL; + ctx->last_tes_sh_base = -1; + ctx->last_num_tcs_input_cp = -1; + ctx->last_ls_hs_config = -1; /* impossible value */ + ctx->last_binning_enabled = -1; + ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL; + + ctx->prim_discard_compute_ib_initialized = false; + + /* Compute-based primitive discard: + * The index ring is divided into 2 halves. Switch between the halves + * in the same fashion as doublebuffering. + */ + if (ctx->index_ring_base) + ctx->index_ring_base = 0; + else + ctx->index_ring_base = ctx->index_ring_size_per_ib; + + ctx->index_ring_offset = 0; + + STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8); + + if (has_clear_state) { + ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003; + ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000; + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = + 0x0000001e; /* From GFX8 */ + + /* Set all cleared context registers to saved. */ + ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */ + ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ + } else { + /* Set all register values to unknown. */ + ctx->tracked_regs.reg_saved = 0; + ctx->last_gs_out_prim = -1; /* unknown */ + } + + /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */ + memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32); } diff --git a/src/gallium/drivers/radeonsi/si_gpu_load.c b/src/gallium/drivers/radeonsi/si_gpu_load.c index 33cd5642230..806f98ad520 100644 --- a/src/gallium/drivers/radeonsi/si_gpu_load.c +++ b/src/gallium/drivers/radeonsi/si_gpu_load.c @@ -40,242 +40,234 @@ * fps (there are too few samples per frame). */ #define SAMPLES_PER_SEC 10000 -#define GRBM_STATUS 0x8010 -#define TA_BUSY(x) (((x) >> 14) & 0x1) -#define GDS_BUSY(x) (((x) >> 15) & 0x1) -#define VGT_BUSY(x) (((x) >> 17) & 0x1) -#define IA_BUSY(x) (((x) >> 19) & 0x1) -#define SX_BUSY(x) (((x) >> 20) & 0x1) -#define WD_BUSY(x) (((x) >> 21) & 0x1) -#define SPI_BUSY(x) (((x) >> 22) & 0x1) -#define BCI_BUSY(x) (((x) >> 23) & 0x1) -#define SC_BUSY(x) (((x) >> 24) & 0x1) -#define PA_BUSY(x) (((x) >> 25) & 0x1) -#define DB_BUSY(x) (((x) >> 26) & 0x1) -#define CP_BUSY(x) (((x) >> 29) & 0x1) -#define CB_BUSY(x) (((x) >> 30) & 0x1) -#define GUI_ACTIVE(x) (((x) >> 31) & 0x1) - -#define SRBM_STATUS2 0x0e4c -#define SDMA_BUSY(x) (((x) >> 5) & 0x1) - -#define CP_STAT 0x8680 -#define PFP_BUSY(x) (((x) >> 15) & 0x1) -#define MEQ_BUSY(x) (((x) >> 16) & 0x1) -#define ME_BUSY(x) (((x) >> 17) & 0x1) -#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) -#define DMA_BUSY(x) (((x) >> 22) & 0x1) -#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) +#define GRBM_STATUS 0x8010 +#define TA_BUSY(x) (((x) >> 14) & 0x1) +#define GDS_BUSY(x) (((x) >> 15) & 0x1) +#define VGT_BUSY(x) (((x) >> 17) & 0x1) +#define IA_BUSY(x) (((x) >> 19) & 0x1) +#define SX_BUSY(x) (((x) >> 20) & 0x1) +#define WD_BUSY(x) (((x) >> 21) & 0x1) +#define SPI_BUSY(x) (((x) >> 22) & 0x1) +#define BCI_BUSY(x) (((x) >> 23) & 0x1) +#define SC_BUSY(x) (((x) >> 24) & 0x1) +#define PA_BUSY(x) (((x) >> 25) & 0x1) +#define DB_BUSY(x) (((x) >> 26) & 0x1) +#define CP_BUSY(x) (((x) >> 29) & 0x1) +#define CB_BUSY(x) (((x) >> 30) & 0x1) +#define GUI_ACTIVE(x) (((x) >> 31) & 0x1) + +#define SRBM_STATUS2 0x0e4c +#define SDMA_BUSY(x) (((x) >> 5) & 0x1) + +#define CP_STAT 0x8680 +#define PFP_BUSY(x) (((x) >> 15) & 0x1) +#define MEQ_BUSY(x) (((x) >> 16) & 0x1) +#define ME_BUSY(x) (((x) >> 17) & 0x1) +#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) +#define DMA_BUSY(x) (((x) >> 22) & 0x1) +#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) #define IDENTITY(x) x -#define UPDATE_COUNTER(field, mask) \ - do { \ - if (mask(value)) \ - p_atomic_inc(&counters->named.field.busy); \ - else \ - p_atomic_inc(&counters->named.field.idle); \ - } while (0) +#define UPDATE_COUNTER(field, mask) \ + do { \ + if (mask(value)) \ + p_atomic_inc(&counters->named.field.busy); \ + else \ + p_atomic_inc(&counters->named.field.idle); \ + } while (0) -static void si_update_mmio_counters(struct si_screen *sscreen, - union si_mmio_counters *counters) +static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters) { - uint32_t value = 0; - bool gui_busy, sdma_busy = false; - - /* GRBM_STATUS */ - sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value); - - UPDATE_COUNTER(ta, TA_BUSY); - UPDATE_COUNTER(gds, GDS_BUSY); - UPDATE_COUNTER(vgt, VGT_BUSY); - UPDATE_COUNTER(ia, IA_BUSY); - UPDATE_COUNTER(sx, SX_BUSY); - UPDATE_COUNTER(wd, WD_BUSY); - UPDATE_COUNTER(spi, SPI_BUSY); - UPDATE_COUNTER(bci, BCI_BUSY); - UPDATE_COUNTER(sc, SC_BUSY); - UPDATE_COUNTER(pa, PA_BUSY); - UPDATE_COUNTER(db, DB_BUSY); - UPDATE_COUNTER(cp, CP_BUSY); - UPDATE_COUNTER(cb, CB_BUSY); - UPDATE_COUNTER(gui, GUI_ACTIVE); - gui_busy = GUI_ACTIVE(value); - - if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) { - /* SRBM_STATUS2 */ - sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value); - - UPDATE_COUNTER(sdma, SDMA_BUSY); - sdma_busy = SDMA_BUSY(value); - } - - if (sscreen->info.chip_class >= GFX8) { - /* CP_STAT */ - sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value); - - UPDATE_COUNTER(pfp, PFP_BUSY); - UPDATE_COUNTER(meq, MEQ_BUSY); - UPDATE_COUNTER(me, ME_BUSY); - UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY); - UPDATE_COUNTER(cp_dma, DMA_BUSY); - UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY); - } - - value = gui_busy || sdma_busy; - UPDATE_COUNTER(gpu, IDENTITY); + uint32_t value = 0; + bool gui_busy, sdma_busy = false; + + /* GRBM_STATUS */ + sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value); + + UPDATE_COUNTER(ta, TA_BUSY); + UPDATE_COUNTER(gds, GDS_BUSY); + UPDATE_COUNTER(vgt, VGT_BUSY); + UPDATE_COUNTER(ia, IA_BUSY); + UPDATE_COUNTER(sx, SX_BUSY); + UPDATE_COUNTER(wd, WD_BUSY); + UPDATE_COUNTER(spi, SPI_BUSY); + UPDATE_COUNTER(bci, BCI_BUSY); + UPDATE_COUNTER(sc, SC_BUSY); + UPDATE_COUNTER(pa, PA_BUSY); + UPDATE_COUNTER(db, DB_BUSY); + UPDATE_COUNTER(cp, CP_BUSY); + UPDATE_COUNTER(cb, CB_BUSY); + UPDATE_COUNTER(gui, GUI_ACTIVE); + gui_busy = GUI_ACTIVE(value); + + if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) { + /* SRBM_STATUS2 */ + sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value); + + UPDATE_COUNTER(sdma, SDMA_BUSY); + sdma_busy = SDMA_BUSY(value); + } + + if (sscreen->info.chip_class >= GFX8) { + /* CP_STAT */ + sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value); + + UPDATE_COUNTER(pfp, PFP_BUSY); + UPDATE_COUNTER(meq, MEQ_BUSY); + UPDATE_COUNTER(me, ME_BUSY); + UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY); + UPDATE_COUNTER(cp_dma, DMA_BUSY); + UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY); + } + + value = gui_busy || sdma_busy; + UPDATE_COUNTER(gpu, IDENTITY); } #undef UPDATE_COUNTER -static int -si_gpu_load_thread(void *param) +static int si_gpu_load_thread(void *param) { - struct si_screen *sscreen = (struct si_screen*)param; - const int period_us = 1000000 / SAMPLES_PER_SEC; - int sleep_us = period_us; - int64_t cur_time, last_time = os_time_get(); - - while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) { - if (sleep_us) - os_time_sleep(sleep_us); - - /* Make sure we sleep the ideal amount of time to match - * the expected frequency. */ - cur_time = os_time_get(); - - if (os_time_timeout(last_time, last_time + period_us, - cur_time)) - sleep_us = MAX2(sleep_us - 1, 1); - else - sleep_us += 1; - - /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/ - last_time = cur_time; - - /* Update the counters. */ - si_update_mmio_counters(sscreen, &sscreen->mmio_counters); - } - p_atomic_dec(&sscreen->gpu_load_stop_thread); - return 0; + struct si_screen *sscreen = (struct si_screen *)param; + const int period_us = 1000000 / SAMPLES_PER_SEC; + int sleep_us = period_us; + int64_t cur_time, last_time = os_time_get(); + + while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) { + if (sleep_us) + os_time_sleep(sleep_us); + + /* Make sure we sleep the ideal amount of time to match + * the expected frequency. */ + cur_time = os_time_get(); + + if (os_time_timeout(last_time, last_time + period_us, cur_time)) + sleep_us = MAX2(sleep_us - 1, 1); + else + sleep_us += 1; + + /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/ + last_time = cur_time; + + /* Update the counters. */ + si_update_mmio_counters(sscreen, &sscreen->mmio_counters); + } + p_atomic_dec(&sscreen->gpu_load_stop_thread); + return 0; } void si_gpu_load_kill_thread(struct si_screen *sscreen) { - if (!sscreen->gpu_load_thread) - return; + if (!sscreen->gpu_load_thread) + return; - p_atomic_inc(&sscreen->gpu_load_stop_thread); - thrd_join(sscreen->gpu_load_thread, NULL); - sscreen->gpu_load_thread = 0; + p_atomic_inc(&sscreen->gpu_load_stop_thread); + thrd_join(sscreen->gpu_load_thread, NULL); + sscreen->gpu_load_thread = 0; } -static uint64_t si_read_mmio_counter(struct si_screen *sscreen, - unsigned busy_index) +static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index) { - /* Start the thread if needed. */ - if (!sscreen->gpu_load_thread) { - simple_mtx_lock(&sscreen->gpu_load_mutex); - /* Check again inside the mutex. */ - if (!sscreen->gpu_load_thread) - sscreen->gpu_load_thread = - u_thread_create(si_gpu_load_thread, sscreen); - simple_mtx_unlock(&sscreen->gpu_load_mutex); - } - - unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]); - unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]); - - return busy | ((uint64_t)idle << 32); + /* Start the thread if needed. */ + if (!sscreen->gpu_load_thread) { + simple_mtx_lock(&sscreen->gpu_load_mutex); + /* Check again inside the mutex. */ + if (!sscreen->gpu_load_thread) + sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen); + simple_mtx_unlock(&sscreen->gpu_load_mutex); + } + + unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]); + unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]); + + return busy | ((uint64_t)idle << 32); } -static unsigned si_end_mmio_counter(struct si_screen *sscreen, - uint64_t begin, unsigned busy_index) +static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index) { - uint64_t end = si_read_mmio_counter(sscreen, busy_index); - unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff); - unsigned idle = (end >> 32) - (begin >> 32); - - /* Calculate the % of time the busy counter was being incremented. - * - * If no counters were incremented, return the current counter status. - * It's for the case when the load is queried faster than - * the counters are updated. - */ - if (idle || busy) { - return busy*100 / (busy + idle); - } else { - union si_mmio_counters counters; - - memset(&counters, 0, sizeof(counters)); - si_update_mmio_counters(sscreen, &counters); - return counters.array[busy_index] ? 100 : 0; - } + uint64_t end = si_read_mmio_counter(sscreen, busy_index); + unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff); + unsigned idle = (end >> 32) - (begin >> 32); + + /* Calculate the % of time the busy counter was being incremented. + * + * If no counters were incremented, return the current counter status. + * It's for the case when the load is queried faster than + * the counters are updated. + */ + if (idle || busy) { + return busy * 100 / (busy + idle); + } else { + union si_mmio_counters counters; + + memset(&counters, 0, sizeof(counters)); + si_update_mmio_counters(sscreen, &counters); + return counters.array[busy_index] ? 100 : 0; + } } -#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \ - sscreen->mmio_counters.array) +#define BUSY_INDEX(sscreen, field) \ + (&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array) -static unsigned busy_index_from_type(struct si_screen *sscreen, - unsigned type) +static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type) { - switch (type) { - case SI_QUERY_GPU_LOAD: - return BUSY_INDEX(sscreen, gpu); - case SI_QUERY_GPU_SHADERS_BUSY: - return BUSY_INDEX(sscreen, spi); - case SI_QUERY_GPU_TA_BUSY: - return BUSY_INDEX(sscreen, ta); - case SI_QUERY_GPU_GDS_BUSY: - return BUSY_INDEX(sscreen, gds); - case SI_QUERY_GPU_VGT_BUSY: - return BUSY_INDEX(sscreen, vgt); - case SI_QUERY_GPU_IA_BUSY: - return BUSY_INDEX(sscreen, ia); - case SI_QUERY_GPU_SX_BUSY: - return BUSY_INDEX(sscreen, sx); - case SI_QUERY_GPU_WD_BUSY: - return BUSY_INDEX(sscreen, wd); - case SI_QUERY_GPU_BCI_BUSY: - return BUSY_INDEX(sscreen, bci); - case SI_QUERY_GPU_SC_BUSY: - return BUSY_INDEX(sscreen, sc); - case SI_QUERY_GPU_PA_BUSY: - return BUSY_INDEX(sscreen, pa); - case SI_QUERY_GPU_DB_BUSY: - return BUSY_INDEX(sscreen, db); - case SI_QUERY_GPU_CP_BUSY: - return BUSY_INDEX(sscreen, cp); - case SI_QUERY_GPU_CB_BUSY: - return BUSY_INDEX(sscreen, cb); - case SI_QUERY_GPU_SDMA_BUSY: - return BUSY_INDEX(sscreen, sdma); - case SI_QUERY_GPU_PFP_BUSY: - return BUSY_INDEX(sscreen, pfp); - case SI_QUERY_GPU_MEQ_BUSY: - return BUSY_INDEX(sscreen, meq); - case SI_QUERY_GPU_ME_BUSY: - return BUSY_INDEX(sscreen, me); - case SI_QUERY_GPU_SURF_SYNC_BUSY: - return BUSY_INDEX(sscreen, surf_sync); - case SI_QUERY_GPU_CP_DMA_BUSY: - return BUSY_INDEX(sscreen, cp_dma); - case SI_QUERY_GPU_SCRATCH_RAM_BUSY: - return BUSY_INDEX(sscreen, scratch_ram); - default: - unreachable("invalid query type"); - } + switch (type) { + case SI_QUERY_GPU_LOAD: + return BUSY_INDEX(sscreen, gpu); + case SI_QUERY_GPU_SHADERS_BUSY: + return BUSY_INDEX(sscreen, spi); + case SI_QUERY_GPU_TA_BUSY: + return BUSY_INDEX(sscreen, ta); + case SI_QUERY_GPU_GDS_BUSY: + return BUSY_INDEX(sscreen, gds); + case SI_QUERY_GPU_VGT_BUSY: + return BUSY_INDEX(sscreen, vgt); + case SI_QUERY_GPU_IA_BUSY: + return BUSY_INDEX(sscreen, ia); + case SI_QUERY_GPU_SX_BUSY: + return BUSY_INDEX(sscreen, sx); + case SI_QUERY_GPU_WD_BUSY: + return BUSY_INDEX(sscreen, wd); + case SI_QUERY_GPU_BCI_BUSY: + return BUSY_INDEX(sscreen, bci); + case SI_QUERY_GPU_SC_BUSY: + return BUSY_INDEX(sscreen, sc); + case SI_QUERY_GPU_PA_BUSY: + return BUSY_INDEX(sscreen, pa); + case SI_QUERY_GPU_DB_BUSY: + return BUSY_INDEX(sscreen, db); + case SI_QUERY_GPU_CP_BUSY: + return BUSY_INDEX(sscreen, cp); + case SI_QUERY_GPU_CB_BUSY: + return BUSY_INDEX(sscreen, cb); + case SI_QUERY_GPU_SDMA_BUSY: + return BUSY_INDEX(sscreen, sdma); + case SI_QUERY_GPU_PFP_BUSY: + return BUSY_INDEX(sscreen, pfp); + case SI_QUERY_GPU_MEQ_BUSY: + return BUSY_INDEX(sscreen, meq); + case SI_QUERY_GPU_ME_BUSY: + return BUSY_INDEX(sscreen, me); + case SI_QUERY_GPU_SURF_SYNC_BUSY: + return BUSY_INDEX(sscreen, surf_sync); + case SI_QUERY_GPU_CP_DMA_BUSY: + return BUSY_INDEX(sscreen, cp_dma); + case SI_QUERY_GPU_SCRATCH_RAM_BUSY: + return BUSY_INDEX(sscreen, scratch_ram); + default: + unreachable("invalid query type"); + } } uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type) { - unsigned busy_index = busy_index_from_type(sscreen, type); - return si_read_mmio_counter(sscreen, busy_index); + unsigned busy_index = busy_index_from_type(sscreen, type); + return si_read_mmio_counter(sscreen, busy_index); } -unsigned si_end_counter(struct si_screen *sscreen, unsigned type, - uint64_t begin) +unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin) { - unsigned busy_index = busy_index_from_type(sscreen, type); - return si_end_mmio_counter(sscreen, begin, busy_index); + unsigned busy_index = busy_index_from_type(sscreen, type); + return si_end_mmio_counter(sscreen, begin, busy_index); } diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 974ac430c53..ca13ca8a639 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -26,101 +26,101 @@ #include "si_query.h" #include "util/u_memory.h" +enum si_pc_block_flags +{ + /* This block is part of the shader engine */ + SI_PC_BLOCK_SE = (1 << 0), -enum si_pc_block_flags { - /* This block is part of the shader engine */ - SI_PC_BLOCK_SE = (1 << 0), - - /* Expose per-instance groups instead of summing all instances (within - * an SE). */ - SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1), + /* Expose per-instance groups instead of summing all instances (within + * an SE). */ + SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1), - /* Expose per-SE groups instead of summing instances across SEs. */ - SI_PC_BLOCK_SE_GROUPS = (1 << 2), + /* Expose per-SE groups instead of summing instances across SEs. */ + SI_PC_BLOCK_SE_GROUPS = (1 << 2), - /* Shader block */ - SI_PC_BLOCK_SHADER = (1 << 3), + /* Shader block */ + SI_PC_BLOCK_SHADER = (1 << 3), - /* Non-shader block with perfcounters windowed by shaders. */ - SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4), + /* Non-shader block with perfcounters windowed by shaders. */ + SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4), }; -enum si_pc_reg_layout { - /* All secondary selector dwords follow as one block after the primary - * selector dwords for the counters that have secondary selectors. - */ - SI_PC_MULTI_BLOCK = 0, +enum si_pc_reg_layout +{ + /* All secondary selector dwords follow as one block after the primary + * selector dwords for the counters that have secondary selectors. + */ + SI_PC_MULTI_BLOCK = 0, - /* Each secondary selector dword follows immediately afters the - * corresponding primary. - */ - SI_PC_MULTI_ALTERNATE = 1, + /* Each secondary selector dword follows immediately afters the + * corresponding primary. + */ + SI_PC_MULTI_ALTERNATE = 1, - /* All secondary selector dwords follow as one block after all primary - * selector dwords. - */ - SI_PC_MULTI_TAIL = 2, + /* All secondary selector dwords follow as one block after all primary + * selector dwords. + */ + SI_PC_MULTI_TAIL = 2, - /* Free-form arrangement of selector registers. */ - SI_PC_MULTI_CUSTOM = 3, + /* Free-form arrangement of selector registers. */ + SI_PC_MULTI_CUSTOM = 3, - SI_PC_MULTI_MASK = 3, + SI_PC_MULTI_MASK = 3, - /* Registers are laid out in decreasing rather than increasing order. */ - SI_PC_REG_REVERSE = 4, + /* Registers are laid out in decreasing rather than increasing order. */ + SI_PC_REG_REVERSE = 4, - SI_PC_FAKE = 8, + SI_PC_FAKE = 8, }; struct si_pc_block_base { - const char *name; - unsigned num_counters; - unsigned flags; - - unsigned select_or; - unsigned select0; - unsigned counter0_lo; - unsigned *select; - unsigned *counters; - unsigned num_multi; - unsigned num_prelude; - unsigned layout; + const char *name; + unsigned num_counters; + unsigned flags; + + unsigned select_or; + unsigned select0; + unsigned counter0_lo; + unsigned *select; + unsigned *counters; + unsigned num_multi; + unsigned num_prelude; + unsigned layout; }; struct si_pc_block_gfxdescr { - struct si_pc_block_base *b; - unsigned selectors; - unsigned instances; + struct si_pc_block_base *b; + unsigned selectors; + unsigned instances; }; struct si_pc_block { - const struct si_pc_block_gfxdescr *b; - unsigned num_instances; + const struct si_pc_block_gfxdescr *b; + unsigned num_instances; - unsigned num_groups; - char *group_names; - unsigned group_name_stride; + unsigned num_groups; + char *group_names; + unsigned group_name_stride; - char *selector_names; - unsigned selector_name_stride; + char *selector_names; + unsigned selector_name_stride; }; /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of * performance counter group IDs. */ -static const char * const si_pc_shader_type_suffixes[] = { - "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS" -}; +static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS", + "_PS", "_LS", "_HS", "_CS"}; static const unsigned si_pc_shader_type_bits[] = { - 0x7f, - S_036780_ES_EN(1), - S_036780_GS_EN(1), - S_036780_VS_EN(1), - S_036780_PS_EN(1), - S_036780_LS_EN(1), - S_036780_HS_EN(1), - S_036780_CS_EN(1), + 0x7f, + S_036780_ES_EN(1), + S_036780_GS_EN(1), + S_036780_VS_EN(1), + S_036780_PS_EN(1), + S_036780_LS_EN(1), + S_036780_HS_EN(1), + S_036780_CS_EN(1), }; /* Max counters per HW block */ @@ -129,277 +129,274 @@ static const unsigned si_pc_shader_type_bits[] = { #define SI_PC_SHADERS_WINDOWING (1u << 31) struct si_query_group { - struct si_query_group *next; - struct si_pc_block *block; - unsigned sub_gid; /* only used during init */ - unsigned result_base; /* only used during init */ - int se; - int instance; - unsigned num_counters; - unsigned selectors[SI_QUERY_MAX_COUNTERS]; + struct si_query_group *next; + struct si_pc_block *block; + unsigned sub_gid; /* only used during init */ + unsigned result_base; /* only used during init */ + int se; + int instance; + unsigned num_counters; + unsigned selectors[SI_QUERY_MAX_COUNTERS]; }; struct si_query_counter { - unsigned base; - unsigned qwords; - unsigned stride; /* in uint64s */ + unsigned base; + unsigned qwords; + unsigned stride; /* in uint64s */ }; struct si_query_pc { - struct si_query b; - struct si_query_buffer buffer; + struct si_query b; + struct si_query_buffer buffer; - /* Size of the results in memory, in bytes. */ - unsigned result_size; + /* Size of the results in memory, in bytes. */ + unsigned result_size; - unsigned shaders; - unsigned num_counters; - struct si_query_counter *counters; - struct si_query_group *groups; + unsigned shaders; + unsigned num_counters; + struct si_query_counter *counters; + struct si_query_group *groups; }; - static struct si_pc_block_base cik_CB = { - .name = "CB", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_037000_CB_PERFCOUNTER_FILTER, - .counter0_lo = R_035018_CB_PERFCOUNTER0_LO, - .num_multi = 1, - .num_prelude = 1, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "CB", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, + + .select0 = R_037000_CB_PERFCOUNTER_FILTER, + .counter0_lo = R_035018_CB_PERFCOUNTER0_LO, + .num_multi = 1, + .num_prelude = 1, + .layout = SI_PC_MULTI_ALTERNATE, }; static unsigned cik_CPC_select[] = { - R_036024_CPC_PERFCOUNTER0_SELECT, - R_036010_CPC_PERFCOUNTER0_SELECT1, - R_03600C_CPC_PERFCOUNTER1_SELECT, + R_036024_CPC_PERFCOUNTER0_SELECT, + R_036010_CPC_PERFCOUNTER0_SELECT1, + R_03600C_CPC_PERFCOUNTER1_SELECT, }; static struct si_pc_block_base cik_CPC = { - .name = "CPC", - .num_counters = 2, + .name = "CPC", + .num_counters = 2, - .select = cik_CPC_select, - .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE, + .select = cik_CPC_select, + .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE, }; static struct si_pc_block_base cik_CPF = { - .name = "CPF", - .num_counters = 2, + .name = "CPF", + .num_counters = 2, - .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT, - .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, + .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT, + .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, }; static struct si_pc_block_base cik_CPG = { - .name = "CPG", - .num_counters = 2, + .name = "CPG", + .num_counters = 2, - .select0 = R_036008_CPG_PERFCOUNTER0_SELECT, - .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, + .select0 = R_036008_CPG_PERFCOUNTER0_SELECT, + .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, }; static struct si_pc_block_base cik_DB = { - .name = "DB", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_037100_DB_PERFCOUNTER0_SELECT, - .counter0_lo = R_035100_DB_PERFCOUNTER0_LO, - .num_multi = 3, // really only 2, but there's a gap between registers - .layout = SI_PC_MULTI_ALTERNATE, + .name = "DB", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, + + .select0 = R_037100_DB_PERFCOUNTER0_SELECT, + .counter0_lo = R_035100_DB_PERFCOUNTER0_LO, + .num_multi = 3, // really only 2, but there's a gap between registers + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_GDS = { - .name = "GDS", - .num_counters = 4, + .name = "GDS", + .num_counters = 4, - .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT, - .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, + .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT, + .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_TAIL, }; static unsigned cik_GRBM_counters[] = { - R_034100_GRBM_PERFCOUNTER0_LO, - R_03410C_GRBM_PERFCOUNTER1_LO, + R_034100_GRBM_PERFCOUNTER0_LO, + R_03410C_GRBM_PERFCOUNTER1_LO, }; static struct si_pc_block_base cik_GRBM = { - .name = "GRBM", - .num_counters = 2, + .name = "GRBM", + .num_counters = 2, - .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT, - .counters = cik_GRBM_counters, + .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT, + .counters = cik_GRBM_counters, }; static struct si_pc_block_base cik_GRBMSE = { - .name = "GRBMSE", - .num_counters = 4, + .name = "GRBMSE", + .num_counters = 4, - .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT, - .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO, + .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT, + .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO, }; static struct si_pc_block_base cik_IA = { - .name = "IA", - .num_counters = 4, + .name = "IA", + .num_counters = 4, - .select0 = R_036210_IA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034220_IA_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, + .select0 = R_036210_IA_PERFCOUNTER0_SELECT, + .counter0_lo = R_034220_IA_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_TAIL, }; static struct si_pc_block_base cik_PA_SC = { - .name = "PA_SC", - .num_counters = 8, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT, - .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "PA_SC", + .num_counters = 8, + .flags = SI_PC_BLOCK_SE, + + .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT, + .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_ALTERNATE, }; /* According to docs, PA_SU counters are only 48 bits wide. */ static struct si_pc_block_base cik_PA_SU = { - .name = "PA_SU", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT, - .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "PA_SU", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE, + + .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT, + .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO, + .num_multi = 2, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_SPI = { - .name = "SPI", - .num_counters = 6, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036600_SPI_PERFCOUNTER0_SELECT, - .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO, - .num_multi = 4, - .layout = SI_PC_MULTI_BLOCK, + .name = "SPI", + .num_counters = 6, + .flags = SI_PC_BLOCK_SE, + + .select0 = R_036600_SPI_PERFCOUNTER0_SELECT, + .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO, + .num_multi = 4, + .layout = SI_PC_MULTI_BLOCK, }; static struct si_pc_block_base cik_SQ = { - .name = "SQ", - .num_counters = 16, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER, - - .select0 = R_036700_SQ_PERFCOUNTER0_SELECT, - .select_or = S_036700_SQC_BANK_MASK(15) | - S_036700_SQC_CLIENT_MASK(15) | - S_036700_SIMD_MASK(15), - .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO, + .name = "SQ", + .num_counters = 16, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER, + + .select0 = R_036700_SQ_PERFCOUNTER0_SELECT, + .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15), + .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO, }; static struct si_pc_block_base cik_SX = { - .name = "SX", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036900_SX_PERFCOUNTER0_SELECT, - .counter0_lo = R_034900_SX_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_TAIL, + .name = "SX", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE, + + .select0 = R_036900_SX_PERFCOUNTER0_SELECT, + .counter0_lo = R_034900_SX_PERFCOUNTER0_LO, + .num_multi = 2, + .layout = SI_PC_MULTI_TAIL, }; static struct si_pc_block_base cik_TA = { - .name = "TA", - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036B00_TA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "TA", + .num_counters = 2, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, + + .select0 = R_036B00_TA_PERFCOUNTER0_SELECT, + .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_TD = { - .name = "TD", - .num_counters = 2, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036C00_TD_PERFCOUNTER0_SELECT, - .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "TD", + .num_counters = 2, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, + + .select0 = R_036C00_TD_PERFCOUNTER0_SELECT, + .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_TCA = { - .name = "TCA", - .num_counters = 4, - .flags = SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "TCA", + .num_counters = 4, + .flags = SI_PC_BLOCK_INSTANCE_GROUPS, + + .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT, + .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO, + .num_multi = 2, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_TCC = { - .name = "TCC", - .num_counters = 4, - .flags = SI_PC_BLOCK_INSTANCE_GROUPS, - - .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT, - .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "TCC", + .num_counters = 4, + .flags = SI_PC_BLOCK_INSTANCE_GROUPS, + + .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT, + .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO, + .num_multi = 2, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_TCP = { - .name = "TCP", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, - - .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT, - .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO, - .num_multi = 2, - .layout = SI_PC_MULTI_ALTERNATE, + .name = "TCP", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, + + .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT, + .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO, + .num_multi = 2, + .layout = SI_PC_MULTI_ALTERNATE, }; static struct si_pc_block_base cik_VGT = { - .name = "VGT", - .num_counters = 4, - .flags = SI_PC_BLOCK_SE, - - .select0 = R_036230_VGT_PERFCOUNTER0_SELECT, - .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO, - .num_multi = 1, - .layout = SI_PC_MULTI_TAIL, + .name = "VGT", + .num_counters = 4, + .flags = SI_PC_BLOCK_SE, + + .select0 = R_036230_VGT_PERFCOUNTER0_SELECT, + .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO, + .num_multi = 1, + .layout = SI_PC_MULTI_TAIL, }; static struct si_pc_block_base cik_WD = { - .name = "WD", - .num_counters = 4, + .name = "WD", + .num_counters = 4, - .select0 = R_036200_WD_PERFCOUNTER0_SELECT, - .counter0_lo = R_034200_WD_PERFCOUNTER0_LO, + .select0 = R_036200_WD_PERFCOUNTER0_SELECT, + .counter0_lo = R_034200_WD_PERFCOUNTER0_LO, }; static struct si_pc_block_base cik_MC = { - .name = "MC", - .num_counters = 4, + .name = "MC", + .num_counters = 4, - .layout = SI_PC_FAKE, + .layout = SI_PC_FAKE, }; static struct si_pc_block_base cik_SRBM = { - .name = "SRBM", - .num_counters = 2, + .name = "SRBM", + .num_counters = 2, - .layout = SI_PC_FAKE, + .layout = SI_PC_FAKE, }; /* Both the number of instances and selectors varies between chips of the same @@ -411,947 +408,868 @@ static struct si_pc_block_base cik_SRBM = { * blocks here matters. */ static struct si_pc_block_gfxdescr groups_CIK[] = { - { &cik_CB, 226}, - { &cik_CPF, 17 }, - { &cik_DB, 257}, - { &cik_GRBM, 34 }, - { &cik_GRBMSE, 15 }, - { &cik_PA_SU, 153 }, - { &cik_PA_SC, 395 }, - { &cik_SPI, 186 }, - { &cik_SQ, 252 }, - { &cik_SX, 32 }, - { &cik_TA, 111, 11 }, - { &cik_TCA, 39, 2 }, - { &cik_TCC, 160}, - { &cik_TD, 55, 11 }, - { &cik_TCP, 154, 11 }, - { &cik_GDS, 121 }, - { &cik_VGT, 140 }, - { &cik_IA, 22 }, - { &cik_MC, 22 }, - { &cik_SRBM, 19 }, - { &cik_WD, 22 }, - { &cik_CPG, 46 }, - { &cik_CPC, 22 }, + {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15}, + {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32}, + {&cik_TA, 111, 11}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55, 11}, {&cik_TCP, 154, 11}, + {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19}, + {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22}, }; static struct si_pc_block_gfxdescr groups_VI[] = { - { &cik_CB, 405}, - { &cik_CPF, 19 }, - { &cik_DB, 257}, - { &cik_GRBM, 34 }, - { &cik_GRBMSE, 15 }, - { &cik_PA_SU, 154 }, - { &cik_PA_SC, 397 }, - { &cik_SPI, 197 }, - { &cik_SQ, 273 }, - { &cik_SX, 34 }, - { &cik_TA, 119, 16 }, - { &cik_TCA, 35, 2 }, - { &cik_TCC, 192}, - { &cik_TD, 55, 16 }, - { &cik_TCP, 180, 16 }, - { &cik_GDS, 121 }, - { &cik_VGT, 147 }, - { &cik_IA, 24 }, - { &cik_MC, 22 }, - { &cik_SRBM, 27 }, - { &cik_WD, 37 }, - { &cik_CPG, 48 }, - { &cik_CPC, 24 }, + {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15}, + {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34}, + {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55, 16}, {&cik_TCP, 180, 16}, + {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27}, + {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24}, }; static struct si_pc_block_gfxdescr groups_gfx9[] = { - { &cik_CB, 438}, - { &cik_CPF, 32 }, - { &cik_DB, 328}, - { &cik_GRBM, 38 }, - { &cik_GRBMSE, 16 }, - { &cik_PA_SU, 292 }, - { &cik_PA_SC, 491 }, - { &cik_SPI, 196 }, - { &cik_SQ, 374 }, - { &cik_SX, 208 }, - { &cik_TA, 119, 16 }, - { &cik_TCA, 35, 2 }, - { &cik_TCC, 256}, - { &cik_TD, 57, 16 }, - { &cik_TCP, 85, 16 }, - { &cik_GDS, 121 }, - { &cik_VGT, 148 }, - { &cik_IA, 32 }, - { &cik_WD, 58 }, - { &cik_CPG, 59 }, - { &cik_CPC, 35 }, + {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16}, + {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208}, + {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57, 16}, {&cik_TCP, 85, 16}, + {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59}, + {&cik_CPC, 35}, }; static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc, - const struct si_pc_block *block) + const struct si_pc_block *block) { - return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS || - (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se); + return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS || + (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se); } static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc, - const struct si_pc_block *block) + const struct si_pc_block *block) { - return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS || - (block->num_instances > 1 && pc->separate_instance); + return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS || + (block->num_instances > 1 && pc->separate_instance); } -static struct si_pc_block * -lookup_counter(struct si_perfcounters *pc, unsigned index, - unsigned *base_gid, unsigned *sub_index) +static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index, + unsigned *base_gid, unsigned *sub_index) { - struct si_pc_block *block = pc->blocks; - unsigned bid; + struct si_pc_block *block = pc->blocks; + unsigned bid; - *base_gid = 0; - for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { - unsigned total = block->num_groups * block->b->selectors; + *base_gid = 0; + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + unsigned total = block->num_groups * block->b->selectors; - if (index < total) { - *sub_index = index; - return block; - } + if (index < total) { + *sub_index = index; + return block; + } - index -= total; - *base_gid += block->num_groups; - } + index -= total; + *base_gid += block->num_groups; + } - return NULL; + return NULL; } -static struct si_pc_block * -lookup_group(struct si_perfcounters *pc, unsigned *index) +static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index) { - unsigned bid; - struct si_pc_block *block = pc->blocks; + unsigned bid; + struct si_pc_block *block = pc->blocks; - for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { - if (*index < block->num_groups) - return block; - *index -= block->num_groups; - } + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + if (*index < block->num_groups) + return block; + *index -= block->num_groups; + } - return NULL; + return NULL; } -static void si_pc_emit_instance(struct si_context *sctx, - int se, int instance) +static void si_pc_emit_instance(struct si_context *sctx, int se, int instance) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned value = S_030800_SH_BROADCAST_WRITES(1); - - if (se >= 0) { - value |= S_030800_SE_INDEX(se); - } else { - value |= S_030800_SE_BROADCAST_WRITES(1); - } - - if (instance >= 0) { - value |= S_030800_INSTANCE_INDEX(instance); - } else { - value |= S_030800_INSTANCE_BROADCAST_WRITES(1); - } - - radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned value = S_030800_SH_BROADCAST_WRITES(1); + + if (se >= 0) { + value |= S_030800_SE_INDEX(se); + } else { + value |= S_030800_SE_BROADCAST_WRITES(1); + } + + if (instance >= 0) { + value |= S_030800_INSTANCE_INDEX(instance); + } else { + value |= S_030800_INSTANCE_BROADCAST_WRITES(1); + } + + radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); } -static void si_pc_emit_shaders(struct si_context *sctx, - unsigned shaders) +static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); - radeon_emit(cs, shaders & 0x7f); - radeon_emit(cs, 0xffffffff); + radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); + radeon_emit(cs, shaders & 0x7f); + radeon_emit(cs, 0xffffffff); } -static void si_pc_emit_select(struct si_context *sctx, - struct si_pc_block *block, - unsigned count, unsigned *selectors) +static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count, + unsigned *selectors) { - struct si_pc_block_base *regs = block->b->b; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned idx; - unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK; - unsigned dw; - - assert(count <= regs->num_counters); - - if (regs->layout & SI_PC_FAKE) - return; - - if (layout_multi == SI_PC_MULTI_BLOCK) { - assert(!(regs->layout & SI_PC_REG_REVERSE)); - - dw = count + regs->num_prelude; - if (count >= regs->num_multi) - dw += regs->num_multi; - radeon_set_uconfig_reg_seq(cs, regs->select0, dw); - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - - if (count < regs->num_multi) { - unsigned select1 = - regs->select0 + 4 * regs->num_multi; - radeon_set_uconfig_reg_seq(cs, select1, count); - } - - for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) - radeon_emit(cs, 0); - - if (count > regs->num_multi) { - for (idx = regs->num_multi; idx < count; ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - } - } else if (layout_multi == SI_PC_MULTI_TAIL) { - unsigned select1, select1_count; - - assert(!(regs->layout & SI_PC_REG_REVERSE)); - - radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude); - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < count; ++idx) - radeon_emit(cs, selectors[idx] | regs->select_or); - - select1 = regs->select0 + 4 * regs->num_counters; - select1_count = MIN2(count, regs->num_multi); - radeon_set_uconfig_reg_seq(cs, select1, select1_count); - for (idx = 0; idx < select1_count; ++idx) - radeon_emit(cs, 0); - } else if (layout_multi == SI_PC_MULTI_CUSTOM) { - unsigned *reg = regs->select; - for (idx = 0; idx < count; ++idx) { - radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or); - if (idx < regs->num_multi) - radeon_set_uconfig_reg(cs, *reg++, 0); - } - } else { - assert(layout_multi == SI_PC_MULTI_ALTERNATE); - - unsigned reg_base = regs->select0; - unsigned reg_count = count + MIN2(count, regs->num_multi); - reg_count += regs->num_prelude; - - if (!(regs->layout & SI_PC_REG_REVERSE)) { - radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); - - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - for (idx = 0; idx < count; ++idx) { - radeon_emit(cs, selectors[idx] | regs->select_or); - if (idx < regs->num_multi) - radeon_emit(cs, 0); - } - } else { - reg_base -= (reg_count - 1) * 4; - radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); - - for (idx = count; idx > 0; --idx) { - if (idx <= regs->num_multi) - radeon_emit(cs, 0); - radeon_emit(cs, selectors[idx - 1] | regs->select_or); - } - for (idx = 0; idx < regs->num_prelude; ++idx) - radeon_emit(cs, 0); - } - } + struct si_pc_block_base *regs = block->b->b; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned idx; + unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK; + unsigned dw; + + assert(count <= regs->num_counters); + + if (regs->layout & SI_PC_FAKE) + return; + + if (layout_multi == SI_PC_MULTI_BLOCK) { + assert(!(regs->layout & SI_PC_REG_REVERSE)); + + dw = count + regs->num_prelude; + if (count >= regs->num_multi) + dw += regs->num_multi; + radeon_set_uconfig_reg_seq(cs, regs->select0, dw); + for (idx = 0; idx < regs->num_prelude; ++idx) + radeon_emit(cs, 0); + for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) + radeon_emit(cs, selectors[idx] | regs->select_or); + + if (count < regs->num_multi) { + unsigned select1 = regs->select0 + 4 * regs->num_multi; + radeon_set_uconfig_reg_seq(cs, select1, count); + } + + for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) + radeon_emit(cs, 0); + + if (count > regs->num_multi) { + for (idx = regs->num_multi; idx < count; ++idx) + radeon_emit(cs, selectors[idx] | regs->select_or); + } + } else if (layout_multi == SI_PC_MULTI_TAIL) { + unsigned select1, select1_count; + + assert(!(regs->layout & SI_PC_REG_REVERSE)); + + radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude); + for (idx = 0; idx < regs->num_prelude; ++idx) + radeon_emit(cs, 0); + for (idx = 0; idx < count; ++idx) + radeon_emit(cs, selectors[idx] | regs->select_or); + + select1 = regs->select0 + 4 * regs->num_counters; + select1_count = MIN2(count, regs->num_multi); + radeon_set_uconfig_reg_seq(cs, select1, select1_count); + for (idx = 0; idx < select1_count; ++idx) + radeon_emit(cs, 0); + } else if (layout_multi == SI_PC_MULTI_CUSTOM) { + unsigned *reg = regs->select; + for (idx = 0; idx < count; ++idx) { + radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or); + if (idx < regs->num_multi) + radeon_set_uconfig_reg(cs, *reg++, 0); + } + } else { + assert(layout_multi == SI_PC_MULTI_ALTERNATE); + + unsigned reg_base = regs->select0; + unsigned reg_count = count + MIN2(count, regs->num_multi); + reg_count += regs->num_prelude; + + if (!(regs->layout & SI_PC_REG_REVERSE)) { + radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); + + for (idx = 0; idx < regs->num_prelude; ++idx) + radeon_emit(cs, 0); + for (idx = 0; idx < count; ++idx) { + radeon_emit(cs, selectors[idx] | regs->select_or); + if (idx < regs->num_multi) + radeon_emit(cs, 0); + } + } else { + reg_base -= (reg_count - 1) * 4; + radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); + + for (idx = count; idx > 0; --idx) { + if (idx <= regs->num_multi) + radeon_emit(cs, 0); + radeon_emit(cs, selectors[idx - 1] | regs->select_or); + } + for (idx = 0; idx < regs->num_prelude; ++idx) + radeon_emit(cs, 0); + } + } } -static void si_pc_emit_start(struct si_context *sctx, - struct si_resource *buffer, uint64_t va) +static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - si_cp_copy_data(sctx, sctx->gfx_cs, - COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address, - COPY_DATA_IMM, NULL, 1); - - radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, - S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET)); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); - radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, - S_036020_PERFMON_STATE(V_036020_START_COUNTING)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address, + COPY_DATA_IMM, NULL, 1); + + radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, + S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); + radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, + S_036020_PERFMON_STATE(V_036020_START_COUNTING)); } /* Note: The buffer was already added in si_pc_emit_start, so we don't have to * do it again in here. */ -static void si_pc_emit_stop(struct si_context *sctx, - struct si_resource *buffer, uint64_t va) +static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - buffer, va, 0, SI_NOT_QUERY); - si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL); - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); - radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, - S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | - S_036020_PERFMON_SAMPLE_ENABLE(1)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY); + si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); + radeon_set_uconfig_reg( + cs, R_036020_CP_PERFMON_CNTL, + S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1)); } -static void si_pc_emit_read(struct si_context *sctx, - struct si_pc_block *block, - unsigned count, uint64_t va) +static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count, + uint64_t va) { - struct si_pc_block_base *regs = block->b->b; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned idx; - unsigned reg = regs->counter0_lo; - unsigned reg_delta = 8; - - if (!(regs->layout & SI_PC_FAKE)) { - if (regs->layout & SI_PC_REG_REVERSE) - reg_delta = -reg_delta; - - for (idx = 0; idx < count; ++idx) { - if (regs->counters) - reg = regs->counters[idx]; - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | - COPY_DATA_COUNT_SEL); /* 64 bits */ - radeon_emit(cs, reg >> 2); - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - va += sizeof(uint64_t); - reg += reg_delta; - } - } else { - for (idx = 0; idx < count; ++idx) { - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | - COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | - COPY_DATA_COUNT_SEL); - radeon_emit(cs, 0); /* immediate */ - radeon_emit(cs, 0); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - va += sizeof(uint64_t); - } - } + struct si_pc_block_base *regs = block->b->b; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned idx; + unsigned reg = regs->counter0_lo; + unsigned reg_delta = 8; + + if (!(regs->layout & SI_PC_FAKE)) { + if (regs->layout & SI_PC_REG_REVERSE) + reg_delta = -reg_delta; + + for (idx = 0; idx < count; ++idx) { + if (regs->counters) + reg = regs->counters[idx]; + + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_COUNT_SEL); /* 64 bits */ + radeon_emit(cs, reg >> 2); + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + va += sizeof(uint64_t); + reg += reg_delta; + } + } else { + for (idx = 0; idx < count; ++idx) { + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_COUNT_SEL); + radeon_emit(cs, 0); /* immediate */ + radeon_emit(cs, 0); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + va += sizeof(uint64_t); + } + } } -static void si_pc_query_destroy(struct si_context *sctx, - struct si_query *squery) +static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)squery; + struct si_query_pc *query = (struct si_query_pc *)squery; - while (query->groups) { - struct si_query_group *group = query->groups; - query->groups = group->next; - FREE(group); - } + while (query->groups) { + struct si_query_group *group = query->groups; + query->groups = group->next; + FREE(group); + } - FREE(query->counters); + FREE(query->counters); - si_query_buffer_destroy(sctx->screen, &query->buffer); - FREE(query); + si_query_buffer_destroy(sctx->screen, &query->buffer); + FREE(query); } static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery) /* - struct si_query_hw *hwquery, - struct si_resource *buffer, uint64_t va)*/ + struct si_query_hw *hwquery, + struct si_resource *buffer, uint64_t va)*/ { - struct si_query_pc *query = (struct si_query_pc *)squery; - int current_se = -1; - int current_instance = -1; + struct si_query_pc *query = (struct si_query_pc *)squery; + int current_se = -1; + int current_instance = -1; - if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size)) - return; - si_need_gfx_cs_space(sctx); + if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size)) + return; + si_need_gfx_cs_space(sctx); - if (query->shaders) - si_pc_emit_shaders(sctx, query->shaders); + if (query->shaders) + si_pc_emit_shaders(sctx, query->shaders); - for (struct si_query_group *group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; + for (struct si_query_group *group = query->groups; group; group = group->next) { + struct si_pc_block *block = group->block; - if (group->se != current_se || group->instance != current_instance) { - current_se = group->se; - current_instance = group->instance; - si_pc_emit_instance(sctx, group->se, group->instance); - } + if (group->se != current_se || group->instance != current_instance) { + current_se = group->se; + current_instance = group->instance; + si_pc_emit_instance(sctx, group->se, group->instance); + } - si_pc_emit_select(sctx, block, group->num_counters, group->selectors); - } + si_pc_emit_select(sctx, block, group->num_counters, group->selectors); + } - if (current_se != -1 || current_instance != -1) - si_pc_emit_instance(sctx, -1, -1); + if (current_se != -1 || current_instance != -1) + si_pc_emit_instance(sctx, -1, -1); - uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; - si_pc_emit_start(sctx, query->buffer.buf, va); + uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; + si_pc_emit_start(sctx, query->buffer.buf, va); } static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)squery; + struct si_query_pc *query = (struct si_query_pc *)squery; - if (!query->buffer.buf) - return; + if (!query->buffer.buf) + return; - uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; - query->buffer.results_end += query->result_size; + uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end; + query->buffer.results_end += query->result_size; - si_pc_emit_stop(sctx, query->buffer.buf, va); + si_pc_emit_stop(sctx, query->buffer.buf, va); - for (struct si_query_group *group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; - unsigned se = group->se >= 0 ? group->se : 0; - unsigned se_end = se + 1; + for (struct si_query_group *group = query->groups; group; group = group->next) { + struct si_pc_block *block = group->block; + unsigned se = group->se >= 0 ? group->se : 0; + unsigned se_end = se + 1; - if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0)) - se_end = sctx->screen->info.max_se; + if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0)) + se_end = sctx->screen->info.max_se; - do { - unsigned instance = group->instance >= 0 ? group->instance : 0; + do { + unsigned instance = group->instance >= 0 ? group->instance : 0; - do { - si_pc_emit_instance(sctx, se, instance); - si_pc_emit_read(sctx, block, group->num_counters, va); - va += sizeof(uint64_t) * group->num_counters; - } while (group->instance < 0 && ++instance < block->num_instances); - } while (++se < se_end); - } + do { + si_pc_emit_instance(sctx, se, instance); + si_pc_emit_read(sctx, block, group->num_counters, va); + va += sizeof(uint64_t) * group->num_counters; + } while (group->instance < 0 && ++instance < block->num_instances); + } while (++se < se_end); + } - si_pc_emit_instance(sctx, -1, -1); + si_pc_emit_instance(sctx, -1, -1); } static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)squery; + struct si_query_pc *query = (struct si_query_pc *)squery; - si_query_buffer_reset(ctx, &query->buffer); + si_query_buffer_reset(ctx, &query->buffer); - list_addtail(&query->b.active_list, &ctx->active_queries); - ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; + list_addtail(&query->b.active_list, &ctx->active_queries); + ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; - si_pc_query_resume(ctx, squery); + si_pc_query_resume(ctx, squery); - return true; + return true; } static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery) { - struct si_query_pc *query = (struct si_query_pc *)squery; + struct si_query_pc *query = (struct si_query_pc *)squery; - si_pc_query_suspend(ctx, squery); + si_pc_query_suspend(ctx, squery); - list_del(&squery->active_list); - ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend; + list_del(&squery->active_list); + ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend; - return query->buffer.buf != NULL; + return query->buffer.buf != NULL; } -static void si_pc_query_add_result(struct si_query_pc *query, - void *buffer, - union pipe_query_result *result) +static void si_pc_query_add_result(struct si_query_pc *query, void *buffer, + union pipe_query_result *result) { - uint64_t *results = buffer; - unsigned i, j; + uint64_t *results = buffer; + unsigned i, j; - for (i = 0; i < query->num_counters; ++i) { - struct si_query_counter *counter = &query->counters[i]; + for (i = 0; i < query->num_counters; ++i) { + struct si_query_counter *counter = &query->counters[i]; - for (j = 0; j < counter->qwords; ++j) { - uint32_t value = results[counter->base + j * counter->stride]; - result->batch[i].u64 += value; - } - } + for (j = 0; j < counter->qwords; ++j) { + uint32_t value = results[counter->base + j * counter->stride]; + result->batch[i].u64 += value; + } + } } -static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, - bool wait, union pipe_query_result *result) +static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait, + union pipe_query_result *result) { - struct si_query_pc *query = (struct si_query_pc *)squery; + struct si_query_pc *query = (struct si_query_pc *)squery; - memset(result, 0, sizeof(result->batch[0]) * query->num_counters); + memset(result, 0, sizeof(result->batch[0]) * query->num_counters); - for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned usage = PIPE_TRANSFER_READ | - (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); - unsigned results_base = 0; - void *map; + for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + unsigned results_base = 0; + void *map; - if (squery->b.flushed) - map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); - else - map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); + if (squery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); - if (!map) - return false; + if (!map) + return false; - while (results_base != qbuf->results_end) { - si_pc_query_add_result(query, map + results_base, result); - results_base += query->result_size; - } - } + while (results_base != qbuf->results_end) { + si_pc_query_add_result(query, map + results_base, result); + results_base += query->result_size; + } + } - return true; + return true; } static const struct si_query_ops batch_query_ops = { - .destroy = si_pc_query_destroy, - .begin = si_pc_query_begin, - .end = si_pc_query_end, - .get_result = si_pc_query_get_result, + .destroy = si_pc_query_destroy, + .begin = si_pc_query_begin, + .end = si_pc_query_end, + .get_result = si_pc_query_get_result, - .suspend = si_pc_query_suspend, - .resume = si_pc_query_resume, + .suspend = si_pc_query_suspend, + .resume = si_pc_query_resume, }; -static struct si_query_group *get_group_state(struct si_screen *screen, - struct si_query_pc *query, - struct si_pc_block *block, - unsigned sub_gid) +static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query, + struct si_pc_block *block, unsigned sub_gid) { - struct si_query_group *group = query->groups; - - while (group) { - if (group->block == block && group->sub_gid == sub_gid) - return group; - group = group->next; - } - - group = CALLOC_STRUCT(si_query_group); - if (!group) - return NULL; - - group->block = block; - group->sub_gid = sub_gid; - - if (block->b->b->flags & SI_PC_BLOCK_SHADER) { - unsigned sub_gids = block->num_instances; - unsigned shader_id; - unsigned shaders; - unsigned query_shaders; - - if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) - sub_gids = sub_gids * screen->info.max_se; - shader_id = sub_gid / sub_gids; - sub_gid = sub_gid % sub_gids; - - shaders = si_pc_shader_type_bits[shader_id]; - - query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING; - if (query_shaders && query_shaders != shaders) { - fprintf(stderr, "si_perfcounter: incompatible shader groups\n"); - FREE(group); - return NULL; - } - query->shaders = shaders; - } - - if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) { - // A non-zero value in query->shaders ensures that the shader - // masking is reset unless the user explicitly requests one. - query->shaders = SI_PC_SHADERS_WINDOWING; - } - - if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) { - group->se = sub_gid / block->num_instances; - sub_gid = sub_gid % block->num_instances; - } else { - group->se = -1; - } - - if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) { - group->instance = sub_gid; - } else { - group->instance = -1; - } - - group->next = query->groups; - query->groups = group; - - return group; + struct si_query_group *group = query->groups; + + while (group) { + if (group->block == block && group->sub_gid == sub_gid) + return group; + group = group->next; + } + + group = CALLOC_STRUCT(si_query_group); + if (!group) + return NULL; + + group->block = block; + group->sub_gid = sub_gid; + + if (block->b->b->flags & SI_PC_BLOCK_SHADER) { + unsigned sub_gids = block->num_instances; + unsigned shader_id; + unsigned shaders; + unsigned query_shaders; + + if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) + sub_gids = sub_gids * screen->info.max_se; + shader_id = sub_gid / sub_gids; + sub_gid = sub_gid % sub_gids; + + shaders = si_pc_shader_type_bits[shader_id]; + + query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING; + if (query_shaders && query_shaders != shaders) { + fprintf(stderr, "si_perfcounter: incompatible shader groups\n"); + FREE(group); + return NULL; + } + query->shaders = shaders; + } + + if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) { + // A non-zero value in query->shaders ensures that the shader + // masking is reset unless the user explicitly requests one. + query->shaders = SI_PC_SHADERS_WINDOWING; + } + + if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) { + group->se = sub_gid / block->num_instances; + sub_gid = sub_gid % block->num_instances; + } else { + group->se = -1; + } + + if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) { + group->instance = sub_gid; + } else { + group->instance = -1; + } + + group->next = query->groups; + query->groups = group; + + return group; } -struct pipe_query *si_create_batch_query(struct pipe_context *ctx, - unsigned num_queries, - unsigned *query_types) +struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries, + unsigned *query_types) { - struct si_screen *screen = - (struct si_screen *)ctx->screen; - struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; - struct si_query_group *group; - struct si_query_pc *query; - unsigned base_gid, sub_gid, sub_index; - unsigned i, j; - - if (!pc) - return NULL; - - query = CALLOC_STRUCT(si_query_pc); - if (!query) - return NULL; - - query->b.ops = &batch_query_ops; - - query->num_counters = num_queries; - - /* Collect selectors per group */ - for (i = 0; i < num_queries; ++i) { - unsigned sub_gid; - - if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER) - goto error; - - block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, - &base_gid, &sub_index); - if (!block) - goto error; - - sub_gid = sub_index / block->b->selectors; - sub_index = sub_index % block->b->selectors; - - group = get_group_state(screen, query, block, sub_gid); - if (!group) - goto error; - - if (group->num_counters >= block->b->b->num_counters) { - fprintf(stderr, - "perfcounter group %s: too many selected\n", - block->b->b->name); - goto error; - } - group->selectors[group->num_counters] = sub_index; - ++group->num_counters; - } - - /* Compute result bases and CS size per group */ - query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords; - query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords; - - i = 0; - for (group = query->groups; group; group = group->next) { - struct si_pc_block *block = group->block; - unsigned read_dw; - unsigned instances = 1; - - if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) - instances = screen->info.max_se; - if (group->instance < 0) - instances *= block->num_instances; - - group->result_base = i; - query->result_size += sizeof(uint64_t) * instances * group->num_counters; - i += instances * group->num_counters; - - read_dw = 6 * group->num_counters; - query->b.num_cs_dw_suspend += instances * read_dw; - query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords; - } - - if (query->shaders) { - if (query->shaders == SI_PC_SHADERS_WINDOWING) - query->shaders = 0xffffffff; - } - - /* Map user-supplied query array to result indices */ - query->counters = CALLOC(num_queries, sizeof(*query->counters)); - for (i = 0; i < num_queries; ++i) { - struct si_query_counter *counter = &query->counters[i]; - struct si_pc_block *block; - - block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, - &base_gid, &sub_index); - - sub_gid = sub_index / block->b->selectors; - sub_index = sub_index % block->b->selectors; - - group = get_group_state(screen, query, block, sub_gid); - assert(group != NULL); - - for (j = 0; j < group->num_counters; ++j) { - if (group->selectors[j] == sub_index) - break; - } - - counter->base = group->result_base + j; - counter->stride = group->num_counters; - - counter->qwords = 1; - if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) - counter->qwords = screen->info.max_se; - if (group->instance < 0) - counter->qwords *= block->num_instances; - } + struct si_screen *screen = (struct si_screen *)ctx->screen; + struct si_perfcounters *pc = screen->perfcounters; + struct si_pc_block *block; + struct si_query_group *group; + struct si_query_pc *query; + unsigned base_gid, sub_gid, sub_index; + unsigned i, j; + + if (!pc) + return NULL; + + query = CALLOC_STRUCT(si_query_pc); + if (!query) + return NULL; + + query->b.ops = &batch_query_ops; + + query->num_counters = num_queries; + + /* Collect selectors per group */ + for (i = 0; i < num_queries; ++i) { + unsigned sub_gid; + + if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER) + goto error; + + block = + lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); + if (!block) + goto error; + + sub_gid = sub_index / block->b->selectors; + sub_index = sub_index % block->b->selectors; + + group = get_group_state(screen, query, block, sub_gid); + if (!group) + goto error; + + if (group->num_counters >= block->b->b->num_counters) { + fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name); + goto error; + } + group->selectors[group->num_counters] = sub_index; + ++group->num_counters; + } + + /* Compute result bases and CS size per group */ + query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords; + query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords; + + i = 0; + for (group = query->groups; group; group = group->next) { + struct si_pc_block *block = group->block; + unsigned read_dw; + unsigned instances = 1; + + if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) + instances = screen->info.max_se; + if (group->instance < 0) + instances *= block->num_instances; + + group->result_base = i; + query->result_size += sizeof(uint64_t) * instances * group->num_counters; + i += instances * group->num_counters; + + read_dw = 6 * group->num_counters; + query->b.num_cs_dw_suspend += instances * read_dw; + query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords; + } + + if (query->shaders) { + if (query->shaders == SI_PC_SHADERS_WINDOWING) + query->shaders = 0xffffffff; + } + + /* Map user-supplied query array to result indices */ + query->counters = CALLOC(num_queries, sizeof(*query->counters)); + for (i = 0; i < num_queries; ++i) { + struct si_query_counter *counter = &query->counters[i]; + struct si_pc_block *block; + + block = + lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index); + + sub_gid = sub_index / block->b->selectors; + sub_index = sub_index % block->b->selectors; + + group = get_group_state(screen, query, block, sub_gid); + assert(group != NULL); + + for (j = 0; j < group->num_counters; ++j) { + if (group->selectors[j] == sub_index) + break; + } + + counter->base = group->result_base + j; + counter->stride = group->num_counters; + + counter->qwords = 1; + if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0) + counter->qwords = screen->info.max_se; + if (group->instance < 0) + counter->qwords *= block->num_instances; + } - return (struct pipe_query *)query; + return (struct pipe_query *)query; error: - si_pc_query_destroy((struct si_context *)ctx, &query->b); - return NULL; + si_pc_query_destroy((struct si_context *)ctx, &query->b); + return NULL; } -static bool si_init_block_names(struct si_screen *screen, - struct si_pc_block *block) +static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block) { - bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block); - bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block); - unsigned i, j, k; - unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; - unsigned namelen; - char *groupname; - char *p; - - if (per_instance_groups) - groups_instance = block->num_instances; - if (per_se_groups) - groups_se = screen->info.max_se; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - groups_shader = ARRAY_SIZE(si_pc_shader_type_bits); - - namelen = strlen(block->b->b->name); - block->group_name_stride = namelen + 1; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - block->group_name_stride += 3; - if (per_se_groups) { - assert(groups_se <= 10); - block->group_name_stride += 1; - - if (per_instance_groups) - block->group_name_stride += 1; - } - if (per_instance_groups) { - assert(groups_instance <= 100); - block->group_name_stride += 2; - } - - block->group_names = MALLOC(block->num_groups * block->group_name_stride); - if (!block->group_names) - return false; - - groupname = block->group_names; - for (i = 0; i < groups_shader; ++i) { - const char *shader_suffix = si_pc_shader_type_suffixes[i]; - unsigned shaderlen = strlen(shader_suffix); - for (j = 0; j < groups_se; ++j) { - for (k = 0; k < groups_instance; ++k) { - strcpy(groupname, block->b->b->name); - p = groupname + namelen; - - if (block->b->b->flags & SI_PC_BLOCK_SHADER) { - strcpy(p, shader_suffix); - p += shaderlen; - } - - if (per_se_groups) { - p += sprintf(p, "%d", j); - if (per_instance_groups) - *p++ = '_'; - } - - if (per_instance_groups) - p += sprintf(p, "%d", k); - - groupname += block->group_name_stride; - } - } - } - - assert(block->b->selectors <= 1000); - block->selector_name_stride = block->group_name_stride + 4; - block->selector_names = MALLOC(block->num_groups * block->b->selectors * - block->selector_name_stride); - if (!block->selector_names) - return false; - - groupname = block->group_names; - p = block->selector_names; - for (i = 0; i < block->num_groups; ++i) { - for (j = 0; j < block->b->selectors; ++j) { - sprintf(p, "%s_%03d", groupname, j); - p += block->selector_name_stride; - } - groupname += block->group_name_stride; - } - - return true; + bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block); + bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block); + unsigned i, j, k; + unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; + unsigned namelen; + char *groupname; + char *p; + + if (per_instance_groups) + groups_instance = block->num_instances; + if (per_se_groups) + groups_se = screen->info.max_se; + if (block->b->b->flags & SI_PC_BLOCK_SHADER) + groups_shader = ARRAY_SIZE(si_pc_shader_type_bits); + + namelen = strlen(block->b->b->name); + block->group_name_stride = namelen + 1; + if (block->b->b->flags & SI_PC_BLOCK_SHADER) + block->group_name_stride += 3; + if (per_se_groups) { + assert(groups_se <= 10); + block->group_name_stride += 1; + + if (per_instance_groups) + block->group_name_stride += 1; + } + if (per_instance_groups) { + assert(groups_instance <= 100); + block->group_name_stride += 2; + } + + block->group_names = MALLOC(block->num_groups * block->group_name_stride); + if (!block->group_names) + return false; + + groupname = block->group_names; + for (i = 0; i < groups_shader; ++i) { + const char *shader_suffix = si_pc_shader_type_suffixes[i]; + unsigned shaderlen = strlen(shader_suffix); + for (j = 0; j < groups_se; ++j) { + for (k = 0; k < groups_instance; ++k) { + strcpy(groupname, block->b->b->name); + p = groupname + namelen; + + if (block->b->b->flags & SI_PC_BLOCK_SHADER) { + strcpy(p, shader_suffix); + p += shaderlen; + } + + if (per_se_groups) { + p += sprintf(p, "%d", j); + if (per_instance_groups) + *p++ = '_'; + } + + if (per_instance_groups) + p += sprintf(p, "%d", k); + + groupname += block->group_name_stride; + } + } + } + + assert(block->b->selectors <= 1000); + block->selector_name_stride = block->group_name_stride + 4; + block->selector_names = + MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride); + if (!block->selector_names) + return false; + + groupname = block->group_names; + p = block->selector_names; + for (i = 0; i < block->num_groups; ++i) { + for (j = 0; j < block->b->selectors; ++j) { + sprintf(p, "%s_%03d", groupname, j); + p += block->selector_name_stride; + } + groupname += block->group_name_stride; + } + + return true; } -int si_get_perfcounter_info(struct si_screen *screen, - unsigned index, - struct pipe_driver_query_info *info) +int si_get_perfcounter_info(struct si_screen *screen, unsigned index, + struct pipe_driver_query_info *info) { - struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; - unsigned base_gid, sub; - - if (!pc) - return 0; - - if (!info) { - unsigned bid, num_queries = 0; - - for (bid = 0; bid < pc->num_blocks; ++bid) { - num_queries += pc->blocks[bid].b->selectors * - pc->blocks[bid].num_groups; - } - - return num_queries; - } - - block = lookup_counter(pc, index, &base_gid, &sub); - if (!block) - return 0; - - if (!block->selector_names) { - if (!si_init_block_names(screen, block)) - return 0; - } - info->name = block->selector_names + sub * block->selector_name_stride; - info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index; - info->max_value.u64 = 0; - info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; - info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; - info->group_id = base_gid + sub / block->b->selectors; - info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; - if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups) - info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST; - return 1; + struct si_perfcounters *pc = screen->perfcounters; + struct si_pc_block *block; + unsigned base_gid, sub; + + if (!pc) + return 0; + + if (!info) { + unsigned bid, num_queries = 0; + + for (bid = 0; bid < pc->num_blocks; ++bid) { + num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups; + } + + return num_queries; + } + + block = lookup_counter(pc, index, &base_gid, &sub); + if (!block) + return 0; + + if (!block->selector_names) { + if (!si_init_block_names(screen, block)) + return 0; + } + info->name = block->selector_names + sub * block->selector_name_stride; + info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index; + info->max_value.u64 = 0; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; + info->group_id = base_gid + sub / block->b->selectors; + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups) + info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST; + return 1; } -int si_get_perfcounter_group_info(struct si_screen *screen, - unsigned index, - struct pipe_driver_query_group_info *info) +int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index, + struct pipe_driver_query_group_info *info) { - struct si_perfcounters *pc = screen->perfcounters; - struct si_pc_block *block; - - if (!pc) - return 0; - - if (!info) - return pc->num_groups; - - block = lookup_group(pc, &index); - if (!block) - return 0; - - if (!block->group_names) { - if (!si_init_block_names(screen, block)) - return 0; - } - info->name = block->group_names + index * block->group_name_stride; - info->num_queries = block->b->selectors; - info->max_active_queries = block->b->b->num_counters; - return 1; + struct si_perfcounters *pc = screen->perfcounters; + struct si_pc_block *block; + + if (!pc) + return 0; + + if (!info) + return pc->num_groups; + + block = lookup_group(pc, &index); + if (!block) + return 0; + + if (!block->group_names) { + if (!si_init_block_names(screen, block)) + return 0; + } + info->name = block->group_names + index * block->group_name_stride; + info->num_queries = block->b->selectors; + info->max_active_queries = block->b->b->num_counters; + return 1; } void si_destroy_perfcounters(struct si_screen *screen) { - struct si_perfcounters *pc = screen->perfcounters; - unsigned i; - - if (!pc) - return; - - for (i = 0; i < pc->num_blocks; ++i) { - FREE(pc->blocks[i].group_names); - FREE(pc->blocks[i].selector_names); - } - FREE(pc->blocks); - FREE(pc); - screen->perfcounters = NULL; + struct si_perfcounters *pc = screen->perfcounters; + unsigned i; + + if (!pc) + return; + + for (i = 0; i < pc->num_blocks; ++i) { + FREE(pc->blocks[i].group_names); + FREE(pc->blocks[i].selector_names); + } + FREE(pc->blocks); + FREE(pc); + screen->perfcounters = NULL; } void si_init_perfcounters(struct si_screen *screen) { - struct si_perfcounters *pc; - const struct si_pc_block_gfxdescr *blocks; - unsigned num_blocks; - unsigned i; - - switch (screen->info.chip_class) { - case GFX7: - blocks = groups_CIK; - num_blocks = ARRAY_SIZE(groups_CIK); - break; - case GFX8: - blocks = groups_VI; - num_blocks = ARRAY_SIZE(groups_VI); - break; - case GFX9: - blocks = groups_gfx9; - num_blocks = ARRAY_SIZE(groups_gfx9); - break; - case GFX6: - default: - return; /* not implemented */ - } - - if (screen->info.max_sh_per_se != 1) { - /* This should not happen on non-GFX6 chips. */ - fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not " - "supported (inaccurate performance counters)\n", - screen->info.max_sh_per_se); - } - - screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters); - if (!pc) - return; - - pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen); - pc->num_instance_cs_dwords = 3; - - pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); - pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); - - pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block)); - if (!pc->blocks) - goto error; - pc->num_blocks = num_blocks; - - for (i = 0; i < num_blocks; ++i) { - struct si_pc_block *block = &pc->blocks[i]; - block->b = &blocks[i]; - block->num_instances = MAX2(1, block->b->instances); - - if (!strcmp(block->b->b->name, "CB") || - !strcmp(block->b->b->name, "DB")) - block->num_instances = screen->info.max_se; - else if (!strcmp(block->b->b->name, "TCC")) - block->num_instances = screen->info.num_tcc_blocks; - else if (!strcmp(block->b->b->name, "IA")) - block->num_instances = MAX2(1, screen->info.max_se / 2); - - if (si_pc_block_has_per_instance_groups(pc, block)) { - block->num_groups = block->num_instances; - } else { - block->num_groups = 1; - } - - if (si_pc_block_has_per_se_groups(pc, block)) - block->num_groups *= screen->info.max_se; - if (block->b->b->flags & SI_PC_BLOCK_SHADER) - block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits); - - pc->num_groups += block->num_groups; - } - - return; + struct si_perfcounters *pc; + const struct si_pc_block_gfxdescr *blocks; + unsigned num_blocks; + unsigned i; + + switch (screen->info.chip_class) { + case GFX7: + blocks = groups_CIK; + num_blocks = ARRAY_SIZE(groups_CIK); + break; + case GFX8: + blocks = groups_VI; + num_blocks = ARRAY_SIZE(groups_VI); + break; + case GFX9: + blocks = groups_gfx9; + num_blocks = ARRAY_SIZE(groups_gfx9); + break; + case GFX6: + default: + return; /* not implemented */ + } + + if (screen->info.max_sh_per_se != 1) { + /* This should not happen on non-GFX6 chips. */ + fprintf(stderr, + "si_init_perfcounters: max_sh_per_se = %d not " + "supported (inaccurate performance counters)\n", + screen->info.max_sh_per_se); + } + + screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters); + if (!pc) + return; + + pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen); + pc->num_instance_cs_dwords = 3; + + pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); + pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); + + pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block)); + if (!pc->blocks) + goto error; + pc->num_blocks = num_blocks; + + for (i = 0; i < num_blocks; ++i) { + struct si_pc_block *block = &pc->blocks[i]; + block->b = &blocks[i]; + block->num_instances = MAX2(1, block->b->instances); + + if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB")) + block->num_instances = screen->info.max_se; + else if (!strcmp(block->b->b->name, "TCC")) + block->num_instances = screen->info.num_tcc_blocks; + else if (!strcmp(block->b->b->name, "IA")) + block->num_instances = MAX2(1, screen->info.max_se / 2); + + if (si_pc_block_has_per_instance_groups(pc, block)) { + block->num_groups = block->num_instances; + } else { + block->num_groups = 1; + } + + if (si_pc_block_has_per_se_groups(pc, block)) + block->num_groups *= screen->info.max_se; + if (block->b->b->flags & SI_PC_BLOCK_SHADER) + block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits); + + pc->num_groups += block->num_groups; + } + + return; error: - si_destroy_perfcounters(screen); + si_destroy_perfcounters(screen); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index d900467964b..816015d1f82 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -24,12 +24,15 @@ */ #include "si_pipe.h" + +#include "driver_ddebug/dd_util.h" +#include "gallium/winsys/amdgpu/drm/amdgpu_public.h" +#include "gallium/winsys/radeon/drm/radeon_drm_public.h" +#include "radeon/radeon_uvd.h" +#include "si_compute.h" #include "si_public.h" #include "si_shader_internal.h" -#include "si_compute.h" #include "sid.h" - -#include "radeon/radeon_uvd.h" #include "util/disk_cache.h" #include "util/u_log.h" #include "util/u_memory.h" @@ -38,128 +41,124 @@ #include "util/u_upload_mgr.h" #include "util/xmlconfig.h" #include "vl/vl_decoder.h" -#include "driver_ddebug/dd_util.h" -#include "gallium/winsys/radeon/drm/radeon_drm_public.h" -#include "gallium/winsys/amdgpu/drm/amdgpu_public.h" #include -static struct pipe_context *si_create_context(struct pipe_screen *screen, - unsigned flags); +static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags); static const struct debug_named_value debug_options[] = { - /* Shader logging options: */ - { "vs", DBG(VS), "Print vertex shaders" }, - { "ps", DBG(PS), "Print pixel shaders" }, - { "gs", DBG(GS), "Print geometry shaders" }, - { "tcs", DBG(TCS), "Print tessellation control shaders" }, - { "tes", DBG(TES), "Print tessellation evaluation shaders" }, - { "cs", DBG(CS), "Print compute shaders" }, - { "noir", DBG(NO_IR), "Don't print the LLVM IR"}, - { "nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"}, - { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"}, - { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" }, - - /* Shader compiler options the shader cache should be aware of: */ - { "gisel", DBG(GISEL), "Enable LLVM global instruction selector." }, - { "w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders." }, - { "w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders." }, - { "w32cs", DBG(W32_CS), "Use Wave32 for computes shaders." }, - { "w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders." }, - { "w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders." }, - { "w64cs", DBG(W64_CS), "Use Wave64 for computes shaders." }, - - /* Shader compiler options (with no effect on the shader cache): */ - { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" }, - { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" }, - { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." }, - - /* Information logging options: */ - { "info", DBG(INFO), "Print driver information" }, - { "tex", DBG(TEX), "Print texture info" }, - { "compute", DBG(COMPUTE), "Print compute info" }, - { "vm", DBG(VM), "Print virtual addresses when creating resources" }, - { "cache_stats", DBG(CACHE_STATS), "Print shader cache statistics." }, - - /* Driver options: */ - { "forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible." }, - { "nodma", DBG(NO_SDMA), "Disable SDMA" }, - { "nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears" }, - { "nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies" }, - { "nowc", DBG(NO_WC), "Disable GTT write combining" }, - { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." }, - { "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." }, - { "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." }, - - /* 3D engine options: */ - { "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." }, - { "nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline." }, - { "nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt." }, - { "nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling." }, - { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." }, - { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." }, - { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." }, - { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." }, - { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" }, - { "nodpbb", DBG(NO_DPBB), "Disable DPBB." }, - { "nodfsm", DBG(NO_DFSM), "Disable DFSM." }, - { "dpbb", DBG(DPBB), "Enable DPBB." }, - { "dfsm", DBG(DFSM), "Enable DFSM." }, - { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" }, - { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." }, - { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" }, - { "notiling", DBG(NO_TILING), "Disable tiling" }, - { "nodcc", DBG(NO_DCC), "Disable DCC." }, - { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." }, - { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" }, - { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" }, - { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" }, - - DEBUG_NAMED_VALUE_END /* must be last */ + /* Shader logging options: */ + {"vs", DBG(VS), "Print vertex shaders"}, + {"ps", DBG(PS), "Print pixel shaders"}, + {"gs", DBG(GS), "Print geometry shaders"}, + {"tcs", DBG(TCS), "Print tessellation control shaders"}, + {"tes", DBG(TES), "Print tessellation evaluation shaders"}, + {"cs", DBG(CS), "Print compute shaders"}, + {"noir", DBG(NO_IR), "Don't print the LLVM IR"}, + {"nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"}, + {"noasm", DBG(NO_ASM), "Don't print disassembled shaders"}, + {"preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations"}, + + /* Shader compiler options the shader cache should be aware of: */ + {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."}, + {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."}, + {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."}, + {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."}, + {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."}, + {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."}, + {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."}, + + /* Shader compiler options (with no effect on the shader cache): */ + {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"}, + {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"}, + {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."}, + + /* Information logging options: */ + {"info", DBG(INFO), "Print driver information"}, + {"tex", DBG(TEX), "Print texture info"}, + {"compute", DBG(COMPUTE), "Print compute info"}, + {"vm", DBG(VM), "Print virtual addresses when creating resources"}, + {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."}, + + /* Driver options: */ + {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."}, + {"nodma", DBG(NO_SDMA), "Disable SDMA"}, + {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"}, + {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"}, + {"nowc", DBG(NO_WC), "Disable GTT write combining"}, + {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."}, + {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."}, + {"zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations."}, + + /* 3D engine options: */ + {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."}, + {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."}, + {"nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt."}, + {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."}, + {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."}, + {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."}, + {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."}, + {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."}, + {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"}, + {"nodpbb", DBG(NO_DPBB), "Disable DPBB."}, + {"nodfsm", DBG(NO_DFSM), "Disable DFSM."}, + {"dpbb", DBG(DPBB), "Enable DPBB."}, + {"dfsm", DBG(DFSM), "Enable DFSM."}, + {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"}, + {"norbplus", DBG(NO_RB_PLUS), "Disable RB+."}, + {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"}, + {"notiling", DBG(NO_TILING), "Disable tiling"}, + {"nodcc", DBG(NO_DCC), "Disable DCC."}, + {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."}, + {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"}, + {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"}, + {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"}, + + DEBUG_NAMED_VALUE_END /* must be last */ }; static const struct debug_named_value test_options[] = { - /* Tests: */ - { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." }, - { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." }, - { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." }, - { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." }, - { "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" }, - { "testgds", DBG(TEST_GDS), "Test GDS." }, - { "testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management." }, - { "testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management." }, - - DEBUG_NAMED_VALUE_END /* must be last */ + /* Tests: */ + {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."}, + {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."}, + {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."}, + {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."}, + {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"}, + {"testgds", DBG(TEST_GDS), "Test GDS."}, + {"testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management."}, + {"testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management."}, + + DEBUG_NAMED_VALUE_END /* must be last */ }; void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler) { - /* Only create the less-optimizing version of the compiler on APUs - * predating Ryzen (Raven). */ - bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram && - sscreen->info.chip_class <= GFX8; - - enum ac_target_machine_options tm_options = - (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) | - (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) | - (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) | - (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) | - (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) | - (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0); - - ac_init_llvm_once(); - ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options); - compiler->passes = ac_create_llvm_passes(compiler->tm); - - if (compiler->tm_wave32) - compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32); - if (compiler->low_opt_tm) - compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm); + /* Only create the less-optimizing version of the compiler on APUs + * predating Ryzen (Raven). */ + bool create_low_opt_compiler = + !sscreen->info.has_dedicated_vram && sscreen->info.chip_class <= GFX8; + + enum ac_target_machine_options tm_options = + (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) | + (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) | + (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) | + (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) | + (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) | + (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0); + + ac_init_llvm_once(); + ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options); + compiler->passes = ac_create_llvm_passes(compiler->tm); + + if (compiler->tm_wave32) + compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32); + if (compiler->low_opt_tm) + compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm); } static void si_destroy_compiler(struct ac_llvm_compiler *compiler) { - ac_destroy_llvm_compiler(compiler); + ac_destroy_llvm_compiler(compiler); } /* @@ -167,195 +166,191 @@ static void si_destroy_compiler(struct ac_llvm_compiler *compiler) */ static void si_destroy_context(struct pipe_context *context) { - struct si_context *sctx = (struct si_context *)context; - int i; - - /* Unreference the framebuffer normally to disable related logic - * properly. - */ - struct pipe_framebuffer_state fb = {}; - if (context->set_framebuffer_state) - context->set_framebuffer_state(context, &fb); - - si_release_all_descriptors(sctx); - - if (sctx->chip_class >= GFX10 && sctx->has_graphics) - gfx10_destroy_query(sctx); - - pipe_resource_reference(&sctx->esgs_ring, NULL); - pipe_resource_reference(&sctx->gsvs_ring, NULL); - pipe_resource_reference(&sctx->tess_rings, NULL); - pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); - pipe_resource_reference(&sctx->sample_pos_buffer, NULL); - si_resource_reference(&sctx->border_color_buffer, NULL); - free(sctx->border_color_table); - si_resource_reference(&sctx->scratch_buffer, NULL); - si_resource_reference(&sctx->compute_scratch_buffer, NULL); - si_resource_reference(&sctx->wait_mem_scratch, NULL); - si_resource_reference(&sctx->small_prim_cull_info_buf, NULL); - - si_pm4_free_state(sctx, sctx->init_config, ~0); - if (sctx->init_config_gs_rings) - si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); - for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++) - si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); - - if (sctx->fixed_func_tcs_shader.cso) - sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso); - if (sctx->custom_dsa_flush) - sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush); - if (sctx->custom_blend_resolve) - sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve); - if (sctx->custom_blend_fmask_decompress) - sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress); - if (sctx->custom_blend_eliminate_fastclear) - sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear); - if (sctx->custom_blend_dcc_decompress) - sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress); - if (sctx->vs_blit_pos) - sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos); - if (sctx->vs_blit_pos_layered) - sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered); - if (sctx->vs_blit_color) - sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color); - if (sctx->vs_blit_color_layered) - sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered); - if (sctx->vs_blit_texcoord) - sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord); - if (sctx->cs_clear_buffer) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer); - if (sctx->cs_copy_buffer) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); - if (sctx->cs_copy_image) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image); - if (sctx->cs_copy_image_1d_array) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array); - if (sctx->cs_clear_render_target) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target); - if (sctx->cs_clear_render_target_1d_array) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); - if (sctx->cs_clear_12bytes_buffer) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); - if (sctx->cs_dcc_retile) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); - - for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) { - for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) { - if (sctx->cs_fmask_expand[i][j]) { - sctx->b.delete_compute_state(&sctx->b, - sctx->cs_fmask_expand[i][j]); - } - } - } - - if (sctx->blitter) - util_blitter_destroy(sctx->blitter); - - /* Release DCC stats. */ - for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { - assert(!sctx->dcc_stats[i].query_active); - - for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) - if (sctx->dcc_stats[i].ps_stats[j]) - sctx->b.destroy_query(&sctx->b, - sctx->dcc_stats[i].ps_stats[j]); - - si_texture_reference(&sctx->dcc_stats[i].tex, NULL); - } - - if (sctx->query_result_shader) - sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader); - if (sctx->sh_query_result_shader) - sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader); - - if (sctx->gfx_cs) - sctx->ws->cs_destroy(sctx->gfx_cs); - if (sctx->sdma_cs) - sctx->ws->cs_destroy(sctx->sdma_cs); - if (sctx->ctx) - sctx->ws->ctx_destroy(sctx->ctx); - - if (sctx->b.stream_uploader) - u_upload_destroy(sctx->b.stream_uploader); - if (sctx->b.const_uploader) - u_upload_destroy(sctx->b.const_uploader); - if (sctx->cached_gtt_allocator) - u_upload_destroy(sctx->cached_gtt_allocator); - - slab_destroy_child(&sctx->pool_transfers); - slab_destroy_child(&sctx->pool_transfers_unsync); - - if (sctx->allocator_zeroed_memory) - u_suballocator_destroy(sctx->allocator_zeroed_memory); - - sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); - sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL); - sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); - si_resource_reference(&sctx->eop_bug_scratch, NULL); - si_resource_reference(&sctx->index_ring, NULL); - si_resource_reference(&sctx->barrier_buf, NULL); - si_resource_reference(&sctx->last_ib_barrier_buf, NULL); - pb_reference(&sctx->gds, NULL); - pb_reference(&sctx->gds_oa, NULL); - - si_destroy_compiler(&sctx->compiler); - - si_saved_cs_reference(&sctx->current_saved_cs, NULL); - - _mesa_hash_table_destroy(sctx->tex_handles, NULL); - _mesa_hash_table_destroy(sctx->img_handles, NULL); - - util_dynarray_fini(&sctx->resident_tex_handles); - util_dynarray_fini(&sctx->resident_img_handles); - util_dynarray_fini(&sctx->resident_tex_needs_color_decompress); - util_dynarray_fini(&sctx->resident_img_needs_color_decompress); - util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress); - si_unref_sdma_uploads(sctx); - free(sctx->sdma_uploads); - FREE(sctx); + struct si_context *sctx = (struct si_context *)context; + int i; + + /* Unreference the framebuffer normally to disable related logic + * properly. + */ + struct pipe_framebuffer_state fb = {}; + if (context->set_framebuffer_state) + context->set_framebuffer_state(context, &fb); + + si_release_all_descriptors(sctx); + + if (sctx->chip_class >= GFX10 && sctx->has_graphics) + gfx10_destroy_query(sctx); + + pipe_resource_reference(&sctx->esgs_ring, NULL); + pipe_resource_reference(&sctx->gsvs_ring, NULL); + pipe_resource_reference(&sctx->tess_rings, NULL); + pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); + pipe_resource_reference(&sctx->sample_pos_buffer, NULL); + si_resource_reference(&sctx->border_color_buffer, NULL); + free(sctx->border_color_table); + si_resource_reference(&sctx->scratch_buffer, NULL); + si_resource_reference(&sctx->compute_scratch_buffer, NULL); + si_resource_reference(&sctx->wait_mem_scratch, NULL); + si_resource_reference(&sctx->small_prim_cull_info_buf, NULL); + + si_pm4_free_state(sctx, sctx->init_config, ~0); + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); + for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++) + si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); + + if (sctx->fixed_func_tcs_shader.cso) + sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso); + if (sctx->custom_dsa_flush) + sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush); + if (sctx->custom_blend_resolve) + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve); + if (sctx->custom_blend_fmask_decompress) + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress); + if (sctx->custom_blend_eliminate_fastclear) + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear); + if (sctx->custom_blend_dcc_decompress) + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress); + if (sctx->vs_blit_pos) + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos); + if (sctx->vs_blit_pos_layered) + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered); + if (sctx->vs_blit_color) + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color); + if (sctx->vs_blit_color_layered) + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered); + if (sctx->vs_blit_texcoord) + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord); + if (sctx->cs_clear_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer); + if (sctx->cs_copy_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); + if (sctx->cs_copy_image) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image); + if (sctx->cs_copy_image_1d_array) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array); + if (sctx->cs_clear_render_target) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target); + if (sctx->cs_clear_render_target_1d_array) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); + if (sctx->cs_clear_12bytes_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); + if (sctx->cs_dcc_retile) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); + + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) { + if (sctx->cs_fmask_expand[i][j]) { + sctx->b.delete_compute_state(&sctx->b, sctx->cs_fmask_expand[i][j]); + } + } + } + + if (sctx->blitter) + util_blitter_destroy(sctx->blitter); + + /* Release DCC stats. */ + for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { + assert(!sctx->dcc_stats[i].query_active); + + for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) + if (sctx->dcc_stats[i].ps_stats[j]) + sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]); + + si_texture_reference(&sctx->dcc_stats[i].tex, NULL); + } + + if (sctx->query_result_shader) + sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader); + if (sctx->sh_query_result_shader) + sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader); + + if (sctx->gfx_cs) + sctx->ws->cs_destroy(sctx->gfx_cs); + if (sctx->sdma_cs) + sctx->ws->cs_destroy(sctx->sdma_cs); + if (sctx->ctx) + sctx->ws->ctx_destroy(sctx->ctx); + + if (sctx->b.stream_uploader) + u_upload_destroy(sctx->b.stream_uploader); + if (sctx->b.const_uploader) + u_upload_destroy(sctx->b.const_uploader); + if (sctx->cached_gtt_allocator) + u_upload_destroy(sctx->cached_gtt_allocator); + + slab_destroy_child(&sctx->pool_transfers); + slab_destroy_child(&sctx->pool_transfers_unsync); + + if (sctx->allocator_zeroed_memory) + u_suballocator_destroy(sctx->allocator_zeroed_memory); + + sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); + sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL); + sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); + si_resource_reference(&sctx->eop_bug_scratch, NULL); + si_resource_reference(&sctx->index_ring, NULL); + si_resource_reference(&sctx->barrier_buf, NULL); + si_resource_reference(&sctx->last_ib_barrier_buf, NULL); + pb_reference(&sctx->gds, NULL); + pb_reference(&sctx->gds_oa, NULL); + + si_destroy_compiler(&sctx->compiler); + + si_saved_cs_reference(&sctx->current_saved_cs, NULL); + + _mesa_hash_table_destroy(sctx->tex_handles, NULL); + _mesa_hash_table_destroy(sctx->img_handles, NULL); + + util_dynarray_fini(&sctx->resident_tex_handles); + util_dynarray_fini(&sctx->resident_img_handles); + util_dynarray_fini(&sctx->resident_tex_needs_color_decompress); + util_dynarray_fini(&sctx->resident_img_needs_color_decompress); + util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress); + si_unref_sdma_uploads(sctx); + free(sctx->sdma_uploads); + FREE(sctx); } static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_screen *sscreen = sctx->screen; - enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx); - - if (status != PIPE_NO_RESET) { - /* Call the state tracker to set a no-op API dispatch. */ - if (sctx->device_reset_callback.reset) { - sctx->device_reset_callback.reset(sctx->device_reset_callback.data, - status); - } - - /* Re-create the auxiliary context, because it won't submit - * any new IBs due to a GPU reset. - */ - simple_mtx_lock(&sscreen->aux_context_lock); - - struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; - sscreen->aux_context->set_log_context(sscreen->aux_context, NULL); - sscreen->aux_context->destroy(sscreen->aux_context); - - sscreen->aux_context = si_create_context(&sscreen->b, - (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | - (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY)); - sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log); - simple_mtx_unlock(&sscreen->aux_context_lock); - } - return status; + struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; + enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx); + + if (status != PIPE_NO_RESET) { + /* Call the state tracker to set a no-op API dispatch. */ + if (sctx->device_reset_callback.reset) { + sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status); + } + + /* Re-create the auxiliary context, because it won't submit + * any new IBs due to a GPU reset. + */ + simple_mtx_lock(&sscreen->aux_context_lock); + + struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; + sscreen->aux_context->set_log_context(sscreen->aux_context, NULL); + sscreen->aux_context->destroy(sscreen->aux_context); + + sscreen->aux_context = si_create_context( + &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | + (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY)); + sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log); + simple_mtx_unlock(&sscreen->aux_context_lock); + } + return status; } static void si_set_device_reset_callback(struct pipe_context *ctx, - const struct pipe_device_reset_callback *cb) + const struct pipe_device_reset_callback *cb) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (cb) - sctx->device_reset_callback = *cb; - else - memset(&sctx->device_reset_callback, 0, - sizeof(sctx->device_reset_callback)); + if (cb) + sctx->device_reset_callback = *cb; + else + memset(&sctx->device_reset_callback, 0, sizeof(sctx->device_reset_callback)); } /* Apitrace profiling: @@ -366,989 +361,895 @@ static void si_set_device_reset_callback(struct pipe_context *ctx, * call and print the results. * 4) glretrace --benchmark --markers .. */ -static void si_emit_string_marker(struct pipe_context *ctx, - const char *string, int len) +static void si_emit_string_marker(struct pipe_context *ctx, const char *string, int len) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number); + dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number); - if (sctx->log) - u_log_printf(sctx->log, "\nString marker: %*s\n", len, string); + if (sctx->log) + u_log_printf(sctx->log, "\nString marker: %*s\n", len, string); } -static void si_set_debug_callback(struct pipe_context *ctx, - const struct pipe_debug_callback *cb) +static void si_set_debug_callback(struct pipe_context *ctx, const struct pipe_debug_callback *cb) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_screen *screen = sctx->screen; + struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *screen = sctx->screen; - util_queue_finish(&screen->shader_compiler_queue); - util_queue_finish(&screen->shader_compiler_queue_low_priority); + util_queue_finish(&screen->shader_compiler_queue); + util_queue_finish(&screen->shader_compiler_queue_low_priority); - if (cb) - sctx->debug = *cb; - else - memset(&sctx->debug, 0, sizeof(sctx->debug)); + if (cb) + sctx->debug = *cb; + else + memset(&sctx->debug, 0, sizeof(sctx->debug)); } -static void si_set_log_context(struct pipe_context *ctx, - struct u_log_context *log) +static void si_set_log_context(struct pipe_context *ctx, struct u_log_context *log) { - struct si_context *sctx = (struct si_context *)ctx; - sctx->log = log; + struct si_context *sctx = (struct si_context *)ctx; + sctx->log = log; - if (log) - u_log_add_auto_logger(log, si_auto_log_cs, sctx); + if (log) + u_log_add_auto_logger(log, si_auto_log_cs, sctx); } -static void si_set_context_param(struct pipe_context *ctx, - enum pipe_context_param param, - unsigned value) +static void si_set_context_param(struct pipe_context *ctx, enum pipe_context_param param, + unsigned value) { - struct radeon_winsys *ws = ((struct si_context *)ctx)->ws; - - switch (param) { - case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE: - ws->pin_threads_to_L3_cache(ws, value); - break; - default:; - } + struct radeon_winsys *ws = ((struct si_context *)ctx)->ws; + + switch (param) { + case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE: + ws->pin_threads_to_L3_cache(ws, value); + break; + default:; + } } -static struct pipe_context *si_create_context(struct pipe_screen *screen, - unsigned flags) +static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags) { - struct si_screen* sscreen = (struct si_screen *)screen; - STATIC_ASSERT(DBG_COUNT <= 64); - - /* Don't create a context if it's not compute-only and hw is compute-only. */ - if (!sscreen->info.has_graphics && - !(flags & PIPE_CONTEXT_COMPUTE_ONLY)) - return NULL; - - struct si_context *sctx = CALLOC_STRUCT(si_context); - struct radeon_winsys *ws = sscreen->ws; - int shader, i; - bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0; - - if (!sctx) - return NULL; - - sctx->has_graphics = sscreen->info.chip_class == GFX6 || - !(flags & PIPE_CONTEXT_COMPUTE_ONLY); - - if (flags & PIPE_CONTEXT_DEBUG) - sscreen->record_llvm_ir = true; /* racy but not critical */ - - sctx->b.screen = screen; /* this must be set first */ - sctx->b.priv = NULL; - sctx->b.destroy = si_destroy_context; - sctx->screen = sscreen; /* Easy accessing of screen/winsys. */ - sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0; - - slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers); - slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers); - - sctx->ws = sscreen->ws; - sctx->family = sscreen->info.family; - sctx->chip_class = sscreen->info.chip_class; - - if (sctx->chip_class == GFX7 || - sctx->chip_class == GFX8 || - sctx->chip_class == GFX9) { - sctx->eop_bug_scratch = si_resource( - pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, - 16 * sscreen->info.num_render_backends)); - if (!sctx->eop_bug_scratch) - goto fail; - } - - /* Initialize context allocators. */ - sctx->allocator_zeroed_memory = - u_suballocator_create(&sctx->b, 128 * 1024, - 0, PIPE_USAGE_DEFAULT, - SI_RESOURCE_FLAG_UNMAPPABLE | - SI_RESOURCE_FLAG_CLEAR, false); - if (!sctx->allocator_zeroed_memory) - goto fail; - - sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024, - 0, PIPE_USAGE_STREAM, - SI_RESOURCE_FLAG_READ_ONLY); - if (!sctx->b.stream_uploader) - goto fail; - - sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, - 0, PIPE_USAGE_STAGING, 0); - if (!sctx->cached_gtt_allocator) - goto fail; - - sctx->ctx = sctx->ws->ctx_create(sctx->ws); - if (!sctx->ctx) - goto fail; - - if (sscreen->info.num_rings[RING_DMA] && - !(sscreen->debug_flags & DBG(NO_SDMA)) && - /* SDMA causes corruption on RX 580: - * https://gitlab.freedesktop.org/mesa/mesa/issues/1399 - * https://gitlab.freedesktop.org/mesa/mesa/issues/1889 - */ - (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) && - /* SDMA timeouts sometimes on gfx10 so disable it for now. See: - * https://bugs.freedesktop.org/show_bug.cgi?id=111481 - * https://gitlab.freedesktop.org/mesa/mesa/issues/1907 - */ - (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) { - sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, - (void*)si_flush_dma_cs, - sctx, stop_exec_on_failure); - } - - bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs; - sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024, - 0, PIPE_USAGE_DEFAULT, - SI_RESOURCE_FLAG_32BIT | - (use_sdma_upload ? - SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0)); - if (!sctx->b.const_uploader) - goto fail; - - if (use_sdma_upload) - u_upload_enable_flush_explicit(sctx->b.const_uploader); - - sctx->gfx_cs = ws->cs_create(sctx->ctx, - sctx->has_graphics ? RING_GFX : RING_COMPUTE, - (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure); - - /* Border colors. */ - sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * - sizeof(*sctx->border_color_table)); - if (!sctx->border_color_table) - goto fail; - - sctx->border_color_buffer = si_resource( - pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, - SI_MAX_BORDER_COLORS * - sizeof(*sctx->border_color_table))); - if (!sctx->border_color_buffer) - goto fail; - - sctx->border_color_map = - ws->buffer_map(sctx->border_color_buffer->buf, - NULL, PIPE_TRANSFER_WRITE); - if (!sctx->border_color_map) - goto fail; - - sctx->ngg = sscreen->use_ngg; - - /* Initialize context functions used by graphics and compute. */ - if (sctx->chip_class >= GFX10) - sctx->emit_cache_flush = gfx10_emit_cache_flush; - else - sctx->emit_cache_flush = si_emit_cache_flush; - - sctx->b.emit_string_marker = si_emit_string_marker; - sctx->b.set_debug_callback = si_set_debug_callback; - sctx->b.set_log_context = si_set_log_context; - sctx->b.set_context_param = si_set_context_param; - sctx->b.get_device_reset_status = si_get_reset_status; - sctx->b.set_device_reset_callback = si_set_device_reset_callback; - - si_init_all_descriptors(sctx); - si_init_buffer_functions(sctx); - si_init_clear_functions(sctx); - si_init_blit_functions(sctx); - si_init_compute_functions(sctx); - si_init_compute_blit_functions(sctx); - si_init_debug_functions(sctx); - si_init_fence_functions(sctx); - si_init_query_functions(sctx); - si_init_state_compute_functions(sctx); - si_init_context_texture_functions(sctx); - - /* Initialize graphics-only context functions. */ - if (sctx->has_graphics) { - if (sctx->chip_class >= GFX10) - gfx10_init_query(sctx); - si_init_msaa_functions(sctx); - si_init_shader_functions(sctx); - si_init_state_functions(sctx); - si_init_streamout_functions(sctx); - si_init_viewport_functions(sctx); - - sctx->blitter = util_blitter_create(&sctx->b); - if (sctx->blitter == NULL) - goto fail; - sctx->blitter->skip_viewport_restore = true; - - /* Some states are expected to be always non-NULL. */ - sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter); - sctx->queued.named.blend = sctx->noop_blend; - - sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter); - sctx->queued.named.dsa = sctx->noop_dsa; - - sctx->discard_rasterizer_state = - util_blitter_get_discard_rasterizer_state(sctx->blitter); - sctx->queued.named.rasterizer = sctx->discard_rasterizer_state; - - si_init_draw_functions(sctx); - - /* If aux_context == NULL, we are initializing aux_context right now. */ - bool is_aux_context = !sscreen->aux_context; - si_initialize_prim_discard_tunables(sscreen, is_aux_context, - &sctx->prim_discard_vertex_count_threshold, - &sctx->index_ring_size_per_ib); - } - - /* Initialize SDMA functions. */ - if (sctx->chip_class >= GFX7) - cik_init_sdma_functions(sctx); - else - sctx->dma_copy = si_resource_copy_region; - - if (sscreen->debug_flags & DBG(FORCE_SDMA)) - sctx->b.resource_copy_region = sctx->dma_copy; - - sctx->sample_mask = 0xffff; - - /* Initialize multimedia functions. */ - if (sscreen->info.has_hw_decode) { - sctx->b.create_video_codec = si_uvd_create_decoder; - sctx->b.create_video_buffer = si_video_buffer_create; - } else { - sctx->b.create_video_codec = vl_create_decoder; - sctx->b.create_video_buffer = vl_video_buffer_create; - } - - if (sctx->chip_class >= GFX9 || - si_compute_prim_discard_enabled(sctx)) { - sctx->wait_mem_scratch = si_resource( - pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8)); - if (!sctx->wait_mem_scratch) - goto fail; - - /* Initialize the memory. */ - si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, - V_370_MEM, V_370_ME, &sctx->wait_mem_number); - } - - /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads - * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */ - if (sctx->chip_class == GFX7) { - sctx->null_const_buf.buffer = - pipe_aligned_buffer_create(screen, - SI_RESOURCE_FLAG_32BIT, - PIPE_USAGE_DEFAULT, 16, - sctx->screen->info.tcc_cache_line_size); - if (!sctx->null_const_buf.buffer) - goto fail; - sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0; - - unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; - for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) { - for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) { - sctx->b.set_constant_buffer(&sctx->b, shader, i, - &sctx->null_const_buf); - } - } - - si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, - &sctx->null_const_buf); - si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, - &sctx->null_const_buf); - si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, - &sctx->null_const_buf); - si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, - &sctx->null_const_buf); - si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, - &sctx->null_const_buf); - } - - uint64_t max_threads_per_block; - screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, - PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, - &max_threads_per_block); - - /* The maximum number of scratch waves. Scratch space isn't divided - * evenly between CUs. The number is only a function of the number of CUs. - * We can decrease the constant to decrease the scratch buffer size. - * - * sctx->scratch_waves must be >= the maximum posible size of - * 1 threadgroup, so that the hw doesn't hang from being unable - * to start any. - * - * The recommended value is 4 per CU at most. Higher numbers don't - * bring much benefit, but they still occupy chip resources (think - * async compute). I've seen ~2% performance difference between 4 and 32. - */ - sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units, - max_threads_per_block / 64); - - /* Bindless handles. */ - sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - util_dynarray_init(&sctx->resident_tex_handles, NULL); - util_dynarray_init(&sctx->resident_img_handles, NULL); - util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL); - util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL); - util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL); - - sctx->sample_pos_buffer = - pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, - sizeof(sctx->sample_positions)); - pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, - sizeof(sctx->sample_positions), &sctx->sample_positions); - - /* this must be last */ - si_begin_new_gfx_cs(sctx); - - if (sctx->chip_class == GFX7) { - /* Clear the NULL constant buffer, because loads should return zeros. - * Note that this forces CP DMA to be used, because clover deadlocks - * for some reason when the compute codepath is used. - */ - uint32_t clear_value = 0; - si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, - sctx->null_const_buf.buffer->width0, - &clear_value, 4, SI_COHERENCY_SHADER, true); - } - return &sctx->b; + struct si_screen *sscreen = (struct si_screen *)screen; + STATIC_ASSERT(DBG_COUNT <= 64); + + /* Don't create a context if it's not compute-only and hw is compute-only. */ + if (!sscreen->info.has_graphics && !(flags & PIPE_CONTEXT_COMPUTE_ONLY)) + return NULL; + + struct si_context *sctx = CALLOC_STRUCT(si_context); + struct radeon_winsys *ws = sscreen->ws; + int shader, i; + bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0; + + if (!sctx) + return NULL; + + sctx->has_graphics = sscreen->info.chip_class == GFX6 || !(flags & PIPE_CONTEXT_COMPUTE_ONLY); + + if (flags & PIPE_CONTEXT_DEBUG) + sscreen->record_llvm_ir = true; /* racy but not critical */ + + sctx->b.screen = screen; /* this must be set first */ + sctx->b.priv = NULL; + sctx->b.destroy = si_destroy_context; + sctx->screen = sscreen; /* Easy accessing of screen/winsys. */ + sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0; + + slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers); + slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers); + + sctx->ws = sscreen->ws; + sctx->family = sscreen->info.family; + sctx->chip_class = sscreen->info.chip_class; + + if (sctx->chip_class == GFX7 || sctx->chip_class == GFX8 || sctx->chip_class == GFX9) { + sctx->eop_bug_scratch = si_resource(pipe_buffer_create( + &sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends)); + if (!sctx->eop_bug_scratch) + goto fail; + } + + /* Initialize context allocators. */ + sctx->allocator_zeroed_memory = + u_suballocator_create(&sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT, + SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false); + if (!sctx->allocator_zeroed_memory) + goto fail; + + sctx->b.stream_uploader = + u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY); + if (!sctx->b.stream_uploader) + goto fail; + + sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0); + if (!sctx->cached_gtt_allocator) + goto fail; + + sctx->ctx = sctx->ws->ctx_create(sctx->ws); + if (!sctx->ctx) + goto fail; + + if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) && + /* SDMA causes corruption on RX 580: + * https://gitlab.freedesktop.org/mesa/mesa/issues/1399 + * https://gitlab.freedesktop.org/mesa/mesa/issues/1889 + */ + (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) && + /* SDMA timeouts sometimes on gfx10 so disable it for now. See: + * https://bugs.freedesktop.org/show_bug.cgi?id=111481 + * https://gitlab.freedesktop.org/mesa/mesa/issues/1907 + */ + (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) { + sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, sctx, + stop_exec_on_failure); + } + + bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs; + sctx->b.const_uploader = + u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT, + SI_RESOURCE_FLAG_32BIT | + (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0)); + if (!sctx->b.const_uploader) + goto fail; + + if (use_sdma_upload) + u_upload_enable_flush_explicit(sctx->b.const_uploader); + + sctx->gfx_cs = ws->cs_create(sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE, + (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure); + + /* Border colors. */ + sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table)); + if (!sctx->border_color_table) + goto fail; + + sctx->border_color_buffer = si_resource(pipe_buffer_create( + screen, 0, PIPE_USAGE_DEFAULT, SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table))); + if (!sctx->border_color_buffer) + goto fail; + + sctx->border_color_map = + ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE); + if (!sctx->border_color_map) + goto fail; + + sctx->ngg = sscreen->use_ngg; + + /* Initialize context functions used by graphics and compute. */ + if (sctx->chip_class >= GFX10) + sctx->emit_cache_flush = gfx10_emit_cache_flush; + else + sctx->emit_cache_flush = si_emit_cache_flush; + + sctx->b.emit_string_marker = si_emit_string_marker; + sctx->b.set_debug_callback = si_set_debug_callback; + sctx->b.set_log_context = si_set_log_context; + sctx->b.set_context_param = si_set_context_param; + sctx->b.get_device_reset_status = si_get_reset_status; + sctx->b.set_device_reset_callback = si_set_device_reset_callback; + + si_init_all_descriptors(sctx); + si_init_buffer_functions(sctx); + si_init_clear_functions(sctx); + si_init_blit_functions(sctx); + si_init_compute_functions(sctx); + si_init_compute_blit_functions(sctx); + si_init_debug_functions(sctx); + si_init_fence_functions(sctx); + si_init_query_functions(sctx); + si_init_state_compute_functions(sctx); + si_init_context_texture_functions(sctx); + + /* Initialize graphics-only context functions. */ + if (sctx->has_graphics) { + if (sctx->chip_class >= GFX10) + gfx10_init_query(sctx); + si_init_msaa_functions(sctx); + si_init_shader_functions(sctx); + si_init_state_functions(sctx); + si_init_streamout_functions(sctx); + si_init_viewport_functions(sctx); + + sctx->blitter = util_blitter_create(&sctx->b); + if (sctx->blitter == NULL) + goto fail; + sctx->blitter->skip_viewport_restore = true; + + /* Some states are expected to be always non-NULL. */ + sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter); + sctx->queued.named.blend = sctx->noop_blend; + + sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter); + sctx->queued.named.dsa = sctx->noop_dsa; + + sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter); + sctx->queued.named.rasterizer = sctx->discard_rasterizer_state; + + si_init_draw_functions(sctx); + + /* If aux_context == NULL, we are initializing aux_context right now. */ + bool is_aux_context = !sscreen->aux_context; + si_initialize_prim_discard_tunables(sscreen, is_aux_context, + &sctx->prim_discard_vertex_count_threshold, + &sctx->index_ring_size_per_ib); + } + + /* Initialize SDMA functions. */ + if (sctx->chip_class >= GFX7) + cik_init_sdma_functions(sctx); + else + sctx->dma_copy = si_resource_copy_region; + + if (sscreen->debug_flags & DBG(FORCE_SDMA)) + sctx->b.resource_copy_region = sctx->dma_copy; + + sctx->sample_mask = 0xffff; + + /* Initialize multimedia functions. */ + if (sscreen->info.has_hw_decode) { + sctx->b.create_video_codec = si_uvd_create_decoder; + sctx->b.create_video_buffer = si_video_buffer_create; + } else { + sctx->b.create_video_codec = vl_create_decoder; + sctx->b.create_video_buffer = vl_video_buffer_create; + } + + if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) { + sctx->wait_mem_scratch = si_resource(pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8)); + if (!sctx->wait_mem_scratch) + goto fail; + + /* Initialize the memory. */ + si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, V_370_MEM, V_370_ME, + &sctx->wait_mem_number); + } + + /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads + * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */ + if (sctx->chip_class == GFX7) { + sctx->null_const_buf.buffer = + pipe_aligned_buffer_create(screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16, + sctx->screen->info.tcc_cache_line_size); + if (!sctx->null_const_buf.buffer) + goto fail; + sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0; + + unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE; + for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) { + for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) { + sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf); + } + } + + si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf); + si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf); + si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf); + si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf); + si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf); + } + + uint64_t max_threads_per_block; + screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, + &max_threads_per_block); + + /* The maximum number of scratch waves. Scratch space isn't divided + * evenly between CUs. The number is only a function of the number of CUs. + * We can decrease the constant to decrease the scratch buffer size. + * + * sctx->scratch_waves must be >= the maximum posible size of + * 1 threadgroup, so that the hw doesn't hang from being unable + * to start any. + * + * The recommended value is 4 per CU at most. Higher numbers don't + * bring much benefit, but they still occupy chip resources (think + * async compute). I've seen ~2% performance difference between 4 and 32. + */ + sctx->scratch_waves = + MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64); + + /* Bindless handles. */ + sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); + + util_dynarray_init(&sctx->resident_tex_handles, NULL); + util_dynarray_init(&sctx->resident_img_handles, NULL); + util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL); + util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL); + util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL); + + sctx->sample_pos_buffer = + pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions)); + pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions), + &sctx->sample_positions); + + /* this must be last */ + si_begin_new_gfx_cs(sctx); + + if (sctx->chip_class == GFX7) { + /* Clear the NULL constant buffer, because loads should return zeros. + * Note that this forces CP DMA to be used, because clover deadlocks + * for some reason when the compute codepath is used. + */ + uint32_t clear_value = 0; + si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, + &clear_value, 4, SI_COHERENCY_SHADER, true); + } + return &sctx->b; fail: - fprintf(stderr, "radeonsi: Failed to create a context.\n"); - si_destroy_context(&sctx->b); - return NULL; + fprintf(stderr, "radeonsi: Failed to create a context.\n"); + si_destroy_context(&sctx->b); + return NULL; } -static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, - void *priv, unsigned flags) +static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv, + unsigned flags) { - struct si_screen *sscreen = (struct si_screen *)screen; - struct pipe_context *ctx; + struct si_screen *sscreen = (struct si_screen *)screen; + struct pipe_context *ctx; - if (sscreen->debug_flags & DBG(CHECK_VM)) - flags |= PIPE_CONTEXT_DEBUG; + if (sscreen->debug_flags & DBG(CHECK_VM)) + flags |= PIPE_CONTEXT_DEBUG; - ctx = si_create_context(screen, flags); + ctx = si_create_context(screen, flags); - if (!(flags & PIPE_CONTEXT_PREFER_THREADED)) - return ctx; + if (!(flags & PIPE_CONTEXT_PREFER_THREADED)) + return ctx; - /* Clover (compute-only) is unsupported. */ - if (flags & PIPE_CONTEXT_COMPUTE_ONLY) - return ctx; + /* Clover (compute-only) is unsupported. */ + if (flags & PIPE_CONTEXT_COMPUTE_ONLY) + return ctx; - /* When shaders are logged to stderr, asynchronous compilation is - * disabled too. */ - if (sscreen->debug_flags & DBG_ALL_SHADERS) - return ctx; + /* When shaders are logged to stderr, asynchronous compilation is + * disabled too. */ + if (sscreen->debug_flags & DBG_ALL_SHADERS) + return ctx; - /* Use asynchronous flushes only on amdgpu, since the radeon - * implementation for fence_server_sync is incomplete. */ - return threaded_context_create(ctx, &sscreen->pool_transfers, - si_replace_buffer_storage, - sscreen->info.is_amdgpu ? si_create_fence : NULL, - &((struct si_context*)ctx)->tc); + /* Use asynchronous flushes only on amdgpu, since the radeon + * implementation for fence_server_sync is incomplete. */ + return threaded_context_create(ctx, &sscreen->pool_transfers, si_replace_buffer_storage, + sscreen->info.is_amdgpu ? si_create_fence : NULL, + &((struct si_context *)ctx)->tc); } /* * pipe_screen */ -static void si_destroy_screen(struct pipe_screen* pscreen) +static void si_destroy_screen(struct pipe_screen *pscreen) { - struct si_screen *sscreen = (struct si_screen *)pscreen; - struct si_shader_part *parts[] = { - sscreen->vs_prologs, - sscreen->tcs_epilogs, - sscreen->gs_prologs, - sscreen->ps_prologs, - sscreen->ps_epilogs - }; - unsigned i; - - if (!sscreen->ws->unref(sscreen->ws)) - return; - - if (sscreen->debug_flags & DBG(CACHE_STATS)) { - printf("live shader cache: hits = %u, misses = %u\n", - sscreen->live_shader_cache.hits, - sscreen->live_shader_cache.misses); - printf("memory shader cache: hits = %u, misses = %u\n", - sscreen->num_memory_shader_cache_hits, - sscreen->num_memory_shader_cache_misses); - printf("disk shader cache: hits = %u, misses = %u\n", - sscreen->num_disk_shader_cache_hits, - sscreen->num_disk_shader_cache_misses); - } - - simple_mtx_destroy(&sscreen->aux_context_lock); - - struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; - if (aux_log) { - sscreen->aux_context->set_log_context(sscreen->aux_context, NULL); - u_log_context_destroy(aux_log); - FREE(aux_log); - } - - sscreen->aux_context->destroy(sscreen->aux_context); - - util_queue_destroy(&sscreen->shader_compiler_queue); - util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); - - /* Release the reference on glsl types of the compiler threads. */ - glsl_type_singleton_decref(); - - for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) - si_destroy_compiler(&sscreen->compiler[i]); - - for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) - si_destroy_compiler(&sscreen->compiler_lowp[i]); - - /* Free shader parts. */ - for (i = 0; i < ARRAY_SIZE(parts); i++) { - while (parts[i]) { - struct si_shader_part *part = parts[i]; - - parts[i] = part->next; - si_shader_binary_clean(&part->binary); - FREE(part); - } - } - simple_mtx_destroy(&sscreen->shader_parts_mutex); - si_destroy_shader_cache(sscreen); - - si_destroy_perfcounters(sscreen); - si_gpu_load_kill_thread(sscreen); - - simple_mtx_destroy(&sscreen->gpu_load_mutex); - - slab_destroy_parent(&sscreen->pool_transfers); - - disk_cache_destroy(sscreen->disk_shader_cache); - util_live_shader_cache_deinit(&sscreen->live_shader_cache); - sscreen->ws->destroy(sscreen->ws); - FREE(sscreen); + struct si_screen *sscreen = (struct si_screen *)pscreen; + struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs, + sscreen->ps_prologs, sscreen->ps_epilogs}; + unsigned i; + + if (!sscreen->ws->unref(sscreen->ws)) + return; + + if (sscreen->debug_flags & DBG(CACHE_STATS)) { + printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits, + sscreen->live_shader_cache.misses); + printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits, + sscreen->num_memory_shader_cache_misses); + printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits, + sscreen->num_disk_shader_cache_misses); + } + + simple_mtx_destroy(&sscreen->aux_context_lock); + + struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log; + if (aux_log) { + sscreen->aux_context->set_log_context(sscreen->aux_context, NULL); + u_log_context_destroy(aux_log); + FREE(aux_log); + } + + sscreen->aux_context->destroy(sscreen->aux_context); + + util_queue_destroy(&sscreen->shader_compiler_queue); + util_queue_destroy(&sscreen->shader_compiler_queue_low_priority); + + /* Release the reference on glsl types of the compiler threads. */ + glsl_type_singleton_decref(); + + for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++) + si_destroy_compiler(&sscreen->compiler[i]); + + for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++) + si_destroy_compiler(&sscreen->compiler_lowp[i]); + + /* Free shader parts. */ + for (i = 0; i < ARRAY_SIZE(parts); i++) { + while (parts[i]) { + struct si_shader_part *part = parts[i]; + + parts[i] = part->next; + si_shader_binary_clean(&part->binary); + FREE(part); + } + } + simple_mtx_destroy(&sscreen->shader_parts_mutex); + si_destroy_shader_cache(sscreen); + + si_destroy_perfcounters(sscreen); + si_gpu_load_kill_thread(sscreen); + + simple_mtx_destroy(&sscreen->gpu_load_mutex); + + slab_destroy_parent(&sscreen->pool_transfers); + + disk_cache_destroy(sscreen->disk_shader_cache); + util_live_shader_cache_deinit(&sscreen->live_shader_cache); + sscreen->ws->destroy(sscreen->ws); + FREE(sscreen); } static void si_init_gs_info(struct si_screen *sscreen) { - sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class, - sscreen->info.family); + sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class, sscreen->info.family); } static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags) { - struct pipe_context *ctx = sscreen->aux_context; - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_resource *buf = - pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64); - - if (!buf) { - puts("Buffer allocation failed."); - exit(1); - } - - si_resource(buf)->gpu_address = 0; /* cause a VM fault */ - - if (test_flags & DBG(TEST_VMFAULT_CP)) { - si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, - SI_COHERENCY_NONE, L2_BYPASS); - ctx->flush(ctx, NULL, 0); - puts("VM fault test: CP - done."); - } - if (test_flags & DBG(TEST_VMFAULT_SDMA)) { - si_sdma_clear_buffer(sctx, buf, 0, 4, 0); - ctx->flush(ctx, NULL, 0); - puts("VM fault test: SDMA - done."); - } - if (test_flags & DBG(TEST_VMFAULT_SHADER)) { - util_test_constant_buffer(ctx, buf); - puts("VM fault test: Shader - done."); - } - exit(0); + struct pipe_context *ctx = sscreen->aux_context; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_resource *buf = pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64); + + if (!buf) { + puts("Buffer allocation failed."); + exit(1); + } + + si_resource(buf)->gpu_address = 0; /* cause a VM fault */ + + if (test_flags & DBG(TEST_VMFAULT_CP)) { + si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, SI_COHERENCY_NONE, L2_BYPASS); + ctx->flush(ctx, NULL, 0); + puts("VM fault test: CP - done."); + } + if (test_flags & DBG(TEST_VMFAULT_SDMA)) { + si_sdma_clear_buffer(sctx, buf, 0, 4, 0); + ctx->flush(ctx, NULL, 0); + puts("VM fault test: SDMA - done."); + } + if (test_flags & DBG(TEST_VMFAULT_SHADER)) { + util_test_constant_buffer(ctx, buf); + puts("VM fault test: Shader - done."); + } + exit(0); } -static void si_test_gds_memory_management(struct si_context *sctx, - unsigned alloc_size, unsigned alignment, - enum radeon_bo_domain domain) +static void si_test_gds_memory_management(struct si_context *sctx, unsigned alloc_size, + unsigned alignment, enum radeon_bo_domain domain) { - struct radeon_winsys *ws = sctx->ws; - struct radeon_cmdbuf *cs[8]; - struct pb_buffer *gds_bo[ARRAY_SIZE(cs)]; - - for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) { - cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE, - NULL, NULL, false); - gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0); - assert(gds_bo[i]); - } - - for (unsigned iterations = 0; iterations < 20000; iterations++) { - for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) { - /* This clears GDS with CP DMA. - * - * We don't care if GDS is present. Just add some packet - * to make the GPU busy for a moment. - */ - si_cp_dma_clear_buffer(sctx, cs[i], NULL, 0, alloc_size, 0, - SI_CPDMA_SKIP_BO_LIST_UPDATE | - SI_CPDMA_SKIP_CHECK_CS_SPACE | - SI_CPDMA_SKIP_GFX_SYNC, 0, 0); - - ws->cs_add_buffer(cs[i], gds_bo[i], domain, - RADEON_USAGE_READWRITE, 0); - ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL); - } - } - exit(0); + struct radeon_winsys *ws = sctx->ws; + struct radeon_cmdbuf *cs[8]; + struct pb_buffer *gds_bo[ARRAY_SIZE(cs)]; + + for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) { + cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE, NULL, NULL, false); + gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0); + assert(gds_bo[i]); + } + + for (unsigned iterations = 0; iterations < 20000; iterations++) { + for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) { + /* This clears GDS with CP DMA. + * + * We don't care if GDS is present. Just add some packet + * to make the GPU busy for a moment. + */ + si_cp_dma_clear_buffer( + sctx, cs[i], NULL, 0, alloc_size, 0, + SI_CPDMA_SKIP_BO_LIST_UPDATE | SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC, 0, + 0); + + ws->cs_add_buffer(cs[i], gds_bo[i], domain, RADEON_USAGE_READWRITE, 0); + ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL); + } + } + exit(0); } static void si_disk_cache_create(struct si_screen *sscreen) { - /* Don't use the cache if shader dumping is enabled. */ - if (sscreen->debug_flags & DBG_ALL_SHADERS) - return; - - struct mesa_sha1 ctx; - unsigned char sha1[20]; - char cache_id[20 * 2 + 1]; - - _mesa_sha1_init(&ctx); - - if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) || - !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, - &ctx)) - return; - - _mesa_sha1_final(&ctx, sha1); - disk_cache_format_hex_id(cache_id, sha1, 20 * 2); - - /* These flags affect shader compilation. */ - #define ALL_FLAGS (DBG(GISEL)) - uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS; - - /* Add the high bits of 32-bit addresses, which affects - * how 32-bit addresses are expanded to 64 bits. - */ - STATIC_ASSERT(ALL_FLAGS <= UINT_MAX); - assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi); - shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32; - - sscreen->disk_shader_cache = - disk_cache_create(sscreen->info.name, - cache_id, - shader_debug_flags); + /* Don't use the cache if shader dumping is enabled. */ + if (sscreen->debug_flags & DBG_ALL_SHADERS) + return; + + struct mesa_sha1 ctx; + unsigned char sha1[20]; + char cache_id[20 * 2 + 1]; + + _mesa_sha1_init(&ctx); + + if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) || + !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx)) + return; + + _mesa_sha1_final(&ctx, sha1); + disk_cache_format_hex_id(cache_id, sha1, 20 * 2); + +/* These flags affect shader compilation. */ +#define ALL_FLAGS (DBG(GISEL)) + uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS; + + /* Add the high bits of 32-bit addresses, which affects + * how 32-bit addresses are expanded to 64 bits. + */ + STATIC_ASSERT(ALL_FLAGS <= UINT_MAX); + assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi); + shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32; + + sscreen->disk_shader_cache = disk_cache_create(sscreen->info.name, cache_id, shader_debug_flags); } -static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, - unsigned max_threads) +static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads) { - struct si_screen *sscreen = (struct si_screen *)screen; + struct si_screen *sscreen = (struct si_screen *)screen; - /* This function doesn't allow a greater number of threads than - * the queue had at its creation. */ - util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, - max_threads); - /* Don't change the number of threads on the low priority queue. */ + /* This function doesn't allow a greater number of threads than + * the queue had at its creation. */ + util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads); + /* Don't change the number of threads on the low priority queue. */ } -static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, - void *shader, - enum pipe_shader_type shader_type) +static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader, + enum pipe_shader_type shader_type) { - struct si_shader_selector *sel = (struct si_shader_selector *)shader; + struct si_shader_selector *sel = (struct si_shader_selector *)shader; - return util_queue_fence_is_signalled(&sel->ready); + return util_queue_fence_is_signalled(&sel->ready); } -static struct pipe_screen * -radeonsi_screen_create_impl(struct radeon_winsys *ws, - const struct pipe_screen_config *config) +static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, + const struct pipe_screen_config *config) { - struct si_screen *sscreen = CALLOC_STRUCT(si_screen); - unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads; - uint64_t test_flags; - - if (!sscreen) { - return NULL; - } - - sscreen->ws = ws; - ws->query_info(ws, &sscreen->info); - - if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) { - fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n"); - FREE(sscreen); - return NULL; - } - - if (sscreen->info.chip_class >= GFX9) { - sscreen->se_tile_repeat = 32 * sscreen->info.max_se; - } else { - ac_get_raster_config(&sscreen->info, - &sscreen->pa_sc_raster_config, - &sscreen->pa_sc_raster_config_1, - &sscreen->se_tile_repeat); - } - - sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", - debug_options, 0); - sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", - debug_options, 0); - test_flags = debug_get_flags_option("AMD_TEST", - test_options, 0); - - if (sscreen->debug_flags & DBG(NO_GFX)) - sscreen->info.has_graphics = false; - - /* Set functions first. */ - sscreen->b.context_create = si_pipe_create_context; - sscreen->b.destroy = si_destroy_screen; - sscreen->b.set_max_shader_compiler_threads = - si_set_max_shader_compiler_threads; - sscreen->b.is_parallel_shader_compilation_finished = - si_is_parallel_shader_compilation_finished; - sscreen->b.finalize_nir = si_finalize_nir; - - si_init_screen_get_functions(sscreen); - si_init_screen_buffer_functions(sscreen); - si_init_screen_fence_functions(sscreen); - si_init_screen_state_functions(sscreen); - si_init_screen_texture_functions(sscreen); - si_init_screen_query_functions(sscreen); - si_init_screen_live_shader_cache(sscreen); - - /* Set these flags in debug_flags early, so that the shader cache takes - * them into account. - */ - if (driQueryOptionb(config->options, - "glsl_correct_derivatives_after_discard")) - sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); - - if (sscreen->debug_flags & DBG(INFO)) - ac_print_gpu_info(&sscreen->info); - - slab_create_parent(&sscreen->pool_transfers, - sizeof(struct si_transfer), 64); - - sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); - if (sscreen->force_aniso == -1) { - sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1)); - } - - if (sscreen->force_aniso >= 0) { - printf("radeonsi: Forcing anisotropy filter to %ix\n", - /* round down to a power of two */ - 1 << util_logbase2(sscreen->force_aniso)); - } - - (void) simple_mtx_init(&sscreen->aux_context_lock, mtx_plain); - (void) simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); - - si_init_gs_info(sscreen); - if (!si_init_shader_cache(sscreen)) { - FREE(sscreen); - return NULL; - } - - { -#define OPT_BOOL(name, dflt, description) \ - sscreen->options.name = \ - driQueryOptionb(config->options, "radeonsi_"#name); + struct si_screen *sscreen = CALLOC_STRUCT(si_screen); + unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads; + uint64_t test_flags; + + if (!sscreen) { + return NULL; + } + + sscreen->ws = ws; + ws->query_info(ws, &sscreen->info); + + if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) { + fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n"); + FREE(sscreen); + return NULL; + } + + if (sscreen->info.chip_class >= GFX9) { + sscreen->se_tile_repeat = 32 * sscreen->info.max_se; + } else { + ac_get_raster_config(&sscreen->info, &sscreen->pa_sc_raster_config, + &sscreen->pa_sc_raster_config_1, &sscreen->se_tile_repeat); + } + + sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", debug_options, 0); + sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", debug_options, 0); + test_flags = debug_get_flags_option("AMD_TEST", test_options, 0); + + if (sscreen->debug_flags & DBG(NO_GFX)) + sscreen->info.has_graphics = false; + + /* Set functions first. */ + sscreen->b.context_create = si_pipe_create_context; + sscreen->b.destroy = si_destroy_screen; + sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads; + sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished; + sscreen->b.finalize_nir = si_finalize_nir; + + si_init_screen_get_functions(sscreen); + si_init_screen_buffer_functions(sscreen); + si_init_screen_fence_functions(sscreen); + si_init_screen_state_functions(sscreen); + si_init_screen_texture_functions(sscreen); + si_init_screen_query_functions(sscreen); + si_init_screen_live_shader_cache(sscreen); + + /* Set these flags in debug_flags early, so that the shader cache takes + * them into account. + */ + if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard")) + sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); + + if (sscreen->debug_flags & DBG(INFO)) + ac_print_gpu_info(&sscreen->info); + + slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64); + + sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (sscreen->force_aniso == -1) { + sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1)); + } + + if (sscreen->force_aniso >= 0) { + printf("radeonsi: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(sscreen->force_aniso)); + } + + (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain); + (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain); + + si_init_gs_info(sscreen); + if (!si_init_shader_cache(sscreen)) { + FREE(sscreen); + return NULL; + } + + { +#define OPT_BOOL(name, dflt, description) \ + sscreen->options.name = driQueryOptionb(config->options, "radeonsi_" #name); #include "si_debug_options.h" - } - - si_disk_cache_create(sscreen); - - /* Determine the number of shader compiler threads. */ - hw_threads = sysconf(_SC_NPROCESSORS_ONLN); - - if (hw_threads >= 12) { - num_comp_hi_threads = hw_threads * 3 / 4; - num_comp_lo_threads = hw_threads / 3; - } else if (hw_threads >= 6) { - num_comp_hi_threads = hw_threads - 2; - num_comp_lo_threads = hw_threads / 2; - } else if (hw_threads >= 2) { - num_comp_hi_threads = hw_threads - 1; - num_comp_lo_threads = hw_threads / 2; - } else { - num_comp_hi_threads = 1; - num_comp_lo_threads = 1; - } - - num_comp_hi_threads = MIN2(num_comp_hi_threads, - ARRAY_SIZE(sscreen->compiler)); - num_comp_lo_threads = MIN2(num_comp_lo_threads, - ARRAY_SIZE(sscreen->compiler_lowp)); - - /* Take a reference on the glsl types for the compiler threads. */ - glsl_type_singleton_init_or_ref(); - - if (!util_queue_init(&sscreen->shader_compiler_queue, "sh", - 64, num_comp_hi_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen); - glsl_type_singleton_decref(); - return NULL; - } - - if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, - "shlo", - 64, num_comp_lo_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY | - UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { - si_destroy_shader_cache(sscreen); - FREE(sscreen); - glsl_type_singleton_decref(); - return NULL; - } - - if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) - si_init_perfcounters(sscreen); - - unsigned prim_discard_vertex_count_threshold, tmp; - si_initialize_prim_discard_tunables(sscreen, false, - &prim_discard_vertex_count_threshold, - &tmp); - /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */ - if (prim_discard_vertex_count_threshold == UINT_MAX) - sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; - - /* Determine tessellation ring info. */ - bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && - sscreen->info.family != CHIP_CARRIZO && - sscreen->info.family != CHIP_STONEY; - /* This must be one less than the maximum number due to a hw limitation. - * Various hardware bugs need this. - */ - unsigned max_offchip_buffers_per_se; - - if (sscreen->info.chip_class >= GFX10) - max_offchip_buffers_per_se = 256; - /* Only certain chips can use the maximum value. */ - else if (sscreen->info.family == CHIP_VEGA12 || - sscreen->info.family == CHIP_VEGA20) - max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; - else - max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63; - - unsigned max_offchip_buffers = max_offchip_buffers_per_se * - sscreen->info.max_se; - unsigned offchip_granularity; - - /* Hawaii has a bug with offchip buffers > 256 that can be worked - * around by setting 4K granularity. - */ - if (sscreen->info.family == CHIP_HAWAII) { - sscreen->tess_offchip_block_dw_size = 4096; - offchip_granularity = V_03093C_X_4K_DWORDS; - } else { - sscreen->tess_offchip_block_dw_size = 8192; - offchip_granularity = V_03093C_X_8K_DWORDS; - } - - sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se; - sscreen->tess_offchip_ring_size = max_offchip_buffers * - sscreen->tess_offchip_block_dw_size * 4; - - if (sscreen->info.chip_class >= GFX7) { - if (sscreen->info.chip_class >= GFX8) - --max_offchip_buffers; - sscreen->vgt_hs_offchip_param = - S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | - S_03093C_OFFCHIP_GRANULARITY(offchip_granularity); - } else { - assert(offchip_granularity == V_03093C_X_8K_DWORDS); - sscreen->vgt_hs_offchip_param = - S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); - } - - sscreen->has_draw_indirect_multi = - (sscreen->info.family >= CHIP_POLARIS10) || - (sscreen->info.chip_class == GFX8 && - sscreen->info.pfp_fw_version >= 121 && - sscreen->info.me_fw_version >= 87) || - (sscreen->info.chip_class == GFX7 && - sscreen->info.pfp_fw_version >= 211 && - sscreen->info.me_fw_version >= 173) || - (sscreen->info.chip_class == GFX6 && - sscreen->info.pfp_fw_version >= 79 && - sscreen->info.me_fw_version >= 142); - - sscreen->has_out_of_order_rast = sscreen->info.has_out_of_order_rast && - !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)); - sscreen->assume_no_z_fights = - driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") || - driQueryOptionb(config->options, "allow_draw_out_of_order"); - sscreen->commutative_blend_add = - driQueryOptionb(config->options, "radeonsi_commutative_blend_add") || - driQueryOptionb(config->options, "allow_draw_out_of_order"); - - sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && - sscreen->info.family != CHIP_NAVI14 && - !(sscreen->debug_flags & DBG(NO_NGG)); - sscreen->use_ngg_culling = sscreen->use_ngg && - !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); - sscreen->always_use_ngg_culling = sscreen->use_ngg_culling && - sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING); - sscreen->use_ngg_streamout = false; - - /* Only enable primitive binning on APUs by default. */ - if (sscreen->info.chip_class >= GFX10) { - sscreen->dpbb_allowed = true; - sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram; - } else if (sscreen->info.chip_class == GFX9) { - sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram; - sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram; - } - - /* Process DPBB enable flags. */ - if (sscreen->debug_flags & DBG(DPBB)) { - sscreen->dpbb_allowed = true; - if (sscreen->debug_flags & DBG(DFSM)) - sscreen->dfsm_allowed = true; - } - - /* Process DPBB disable flags. */ - if (sscreen->debug_flags & DBG(NO_DPBB)) { - sscreen->dpbb_allowed = false; - sscreen->dfsm_allowed = false; - } else if (sscreen->debug_flags & DBG(NO_DFSM)) { - sscreen->dfsm_allowed = false; - } - - /* While it would be nice not to have this flag, we are constrained - * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. - */ - sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9; - - sscreen->dcc_msaa_allowed = - !(sscreen->debug_flags & DBG(NO_DCC_MSAA)); - - (void) simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); - sscreen->use_monolithic_shaders = - (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; - - sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE; - if (sscreen->info.chip_class <= GFX8) { - sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2; - sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2; - } - - if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) - sscreen->debug_flags |= DBG_ALL_SHADERS; - - /* Syntax: - * EQAA=s,z,c - * Example: - * EQAA=8,4,2 - - * That means 8 coverage samples, 4 Z/S samples, and 2 color samples. - * Constraints: - * s >= z >= c (ignoring this only wastes memory) - * s = [2..16] - * z = [2..8] - * c = [2..8] - * - * Only MSAA color and depth buffers are overriden. - */ - if (sscreen->info.has_eqaa_surface_allocator) { - const char *eqaa = debug_get_option("EQAA", NULL); - unsigned s,z,f; - - if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) { - sscreen->eqaa_force_coverage_samples = s; - sscreen->eqaa_force_z_samples = z; - sscreen->eqaa_force_color_samples = f; - } - } - - sscreen->ge_wave_size = 64; - sscreen->ps_wave_size = 64; - sscreen->compute_wave_size = 64; - - if (sscreen->info.chip_class >= GFX10) { - /* Pixels shaders: Wave64 is recommended. - * Compute shaders: There are piglit failures with Wave32. - */ - sscreen->ge_wave_size = 32; - - if (sscreen->debug_flags & DBG(W32_GE)) - sscreen->ge_wave_size = 32; - if (sscreen->debug_flags & DBG(W32_PS)) - sscreen->ps_wave_size = 32; - if (sscreen->debug_flags & DBG(W32_CS)) - sscreen->compute_wave_size = 32; - - if (sscreen->debug_flags & DBG(W64_GE)) - sscreen->ge_wave_size = 64; - if (sscreen->debug_flags & DBG(W64_PS)) - sscreen->ps_wave_size = 64; - if (sscreen->debug_flags & DBG(W64_CS)) - sscreen->compute_wave_size = 64; - } - - /* Create the auxiliary context. This must be done last. */ - sscreen->aux_context = si_create_context(&sscreen->b, - (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | - (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY)); - if (sscreen->options.aux_debug) { - struct u_log_context *log = CALLOC_STRUCT(u_log_context); - u_log_context_init(log); - sscreen->aux_context->set_log_context(sscreen->aux_context, log); - } - - if (test_flags & DBG(TEST_DMA)) - si_test_dma(sscreen); - - if (test_flags & DBG(TEST_DMA_PERF)) { - si_test_dma_perf(sscreen); - } - - if (test_flags & (DBG(TEST_VMFAULT_CP) | - DBG(TEST_VMFAULT_SDMA) | - DBG(TEST_VMFAULT_SHADER))) - si_test_vmfault(sscreen, test_flags); - - if (test_flags & DBG(TEST_GDS)) - si_test_gds((struct si_context*)sscreen->aux_context); - - if (test_flags & DBG(TEST_GDS_MM)) { - si_test_gds_memory_management((struct si_context*)sscreen->aux_context, - 32 * 1024, 4, RADEON_DOMAIN_GDS); - } - if (test_flags & DBG(TEST_GDS_OA_MM)) { - si_test_gds_memory_management((struct si_context*)sscreen->aux_context, - 4, 1, RADEON_DOMAIN_OA); - } - - STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4); - return &sscreen->b; + } + + si_disk_cache_create(sscreen); + + /* Determine the number of shader compiler threads. */ + hw_threads = sysconf(_SC_NPROCESSORS_ONLN); + + if (hw_threads >= 12) { + num_comp_hi_threads = hw_threads * 3 / 4; + num_comp_lo_threads = hw_threads / 3; + } else if (hw_threads >= 6) { + num_comp_hi_threads = hw_threads - 2; + num_comp_lo_threads = hw_threads / 2; + } else if (hw_threads >= 2) { + num_comp_hi_threads = hw_threads - 1; + num_comp_lo_threads = hw_threads / 2; + } else { + num_comp_hi_threads = 1; + num_comp_lo_threads = 1; + } + + num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler)); + num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp)); + + /* Take a reference on the glsl types for the compiler threads. */ + glsl_type_singleton_init_or_ref(); + + if (!util_queue_init( + &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads, + UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) { + si_destroy_shader_cache(sscreen); + FREE(sscreen); + glsl_type_singleton_decref(); + return NULL; + } + + if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64, + num_comp_lo_threads, + UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY | + UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) { + si_destroy_shader_cache(sscreen); + FREE(sscreen); + glsl_type_singleton_decref(); + return NULL; + } + + if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) + si_init_perfcounters(sscreen); + + unsigned prim_discard_vertex_count_threshold, tmp; + si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp); + /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */ + if (prim_discard_vertex_count_threshold == UINT_MAX) + sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; + + /* Determine tessellation ring info. */ + bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && + sscreen->info.family != CHIP_CARRIZO && + sscreen->info.family != CHIP_STONEY; + /* This must be one less than the maximum number due to a hw limitation. + * Various hardware bugs need this. + */ + unsigned max_offchip_buffers_per_se; + + if (sscreen->info.chip_class >= GFX10) + max_offchip_buffers_per_se = 256; + /* Only certain chips can use the maximum value. */ + else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20) + max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + else + max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63; + + unsigned max_offchip_buffers = max_offchip_buffers_per_se * sscreen->info.max_se; + unsigned offchip_granularity; + + /* Hawaii has a bug with offchip buffers > 256 that can be worked + * around by setting 4K granularity. + */ + if (sscreen->info.family == CHIP_HAWAII) { + sscreen->tess_offchip_block_dw_size = 4096; + offchip_granularity = V_03093C_X_4K_DWORDS; + } else { + sscreen->tess_offchip_block_dw_size = 8192; + offchip_granularity = V_03093C_X_8K_DWORDS; + } + + sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se; + sscreen->tess_offchip_ring_size = max_offchip_buffers * sscreen->tess_offchip_block_dw_size * 4; + + if (sscreen->info.chip_class >= GFX7) { + if (sscreen->info.chip_class >= GFX8) + --max_offchip_buffers; + sscreen->vgt_hs_offchip_param = S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity); + } else { + assert(offchip_granularity == V_03093C_X_8K_DWORDS); + sscreen->vgt_hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); + } + + sscreen->has_draw_indirect_multi = + (sscreen->info.family >= CHIP_POLARIS10) || + (sscreen->info.chip_class == GFX8 && sscreen->info.pfp_fw_version >= 121 && + sscreen->info.me_fw_version >= 87) || + (sscreen->info.chip_class == GFX7 && sscreen->info.pfp_fw_version >= 211 && + sscreen->info.me_fw_version >= 173) || + (sscreen->info.chip_class == GFX6 && sscreen->info.pfp_fw_version >= 79 && + sscreen->info.me_fw_version >= 142); + + sscreen->has_out_of_order_rast = + sscreen->info.has_out_of_order_rast && !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)); + sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") || + driQueryOptionb(config->options, "allow_draw_out_of_order"); + sscreen->commutative_blend_add = + driQueryOptionb(config->options, "radeonsi_commutative_blend_add") || + driQueryOptionb(config->options, "allow_draw_out_of_order"); + + sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 && + !(sscreen->debug_flags & DBG(NO_NGG)); + sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); + sscreen->always_use_ngg_culling = + sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING); + sscreen->use_ngg_streamout = false; + + /* Only enable primitive binning on APUs by default. */ + if (sscreen->info.chip_class >= GFX10) { + sscreen->dpbb_allowed = true; + sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram; + } else if (sscreen->info.chip_class == GFX9) { + sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram; + sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram; + } + + /* Process DPBB enable flags. */ + if (sscreen->debug_flags & DBG(DPBB)) { + sscreen->dpbb_allowed = true; + if (sscreen->debug_flags & DBG(DFSM)) + sscreen->dfsm_allowed = true; + } + + /* Process DPBB disable flags. */ + if (sscreen->debug_flags & DBG(NO_DPBB)) { + sscreen->dpbb_allowed = false; + sscreen->dfsm_allowed = false; + } else if (sscreen->debug_flags & DBG(NO_DFSM)) { + sscreen->dfsm_allowed = false; + } + + /* While it would be nice not to have this flag, we are constrained + * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9. + */ + sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9; + + sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA)); + + (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain); + sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; + + sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; + if (sscreen->info.chip_class <= GFX8) { + sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2; + sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2; + } + + if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) + sscreen->debug_flags |= DBG_ALL_SHADERS; + + /* Syntax: + * EQAA=s,z,c + * Example: + * EQAA=8,4,2 + + * That means 8 coverage samples, 4 Z/S samples, and 2 color samples. + * Constraints: + * s >= z >= c (ignoring this only wastes memory) + * s = [2..16] + * z = [2..8] + * c = [2..8] + * + * Only MSAA color and depth buffers are overriden. + */ + if (sscreen->info.has_eqaa_surface_allocator) { + const char *eqaa = debug_get_option("EQAA", NULL); + unsigned s, z, f; + + if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) { + sscreen->eqaa_force_coverage_samples = s; + sscreen->eqaa_force_z_samples = z; + sscreen->eqaa_force_color_samples = f; + } + } + + sscreen->ge_wave_size = 64; + sscreen->ps_wave_size = 64; + sscreen->compute_wave_size = 64; + + if (sscreen->info.chip_class >= GFX10) { + /* Pixels shaders: Wave64 is recommended. + * Compute shaders: There are piglit failures with Wave32. + */ + sscreen->ge_wave_size = 32; + + if (sscreen->debug_flags & DBG(W32_GE)) + sscreen->ge_wave_size = 32; + if (sscreen->debug_flags & DBG(W32_PS)) + sscreen->ps_wave_size = 32; + if (sscreen->debug_flags & DBG(W32_CS)) + sscreen->compute_wave_size = 32; + + if (sscreen->debug_flags & DBG(W64_GE)) + sscreen->ge_wave_size = 64; + if (sscreen->debug_flags & DBG(W64_PS)) + sscreen->ps_wave_size = 64; + if (sscreen->debug_flags & DBG(W64_CS)) + sscreen->compute_wave_size = 64; + } + + /* Create the auxiliary context. This must be done last. */ + sscreen->aux_context = si_create_context( + &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) | + (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY)); + if (sscreen->options.aux_debug) { + struct u_log_context *log = CALLOC_STRUCT(u_log_context); + u_log_context_init(log); + sscreen->aux_context->set_log_context(sscreen->aux_context, log); + } + + if (test_flags & DBG(TEST_DMA)) + si_test_dma(sscreen); + + if (test_flags & DBG(TEST_DMA_PERF)) { + si_test_dma_perf(sscreen); + } + + if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) + si_test_vmfault(sscreen, test_flags); + + if (test_flags & DBG(TEST_GDS)) + si_test_gds((struct si_context *)sscreen->aux_context); + + if (test_flags & DBG(TEST_GDS_MM)) { + si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 32 * 1024, 4, + RADEON_DOMAIN_GDS); + } + if (test_flags & DBG(TEST_GDS_OA_MM)) { + si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 4, 1, + RADEON_DOMAIN_OA); + } + + STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4); + return &sscreen->b; } struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_config *config) { - drmVersionPtr version = drmGetVersion(fd); - struct radeon_winsys *rw = NULL; - - switch (version->version_major) { - case 2: - rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl); - break; - case 3: - rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl); - break; - } - - drmFreeVersion(version); - return rw ? rw->screen : NULL; + drmVersionPtr version = drmGetVersion(fd); + struct radeon_winsys *rw = NULL; + + switch (version->version_major) { + case 2: + rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl); + break; + case 3: + rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl); + break; + } + + drmFreeVersion(version); + return rw ? rw->screen : NULL; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 400f2152243..30f7832f71c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -27,7 +27,6 @@ #include "si_shader.h" #include "si_state.h" - #include "util/u_dynarray.h" #include "util/u_idalloc.h" #include "util/u_threaded_context.h" @@ -38,201 +37,207 @@ #define SI_BIG_ENDIAN 0 #endif -#define ATI_VENDOR_ID 0x1002 -#define SI_PRIM_DISCARD_DEBUG 0 -#define SI_NOT_QUERY 0xffffffff +#define ATI_VENDOR_ID 0x1002 +#define SI_PRIM_DISCARD_DEBUG 0 +#define SI_NOT_QUERY 0xffffffff /* The base vertex and primitive restart can be any number, but we must pick * one which will mean "unknown" for the purpose of state tracking and * the number shouldn't be a commonly-used one. */ -#define SI_BASE_VERTEX_UNKNOWN INT_MIN -#define SI_RESTART_INDEX_UNKNOWN INT_MIN -#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN -#define SI_NUM_SMOOTH_AA_SAMPLES 8 -#define SI_MAX_POINT_SIZE 2048 -#define SI_GS_PER_ES 128 +#define SI_BASE_VERTEX_UNKNOWN INT_MIN +#define SI_RESTART_INDEX_UNKNOWN INT_MIN +#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN +#define SI_NUM_SMOOTH_AA_SAMPLES 8 +#define SI_MAX_POINT_SIZE 2048 +#define SI_GS_PER_ES 128 /* Alignment for optimal CP DMA performance. */ -#define SI_CPDMA_ALIGNMENT 32 +#define SI_CPDMA_ALIGNMENT 32 /* Tunables for compute-based clear_buffer and copy_buffer: */ -#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 -#define SI_COMPUTE_COPY_DW_PER_THREAD 4 -#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM +#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 +#define SI_COMPUTE_COPY_DW_PER_THREAD 4 +#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM /* Pipeline & streamout query controls. */ -#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) -#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) +#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) +#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2) /* Instruction cache. */ -#define SI_CONTEXT_INV_ICACHE (1 << 3) +#define SI_CONTEXT_INV_ICACHE (1 << 3) /* Scalar cache. (GFX6-9: scalar L1; GFX10: scalar L0) * GFX10: This also invalidates the L1 shader array cache. */ -#define SI_CONTEXT_INV_SCACHE (1 << 4) +#define SI_CONTEXT_INV_SCACHE (1 << 4) /* Vector cache. (GFX6-9: vector L1; GFX10: vector L0) * GFX10: This also invalidates the L1 shader array cache. */ -#define SI_CONTEXT_INV_VCACHE (1 << 5) +#define SI_CONTEXT_INV_VCACHE (1 << 5) /* L2 cache + L2 metadata cache writeback & invalidate. * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */ -#define SI_CONTEXT_INV_L2 (1 << 6) +#define SI_CONTEXT_INV_L2 (1 << 6) /* L2 writeback (write dirty L2 lines to memory for non-L2 clients). * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8. * GFX6-7 will do complete invalidation, because the writeback is unsupported. */ -#define SI_CONTEXT_WB_L2 (1 << 7) +#define SI_CONTEXT_WB_L2 (1 << 7) /* Writeback & invalidate the L2 metadata cache only. It can only be coupled with * a CB or DB flush. */ -#define SI_CONTEXT_INV_L2_METADATA (1 << 8) +#define SI_CONTEXT_INV_L2_METADATA (1 << 8) /* Framebuffer caches. */ -#define SI_CONTEXT_FLUSH_AND_INV_DB (1 << 9) +#define SI_CONTEXT_FLUSH_AND_INV_DB (1 << 9) #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10) -#define SI_CONTEXT_FLUSH_AND_INV_CB (1 << 11) +#define SI_CONTEXT_FLUSH_AND_INV_CB (1 << 11) /* Engine synchronization. */ -#define SI_CONTEXT_VS_PARTIAL_FLUSH (1 << 12) -#define SI_CONTEXT_PS_PARTIAL_FLUSH (1 << 13) -#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14) -#define SI_CONTEXT_VGT_FLUSH (1 << 15) -#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16) - -#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0) -#define SI_PREFETCH_LS (1 << 1) -#define SI_PREFETCH_HS (1 << 2) -#define SI_PREFETCH_ES (1 << 3) -#define SI_PREFETCH_GS (1 << 4) -#define SI_PREFETCH_VS (1 << 5) -#define SI_PREFETCH_PS (1 << 6) - -#define SI_MAX_BORDER_COLORS 4096 -#define SI_MAX_VIEWPORTS 16 -#define SIX_BITS 0x3F -#define SI_MAP_BUFFER_ALIGNMENT 64 +#define SI_CONTEXT_VS_PARTIAL_FLUSH (1 << 12) +#define SI_CONTEXT_PS_PARTIAL_FLUSH (1 << 13) +#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14) +#define SI_CONTEXT_VGT_FLUSH (1 << 15) +#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16) + +#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0) +#define SI_PREFETCH_LS (1 << 1) +#define SI_PREFETCH_HS (1 << 2) +#define SI_PREFETCH_ES (1 << 3) +#define SI_PREFETCH_GS (1 << 4) +#define SI_PREFETCH_VS (1 << 5) +#define SI_PREFETCH_PS (1 << 6) + +#define SI_MAX_BORDER_COLORS 4096 +#define SI_MAX_VIEWPORTS 16 +#define SIX_BITS 0x3F +#define SI_MAP_BUFFER_ALIGNMENT 64 #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 -#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) -#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) +#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) +#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) -#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) -#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) -#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) -#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) -#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) +#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) +#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) +#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) +#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) +#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) /* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */ -#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) /* Set a micro tile mode: */ -#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9) -#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10) -#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) (((x) & 0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) -#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3) +#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) \ + (((x)&0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) +#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) \ + (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3) enum si_clear_code { - DCC_CLEAR_COLOR_0000 = 0x00000000, - DCC_CLEAR_COLOR_0001 = 0x40404040, - DCC_CLEAR_COLOR_1110 = 0x80808080, - DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0, - DCC_CLEAR_COLOR_REG = 0x20202020, - DCC_UNCOMPRESSED = 0xFFFFFFFF, + DCC_CLEAR_COLOR_0000 = 0x00000000, + DCC_CLEAR_COLOR_0001 = 0x40404040, + DCC_CLEAR_COLOR_1110 = 0x80808080, + DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0, + DCC_CLEAR_COLOR_REG = 0x20202020, + DCC_UNCOMPRESSED = 0xFFFFFFFF, }; -#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7) +#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7) /* Debug flags. */ -enum { - /* Shader logging options: */ - DBG_VS = PIPE_SHADER_VERTEX, - DBG_PS = PIPE_SHADER_FRAGMENT, - DBG_GS = PIPE_SHADER_GEOMETRY, - DBG_TCS = PIPE_SHADER_TESS_CTRL, - DBG_TES = PIPE_SHADER_TESS_EVAL, - DBG_CS = PIPE_SHADER_COMPUTE, - DBG_NO_IR, - DBG_NO_NIR, - DBG_NO_ASM, - DBG_PREOPT_IR, - - /* Shader compiler options the shader cache should be aware of: */ - DBG_FS_CORRECT_DERIVS_AFTER_KILL, - DBG_GISEL, - DBG_W32_GE, - DBG_W32_PS, - DBG_W32_CS, - DBG_W64_GE, - DBG_W64_PS, - DBG_W64_CS, - - /* Shader compiler options (with no effect on the shader cache): */ - DBG_CHECK_IR, - DBG_MONOLITHIC_SHADERS, - DBG_NO_OPT_VARIANT, - - /* Information logging options: */ - DBG_INFO, - DBG_TEX, - DBG_COMPUTE, - DBG_VM, - DBG_CACHE_STATS, - - /* Driver options: */ - DBG_FORCE_SDMA, - DBG_NO_SDMA, - DBG_NO_SDMA_CLEARS, - DBG_NO_SDMA_COPY_IMAGE, - DBG_NO_WC, - DBG_CHECK_VM, - DBG_RESERVE_VMID, - DBG_ZERO_VRAM, - - /* 3D engine options: */ - DBG_NO_GFX, - DBG_NO_NGG, - DBG_ALWAYS_NGG_CULLING, - DBG_NO_NGG_CULLING, - DBG_ALWAYS_PD, - DBG_PD, - DBG_NO_PD, - DBG_SWITCH_ON_EOP, - DBG_NO_OUT_OF_ORDER, - DBG_NO_DPBB, - DBG_NO_DFSM, - DBG_DPBB, - DBG_DFSM, - DBG_NO_HYPERZ, - DBG_NO_RB_PLUS, - DBG_NO_2D_TILING, - DBG_NO_TILING, - DBG_NO_DCC, - DBG_NO_DCC_CLEAR, - DBG_NO_DCC_FB, - DBG_NO_DCC_MSAA, - DBG_NO_FMASK, - - DBG_COUNT +enum +{ + /* Shader logging options: */ + DBG_VS = PIPE_SHADER_VERTEX, + DBG_PS = PIPE_SHADER_FRAGMENT, + DBG_GS = PIPE_SHADER_GEOMETRY, + DBG_TCS = PIPE_SHADER_TESS_CTRL, + DBG_TES = PIPE_SHADER_TESS_EVAL, + DBG_CS = PIPE_SHADER_COMPUTE, + DBG_NO_IR, + DBG_NO_NIR, + DBG_NO_ASM, + DBG_PREOPT_IR, + + /* Shader compiler options the shader cache should be aware of: */ + DBG_FS_CORRECT_DERIVS_AFTER_KILL, + DBG_GISEL, + DBG_W32_GE, + DBG_W32_PS, + DBG_W32_CS, + DBG_W64_GE, + DBG_W64_PS, + DBG_W64_CS, + + /* Shader compiler options (with no effect on the shader cache): */ + DBG_CHECK_IR, + DBG_MONOLITHIC_SHADERS, + DBG_NO_OPT_VARIANT, + + /* Information logging options: */ + DBG_INFO, + DBG_TEX, + DBG_COMPUTE, + DBG_VM, + DBG_CACHE_STATS, + + /* Driver options: */ + DBG_FORCE_SDMA, + DBG_NO_SDMA, + DBG_NO_SDMA_CLEARS, + DBG_NO_SDMA_COPY_IMAGE, + DBG_NO_WC, + DBG_CHECK_VM, + DBG_RESERVE_VMID, + DBG_ZERO_VRAM, + + /* 3D engine options: */ + DBG_NO_GFX, + DBG_NO_NGG, + DBG_ALWAYS_NGG_CULLING, + DBG_NO_NGG_CULLING, + DBG_ALWAYS_PD, + DBG_PD, + DBG_NO_PD, + DBG_SWITCH_ON_EOP, + DBG_NO_OUT_OF_ORDER, + DBG_NO_DPBB, + DBG_NO_DFSM, + DBG_DPBB, + DBG_DFSM, + DBG_NO_HYPERZ, + DBG_NO_RB_PLUS, + DBG_NO_2D_TILING, + DBG_NO_TILING, + DBG_NO_DCC, + DBG_NO_DCC_CLEAR, + DBG_NO_DCC_FB, + DBG_NO_DCC_MSAA, + DBG_NO_FMASK, + + DBG_COUNT }; -enum { - /* Tests: */ - DBG_TEST_DMA, - DBG_TEST_VMFAULT_CP, - DBG_TEST_VMFAULT_SDMA, - DBG_TEST_VMFAULT_SHADER, - DBG_TEST_DMA_PERF, - DBG_TEST_GDS, - DBG_TEST_GDS_MM, - DBG_TEST_GDS_OA_MM, +enum +{ + /* Tests: */ + DBG_TEST_DMA, + DBG_TEST_VMFAULT_CP, + DBG_TEST_VMFAULT_SDMA, + DBG_TEST_VMFAULT_SHADER, + DBG_TEST_DMA_PERF, + DBG_TEST_GDS, + DBG_TEST_GDS_MM, + DBG_TEST_GDS_OA_MM, }; -#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) -#define DBG(name) (1ull << DBG_##name) +#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) +#define DBG(name) (1ull << DBG_##name) -enum si_cache_policy { - L2_BYPASS, - L2_STREAM, /* same as SLC=1 */ - L2_LRU, /* same as SLC=0 */ +enum si_cache_policy +{ + L2_BYPASS, + L2_STREAM, /* same as SLC=1 */ + L2_LRU, /* same as SLC=0 */ }; -enum si_coherency { - SI_COHERENCY_NONE, /* no cache flushes needed */ - SI_COHERENCY_SHADER, - SI_COHERENCY_CB_META, - SI_COHERENCY_CP, +enum si_coherency +{ + SI_COHERENCY_NONE, /* no cache flushes needed */ + SI_COHERENCY_SHADER, + SI_COHERENCY_CB_META, + SI_COHERENCY_CP, }; struct si_compute; @@ -244,528 +249,523 @@ struct u_suballocator; * at the moment. */ struct si_resource { - struct threaded_resource b; - - /* Winsys objects. */ - struct pb_buffer *buf; - uint64_t gpu_address; - /* Memory usage if the buffer placement is optimal. */ - uint64_t vram_usage; - uint64_t gart_usage; - - /* Resource properties. */ - uint64_t bo_size; - unsigned bo_alignment; - enum radeon_bo_domain domains; - enum radeon_bo_flag flags; - unsigned bind_history; - int max_forced_staging_uploads; - - /* The buffer range which is initialized (with a write transfer, - * streamout, DMA, or as a random access target). The rest of - * the buffer is considered invalid and can be mapped unsynchronized. - * - * This allows unsychronized mapping of a buffer range which hasn't - * been used yet. It's for applications which forget to use - * the unsynchronized map flag and expect the driver to figure it out. - */ - struct util_range valid_buffer_range; - - /* For buffers only. This indicates that a write operation has been - * performed by TC L2, but the cache hasn't been flushed. - * Any hw block which doesn't use or bypasses TC L2 should check this - * flag and flush the cache before using the buffer. - * - * For example, TC L2 must be flushed if a buffer which has been - * modified by a shader store instruction is about to be used as - * an index buffer. The reason is that VGT DMA index fetching doesn't - * use TC L2. - */ - bool TC_L2_dirty; - - /* Whether this resource is referenced by bindless handles. */ - bool texture_handle_allocated; - bool image_handle_allocated; - - /* Whether the resource has been exported via resource_get_handle. */ - unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ + struct threaded_resource b; + + /* Winsys objects. */ + struct pb_buffer *buf; + uint64_t gpu_address; + /* Memory usage if the buffer placement is optimal. */ + uint64_t vram_usage; + uint64_t gart_usage; + + /* Resource properties. */ + uint64_t bo_size; + unsigned bo_alignment; + enum radeon_bo_domain domains; + enum radeon_bo_flag flags; + unsigned bind_history; + int max_forced_staging_uploads; + + /* The buffer range which is initialized (with a write transfer, + * streamout, DMA, or as a random access target). The rest of + * the buffer is considered invalid and can be mapped unsynchronized. + * + * This allows unsychronized mapping of a buffer range which hasn't + * been used yet. It's for applications which forget to use + * the unsynchronized map flag and expect the driver to figure it out. + */ + struct util_range valid_buffer_range; + + /* For buffers only. This indicates that a write operation has been + * performed by TC L2, but the cache hasn't been flushed. + * Any hw block which doesn't use or bypasses TC L2 should check this + * flag and flush the cache before using the buffer. + * + * For example, TC L2 must be flushed if a buffer which has been + * modified by a shader store instruction is about to be used as + * an index buffer. The reason is that VGT DMA index fetching doesn't + * use TC L2. + */ + bool TC_L2_dirty; + + /* Whether this resource is referenced by bindless handles. */ + bool texture_handle_allocated; + bool image_handle_allocated; + + /* Whether the resource has been exported via resource_get_handle. */ + unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ }; struct si_transfer { - struct threaded_transfer b; - struct si_resource *staging; - unsigned offset; + struct threaded_transfer b; + struct si_resource *staging; + unsigned offset; }; struct si_texture { - struct si_resource buffer; - - struct radeon_surf surface; - struct si_texture *flushed_depth_texture; - - /* One texture allocation can contain these buffers: - * - image (pixel data) - * - FMASK buffer (MSAA compression) - * - CMASK buffer (MSAA compression and/or legacy fast color clear) - * - HTILE buffer (Z/S compression and fast Z/S clear) - * - DCC buffer (color compression and new fast color clear) - * - displayable DCC buffer (if the DCC buffer is not displayable) - * - DCC retile mapping buffer (if the DCC buffer is not displayable) - */ - uint64_t cmask_base_address_reg; - struct si_resource *cmask_buffer; - unsigned cb_color_info; /* fast clear enable bit */ - unsigned color_clear_value[2]; - unsigned last_msaa_resolve_target_micro_mode; - unsigned num_level0_transfers; - unsigned plane_index; /* other planes are different pipe_resources */ - unsigned num_planes; - - /* Depth buffer compression and fast clear. */ - float depth_clear_value; - uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */ - uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ - enum pipe_format db_render_format:16; - uint8_t stencil_clear_value; - bool fmask_is_identity:1; - bool tc_compatible_htile:1; - bool htile_stencil_disabled:1; - bool depth_cleared:1; /* if it was cleared at least once */ - bool stencil_cleared:1; /* if it was cleared at least once */ - bool upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */ - bool is_depth:1; - bool db_compatible:1; - bool can_sample_z:1; - bool can_sample_s:1; - - /* We need to track DCC dirtiness, because st/dri usually calls - * flush_resource twice per frame (not a bug) and we don't wanna - * decompress DCC twice. Also, the dirty tracking must be done even - * if DCC isn't used, because it's required by the DCC usage analysis - * for a possible future enablement. - */ - bool separate_dcc_dirty:1; - bool displayable_dcc_dirty:1; - - /* Statistics gathering for the DCC enablement heuristic. */ - bool dcc_gather_statistics:1; - /* Counter that should be non-zero if the texture is bound to a - * framebuffer. - */ - unsigned framebuffers_bound; - /* Whether the texture is a displayable back buffer and needs DCC - * decompression, which is expensive. Therefore, it's enabled only - * if statistics suggest that it will pay off and it's allocated - * separately. It can't be bound as a sampler by apps. Limited to - * target == 2D and last_level == 0. If enabled, dcc_offset contains - * the absolute GPUVM address, not the relative one. - */ - struct si_resource *dcc_separate_buffer; - /* When DCC is temporarily disabled, the separate buffer is here. */ - struct si_resource *last_dcc_separate_buffer; - /* Estimate of how much this color buffer is written to in units of - * full-screen draws: ps_invocations / (width * height) - * Shader kills, late Z, and blending with trivial discards make it - * inaccurate (we need to count CB updates, not PS invocations). - */ - unsigned ps_draw_ratio; - /* The number of clears since the last DCC usage analysis. */ - unsigned num_slow_clears; + struct si_resource buffer; + + struct radeon_surf surface; + struct si_texture *flushed_depth_texture; + + /* One texture allocation can contain these buffers: + * - image (pixel data) + * - FMASK buffer (MSAA compression) + * - CMASK buffer (MSAA compression and/or legacy fast color clear) + * - HTILE buffer (Z/S compression and fast Z/S clear) + * - DCC buffer (color compression and new fast color clear) + * - displayable DCC buffer (if the DCC buffer is not displayable) + * - DCC retile mapping buffer (if the DCC buffer is not displayable) + */ + uint64_t cmask_base_address_reg; + struct si_resource *cmask_buffer; + unsigned cb_color_info; /* fast clear enable bit */ + unsigned color_clear_value[2]; + unsigned last_msaa_resolve_target_micro_mode; + unsigned num_level0_transfers; + unsigned plane_index; /* other planes are different pipe_resources */ + unsigned num_planes; + + /* Depth buffer compression and fast clear. */ + float depth_clear_value; + uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */ + uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ + enum pipe_format db_render_format : 16; + uint8_t stencil_clear_value; + bool fmask_is_identity : 1; + bool tc_compatible_htile : 1; + bool htile_stencil_disabled : 1; + bool depth_cleared : 1; /* if it was cleared at least once */ + bool stencil_cleared : 1; /* if it was cleared at least once */ + bool upgraded_depth : 1; /* upgraded from unorm to Z32_FLOAT */ + bool is_depth : 1; + bool db_compatible : 1; + bool can_sample_z : 1; + bool can_sample_s : 1; + + /* We need to track DCC dirtiness, because st/dri usually calls + * flush_resource twice per frame (not a bug) and we don't wanna + * decompress DCC twice. Also, the dirty tracking must be done even + * if DCC isn't used, because it's required by the DCC usage analysis + * for a possible future enablement. + */ + bool separate_dcc_dirty : 1; + bool displayable_dcc_dirty : 1; + + /* Statistics gathering for the DCC enablement heuristic. */ + bool dcc_gather_statistics : 1; + /* Counter that should be non-zero if the texture is bound to a + * framebuffer. + */ + unsigned framebuffers_bound; + /* Whether the texture is a displayable back buffer and needs DCC + * decompression, which is expensive. Therefore, it's enabled only + * if statistics suggest that it will pay off and it's allocated + * separately. It can't be bound as a sampler by apps. Limited to + * target == 2D and last_level == 0. If enabled, dcc_offset contains + * the absolute GPUVM address, not the relative one. + */ + struct si_resource *dcc_separate_buffer; + /* When DCC is temporarily disabled, the separate buffer is here. */ + struct si_resource *last_dcc_separate_buffer; + /* Estimate of how much this color buffer is written to in units of + * full-screen draws: ps_invocations / (width * height) + * Shader kills, late Z, and blending with trivial discards make it + * inaccurate (we need to count CB updates, not PS invocations). + */ + unsigned ps_draw_ratio; + /* The number of clears since the last DCC usage analysis. */ + unsigned num_slow_clears; }; struct si_surface { - struct pipe_surface base; - - /* These can vary with block-compressed textures. */ - uint16_t width0; - uint16_t height0; - - bool color_initialized:1; - bool depth_initialized:1; - - /* Misc. color flags. */ - bool color_is_int8:1; - bool color_is_int10:1; - bool dcc_incompatible:1; - - /* Color registers. */ - unsigned cb_color_info; - unsigned cb_color_view; - unsigned cb_color_attrib; - unsigned cb_color_attrib2; /* GFX9 and later */ - unsigned cb_color_attrib3; /* GFX10 and later */ - unsigned cb_dcc_control; /* GFX8 and later */ - unsigned spi_shader_col_format:8; /* no blending, no alpha-to-coverage. */ - unsigned spi_shader_col_format_alpha:8; /* alpha-to-coverage */ - unsigned spi_shader_col_format_blend:8; /* blending without alpha. */ - unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */ - - /* DB registers. */ - uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */ - uint64_t db_stencil_base; - uint64_t db_htile_data_base; - unsigned db_depth_info; - unsigned db_z_info; - unsigned db_z_info2; /* GFX9 only */ - unsigned db_depth_view; - unsigned db_depth_size; - unsigned db_depth_slice; - unsigned db_stencil_info; - unsigned db_stencil_info2; /* GFX9 only */ - unsigned db_htile_surface; + struct pipe_surface base; + + /* These can vary with block-compressed textures. */ + uint16_t width0; + uint16_t height0; + + bool color_initialized : 1; + bool depth_initialized : 1; + + /* Misc. color flags. */ + bool color_is_int8 : 1; + bool color_is_int10 : 1; + bool dcc_incompatible : 1; + + /* Color registers. */ + unsigned cb_color_info; + unsigned cb_color_view; + unsigned cb_color_attrib; + unsigned cb_color_attrib2; /* GFX9 and later */ + unsigned cb_color_attrib3; /* GFX10 and later */ + unsigned cb_dcc_control; /* GFX8 and later */ + unsigned spi_shader_col_format : 8; /* no blending, no alpha-to-coverage. */ + unsigned spi_shader_col_format_alpha : 8; /* alpha-to-coverage */ + unsigned spi_shader_col_format_blend : 8; /* blending without alpha. */ + unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */ + + /* DB registers. */ + uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */ + uint64_t db_stencil_base; + uint64_t db_htile_data_base; + unsigned db_depth_info; + unsigned db_z_info; + unsigned db_z_info2; /* GFX9 only */ + unsigned db_depth_view; + unsigned db_depth_size; + unsigned db_depth_slice; + unsigned db_stencil_info; + unsigned db_stencil_info2; /* GFX9 only */ + unsigned db_htile_surface; }; struct si_mmio_counter { - unsigned busy; - unsigned idle; + unsigned busy; + unsigned idle; }; union si_mmio_counters { - struct { - /* For global GPU load including SDMA. */ - struct si_mmio_counter gpu; - - /* GRBM_STATUS */ - struct si_mmio_counter spi; - struct si_mmio_counter gui; - struct si_mmio_counter ta; - struct si_mmio_counter gds; - struct si_mmio_counter vgt; - struct si_mmio_counter ia; - struct si_mmio_counter sx; - struct si_mmio_counter wd; - struct si_mmio_counter bci; - struct si_mmio_counter sc; - struct si_mmio_counter pa; - struct si_mmio_counter db; - struct si_mmio_counter cp; - struct si_mmio_counter cb; - - /* SRBM_STATUS2 */ - struct si_mmio_counter sdma; - - /* CP_STAT */ - struct si_mmio_counter pfp; - struct si_mmio_counter meq; - struct si_mmio_counter me; - struct si_mmio_counter surf_sync; - struct si_mmio_counter cp_dma; - struct si_mmio_counter scratch_ram; - } named; - unsigned array[0]; + struct { + /* For global GPU load including SDMA. */ + struct si_mmio_counter gpu; + + /* GRBM_STATUS */ + struct si_mmio_counter spi; + struct si_mmio_counter gui; + struct si_mmio_counter ta; + struct si_mmio_counter gds; + struct si_mmio_counter vgt; + struct si_mmio_counter ia; + struct si_mmio_counter sx; + struct si_mmio_counter wd; + struct si_mmio_counter bci; + struct si_mmio_counter sc; + struct si_mmio_counter pa; + struct si_mmio_counter db; + struct si_mmio_counter cp; + struct si_mmio_counter cb; + + /* SRBM_STATUS2 */ + struct si_mmio_counter sdma; + + /* CP_STAT */ + struct si_mmio_counter pfp; + struct si_mmio_counter meq; + struct si_mmio_counter me; + struct si_mmio_counter surf_sync; + struct si_mmio_counter cp_dma; + struct si_mmio_counter scratch_ram; + } named; + unsigned array[0]; }; struct si_memory_object { - struct pipe_memory_object b; - struct pb_buffer *buf; - uint32_t stride; + struct pipe_memory_object b; + struct pb_buffer *buf; + uint32_t stride; }; /* Saved CS data for debugging features. */ struct radeon_saved_cs { - uint32_t *ib; - unsigned num_dw; + uint32_t *ib; + unsigned num_dw; - struct radeon_bo_list_item *bo_list; - unsigned bo_count; + struct radeon_bo_list_item *bo_list; + unsigned bo_count; }; struct si_screen { - struct pipe_screen b; - struct radeon_winsys *ws; - struct disk_cache *disk_shader_cache; - - struct radeon_info info; - uint64_t debug_flags; - char renderer_string[183]; - - void (*make_texture_descriptor)( - struct si_screen *screen, - struct si_texture *tex, - bool sampler, - enum pipe_texture_target target, - enum pipe_format pipe_format, - const unsigned char state_swizzle[4], - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned width, unsigned height, unsigned depth, - uint32_t *state, - uint32_t *fmask_state); - - unsigned num_vbos_in_user_sgprs; - unsigned pa_sc_raster_config; - unsigned pa_sc_raster_config_1; - unsigned se_tile_repeat; - unsigned gs_table_depth; - unsigned tess_offchip_block_dw_size; - unsigned tess_offchip_ring_size; - unsigned tess_factor_ring_size; - unsigned vgt_hs_offchip_param; - unsigned eqaa_force_coverage_samples; - unsigned eqaa_force_z_samples; - unsigned eqaa_force_color_samples; - bool has_draw_indirect_multi; - bool has_out_of_order_rast; - bool assume_no_z_fights; - bool commutative_blend_add; - bool dpbb_allowed; - bool dfsm_allowed; - bool llvm_has_working_vgpr_indexing; - bool use_ngg; - bool use_ngg_culling; - bool always_use_ngg_culling; - bool use_ngg_streamout; - - struct { -#define OPT_BOOL(name, dflt, description) bool name:1; + struct pipe_screen b; + struct radeon_winsys *ws; + struct disk_cache *disk_shader_cache; + + struct radeon_info info; + uint64_t debug_flags; + char renderer_string[183]; + + void (*make_texture_descriptor)(struct si_screen *screen, struct si_texture *tex, bool sampler, + enum pipe_texture_target target, enum pipe_format pipe_format, + const unsigned char state_swizzle[4], unsigned first_level, + unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, uint32_t *state, + uint32_t *fmask_state); + + unsigned num_vbos_in_user_sgprs; + unsigned pa_sc_raster_config; + unsigned pa_sc_raster_config_1; + unsigned se_tile_repeat; + unsigned gs_table_depth; + unsigned tess_offchip_block_dw_size; + unsigned tess_offchip_ring_size; + unsigned tess_factor_ring_size; + unsigned vgt_hs_offchip_param; + unsigned eqaa_force_coverage_samples; + unsigned eqaa_force_z_samples; + unsigned eqaa_force_color_samples; + bool has_draw_indirect_multi; + bool has_out_of_order_rast; + bool assume_no_z_fights; + bool commutative_blend_add; + bool dpbb_allowed; + bool dfsm_allowed; + bool llvm_has_working_vgpr_indexing; + bool use_ngg; + bool use_ngg_culling; + bool always_use_ngg_culling; + bool use_ngg_streamout; + + struct { +#define OPT_BOOL(name, dflt, description) bool name : 1; #include "si_debug_options.h" - } options; - - /* Whether shaders are monolithic (1-part) or separate (3-part). */ - bool use_monolithic_shaders; - bool record_llvm_ir; - bool dcc_msaa_allowed; - - struct slab_parent_pool pool_transfers; - - /* Texture filter settings. */ - int force_aniso; /* -1 = disabled */ - - /* Auxiliary context. Mainly used to initialize resources. - * It must be locked prior to using and flushed before unlocking. */ - struct pipe_context *aux_context; - simple_mtx_t aux_context_lock; - - /* This must be in the screen, because UE4 uses one context for - * compilation and another one for rendering. - */ - unsigned num_compilations; - /* Along with ST_DEBUG=precompile, this should show if applications - * are loading shaders on demand. This is a monotonic counter. - */ - unsigned num_shaders_created; - unsigned num_memory_shader_cache_hits; - unsigned num_memory_shader_cache_misses; - unsigned num_disk_shader_cache_hits; - unsigned num_disk_shader_cache_misses; - - /* GPU load thread. */ - simple_mtx_t gpu_load_mutex; - thrd_t gpu_load_thread; - union si_mmio_counters mmio_counters; - volatile unsigned gpu_load_stop_thread; /* bool */ - - /* Performance counters. */ - struct si_perfcounters *perfcounters; - - /* If pipe_screen wants to recompute and re-emit the framebuffer, - * sampler, and image states of all contexts, it should atomically - * increment this. - * - * Each context will compare this with its own last known value of - * the counter before drawing and re-emit the states accordingly. - */ - unsigned dirty_tex_counter; - unsigned dirty_buf_counter; - - /* Atomically increment this counter when an existing texture's - * metadata is enabled or disabled in a way that requires changing - * contexts' compressed texture binding masks. - */ - unsigned compressed_colortex_counter; - - struct { - /* Context flags to set so that all writes from earlier jobs - * in the CP are seen by L2 clients. - */ - unsigned cp_to_L2; - - /* Context flags to set so that all writes from earlier jobs - * that end in L2 are seen by CP. - */ - unsigned L2_to_cp; - } barrier_flags; - - simple_mtx_t shader_parts_mutex; - struct si_shader_part *vs_prologs; - struct si_shader_part *tcs_epilogs; - struct si_shader_part *gs_prologs; - struct si_shader_part *ps_prologs; - struct si_shader_part *ps_epilogs; - - /* Shader cache in memory. - * - * Design & limitations: - * - The shader cache is per screen (= per process), never saved to - * disk, and skips redundant shader compilations from NIR to bytecode. - * - It can only be used with one-variant-per-shader support, in which - * case only the main (typically middle) part of shaders is cached. - * - Only VS, TCS, TES, PS are cached, out of which only the hw VS - * variants of VS and TES are cached, so LS and ES aren't. - * - GS and CS aren't cached, but it's certainly possible to cache - * those as well. - */ - simple_mtx_t shader_cache_mutex; - struct hash_table *shader_cache; - - /* Shader cache of live shaders. */ - struct util_live_shader_cache live_shader_cache; - - /* Shader compiler queue for multithreaded compilation. */ - struct util_queue shader_compiler_queue; - /* Use at most 3 normal compiler threads on quadcore and better. - * Hyperthreaded CPUs report the number of threads, but we want - * the number of cores. We only need this many threads for shader-db. */ - struct ac_llvm_compiler compiler[24]; /* used by the queue only */ - - struct util_queue shader_compiler_queue_low_priority; - /* Use at most 2 low priority threads on quadcore and better. - * We want to minimize the impact on multithreaded Mesa. */ - struct ac_llvm_compiler compiler_lowp[10]; - - unsigned compute_wave_size; - unsigned ps_wave_size; - unsigned ge_wave_size; + } options; + + /* Whether shaders are monolithic (1-part) or separate (3-part). */ + bool use_monolithic_shaders; + bool record_llvm_ir; + bool dcc_msaa_allowed; + + struct slab_parent_pool pool_transfers; + + /* Texture filter settings. */ + int force_aniso; /* -1 = disabled */ + + /* Auxiliary context. Mainly used to initialize resources. + * It must be locked prior to using and flushed before unlocking. */ + struct pipe_context *aux_context; + simple_mtx_t aux_context_lock; + + /* This must be in the screen, because UE4 uses one context for + * compilation and another one for rendering. + */ + unsigned num_compilations; + /* Along with ST_DEBUG=precompile, this should show if applications + * are loading shaders on demand. This is a monotonic counter. + */ + unsigned num_shaders_created; + unsigned num_memory_shader_cache_hits; + unsigned num_memory_shader_cache_misses; + unsigned num_disk_shader_cache_hits; + unsigned num_disk_shader_cache_misses; + + /* GPU load thread. */ + simple_mtx_t gpu_load_mutex; + thrd_t gpu_load_thread; + union si_mmio_counters mmio_counters; + volatile unsigned gpu_load_stop_thread; /* bool */ + + /* Performance counters. */ + struct si_perfcounters *perfcounters; + + /* If pipe_screen wants to recompute and re-emit the framebuffer, + * sampler, and image states of all contexts, it should atomically + * increment this. + * + * Each context will compare this with its own last known value of + * the counter before drawing and re-emit the states accordingly. + */ + unsigned dirty_tex_counter; + unsigned dirty_buf_counter; + + /* Atomically increment this counter when an existing texture's + * metadata is enabled or disabled in a way that requires changing + * contexts' compressed texture binding masks. + */ + unsigned compressed_colortex_counter; + + struct { + /* Context flags to set so that all writes from earlier jobs + * in the CP are seen by L2 clients. + */ + unsigned cp_to_L2; + + /* Context flags to set so that all writes from earlier jobs + * that end in L2 are seen by CP. + */ + unsigned L2_to_cp; + } barrier_flags; + + simple_mtx_t shader_parts_mutex; + struct si_shader_part *vs_prologs; + struct si_shader_part *tcs_epilogs; + struct si_shader_part *gs_prologs; + struct si_shader_part *ps_prologs; + struct si_shader_part *ps_epilogs; + + /* Shader cache in memory. + * + * Design & limitations: + * - The shader cache is per screen (= per process), never saved to + * disk, and skips redundant shader compilations from NIR to bytecode. + * - It can only be used with one-variant-per-shader support, in which + * case only the main (typically middle) part of shaders is cached. + * - Only VS, TCS, TES, PS are cached, out of which only the hw VS + * variants of VS and TES are cached, so LS and ES aren't. + * - GS and CS aren't cached, but it's certainly possible to cache + * those as well. + */ + simple_mtx_t shader_cache_mutex; + struct hash_table *shader_cache; + + /* Shader cache of live shaders. */ + struct util_live_shader_cache live_shader_cache; + + /* Shader compiler queue for multithreaded compilation. */ + struct util_queue shader_compiler_queue; + /* Use at most 3 normal compiler threads on quadcore and better. + * Hyperthreaded CPUs report the number of threads, but we want + * the number of cores. We only need this many threads for shader-db. */ + struct ac_llvm_compiler compiler[24]; /* used by the queue only */ + + struct util_queue shader_compiler_queue_low_priority; + /* Use at most 2 low priority threads on quadcore and better. + * We want to minimize the impact on multithreaded Mesa. */ + struct ac_llvm_compiler compiler_lowp[10]; + + unsigned compute_wave_size; + unsigned ps_wave_size; + unsigned ge_wave_size; }; struct si_blend_color { - struct pipe_blend_color state; - bool any_nonzeros; + struct pipe_blend_color state; + bool any_nonzeros; }; struct si_sampler_view { - struct pipe_sampler_view base; - /* [0..7] = image descriptor - * [4..7] = buffer descriptor */ - uint32_t state[8]; - uint32_t fmask_state[8]; - const struct legacy_surf_level *base_level_info; - ubyte base_level; - ubyte block_width; - bool is_stencil_sampler; - bool is_integer; - bool dcc_incompatible; + struct pipe_sampler_view base; + /* [0..7] = image descriptor + * [4..7] = buffer descriptor */ + uint32_t state[8]; + uint32_t fmask_state[8]; + const struct legacy_surf_level *base_level_info; + ubyte base_level; + ubyte block_width; + bool is_stencil_sampler; + bool is_integer; + bool dcc_incompatible; }; #define SI_SAMPLER_STATE_MAGIC 0x34f1c35a struct si_sampler_state { #ifndef NDEBUG - unsigned magic; + unsigned magic; #endif - uint32_t val[4]; - uint32_t integer_val[4]; - uint32_t upgraded_depth_val[4]; + uint32_t val[4]; + uint32_t integer_val[4]; + uint32_t upgraded_depth_val[4]; }; struct si_cs_shader_state { - struct si_compute *program; - struct si_compute *emitted_program; - unsigned offset; - bool initialized; - bool uses_scratch; + struct si_compute *program; + struct si_compute *emitted_program; + unsigned offset; + bool initialized; + bool uses_scratch; }; struct si_samplers { - struct pipe_sampler_view *views[SI_NUM_SAMPLERS]; - struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS]; + struct pipe_sampler_view *views[SI_NUM_SAMPLERS]; + struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS]; - /* The i-th bit is set if that element is enabled (non-NULL resource). */ - unsigned enabled_mask; - uint32_t needs_depth_decompress_mask; - uint32_t needs_color_decompress_mask; + /* The i-th bit is set if that element is enabled (non-NULL resource). */ + unsigned enabled_mask; + uint32_t needs_depth_decompress_mask; + uint32_t needs_color_decompress_mask; }; struct si_images { - struct pipe_image_view views[SI_NUM_IMAGES]; - uint32_t needs_color_decompress_mask; - unsigned enabled_mask; + struct pipe_image_view views[SI_NUM_IMAGES]; + uint32_t needs_color_decompress_mask; + unsigned enabled_mask; }; struct si_framebuffer { - struct pipe_framebuffer_state state; - unsigned colorbuf_enabled_4bit; - unsigned spi_shader_col_format; - unsigned spi_shader_col_format_alpha; - unsigned spi_shader_col_format_blend; - unsigned spi_shader_col_format_blend_alpha; - ubyte nr_samples:5; /* at most 16xAA */ - ubyte log_samples:3; /* at most 4 = 16xAA */ - ubyte nr_color_samples; /* at most 8xAA */ - ubyte compressed_cb_mask; - ubyte uncompressed_cb_mask; - ubyte displayable_dcc_cb_mask; - ubyte color_is_int8; - ubyte color_is_int10; - ubyte dirty_cbufs; - ubyte dcc_overwrite_combiner_watermark; - ubyte min_bytes_per_pixel; - bool dirty_zsbuf; - bool any_dst_linear; - bool CB_has_shader_readable_metadata; - bool DB_has_shader_readable_metadata; - bool all_DCC_pipe_aligned; + struct pipe_framebuffer_state state; + unsigned colorbuf_enabled_4bit; + unsigned spi_shader_col_format; + unsigned spi_shader_col_format_alpha; + unsigned spi_shader_col_format_blend; + unsigned spi_shader_col_format_blend_alpha; + ubyte nr_samples : 5; /* at most 16xAA */ + ubyte log_samples : 3; /* at most 4 = 16xAA */ + ubyte nr_color_samples; /* at most 8xAA */ + ubyte compressed_cb_mask; + ubyte uncompressed_cb_mask; + ubyte displayable_dcc_cb_mask; + ubyte color_is_int8; + ubyte color_is_int10; + ubyte dirty_cbufs; + ubyte dcc_overwrite_combiner_watermark; + ubyte min_bytes_per_pixel; + bool dirty_zsbuf; + bool any_dst_linear; + bool CB_has_shader_readable_metadata; + bool DB_has_shader_readable_metadata; + bool all_DCC_pipe_aligned; }; -enum si_quant_mode { - /* This is the list we want to support. */ - SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH, - SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH, - SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH, +enum si_quant_mode +{ + /* This is the list we want to support. */ + SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH, + SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH, + SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH, }; struct si_signed_scissor { - int minx; - int miny; - int maxx; - int maxy; - enum si_quant_mode quant_mode; + int minx; + int miny; + int maxx; + int maxy; + enum si_quant_mode quant_mode; }; struct si_viewports { - struct pipe_viewport_state states[SI_MAX_VIEWPORTS]; - struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS]; - bool y_inverted; + struct pipe_viewport_state states[SI_MAX_VIEWPORTS]; + struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS]; + bool y_inverted; }; struct si_clip_state { - struct pipe_clip_state state; - bool any_nonzeros; + struct pipe_clip_state state; + bool any_nonzeros; }; struct si_streamout_target { - struct pipe_stream_output_target b; + struct pipe_stream_output_target b; - /* The buffer where BUFFER_FILLED_SIZE is stored. */ - struct si_resource *buf_filled_size; - unsigned buf_filled_size_offset; - bool buf_filled_size_valid; + /* The buffer where BUFFER_FILLED_SIZE is stored. */ + struct si_resource *buf_filled_size; + unsigned buf_filled_size_offset; + bool buf_filled_size_valid; - unsigned stride_in_dw; + unsigned stride_in_dw; }; struct si_streamout { - bool begin_emitted; + bool begin_emitted; - unsigned enabled_mask; - unsigned num_targets; - struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS]; + unsigned enabled_mask; + unsigned num_targets; + struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS]; - unsigned append_bitmask; - bool suspended; + unsigned append_bitmask; + bool suspended; - /* External state which comes from the vertex shader, - * it must be set explicitly when binding a shader. */ - uint16_t *stride_in_dw; - unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ + /* External state which comes from the vertex shader, + * it must be set explicitly when binding a shader. */ + uint16_t *stride_in_dw; + unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ - /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */ - unsigned hw_enabled_mask; + /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */ + unsigned hw_enabled_mask; - /* The state of VGT_STRMOUT_(CONFIG|EN). */ - bool streamout_enabled; - bool prims_gen_query_enabled; - int num_prims_gen_queries; + /* The state of VGT_STRMOUT_(CONFIG|EN). */ + bool streamout_enabled; + bool prims_gen_query_enabled; + int num_prims_gen_queries; }; /* A shader state consists of the shader selector, which is a constant state @@ -773,494 +773,488 @@ struct si_streamout { * the current shader variant selected for this context. */ struct si_shader_ctx_state { - struct si_shader_selector *cso; - struct si_shader *current; + struct si_shader_selector *cso; + struct si_shader *current; }; #define SI_NUM_VGT_PARAM_KEY_BITS 12 -#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS) +#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS) /* The IA_MULTI_VGT_PARAM key used to index the table of precomputed values. * Some fields are set by state-change calls, most are set by draw_vbo. */ union si_vgt_param_key { - struct { + struct { #if UTIL_ARCH_LITTLE_ENDIAN - unsigned prim:4; - unsigned uses_instancing:1; - unsigned multi_instances_smaller_than_primgroup:1; - unsigned primitive_restart:1; - unsigned count_from_stream_output:1; - unsigned line_stipple_enabled:1; - unsigned uses_tess:1; - unsigned tess_uses_prim_id:1; - unsigned uses_gs:1; - unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS; + unsigned prim : 4; + unsigned uses_instancing : 1; + unsigned multi_instances_smaller_than_primgroup : 1; + unsigned primitive_restart : 1; + unsigned count_from_stream_output : 1; + unsigned line_stipple_enabled : 1; + unsigned uses_tess : 1; + unsigned tess_uses_prim_id : 1; + unsigned uses_gs : 1; + unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS; #else /* UTIL_ARCH_BIG_ENDIAN */ - unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS; - unsigned uses_gs:1; - unsigned tess_uses_prim_id:1; - unsigned uses_tess:1; - unsigned line_stipple_enabled:1; - unsigned count_from_stream_output:1; - unsigned primitive_restart:1; - unsigned multi_instances_smaller_than_primgroup:1; - unsigned uses_instancing:1; - unsigned prim:4; + unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS; + unsigned uses_gs : 1; + unsigned tess_uses_prim_id : 1; + unsigned uses_tess : 1; + unsigned line_stipple_enabled : 1; + unsigned count_from_stream_output : 1; + unsigned primitive_restart : 1; + unsigned multi_instances_smaller_than_primgroup : 1; + unsigned uses_instancing : 1; + unsigned prim : 4; #endif - } u; - uint32_t index; + } u; + uint32_t index; }; #define SI_NUM_VGT_STAGES_KEY_BITS 6 -#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) +#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) /* The VGT_SHADER_STAGES key used to index the table of precomputed values. * Some fields are set by state-change calls, most are set by draw_vbo. */ union si_vgt_stages_key { - struct { + struct { #if UTIL_ARCH_LITTLE_ENDIAN - unsigned tess:1; - unsigned gs:1; - unsigned ngg_gs_fast_launch:1; - unsigned ngg_passthrough:1; - unsigned ngg:1; /* gfx10+ */ - unsigned streamout:1; /* only used with NGG */ - unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS; + unsigned tess : 1; + unsigned gs : 1; + unsigned ngg_gs_fast_launch : 1; + unsigned ngg_passthrough : 1; + unsigned ngg : 1; /* gfx10+ */ + unsigned streamout : 1; /* only used with NGG */ + unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS; #else /* UTIL_ARCH_BIG_ENDIAN */ - unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS; - unsigned streamout:1; - unsigned ngg:1; - unsigned ngg_passthrough:1; - unsigned ngg_gs_fast_launch:1; - unsigned gs:1; - unsigned tess:1; + unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS; + unsigned streamout : 1; + unsigned ngg : 1; + unsigned ngg_passthrough : 1; + unsigned ngg_gs_fast_launch : 1; + unsigned gs : 1; + unsigned tess : 1; #endif - } u; - uint32_t index; + } u; + uint32_t index; }; -struct si_texture_handle -{ - unsigned desc_slot; - bool desc_dirty; - struct pipe_sampler_view *view; - struct si_sampler_state sstate; +struct si_texture_handle { + unsigned desc_slot; + bool desc_dirty; + struct pipe_sampler_view *view; + struct si_sampler_state sstate; }; -struct si_image_handle -{ - unsigned desc_slot; - bool desc_dirty; - struct pipe_image_view view; +struct si_image_handle { + unsigned desc_slot; + bool desc_dirty; + struct pipe_image_view view; }; struct si_saved_cs { - struct pipe_reference reference; - struct si_context *ctx; - struct radeon_saved_cs gfx; - struct radeon_saved_cs compute; - struct si_resource *trace_buf; - unsigned trace_id; - - unsigned gfx_last_dw; - unsigned compute_last_dw; - bool flushed; - int64_t time_flush; + struct pipe_reference reference; + struct si_context *ctx; + struct radeon_saved_cs gfx; + struct radeon_saved_cs compute; + struct si_resource *trace_buf; + unsigned trace_id; + + unsigned gfx_last_dw; + unsigned compute_last_dw; + bool flushed; + int64_t time_flush; }; struct si_sdma_upload { - struct si_resource *dst; - struct si_resource *src; - unsigned src_offset; - unsigned dst_offset; - unsigned size; + struct si_resource *dst; + struct si_resource *src; + unsigned src_offset; + unsigned dst_offset; + unsigned size; }; struct si_small_prim_cull_info { - float scale[2], translate[2]; + float scale[2], translate[2]; }; struct si_context { - struct pipe_context b; /* base class */ - - enum radeon_family family; - enum chip_class chip_class; - - struct radeon_winsys *ws; - struct radeon_winsys_ctx *ctx; - struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */ - struct radeon_cmdbuf *sdma_cs; - struct pipe_fence_handle *last_gfx_fence; - struct pipe_fence_handle *last_sdma_fence; - struct si_resource *eop_bug_scratch; - struct u_upload_mgr *cached_gtt_allocator; - struct threaded_context *tc; - struct u_suballocator *allocator_zeroed_memory; - struct slab_child_pool pool_transfers; - struct slab_child_pool pool_transfers_unsync; /* for threaded_context */ - struct pipe_device_reset_callback device_reset_callback; - struct u_log_context *log; - void *query_result_shader; - void *sh_query_result_shader; - - void (*emit_cache_flush)(struct si_context *ctx); - - struct blitter_context *blitter; - void *noop_blend; - void *noop_dsa; - void *discard_rasterizer_state; - void *custom_dsa_flush; - void *custom_blend_resolve; - void *custom_blend_fmask_decompress; - void *custom_blend_eliminate_fastclear; - void *custom_blend_dcc_decompress; - void *vs_blit_pos; - void *vs_blit_pos_layered; - void *vs_blit_color; - void *vs_blit_color_layered; - void *vs_blit_texcoord; - void *cs_clear_buffer; - void *cs_copy_buffer; - void *cs_copy_image; - void *cs_copy_image_1d_array; - void *cs_clear_render_target; - void *cs_clear_render_target_1d_array; - void *cs_clear_12bytes_buffer; - void *cs_dcc_retile; - void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ - struct si_screen *screen; - struct pipe_debug_callback debug; - struct ac_llvm_compiler compiler; /* only non-threaded compilation */ - struct si_shader_ctx_state fixed_func_tcs_shader; - /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */ - struct si_resource *wait_mem_scratch; - unsigned wait_mem_number; - uint16_t prefetch_L2_mask; - - bool has_graphics; - bool gfx_flush_in_progress:1; - bool gfx_last_ib_is_busy:1; - bool compute_is_busy:1; - - unsigned num_gfx_cs_flushes; - unsigned initial_gfx_cs_size; - unsigned last_dirty_tex_counter; - unsigned last_dirty_buf_counter; - unsigned last_compressed_colortex_counter; - unsigned last_num_draw_calls; - unsigned flags; /* flush flags */ - /* Current unaccounted memory usage. */ - uint64_t vram; - uint64_t gtt; - - /* Compute-based primitive discard. */ - unsigned prim_discard_vertex_count_threshold; - struct pb_buffer *gds; - struct pb_buffer *gds_oa; - struct radeon_cmdbuf *prim_discard_compute_cs; - unsigned compute_gds_offset; - struct si_shader *compute_ib_last_shader; - uint32_t compute_rewind_va; - unsigned compute_num_prims_in_batch; - bool preserve_prim_restart_gds_at_flush; - /* index_ring is divided into 2 halves for doublebuffering. */ - struct si_resource *index_ring; - unsigned index_ring_base; /* offset of a per-IB portion */ - unsigned index_ring_offset; /* offset within a per-IB portion */ - unsigned index_ring_size_per_ib; /* max available size per IB */ - bool prim_discard_compute_ib_initialized; - /* For tracking the last execution barrier - it can be either - * a WRITE_DATA packet or a fence. */ - uint32_t *last_pkt3_write_data; - struct si_resource *barrier_buf; - unsigned barrier_buf_offset; - struct pipe_fence_handle *last_ib_barrier_fence; - struct si_resource *last_ib_barrier_buf; - unsigned last_ib_barrier_buf_offset; - - /* Atoms (direct states). */ - union si_state_atoms atoms; - unsigned dirty_atoms; /* mask */ - /* PM4 states (precomputed immutable states) */ - unsigned dirty_states; - union si_state queued; - union si_state emitted; - - /* Atom declarations. */ - struct si_framebuffer framebuffer; - unsigned sample_locs_num_samples; - uint16_t sample_mask; - unsigned last_cb_target_mask; - struct si_blend_color blend_color; - struct si_clip_state clip_state; - struct si_shader_data shader_pointers; - struct si_stencil_ref stencil_ref; - struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS]; - struct si_streamout streamout; - struct si_viewports viewports; - unsigned num_window_rectangles; - bool window_rectangles_include; - struct pipe_scissor_state window_rectangles[4]; - - /* Precomputed states. */ - struct si_pm4_state *init_config; - struct si_pm4_state *init_config_gs_rings; - bool init_config_has_vgt_flush; - struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES]; - - /* shaders */ - struct si_shader_ctx_state ps_shader; - struct si_shader_ctx_state gs_shader; - struct si_shader_ctx_state vs_shader; - struct si_shader_ctx_state tcs_shader; - struct si_shader_ctx_state tes_shader; - struct si_shader_ctx_state cs_prim_discard_state; - struct si_cs_shader_state cs_shader_state; - - /* shader information */ - struct si_vertex_elements *vertex_elements; - unsigned num_vertex_elements; - unsigned sprite_coord_enable; - unsigned cs_max_waves_per_sh; - bool flatshade; - bool do_update_shaders; - - /* shader descriptors */ - struct si_descriptors descriptors[SI_NUM_DESCS]; - unsigned descriptors_dirty; - unsigned shader_pointers_dirty; - unsigned shader_needs_decompress_mask; - struct si_buffer_resources rw_buffers; - struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; - struct si_samplers samplers[SI_NUM_SHADERS]; - struct si_images images[SI_NUM_SHADERS]; - bool bo_list_add_all_resident_resources; - bool bo_list_add_all_gfx_resources; - bool bo_list_add_all_compute_resources; - - /* other shader resources */ - struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */ - struct pipe_resource *esgs_ring; - struct pipe_resource *gsvs_ring; - struct pipe_resource *tess_rings; - union pipe_color_union *border_color_table; /* in CPU memory, any endian */ - struct si_resource *border_color_buffer; - union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */ - unsigned border_color_count; - unsigned num_vs_blit_sgprs; - uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; - uint32_t cs_user_data[4]; - - /* Vertex buffers. */ - bool vertex_buffers_dirty; - bool vertex_buffer_pointer_dirty; - bool vertex_buffer_user_sgprs_dirty; - struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; - uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ - uint32_t *vb_descriptors_gpu_list; - struct si_resource *vb_descriptors_buffer; - unsigned vb_descriptors_offset; - unsigned vb_descriptor_user_sgprs[5*4]; - - /* MSAA config state. */ - int ps_iter_samples; - bool ps_uses_fbfetch; - bool smoothing_enabled; - - /* DB render state. */ - unsigned ps_db_shader_control; - unsigned dbcb_copy_sample; - bool dbcb_depth_copy_enabled:1; - bool dbcb_stencil_copy_enabled:1; - bool db_flush_depth_inplace:1; - bool db_flush_stencil_inplace:1; - bool db_depth_clear:1; - bool db_depth_disable_expclear:1; - bool db_stencil_clear:1; - bool db_stencil_disable_expclear:1; - bool occlusion_queries_disabled:1; - bool generate_mipmap_for_depth:1; - - /* Emitted draw state. */ - bool gs_tri_strip_adj_fix:1; - bool ls_vgpr_fix:1; - bool prim_discard_cs_instancing:1; - bool ngg:1; - uint8_t ngg_culling; - int last_index_size; - int last_base_vertex; - int last_start_instance; - int last_instance_count; - int last_drawid; - int last_sh_base_reg; - int last_primitive_restart_en; - int last_restart_index; - int last_prim; - int last_multi_vgt_param; - int last_gs_out_prim; - int last_binning_enabled; - unsigned current_vs_state; - unsigned last_vs_state; - enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */ - - struct si_small_prim_cull_info last_small_prim_cull_info; - struct si_resource *small_prim_cull_info_buf; - uint64_t small_prim_cull_info_address; - bool small_prim_cull_info_dirty; - - /* Scratch buffer */ - struct si_resource *scratch_buffer; - unsigned scratch_waves; - unsigned spi_tmpring_size; - unsigned max_seen_scratch_bytes_per_wave; - unsigned max_seen_compute_scratch_bytes_per_wave; - - struct si_resource *compute_scratch_buffer; - - /* Emitted derived tessellation state. */ - /* Local shader (VS), or HS if LS-HS are merged. */ - struct si_shader *last_ls; - struct si_shader_selector *last_tcs; - int last_num_tcs_input_cp; - int last_tes_sh_base; - bool last_tess_uses_primid; - unsigned last_num_patches; - int last_ls_hs_config; - - /* Debug state. */ - bool is_debug; - struct si_saved_cs *current_saved_cs; - uint64_t dmesg_timestamp; - unsigned apitrace_call_number; - - /* Other state */ - bool need_check_render_feedback; - bool decompression_enabled; - bool dpbb_force_off; - bool vs_writes_viewport_index; - bool vs_disables_clipping_viewport; - - /* Precomputed IA_MULTI_VGT_PARAM */ - union si_vgt_param_key ia_multi_vgt_param_key; - unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES]; - - /* Bindless descriptors. */ - struct si_descriptors bindless_descriptors; - struct util_idalloc bindless_used_slots; - unsigned num_bindless_descriptors; - bool bindless_descriptors_dirty; - bool graphics_bindless_pointer_dirty; - bool compute_bindless_pointer_dirty; - - /* Allocated bindless handles */ - struct hash_table *tex_handles; - struct hash_table *img_handles; - - /* Resident bindless handles */ - struct util_dynarray resident_tex_handles; - struct util_dynarray resident_img_handles; - - /* Resident bindless handles which need decompression */ - struct util_dynarray resident_tex_needs_color_decompress; - struct util_dynarray resident_img_needs_color_decompress; - struct util_dynarray resident_tex_needs_depth_decompress; - - /* Bindless state */ - bool uses_bindless_samplers; - bool uses_bindless_images; - - /* MSAA sample locations. - * The first index is the sample index. - * The second index is the coordinate: X, Y. */ - struct { - float x1[1][2]; - float x2[2][2]; - float x4[4][2]; - float x8[8][2]; - float x16[16][2]; - } sample_positions; - struct pipe_resource *sample_pos_buffer; - - /* Misc stats. */ - unsigned num_draw_calls; - unsigned num_decompress_calls; - unsigned num_mrt_draw_calls; - unsigned num_prim_restart_calls; - unsigned num_spill_draw_calls; - unsigned num_compute_calls; - unsigned num_spill_compute_calls; - unsigned num_dma_calls; - unsigned num_cp_dma_calls; - unsigned num_vs_flushes; - unsigned num_ps_flushes; - unsigned num_cs_flushes; - unsigned num_cb_cache_flushes; - unsigned num_db_cache_flushes; - unsigned num_L2_invalidates; - unsigned num_L2_writebacks; - unsigned num_resident_handles; - uint64_t num_alloc_tex_transfer_bytes; - unsigned last_tex_ps_draw_ratio; /* for query */ - unsigned compute_num_verts_accepted; - unsigned compute_num_verts_rejected; - unsigned compute_num_verts_ineligible; /* due to low vertex count */ - unsigned context_roll; - - /* Queries. */ - /* Maintain the list of active queries for pausing between IBs. */ - int num_occlusion_queries; - int num_perfect_occlusion_queries; - int num_pipeline_stat_queries; - struct list_head active_queries; - unsigned num_cs_dw_queries_suspend; - - /* Render condition. */ - struct pipe_query *render_cond; - unsigned render_cond_mode; - bool render_cond_invert; - bool render_cond_force_off; /* for u_blitter */ - - /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */ - bool sdma_uploads_in_progress; - struct si_sdma_upload *sdma_uploads; - unsigned num_sdma_uploads; - unsigned max_sdma_uploads; - - /* Shader-based queries. */ - struct list_head shader_query_buffers; - unsigned num_active_shader_queries; - - /* Statistics gathering for the DCC enablement heuristic. It can't be - * in si_texture because si_texture can be shared by multiple - * contexts. This is for back buffers only. We shouldn't get too many - * of those. - * - * X11 DRI3 rotates among a finite set of back buffers. They should - * all fit in this array. If they don't, separate DCC might never be - * enabled by DCC stat gathering. - */ - struct { - struct si_texture *tex; - /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */ - struct pipe_query *ps_stats[3]; - /* If all slots are used and another slot is needed, - * the least recently used slot is evicted based on this. */ - int64_t last_use_timestamp; - bool query_active; - } dcc_stats[5]; - - /* Copy one resource to another using async DMA. */ - void (*dma_copy)(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dst_x, unsigned dst_y, unsigned dst_z, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box); - - struct si_tracked_regs tracked_regs; + struct pipe_context b; /* base class */ + + enum radeon_family family; + enum chip_class chip_class; + + struct radeon_winsys *ws; + struct radeon_winsys_ctx *ctx; + struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */ + struct radeon_cmdbuf *sdma_cs; + struct pipe_fence_handle *last_gfx_fence; + struct pipe_fence_handle *last_sdma_fence; + struct si_resource *eop_bug_scratch; + struct u_upload_mgr *cached_gtt_allocator; + struct threaded_context *tc; + struct u_suballocator *allocator_zeroed_memory; + struct slab_child_pool pool_transfers; + struct slab_child_pool pool_transfers_unsync; /* for threaded_context */ + struct pipe_device_reset_callback device_reset_callback; + struct u_log_context *log; + void *query_result_shader; + void *sh_query_result_shader; + + void (*emit_cache_flush)(struct si_context *ctx); + + struct blitter_context *blitter; + void *noop_blend; + void *noop_dsa; + void *discard_rasterizer_state; + void *custom_dsa_flush; + void *custom_blend_resolve; + void *custom_blend_fmask_decompress; + void *custom_blend_eliminate_fastclear; + void *custom_blend_dcc_decompress; + void *vs_blit_pos; + void *vs_blit_pos_layered; + void *vs_blit_color; + void *vs_blit_color_layered; + void *vs_blit_texcoord; + void *cs_clear_buffer; + void *cs_copy_buffer; + void *cs_copy_image; + void *cs_copy_image_1d_array; + void *cs_clear_render_target; + void *cs_clear_render_target_1d_array; + void *cs_clear_12bytes_buffer; + void *cs_dcc_retile; + void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ + struct si_screen *screen; + struct pipe_debug_callback debug; + struct ac_llvm_compiler compiler; /* only non-threaded compilation */ + struct si_shader_ctx_state fixed_func_tcs_shader; + /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */ + struct si_resource *wait_mem_scratch; + unsigned wait_mem_number; + uint16_t prefetch_L2_mask; + + bool has_graphics; + bool gfx_flush_in_progress : 1; + bool gfx_last_ib_is_busy : 1; + bool compute_is_busy : 1; + + unsigned num_gfx_cs_flushes; + unsigned initial_gfx_cs_size; + unsigned last_dirty_tex_counter; + unsigned last_dirty_buf_counter; + unsigned last_compressed_colortex_counter; + unsigned last_num_draw_calls; + unsigned flags; /* flush flags */ + /* Current unaccounted memory usage. */ + uint64_t vram; + uint64_t gtt; + + /* Compute-based primitive discard. */ + unsigned prim_discard_vertex_count_threshold; + struct pb_buffer *gds; + struct pb_buffer *gds_oa; + struct radeon_cmdbuf *prim_discard_compute_cs; + unsigned compute_gds_offset; + struct si_shader *compute_ib_last_shader; + uint32_t compute_rewind_va; + unsigned compute_num_prims_in_batch; + bool preserve_prim_restart_gds_at_flush; + /* index_ring is divided into 2 halves for doublebuffering. */ + struct si_resource *index_ring; + unsigned index_ring_base; /* offset of a per-IB portion */ + unsigned index_ring_offset; /* offset within a per-IB portion */ + unsigned index_ring_size_per_ib; /* max available size per IB */ + bool prim_discard_compute_ib_initialized; + /* For tracking the last execution barrier - it can be either + * a WRITE_DATA packet or a fence. */ + uint32_t *last_pkt3_write_data; + struct si_resource *barrier_buf; + unsigned barrier_buf_offset; + struct pipe_fence_handle *last_ib_barrier_fence; + struct si_resource *last_ib_barrier_buf; + unsigned last_ib_barrier_buf_offset; + + /* Atoms (direct states). */ + union si_state_atoms atoms; + unsigned dirty_atoms; /* mask */ + /* PM4 states (precomputed immutable states) */ + unsigned dirty_states; + union si_state queued; + union si_state emitted; + + /* Atom declarations. */ + struct si_framebuffer framebuffer; + unsigned sample_locs_num_samples; + uint16_t sample_mask; + unsigned last_cb_target_mask; + struct si_blend_color blend_color; + struct si_clip_state clip_state; + struct si_shader_data shader_pointers; + struct si_stencil_ref stencil_ref; + struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS]; + struct si_streamout streamout; + struct si_viewports viewports; + unsigned num_window_rectangles; + bool window_rectangles_include; + struct pipe_scissor_state window_rectangles[4]; + + /* Precomputed states. */ + struct si_pm4_state *init_config; + struct si_pm4_state *init_config_gs_rings; + bool init_config_has_vgt_flush; + struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES]; + + /* shaders */ + struct si_shader_ctx_state ps_shader; + struct si_shader_ctx_state gs_shader; + struct si_shader_ctx_state vs_shader; + struct si_shader_ctx_state tcs_shader; + struct si_shader_ctx_state tes_shader; + struct si_shader_ctx_state cs_prim_discard_state; + struct si_cs_shader_state cs_shader_state; + + /* shader information */ + struct si_vertex_elements *vertex_elements; + unsigned num_vertex_elements; + unsigned sprite_coord_enable; + unsigned cs_max_waves_per_sh; + bool flatshade; + bool do_update_shaders; + + /* shader descriptors */ + struct si_descriptors descriptors[SI_NUM_DESCS]; + unsigned descriptors_dirty; + unsigned shader_pointers_dirty; + unsigned shader_needs_decompress_mask; + struct si_buffer_resources rw_buffers; + struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; + struct si_samplers samplers[SI_NUM_SHADERS]; + struct si_images images[SI_NUM_SHADERS]; + bool bo_list_add_all_resident_resources; + bool bo_list_add_all_gfx_resources; + bool bo_list_add_all_compute_resources; + + /* other shader resources */ + struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */ + struct pipe_resource *esgs_ring; + struct pipe_resource *gsvs_ring; + struct pipe_resource *tess_rings; + union pipe_color_union *border_color_table; /* in CPU memory, any endian */ + struct si_resource *border_color_buffer; + union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */ + unsigned border_color_count; + unsigned num_vs_blit_sgprs; + uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; + uint32_t cs_user_data[4]; + + /* Vertex buffers. */ + bool vertex_buffers_dirty; + bool vertex_buffer_pointer_dirty; + bool vertex_buffer_user_sgprs_dirty; + struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; + uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ + uint32_t *vb_descriptors_gpu_list; + struct si_resource *vb_descriptors_buffer; + unsigned vb_descriptors_offset; + unsigned vb_descriptor_user_sgprs[5 * 4]; + + /* MSAA config state. */ + int ps_iter_samples; + bool ps_uses_fbfetch; + bool smoothing_enabled; + + /* DB render state. */ + unsigned ps_db_shader_control; + unsigned dbcb_copy_sample; + bool dbcb_depth_copy_enabled : 1; + bool dbcb_stencil_copy_enabled : 1; + bool db_flush_depth_inplace : 1; + bool db_flush_stencil_inplace : 1; + bool db_depth_clear : 1; + bool db_depth_disable_expclear : 1; + bool db_stencil_clear : 1; + bool db_stencil_disable_expclear : 1; + bool occlusion_queries_disabled : 1; + bool generate_mipmap_for_depth : 1; + + /* Emitted draw state. */ + bool gs_tri_strip_adj_fix : 1; + bool ls_vgpr_fix : 1; + bool prim_discard_cs_instancing : 1; + bool ngg : 1; + uint8_t ngg_culling; + int last_index_size; + int last_base_vertex; + int last_start_instance; + int last_instance_count; + int last_drawid; + int last_sh_base_reg; + int last_primitive_restart_en; + int last_restart_index; + int last_prim; + int last_multi_vgt_param; + int last_gs_out_prim; + int last_binning_enabled; + unsigned current_vs_state; + unsigned last_vs_state; + enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */ + + struct si_small_prim_cull_info last_small_prim_cull_info; + struct si_resource *small_prim_cull_info_buf; + uint64_t small_prim_cull_info_address; + bool small_prim_cull_info_dirty; + + /* Scratch buffer */ + struct si_resource *scratch_buffer; + unsigned scratch_waves; + unsigned spi_tmpring_size; + unsigned max_seen_scratch_bytes_per_wave; + unsigned max_seen_compute_scratch_bytes_per_wave; + + struct si_resource *compute_scratch_buffer; + + /* Emitted derived tessellation state. */ + /* Local shader (VS), or HS if LS-HS are merged. */ + struct si_shader *last_ls; + struct si_shader_selector *last_tcs; + int last_num_tcs_input_cp; + int last_tes_sh_base; + bool last_tess_uses_primid; + unsigned last_num_patches; + int last_ls_hs_config; + + /* Debug state. */ + bool is_debug; + struct si_saved_cs *current_saved_cs; + uint64_t dmesg_timestamp; + unsigned apitrace_call_number; + + /* Other state */ + bool need_check_render_feedback; + bool decompression_enabled; + bool dpbb_force_off; + bool vs_writes_viewport_index; + bool vs_disables_clipping_viewport; + + /* Precomputed IA_MULTI_VGT_PARAM */ + union si_vgt_param_key ia_multi_vgt_param_key; + unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES]; + + /* Bindless descriptors. */ + struct si_descriptors bindless_descriptors; + struct util_idalloc bindless_used_slots; + unsigned num_bindless_descriptors; + bool bindless_descriptors_dirty; + bool graphics_bindless_pointer_dirty; + bool compute_bindless_pointer_dirty; + + /* Allocated bindless handles */ + struct hash_table *tex_handles; + struct hash_table *img_handles; + + /* Resident bindless handles */ + struct util_dynarray resident_tex_handles; + struct util_dynarray resident_img_handles; + + /* Resident bindless handles which need decompression */ + struct util_dynarray resident_tex_needs_color_decompress; + struct util_dynarray resident_img_needs_color_decompress; + struct util_dynarray resident_tex_needs_depth_decompress; + + /* Bindless state */ + bool uses_bindless_samplers; + bool uses_bindless_images; + + /* MSAA sample locations. + * The first index is the sample index. + * The second index is the coordinate: X, Y. */ + struct { + float x1[1][2]; + float x2[2][2]; + float x4[4][2]; + float x8[8][2]; + float x16[16][2]; + } sample_positions; + struct pipe_resource *sample_pos_buffer; + + /* Misc stats. */ + unsigned num_draw_calls; + unsigned num_decompress_calls; + unsigned num_mrt_draw_calls; + unsigned num_prim_restart_calls; + unsigned num_spill_draw_calls; + unsigned num_compute_calls; + unsigned num_spill_compute_calls; + unsigned num_dma_calls; + unsigned num_cp_dma_calls; + unsigned num_vs_flushes; + unsigned num_ps_flushes; + unsigned num_cs_flushes; + unsigned num_cb_cache_flushes; + unsigned num_db_cache_flushes; + unsigned num_L2_invalidates; + unsigned num_L2_writebacks; + unsigned num_resident_handles; + uint64_t num_alloc_tex_transfer_bytes; + unsigned last_tex_ps_draw_ratio; /* for query */ + unsigned compute_num_verts_accepted; + unsigned compute_num_verts_rejected; + unsigned compute_num_verts_ineligible; /* due to low vertex count */ + unsigned context_roll; + + /* Queries. */ + /* Maintain the list of active queries for pausing between IBs. */ + int num_occlusion_queries; + int num_perfect_occlusion_queries; + int num_pipeline_stat_queries; + struct list_head active_queries; + unsigned num_cs_dw_queries_suspend; + + /* Render condition. */ + struct pipe_query *render_cond; + unsigned render_cond_mode; + bool render_cond_invert; + bool render_cond_force_off; /* for u_blitter */ + + /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */ + bool sdma_uploads_in_progress; + struct si_sdma_upload *sdma_uploads; + unsigned num_sdma_uploads; + unsigned max_sdma_uploads; + + /* Shader-based queries. */ + struct list_head shader_query_buffers; + unsigned num_active_shader_queries; + + /* Statistics gathering for the DCC enablement heuristic. It can't be + * in si_texture because si_texture can be shared by multiple + * contexts. This is for back buffers only. We shouldn't get too many + * of those. + * + * X11 DRI3 rotates among a finite set of back buffers. They should + * all fit in this array. If they don't, separate DCC might never be + * enabled by DCC stat gathering. + */ + struct { + struct si_texture *tex; + /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */ + struct pipe_query *ps_stats[3]; + /* If all slots are used and another slot is needed, + * the least recently used slot is evicted based on this. */ + int64_t last_use_timestamp; + bool query_active; + } dcc_stats[5]; + + /* Copy one resource to another using async DMA. */ + void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, + unsigned src_level, const struct pipe_box *src_box); + + struct si_tracked_regs tracked_regs; }; /* cik_sdma.c */ @@ -1269,124 +1263,99 @@ void cik_init_sdma_functions(struct si_context *sctx); /* si_blit.c */ enum si_blitter_op /* bitmask */ { - SI_SAVE_TEXTURES = 1, - SI_SAVE_FRAMEBUFFER = 2, - SI_SAVE_FRAGMENT_STATE = 4, - SI_DISABLE_RENDER_COND = 8, + SI_SAVE_TEXTURES = 1, + SI_SAVE_FRAMEBUFFER = 2, + SI_SAVE_FRAGMENT_STATE = 4, + SI_DISABLE_RENDER_COND = 8, }; void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op); void si_blitter_end(struct si_context *sctx); void si_init_blit_functions(struct si_context *sctx); void si_decompress_textures(struct si_context *sctx, unsigned shader_mask); -void si_decompress_subresource(struct pipe_context *ctx, - struct pipe_resource *tex, - unsigned planes, unsigned level, - unsigned first_layer, unsigned last_layer); -void si_resource_copy_region(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box); +void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes, + unsigned level, unsigned first_layer, unsigned last_layer); +void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box); void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex); /* si_buffer.c */ -bool si_rings_is_buffer_referenced(struct si_context *sctx, - struct pb_buffer *buf, - enum radeon_bo_usage usage); -void *si_buffer_map_sync_with_rings(struct si_context *sctx, - struct si_resource *resource, - unsigned usage); -void si_init_resource_fields(struct si_screen *sscreen, - struct si_resource *res, - uint64_t size, unsigned alignment); -bool si_alloc_resource(struct si_screen *sscreen, - struct si_resource *res); -struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, - unsigned flags, unsigned usage, - unsigned size, unsigned alignment); -struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, - unsigned flags, unsigned usage, - unsigned size, unsigned alignment); -void si_replace_buffer_storage(struct pipe_context *ctx, - struct pipe_resource *dst, - struct pipe_resource *src); +bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf, + enum radeon_bo_usage usage); +void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource, + unsigned usage); +void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size, + unsigned alignment); +bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res); +struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, + unsigned usage, unsigned size, unsigned alignment); +struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, + unsigned usage, unsigned size, unsigned alignment); +void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, + struct pipe_resource *src); void si_init_screen_buffer_functions(struct si_screen *sscreen); void si_init_buffer_functions(struct si_context *sctx); /* si_clear.c */ enum pipe_format si_simplify_cb_format(enum pipe_format format); bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format); -bool vi_dcc_clear_level(struct si_context *sctx, - struct si_texture *tex, - unsigned level, unsigned clear_value); +bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level, + unsigned clear_value); void si_init_clear_functions(struct si_context *sctx); /* si_compute_blit.c */ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, - enum si_cache_policy cache_policy); -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, uint32_t *clear_value, - uint32_t clear_value_size, enum si_coherency coher, - bool force_cpdma); -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size); -void si_compute_copy_image(struct si_context *sctx, - struct pipe_resource *dst, - unsigned dst_level, - struct pipe_resource *src, - unsigned src_level, - unsigned dstx, unsigned dsty, unsigned dstz, - const struct pipe_box *src_box); -void si_compute_clear_render_target(struct pipe_context *ctx, - struct pipe_surface *dstsurf, - const union pipe_color_union *color, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - bool render_condition_enabled); + enum si_cache_policy cache_policy); +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, + uint64_t size, uint32_t *clear_value, uint32_t clear_value_size, + enum si_coherency coher, bool force_cpdma); +void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size); +void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, + struct pipe_resource *src, unsigned src_level, unsigned dstx, + unsigned dsty, unsigned dstz, const struct pipe_box *src_box); +void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, + const union pipe_color_union *color, unsigned dstx, + unsigned dsty, unsigned width, unsigned height, + bool render_condition_enabled); void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex); void si_init_compute_blit_functions(struct si_context *sctx); /* si_cp_dma.c */ -#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */ -#define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */ -#define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */ -#define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */ -#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */ -#define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \ - SI_CPDMA_SKIP_SYNC_AFTER | \ - SI_CPDMA_SKIP_SYNC_BEFORE | \ - SI_CPDMA_SKIP_GFX_SYNC | \ - SI_CPDMA_SKIP_BO_LIST_UPDATE) +#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */ +#define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */ +#define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */ +#define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */ +#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */ +#define SI_CPDMA_SKIP_ALL \ + (SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_SYNC_AFTER | SI_CPDMA_SKIP_SYNC_BEFORE | \ + SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_BO_LIST_UPDATE) void si_cp_dma_wait_for_idle(struct si_context *sctx); void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, - struct pipe_resource *dst, uint64_t offset, - uint64_t size, unsigned value, unsigned user_flags, - enum si_coherency coher, enum si_cache_policy cache_policy); -void si_cp_dma_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size, - unsigned user_flags, enum si_coherency coher, - enum si_cache_policy cache_policy); -void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, - uint64_t offset, unsigned size); + struct pipe_resource *dst, uint64_t offset, uint64_t size, + unsigned value, unsigned user_flags, enum si_coherency coher, + enum si_cache_policy cache_policy); +void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, + unsigned size, unsigned user_flags, enum si_coherency coher, + enum si_cache_policy cache_policy); +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, + unsigned size); void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); void si_test_gds(struct si_context *sctx); -void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, - unsigned offset, unsigned size, unsigned dst_sel, - unsigned engine, const void *data); -void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, - unsigned dst_sel, struct si_resource *dst, unsigned dst_offset, - unsigned src_sel, struct si_resource *src, unsigned src_offset); +void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset, + unsigned size, unsigned dst_sel, unsigned engine, const void *data); +void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel, + struct si_resource *dst, unsigned dst_offset, unsigned src_sel, + struct si_resource *src, unsigned src_offset); /* si_debug.c */ -void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, - struct radeon_saved_cs *saved, bool get_buffer_list); +void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, + bool get_buffer_list); void si_clear_saved_cs(struct radeon_saved_cs *saved); void si_destroy_saved_cs(struct si_saved_cs *scs); void si_auto_log_cs(void *data, struct u_log_context *log); @@ -1394,45 +1363,41 @@ void si_log_hw_flush(struct si_context *sctx); void si_log_draw_state(struct si_context *sctx, struct u_log_context *log); void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); void si_init_debug_functions(struct si_context *sctx); -void si_check_vm_faults(struct si_context *sctx, - struct radeon_saved_cs *saved, enum ring_type ring); +void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, + enum ring_type ring); bool si_replace_shader(unsigned num, struct si_shader_binary *binary); /* si_dma_cs.c */ -void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, - uint64_t offset); -void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned clear_value); +void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset); +void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, + uint64_t size, unsigned clear_value); void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, - struct pipe_resource *src, uint64_t dst_offset, - uint64_t src_offset, uint64_t size); -void si_need_dma_space(struct si_context *ctx, unsigned num_dw, - struct si_resource *dst, struct si_resource *src); -void si_flush_dma_cs(struct si_context *ctx, unsigned flags, - struct pipe_fence_handle **fence); -void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value); + struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, + uint64_t size); +void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst, + struct si_resource *src); +void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); +void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, + uint64_t size, unsigned value); /* si_fence.c */ -void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, - unsigned event, unsigned event_flags, - unsigned dst_sel, unsigned int_sel, unsigned data_sel, - struct si_resource *buf, uint64_t va, - uint32_t new_fence, unsigned query_type); +void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event, + unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel, + struct si_resource *buf, uint64_t va, uint32_t new_fence, + unsigned query_type); unsigned si_cp_write_fence_dwords(struct si_screen *screen); -void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, - uint64_t va, uint32_t ref, uint32_t mask, unsigned flags); +void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref, + uint32_t mask, unsigned flags); void si_init_fence_functions(struct si_context *ctx); void si_init_screen_fence_functions(struct si_screen *screen); struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, - struct tc_unflushed_batch_token *tc_token); + struct tc_unflushed_batch_token *tc_token); /* si_get.c */ void si_init_screen_get_functions(struct si_screen *sscreen); /* si_gfx_cs.c */ -void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, - struct pipe_fence_handle **fence); +void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_allocate_gds(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx); void si_need_gfx_cs_space(struct si_context *ctx); @@ -1441,36 +1406,32 @@ void si_unref_sdma_uploads(struct si_context *sctx); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type); -unsigned si_end_counter(struct si_screen *sscreen, unsigned type, - uint64_t begin); +unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin); /* si_compute.c */ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); void si_init_compute_functions(struct si_context *sctx); /* si_compute_prim_discard.c */ -enum si_prim_discard_outcome { - SI_PRIM_DISCARD_ENABLED, - SI_PRIM_DISCARD_DISABLED, - SI_PRIM_DISCARD_DRAW_SPLIT, +enum si_prim_discard_outcome +{ + SI_PRIM_DISCARD_ENABLED, + SI_PRIM_DISCARD_DISABLED, + SI_PRIM_DISCARD_DRAW_SPLIT, }; void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - bool primitive_restart); +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, + bool primitive_restart); void si_compute_signal_gfx(struct si_context *sctx); void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned index_size, - unsigned base_vertex, - uint64_t input_indexbuf_va, - unsigned input_indexbuf_max_elements); -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, - bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib); + const struct pipe_draw_info *info, unsigned index_size, + unsigned base_vertex, uint64_t input_indexbuf_va, + unsigned input_indexbuf_max_elements); +void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, + unsigned *prim_discard_vertex_count_threshold, + unsigned *index_ring_size_per_ib); /* si_pipe.c */ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler); @@ -1487,19 +1448,17 @@ void si_resume_queries(struct si_context *sctx); /* si_shaderlib_tgsi.c */ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, - unsigned num_layers); + unsigned num_layers); void *si_create_fixed_func_tcs(struct si_context *sctx); -void *si_create_dma_compute_shader(struct pipe_context *ctx, - unsigned num_dwords_per_thread, - bool dst_stream_cache_policy, bool is_copy); +void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, + bool dst_stream_cache_policy, bool is_copy); void *si_create_copy_image_compute_shader(struct pipe_context *ctx); void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); void *si_create_dcc_retile_cs(struct pipe_context *ctx); -void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, - bool is_array); +void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array); void *si_create_query_result_cs(struct si_context *sctx); void *gfx10_create_sh_query_result_cs(struct si_context *sctx); @@ -1515,370 +1474,317 @@ void si_test_dma_perf(struct si_screen *sscreen); /* si_uvd.c */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, - const struct pipe_video_codec *templ); + const struct pipe_video_codec *templ); struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, - const struct pipe_video_buffer *tmpl); + const struct pipe_video_buffer *tmpl); /* si_viewport.c */ void si_update_ngg_small_prim_precision(struct si_context *ctx); -void si_get_small_prim_cull_info(struct si_context *sctx, - struct si_small_prim_cull_info *out); +void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out); void si_update_vs_viewport_state(struct si_context *ctx); void si_init_viewport_functions(struct si_context *ctx); /* si_texture.c */ -bool si_prepare_for_dma_blit(struct si_context *sctx, - struct si_texture *dst, - unsigned dst_level, unsigned dstx, - unsigned dsty, unsigned dstz, - struct si_texture *src, - unsigned src_level, - const struct pipe_box *src_box); -void si_eliminate_fast_color_clear(struct si_context *sctx, - struct si_texture *tex); -void si_texture_discard_cmask(struct si_screen *sscreen, - struct si_texture *tex); -bool si_init_flushed_depth_texture(struct pipe_context *ctx, - struct pipe_resource *texture); -void si_print_texture_info(struct si_screen *sscreen, - struct si_texture *tex, struct u_log_context *log); +bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src, + unsigned src_level, const struct pipe_box *src_box); +void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex); +void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex); +bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture); +void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex, + struct u_log_context *log); struct pipe_resource *si_texture_create(struct pipe_screen *screen, - const struct pipe_resource *templ); -bool vi_dcc_formats_compatible(struct si_screen *sscreen, - enum pipe_format format1, - enum pipe_format format2); -bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format); -void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, - struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format); + const struct pipe_resource *templ); +bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1, + enum pipe_format format2); +bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level, + enum pipe_format view_format); +void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex, + unsigned level, enum pipe_format view_format); struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe, - struct pipe_resource *texture, - const struct pipe_surface *templ, - unsigned width0, unsigned height0, - unsigned width, unsigned height); + struct pipe_resource *texture, + const struct pipe_surface *templ, unsigned width0, + unsigned height0, unsigned width, unsigned height); unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap); -void vi_separate_dcc_try_enable(struct si_context *sctx, - struct si_texture *tex); -void vi_separate_dcc_start_query(struct si_context *sctx, - struct si_texture *tex); -void vi_separate_dcc_stop_query(struct si_context *sctx, - struct si_texture *tex); -void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, - struct si_texture *tex); -bool si_texture_disable_dcc(struct si_context *sctx, - struct si_texture *tex); +void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex); +void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex); +void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex); +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex); +bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex); void si_init_screen_texture_functions(struct si_screen *sscreen); void si_init_context_texture_functions(struct si_context *sctx); - /* * common helpers */ static inline struct si_resource *si_resource(struct pipe_resource *r) { - return (struct si_resource*)r; + return (struct si_resource *)r; } -static inline void -si_resource_reference(struct si_resource **ptr, struct si_resource *res) +static inline void si_resource_reference(struct si_resource **ptr, struct si_resource *res) { - pipe_resource_reference((struct pipe_resource **)ptr, - (struct pipe_resource *)res); + pipe_resource_reference((struct pipe_resource **)ptr, (struct pipe_resource *)res); } -static inline void -si_texture_reference(struct si_texture **ptr, struct si_texture *res) +static inline void si_texture_reference(struct si_texture **ptr, struct si_texture *res) { - pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b); + pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b); } static inline void si_shader_selector_reference(struct si_context *sctx, /* sctx can optionally be NULL */ - struct si_shader_selector **dst, - struct si_shader_selector *src) + struct si_shader_selector **dst, struct si_shader_selector *src) { - if (*dst == src) - return; + if (*dst == src) + return; - struct si_screen *sscreen = src ? src->screen : (*dst)->screen; - util_shader_reference(&sctx->b, &sscreen->live_shader_cache, - (void**)dst, src); + struct si_screen *sscreen = src ? src->screen : (*dst)->screen; + util_shader_reference(&sctx->b, &sscreen->live_shader_cache, (void **)dst, src); } -static inline bool -vi_dcc_enabled(struct si_texture *tex, unsigned level) +static inline bool vi_dcc_enabled(struct si_texture *tex, unsigned level) { - return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels; + return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels; } -static inline unsigned -si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil) +static inline unsigned si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil) { - if (stencil) - return tex->surface.u.legacy.stencil_tiling_index[level]; - else - return tex->surface.u.legacy.tiling_index[level]; + if (stencil) + return tex->surface.u.legacy.stencil_tiling_index[level]; + else + return tex->surface.u.legacy.tiling_index[level]; } -static inline unsigned -si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx) +static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx) { - /* Don't count the needed CS space exactly and just use an upper bound. - * - * Also reserve space for stopping queries at the end of IB, because - * the number of active queries is unlimited in theory. - */ - return 2048 + sctx->num_cs_dw_queries_suspend; + /* Don't count the needed CS space exactly and just use an upper bound. + * + * Also reserve space for stopping queries at the end of IB, because + * the number of active queries is unlimited in theory. + */ + return 2048 + sctx->num_cs_dw_queries_suspend; } -static inline void -si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r) +static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r) { - if (r) { - /* Add memory usage for need_gfx_cs_space */ - sctx->vram += si_resource(r)->vram_usage; - sctx->gtt += si_resource(r)->gart_usage; - } + if (r) { + /* Add memory usage for need_gfx_cs_space */ + sctx->vram += si_resource(r)->vram_usage; + sctx->gtt += si_resource(r)->gart_usage; + } } -static inline void -si_invalidate_draw_sh_constants(struct si_context *sctx) +static inline void si_invalidate_draw_sh_constants(struct si_context *sctx) { - sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN; - sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; + sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN; + sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; } -static inline unsigned -si_get_atom_bit(struct si_context *sctx, struct si_atom *atom) +static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom) { - return 1 << (atom - sctx->atoms.array); + return 1 << (atom - sctx->atoms.array); } -static inline void -si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty) +static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty) { - unsigned bit = si_get_atom_bit(sctx, atom); + unsigned bit = si_get_atom_bit(sctx, atom); - if (dirty) - sctx->dirty_atoms |= bit; - else - sctx->dirty_atoms &= ~bit; + if (dirty) + sctx->dirty_atoms |= bit; + else + sctx->dirty_atoms &= ~bit; } -static inline bool -si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom) +static inline bool si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom) { - return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0; + return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0; } -static inline void -si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom) +static inline void si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom) { - si_set_atom_dirty(sctx, atom, true); + si_set_atom_dirty(sctx, atom, true); } static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx) { - if (sctx->gs_shader.cso) - return &sctx->gs_shader; - if (sctx->tes_shader.cso) - return &sctx->tes_shader; + if (sctx->gs_shader.cso) + return &sctx->gs_shader; + if (sctx->tes_shader.cso) + return &sctx->tes_shader; - return &sctx->vs_shader; + return &sctx->vs_shader; } static inline struct si_shader_info *si_get_vs_info(struct si_context *sctx) { - struct si_shader_ctx_state *vs = si_get_vs(sctx); + struct si_shader_ctx_state *vs = si_get_vs(sctx); - return vs->cso ? &vs->cso->info : NULL; + return vs->cso ? &vs->cso->info : NULL; } -static inline struct si_shader* si_get_vs_state(struct si_context *sctx) +static inline struct si_shader *si_get_vs_state(struct si_context *sctx) { - if (sctx->gs_shader.cso && - sctx->gs_shader.current && - !sctx->gs_shader.current->key.as_ngg) - return sctx->gs_shader.cso->gs_copy_shader; + if (sctx->gs_shader.cso && sctx->gs_shader.current && !sctx->gs_shader.current->key.as_ngg) + return sctx->gs_shader.cso->gs_copy_shader; - struct si_shader_ctx_state *vs = si_get_vs(sctx); - return vs->current ? vs->current : NULL; + struct si_shader_ctx_state *vs = si_get_vs(sctx); + return vs->current ? vs->current : NULL; } -static inline bool si_can_dump_shader(struct si_screen *sscreen, - unsigned processor) +static inline bool si_can_dump_shader(struct si_screen *sscreen, unsigned processor) { - return sscreen->debug_flags & (1 << processor); + return sscreen->debug_flags & (1 << processor); } static inline bool si_get_strmout_en(struct si_context *sctx) { - return sctx->streamout.streamout_enabled || - sctx->streamout.prims_gen_query_enabled; + return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled; } -static inline unsigned -si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size) +static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size) { - unsigned alignment, tcc_cache_line_size; - - /* If the upload size is less than the cache line size (e.g. 16, 32), - * the whole thing will fit into a cache line if we align it to its size. - * The idea is that multiple small uploads can share a cache line. - * If the upload size is greater, align it to the cache line size. - */ - alignment = util_next_power_of_two(upload_size); - tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size; - return MIN2(alignment, tcc_cache_line_size); + unsigned alignment, tcc_cache_line_size; + + /* If the upload size is less than the cache line size (e.g. 16, 32), + * the whole thing will fit into a cache line if we align it to its size. + * The idea is that multiple small uploads can share a cache line. + * If the upload size is greater, align it to the cache line size. + */ + alignment = util_next_power_of_two(upload_size); + tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size; + return MIN2(alignment, tcc_cache_line_size); } -static inline void -si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src) +static inline void si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src) { - if (pipe_reference(&(*dst)->reference, &src->reference)) - si_destroy_saved_cs(*dst); + if (pipe_reference(&(*dst)->reference, &src->reference)) + si_destroy_saved_cs(*dst); - *dst = src; + *dst = src; } -static inline void -si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, - bool shaders_read_metadata, bool dcc_pipe_aligned) +static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, + bool shaders_read_metadata, bool dcc_pipe_aligned) { - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_INV_VCACHE; - - if (sctx->chip_class >= GFX10) { - if (sctx->screen->info.tcc_harvested) - sctx->flags |= SI_CONTEXT_INV_L2; - else if (shaders_read_metadata) - sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else if (sctx->chip_class == GFX9) { - /* Single-sample color is coherent with shaders on GFX9, but - * L2 metadata must be flushed if shaders read metadata. - * (DCC, CMASK). - */ - if (num_samples >= 2 || - (shaders_read_metadata && !dcc_pipe_aligned)) - sctx->flags |= SI_CONTEXT_INV_L2; - else if (shaders_read_metadata) - sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else { - /* GFX6-GFX8 */ - sctx->flags |= SI_CONTEXT_INV_L2; - } + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE; + + if (sctx->chip_class >= GFX10) { + if (sctx->screen->info.tcc_harvested) + sctx->flags |= SI_CONTEXT_INV_L2; + else if (shaders_read_metadata) + sctx->flags |= SI_CONTEXT_INV_L2_METADATA; + } else if (sctx->chip_class == GFX9) { + /* Single-sample color is coherent with shaders on GFX9, but + * L2 metadata must be flushed if shaders read metadata. + * (DCC, CMASK). + */ + if (num_samples >= 2 || (shaders_read_metadata && !dcc_pipe_aligned)) + sctx->flags |= SI_CONTEXT_INV_L2; + else if (shaders_read_metadata) + sctx->flags |= SI_CONTEXT_INV_L2_METADATA; + } else { + /* GFX6-GFX8 */ + sctx->flags |= SI_CONTEXT_INV_L2; + } } -static inline void -si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, - bool include_stencil, bool shaders_read_metadata) +static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, + bool include_stencil, bool shaders_read_metadata) { - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_INV_VCACHE; - - if (sctx->chip_class >= GFX10) { - if (sctx->screen->info.tcc_harvested) - sctx->flags |= SI_CONTEXT_INV_L2; - else if (shaders_read_metadata) - sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else if (sctx->chip_class == GFX9) { - /* Single-sample depth (not stencil) is coherent with shaders - * on GFX9, but L2 metadata must be flushed if shaders read - * metadata. - */ - if (num_samples >= 2 || include_stencil) - sctx->flags |= SI_CONTEXT_INV_L2; - else if (shaders_read_metadata) - sctx->flags |= SI_CONTEXT_INV_L2_METADATA; - } else { - /* GFX6-GFX8 */ - sctx->flags |= SI_CONTEXT_INV_L2; - } + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE; + + if (sctx->chip_class >= GFX10) { + if (sctx->screen->info.tcc_harvested) + sctx->flags |= SI_CONTEXT_INV_L2; + else if (shaders_read_metadata) + sctx->flags |= SI_CONTEXT_INV_L2_METADATA; + } else if (sctx->chip_class == GFX9) { + /* Single-sample depth (not stencil) is coherent with shaders + * on GFX9, but L2 metadata must be flushed if shaders read + * metadata. + */ + if (num_samples >= 2 || include_stencil) + sctx->flags |= SI_CONTEXT_INV_L2; + else if (shaders_read_metadata) + sctx->flags |= SI_CONTEXT_INV_L2_METADATA; + } else { + /* GFX6-GFX8 */ + sctx->flags |= SI_CONTEXT_INV_L2; + } } -static inline bool -si_can_sample_zs(struct si_texture *tex, bool stencil_sampler) +static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler) { - return (stencil_sampler && tex->can_sample_s) || - (!stencil_sampler && tex->can_sample_z); + return (stencil_sampler && tex->can_sample_s) || (!stencil_sampler && tex->can_sample_z); } -static inline bool -si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) +static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) { - if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled) - return false; + if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled) + return false; - return tex->surface.htile_offset && level == 0; + return tex->surface.htile_offset && level == 0; } -static inline bool -vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask) +static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, + unsigned zs_mask) { - assert(!tex->tc_compatible_htile || tex->surface.htile_offset); - return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask); + assert(!tex->tc_compatible_htile || tex->surface.htile_offset); + return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask); } static inline unsigned si_get_ps_iter_samples(struct si_context *sctx) { - if (sctx->ps_uses_fbfetch) - return sctx->framebuffer.nr_color_samples; + if (sctx->ps_uses_fbfetch) + return sctx->framebuffer.nr_color_samples; - return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples); + return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples); } static inline unsigned si_get_total_colormask(struct si_context *sctx) { - if (sctx->queued.named.rasterizer->rasterizer_discard) - return 0; + if (sctx->queued.named.rasterizer->rasterizer_discard) + return 0; - struct si_shader_selector *ps = sctx->ps_shader.cso; - if (!ps) - return 0; + struct si_shader_selector *ps = sctx->ps_shader.cso; + if (!ps) + return 0; - unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit & - sctx->queued.named.blend->cb_target_mask; + unsigned colormask = + sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask; - if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) - colormask &= ps->colors_written_4bit; - else if (!ps->colors_written_4bit) - colormask = 0; /* color0 writes all cbufs, but it's not written */ + if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) + colormask &= ps->colors_written_4bit; + else if (!ps->colors_written_4bit) + colormask = 0; /* color0 writes all cbufs, but it's not written */ - return colormask; + return colormask; } -#define UTIL_ALL_PRIM_LINE_MODES ((1 << PIPE_PRIM_LINES) | \ - (1 << PIPE_PRIM_LINE_LOOP) | \ - (1 << PIPE_PRIM_LINE_STRIP) | \ - (1 << PIPE_PRIM_LINES_ADJACENCY) | \ - (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY)) +#define UTIL_ALL_PRIM_LINE_MODES \ + ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \ + (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY)) static inline bool util_prim_is_lines(unsigned prim) { - return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0; + return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0; } static inline bool util_prim_is_points_or_lines(unsigned prim) { - return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | - (1 << PIPE_PRIM_POINTS))) != 0; + return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | (1 << PIPE_PRIM_POINTS))) != 0; } static inline bool util_rast_prim_is_triangles(unsigned prim) { - return ((1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | - (1 << PIPE_PRIM_TRIANGLE_STRIP) | - (1 << PIPE_PRIM_TRIANGLE_FAN) | - (1 << PIPE_PRIM_QUADS) | - (1 << PIPE_PRIM_QUAD_STRIP) | - (1 << PIPE_PRIM_POLYGON) | - (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | - (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))); + return ((1 << prim) & + ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | + (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | + (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | + (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))); } /** @@ -1888,20 +1794,18 @@ static inline bool util_rast_prim_is_triangles(unsigned prim) * \param vram VRAM memory size not added to the buffer list yet * \param gtt GTT memory size not added to the buffer list yet */ -static inline bool -radeon_cs_memory_below_limit(struct si_screen *screen, - struct radeon_cmdbuf *cs, - uint64_t vram, uint64_t gtt) +static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs, + uint64_t vram, uint64_t gtt) { - vram += cs->used_vram; - gtt += cs->used_gart; + vram += cs->used_vram; + gtt += cs->used_gart; - /* Anything that goes above the VRAM size should go to GTT. */ - if (vram > screen->info.vram_size) - gtt += vram - screen->info.vram_size; + /* Anything that goes above the VRAM size should go to GTT. */ + if (vram > screen->info.vram_size) + gtt += vram - screen->info.vram_size; - /* Now we just need to check if we have enough GTT. */ - return gtt < screen->info.gart_size * 0.7; + /* Now we just need to check if we have enough GTT. */ + return gtt < screen->info.gart_size * 0.7; } /** @@ -1914,17 +1818,13 @@ radeon_cs_memory_below_limit(struct si_screen *screen, * The buffer list becomes empty after every context flush and must be * rebuilt. */ -static inline void radeon_add_to_buffer_list(struct si_context *sctx, - struct radeon_cmdbuf *cs, - struct si_resource *bo, - enum radeon_bo_usage usage, - enum radeon_bo_priority priority) +static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct radeon_cmdbuf *cs, + struct si_resource *bo, enum radeon_bo_usage usage, + enum radeon_bo_priority priority) { - assert(usage); - sctx->ws->cs_add_buffer( - cs, bo->buf, - (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED), - bo->domains, priority); + assert(usage); + sctx->ws->cs_add_buffer(cs, bo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED), + bo->domains, priority); } /** @@ -1944,52 +1844,49 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx, * - if shader resource "enabled_mask" is not up-to-date or there is * a different constraint disallowing a context flush */ -static inline void -radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, - struct si_resource *bo, - enum radeon_bo_usage usage, - enum radeon_bo_priority priority, - bool check_mem) +static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, + struct si_resource *bo, + enum radeon_bo_usage usage, + enum radeon_bo_priority priority, + bool check_mem) { - if (check_mem && - !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, - sctx->vram + bo->vram_usage, - sctx->gtt + bo->gart_usage)) - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + if (check_mem && + !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + bo->vram_usage, + sctx->gtt + bo->gart_usage)) + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority); } static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) { - return sctx->prim_discard_vertex_count_threshold != UINT_MAX; + return sctx->prim_discard_vertex_count_threshold != UINT_MAX; } static inline unsigned si_get_wave_size(struct si_screen *sscreen, - enum pipe_shader_type shader_type, - bool ngg, bool es, bool prim_discard_cs) + enum pipe_shader_type shader_type, bool ngg, bool es, + bool prim_discard_cs) { - if (shader_type == PIPE_SHADER_COMPUTE) - return sscreen->compute_wave_size; - else if (shader_type == PIPE_SHADER_FRAGMENT) - return sscreen->ps_wave_size; - else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */ - (shader_type == PIPE_SHADER_VERTEX && es && !ngg) || - (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) || - (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */ - return 64; - else - return sscreen->ge_wave_size; + if (shader_type == PIPE_SHADER_COMPUTE) + return sscreen->compute_wave_size; + else if (shader_type == PIPE_SHADER_FRAGMENT) + return sscreen->ps_wave_size; + else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */ + (shader_type == PIPE_SHADER_VERTEX && es && !ngg) || + (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) || + (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */ + return 64; + else + return sscreen->ge_wave_size; } static inline unsigned si_get_shader_wave_size(struct si_shader *shader) { - return si_get_wave_size(shader->selector->screen, shader->selector->type, - shader->key.as_ngg, shader->key.as_es, - shader->key.opt.vs_as_prim_discard_cs); + return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg, + shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs); } -#define PRINT_ERR(fmt, args...) \ - fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) +#define PRINT_ERR(fmt, args...) \ + fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) #endif diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 0b7d53e745d..9b63ba69973 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -22,170 +22,159 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_memory.h" #include "si_pipe.h" #include "sid.h" +#include "util/u_memory.h" void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode) { - state->last_opcode = opcode; - state->last_pm4 = state->ndw++; + state->last_opcode = opcode; + state->last_pm4 = state->ndw++; } void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw) { - state->pm4[state->ndw++] = dw; + state->pm4[state->ndw++] = dw; } void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate) { - unsigned count; - count = state->ndw - state->last_pm4 - 2; - state->pm4[state->last_pm4] = - PKT3(state->last_opcode, count, predicate); + unsigned count; + count = state->ndw - state->last_pm4 - 2; + state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate); - assert(state->ndw <= SI_PM4_MAX_DW); + assert(state->ndw <= SI_PM4_MAX_DW); } void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val) { - unsigned opcode; + unsigned opcode; - if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) { - opcode = PKT3_SET_CONFIG_REG; - reg -= SI_CONFIG_REG_OFFSET; + if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) { + opcode = PKT3_SET_CONFIG_REG; + reg -= SI_CONFIG_REG_OFFSET; - } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) { - opcode = PKT3_SET_SH_REG; - reg -= SI_SH_REG_OFFSET; + } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) { + opcode = PKT3_SET_SH_REG; + reg -= SI_SH_REG_OFFSET; - } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) { - opcode = PKT3_SET_CONTEXT_REG; - reg -= SI_CONTEXT_REG_OFFSET; + } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) { + opcode = PKT3_SET_CONTEXT_REG; + reg -= SI_CONTEXT_REG_OFFSET; - } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) { - opcode = PKT3_SET_UCONFIG_REG; - reg -= CIK_UCONFIG_REG_OFFSET; + } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) { + opcode = PKT3_SET_UCONFIG_REG; + reg -= CIK_UCONFIG_REG_OFFSET; - } else { - PRINT_ERR("Invalid register offset %08x!\n", reg); - return; - } + } else { + PRINT_ERR("Invalid register offset %08x!\n", reg); + return; + } - reg >>= 2; + reg >>= 2; - if (opcode != state->last_opcode || reg != (state->last_reg + 1)) { - si_pm4_cmd_begin(state, opcode); - si_pm4_cmd_add(state, reg); - } + if (opcode != state->last_opcode || reg != (state->last_reg + 1)) { + si_pm4_cmd_begin(state, opcode); + si_pm4_cmd_add(state, reg); + } - state->last_reg = reg; - si_pm4_cmd_add(state, val); - si_pm4_cmd_end(state, false); + state->last_reg = reg; + si_pm4_cmd_add(state, val); + si_pm4_cmd_end(state, false); } -void si_pm4_add_bo(struct si_pm4_state *state, - struct si_resource *bo, - enum radeon_bo_usage usage, - enum radeon_bo_priority priority) +void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage, + enum radeon_bo_priority priority) { - unsigned idx = state->nbo++; - assert(idx < SI_PM4_MAX_BO); + unsigned idx = state->nbo++; + assert(idx < SI_PM4_MAX_BO); - si_resource_reference(&state->bo[idx], bo); - state->bo_usage[idx] = usage; - state->bo_priority[idx] = priority; + si_resource_reference(&state->bo[idx], bo); + state->bo_usage[idx] = usage; + state->bo_priority[idx] = priority; } void si_pm4_clear_state(struct si_pm4_state *state) { - for (int i = 0; i < state->nbo; ++i) - si_resource_reference(&state->bo[i], NULL); - si_resource_reference(&state->indirect_buffer, NULL); - state->nbo = 0; - state->ndw = 0; + for (int i = 0; i < state->nbo; ++i) + si_resource_reference(&state->bo[i], NULL); + si_resource_reference(&state->indirect_buffer, NULL); + state->nbo = 0; + state->ndw = 0; } -void si_pm4_free_state(struct si_context *sctx, - struct si_pm4_state *state, - unsigned idx) +void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx) { - if (!state) - return; + if (!state) + return; - if (idx != ~0 && sctx->emitted.array[idx] == state) { - sctx->emitted.array[idx] = NULL; - } + if (idx != ~0 && sctx->emitted.array[idx] == state) { + sctx->emitted.array[idx] = NULL; + } - si_pm4_clear_state(state); - FREE(state); + si_pm4_clear_state(state); + FREE(state); } void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - for (int i = 0; i < state->nbo; ++i) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], - state->bo_usage[i], state->bo_priority[i]); - } - - if (!state->indirect_buffer) { - radeon_emit_array(cs, state->pm4, state->ndw); - } else { - struct si_resource *ib = state->indirect_buffer; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, - RADEON_USAGE_READ, - RADEON_PRIO_IB2); - - radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); - radeon_emit(cs, ib->gpu_address); - radeon_emit(cs, ib->gpu_address >> 32); - radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff); - } - - if (state->atom.emit) - state->atom.emit(sctx); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + for (int i = 0; i < state->nbo; ++i) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i], + state->bo_priority[i]); + } + + if (!state->indirect_buffer) { + radeon_emit_array(cs, state->pm4, state->ndw); + } else { + struct si_resource *ib = state->indirect_buffer; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); + + radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); + radeon_emit(cs, ib->gpu_address); + radeon_emit(cs, ib->gpu_address >> 32); + radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff); + } + + if (state->atom.emit) + state->atom.emit(sctx); } void si_pm4_reset_emitted(struct si_context *sctx) { - memset(&sctx->emitted, 0, sizeof(sctx->emitted)); - sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES); + memset(&sctx->emitted, 0, sizeof(sctx->emitted)); + sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES); } -void si_pm4_upload_indirect_buffer(struct si_context *sctx, - struct si_pm4_state *state) +void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state) { - struct pipe_screen *screen = sctx->b.screen; - unsigned aligned_ndw = align(state->ndw, 8); - - /* only supported on GFX7 and later */ - if (sctx->chip_class < GFX7) - return; - - assert(state->ndw); - assert(aligned_ndw <= SI_PM4_MAX_DW); - - si_resource_reference(&state->indirect_buffer, NULL); - /* TODO: this hangs with 1024 or higher alignment on GFX9. */ - state->indirect_buffer = - si_aligned_buffer_create(screen, 0, - PIPE_USAGE_DEFAULT, aligned_ndw * 4, - 256); - if (!state->indirect_buffer) - return; - - /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */ - if (sctx->screen->info.gfx_ib_pad_with_type2) { - for (int i = state->ndw; i < aligned_ndw; i++) - state->pm4[i] = 0x80000000; /* type2 nop packet */ - } else { - for (int i = state->ndw; i < aligned_ndw; i++) - state->pm4[i] = 0xffff1000; /* type3 nop packet */ - } - - pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, - 0, aligned_ndw *4, state->pm4); + struct pipe_screen *screen = sctx->b.screen; + unsigned aligned_ndw = align(state->ndw, 8); + + /* only supported on GFX7 and later */ + if (sctx->chip_class < GFX7) + return; + + assert(state->ndw); + assert(aligned_ndw <= SI_PM4_MAX_DW); + + si_resource_reference(&state->indirect_buffer, NULL); + /* TODO: this hangs with 1024 or higher alignment on GFX9. */ + state->indirect_buffer = + si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256); + if (!state->indirect_buffer) + return; + + /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */ + if (sctx->screen->info.gfx_ib_pad_with_type2) { + for (int i = state->ndw; i < aligned_ndw; i++) + state->pm4[i] = 0x80000000; /* type2 nop packet */ + } else { + for (int i = state->ndw; i < aligned_ndw; i++) + state->pm4[i] = 0xffff1000; /* type3 nop packet */ + } + + pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4); } diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index c91a90bc638..783833e5a42 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -27,8 +27,8 @@ #include "radeon/radeon_winsys.h" -#define SI_PM4_MAX_DW 176 -#define SI_PM4_MAX_BO 3 +#define SI_PM4_MAX_DW 176 +#define SI_PM4_MAX_BO 3 // forward defines struct si_context; @@ -37,32 +37,31 @@ struct si_context; * command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS). */ struct si_atom { - void (*emit)(struct si_context *ctx); + void (*emit)(struct si_context *ctx); }; -struct si_pm4_state -{ - /* optional indirect buffer */ - struct si_resource *indirect_buffer; +struct si_pm4_state { + /* optional indirect buffer */ + struct si_resource *indirect_buffer; - /* PKT3_SET_*_REG handling */ - unsigned last_opcode; - unsigned last_reg; - unsigned last_pm4; + /* PKT3_SET_*_REG handling */ + unsigned last_opcode; + unsigned last_reg; + unsigned last_pm4; - /* commands for the DE */ - unsigned ndw; - uint32_t pm4[SI_PM4_MAX_DW]; + /* commands for the DE */ + unsigned ndw; + uint32_t pm4[SI_PM4_MAX_DW]; - /* BO's referenced by this state */ - unsigned nbo; - struct si_resource *bo[SI_PM4_MAX_BO]; - enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO]; - enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO]; + /* BO's referenced by this state */ + unsigned nbo; + struct si_resource *bo[SI_PM4_MAX_BO]; + enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO]; + enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO]; - /* For shader states only */ - struct si_shader *shader; - struct si_atom atom; + /* For shader states only */ + struct si_shader *shader; + struct si_atom atom; }; void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode); @@ -70,17 +69,12 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw); void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate); void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val); -void si_pm4_add_bo(struct si_pm4_state *state, - struct si_resource *bo, - enum radeon_bo_usage usage, - enum radeon_bo_priority priority); -void si_pm4_upload_indirect_buffer(struct si_context *sctx, - struct si_pm4_state *state); +void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage, + enum radeon_bo_priority priority); +void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state); void si_pm4_clear_state(struct si_pm4_state *state); -void si_pm4_free_state(struct si_context *sctx, - struct si_pm4_state *state, - unsigned idx); +void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx); void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state); void si_pm4_reset_emitted(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index bf80862e095..6ad293301cb 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -24,1368 +24,1312 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_pipe.h" #include "si_query.h" -#include "util/u_memory.h" -#include "util/u_upload_mgr.h" + +#include "amd/common/sid.h" +#include "si_pipe.h" #include "util/os_time.h" +#include "util/u_memory.h" #include "util/u_suballoc.h" -#include "amd/common/sid.h" +#include "util/u_upload_mgr.h" static const struct si_query_ops query_hw_ops; struct si_hw_query_params { - unsigned start_offset; - unsigned end_offset; - unsigned fence_offset; - unsigned pair_stride; - unsigned pair_count; + unsigned start_offset; + unsigned end_offset; + unsigned fence_offset; + unsigned pair_stride; + unsigned pair_count; }; /* Queries without buffer handling or suspend/resume. */ struct si_query_sw { - struct si_query b; + struct si_query b; - uint64_t begin_result; - uint64_t end_result; + uint64_t begin_result; + uint64_t end_result; - uint64_t begin_time; - uint64_t end_time; + uint64_t begin_time; + uint64_t end_time; - /* Fence for GPU_FINISHED. */ - struct pipe_fence_handle *fence; + /* Fence for GPU_FINISHED. */ + struct pipe_fence_handle *fence; }; -static void si_query_sw_destroy(struct si_context *sctx, - struct si_query *squery) +static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery) { - struct si_query_sw *query = (struct si_query_sw *)squery; + struct si_query_sw *query = (struct si_query_sw *)squery; - sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL); - FREE(query); + sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL); + FREE(query); } static enum radeon_value_id winsys_id_from_type(unsigned type) { - switch (type) { - case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY; - case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY; - case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; - case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; - case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; - case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS; - case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS; - case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS; - case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER; - case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER; - case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; - case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; - case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS; - case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; - case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE; - case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; - case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; - case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK; - case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK; - case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME; - default: unreachable("query type does not correspond to winsys id"); - } + switch (type) { + case SI_QUERY_REQUESTED_VRAM: + return RADEON_REQUESTED_VRAM_MEMORY; + case SI_QUERY_REQUESTED_GTT: + return RADEON_REQUESTED_GTT_MEMORY; + case SI_QUERY_MAPPED_VRAM: + return RADEON_MAPPED_VRAM; + case SI_QUERY_MAPPED_GTT: + return RADEON_MAPPED_GTT; + case SI_QUERY_BUFFER_WAIT_TIME: + return RADEON_BUFFER_WAIT_TIME_NS; + case SI_QUERY_NUM_MAPPED_BUFFERS: + return RADEON_NUM_MAPPED_BUFFERS; + case SI_QUERY_NUM_GFX_IBS: + return RADEON_NUM_GFX_IBS; + case SI_QUERY_NUM_SDMA_IBS: + return RADEON_NUM_SDMA_IBS; + case SI_QUERY_GFX_BO_LIST_SIZE: + return RADEON_GFX_BO_LIST_COUNTER; + case SI_QUERY_GFX_IB_SIZE: + return RADEON_GFX_IB_SIZE_COUNTER; + case SI_QUERY_NUM_BYTES_MOVED: + return RADEON_NUM_BYTES_MOVED; + case SI_QUERY_NUM_EVICTIONS: + return RADEON_NUM_EVICTIONS; + case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: + return RADEON_NUM_VRAM_CPU_PAGE_FAULTS; + case SI_QUERY_VRAM_USAGE: + return RADEON_VRAM_USAGE; + case SI_QUERY_VRAM_VIS_USAGE: + return RADEON_VRAM_VIS_USAGE; + case SI_QUERY_GTT_USAGE: + return RADEON_GTT_USAGE; + case SI_QUERY_GPU_TEMPERATURE: + return RADEON_GPU_TEMPERATURE; + case SI_QUERY_CURRENT_GPU_SCLK: + return RADEON_CURRENT_SCLK; + case SI_QUERY_CURRENT_GPU_MCLK: + return RADEON_CURRENT_MCLK; + case SI_QUERY_CS_THREAD_BUSY: + return RADEON_CS_THREAD_TIME; + default: + unreachable("query type does not correspond to winsys id"); + } } static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx) { - struct pipe_fence_handle *fence = NULL; + struct pipe_fence_handle *fence = NULL; - si_flush_dma_cs(sctx, 0, &fence); - if (fence) { - sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE); - sctx->ws->fence_reference(&fence, NULL); - } + si_flush_dma_cs(sctx, 0, &fence); + if (fence) { + sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE); + sctx->ws->fence_reference(&fence, NULL); + } - return os_time_get_nano(); + return os_time_get_nano(); } -static bool si_query_sw_begin(struct si_context *sctx, - struct si_query *squery) +static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) { - struct si_query_sw *query = (struct si_query_sw *)squery; - enum radeon_value_id ws_id; - - switch(query->b.type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - case PIPE_QUERY_GPU_FINISHED: - break; - case SI_QUERY_TIME_ELAPSED_SDMA_SI: - query->begin_result = si_finish_dma_get_cpu_time(sctx); - break; - case SI_QUERY_DRAW_CALLS: - query->begin_result = sctx->num_draw_calls; - break; - case SI_QUERY_DECOMPRESS_CALLS: - query->begin_result = sctx->num_decompress_calls; - break; - case SI_QUERY_MRT_DRAW_CALLS: - query->begin_result = sctx->num_mrt_draw_calls; - break; - case SI_QUERY_PRIM_RESTART_CALLS: - query->begin_result = sctx->num_prim_restart_calls; - break; - case SI_QUERY_SPILL_DRAW_CALLS: - query->begin_result = sctx->num_spill_draw_calls; - break; - case SI_QUERY_COMPUTE_CALLS: - query->begin_result = sctx->num_compute_calls; - break; - case SI_QUERY_SPILL_COMPUTE_CALLS: - query->begin_result = sctx->num_spill_compute_calls; - break; - case SI_QUERY_DMA_CALLS: - query->begin_result = sctx->num_dma_calls; - break; - case SI_QUERY_CP_DMA_CALLS: - query->begin_result = sctx->num_cp_dma_calls; - break; - case SI_QUERY_NUM_VS_FLUSHES: - query->begin_result = sctx->num_vs_flushes; - break; - case SI_QUERY_NUM_PS_FLUSHES: - query->begin_result = sctx->num_ps_flushes; - break; - case SI_QUERY_NUM_CS_FLUSHES: - query->begin_result = sctx->num_cs_flushes; - break; - case SI_QUERY_NUM_CB_CACHE_FLUSHES: - query->begin_result = sctx->num_cb_cache_flushes; - break; - case SI_QUERY_NUM_DB_CACHE_FLUSHES: - query->begin_result = sctx->num_db_cache_flushes; - break; - case SI_QUERY_NUM_L2_INVALIDATES: - query->begin_result = sctx->num_L2_invalidates; - break; - case SI_QUERY_NUM_L2_WRITEBACKS: - query->begin_result = sctx->num_L2_writebacks; - break; - case SI_QUERY_NUM_RESIDENT_HANDLES: - query->begin_result = sctx->num_resident_handles; - break; - case SI_QUERY_TC_OFFLOADED_SLOTS: - query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0; - break; - case SI_QUERY_TC_DIRECT_SLOTS: - query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0; - break; - case SI_QUERY_TC_NUM_SYNCS: - query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0; - break; - case SI_QUERY_REQUESTED_VRAM: - case SI_QUERY_REQUESTED_GTT: - case SI_QUERY_MAPPED_VRAM: - case SI_QUERY_MAPPED_GTT: - case SI_QUERY_VRAM_USAGE: - case SI_QUERY_VRAM_VIS_USAGE: - case SI_QUERY_GTT_USAGE: - case SI_QUERY_GPU_TEMPERATURE: - case SI_QUERY_CURRENT_GPU_SCLK: - case SI_QUERY_CURRENT_GPU_MCLK: - case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO: - case SI_QUERY_NUM_MAPPED_BUFFERS: - query->begin_result = 0; - break; - case SI_QUERY_BUFFER_WAIT_TIME: - case SI_QUERY_GFX_IB_SIZE: - case SI_QUERY_NUM_GFX_IBS: - case SI_QUERY_NUM_SDMA_IBS: - case SI_QUERY_NUM_BYTES_MOVED: - case SI_QUERY_NUM_EVICTIONS: - case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { - enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); - query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); - break; - } - case SI_QUERY_GFX_BO_LIST_SIZE: - ws_id = winsys_id_from_type(query->b.type); - query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); - query->begin_time = sctx->ws->query_value(sctx->ws, - RADEON_NUM_GFX_IBS); - break; - case SI_QUERY_CS_THREAD_BUSY: - ws_id = winsys_id_from_type(query->b.type); - query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); - query->begin_time = os_time_get_nano(); - break; - case SI_QUERY_GALLIUM_THREAD_BUSY: - query->begin_result = - sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0; - query->begin_time = os_time_get_nano(); - break; - case SI_QUERY_GPU_LOAD: - case SI_QUERY_GPU_SHADERS_BUSY: - case SI_QUERY_GPU_TA_BUSY: - case SI_QUERY_GPU_GDS_BUSY: - case SI_QUERY_GPU_VGT_BUSY: - case SI_QUERY_GPU_IA_BUSY: - case SI_QUERY_GPU_SX_BUSY: - case SI_QUERY_GPU_WD_BUSY: - case SI_QUERY_GPU_BCI_BUSY: - case SI_QUERY_GPU_SC_BUSY: - case SI_QUERY_GPU_PA_BUSY: - case SI_QUERY_GPU_DB_BUSY: - case SI_QUERY_GPU_CP_BUSY: - case SI_QUERY_GPU_CB_BUSY: - case SI_QUERY_GPU_SDMA_BUSY: - case SI_QUERY_GPU_PFP_BUSY: - case SI_QUERY_GPU_MEQ_BUSY: - case SI_QUERY_GPU_ME_BUSY: - case SI_QUERY_GPU_SURF_SYNC_BUSY: - case SI_QUERY_GPU_CP_DMA_BUSY: - case SI_QUERY_GPU_SCRATCH_RAM_BUSY: - query->begin_result = si_begin_counter(sctx->screen, - query->b.type); - break; - case SI_QUERY_NUM_COMPILATIONS: - query->begin_result = p_atomic_read(&sctx->screen->num_compilations); - break; - case SI_QUERY_NUM_SHADERS_CREATED: - query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created); - break; - case SI_QUERY_LIVE_SHADER_CACHE_HITS: - query->begin_result = sctx->screen->live_shader_cache.hits; - break; - case SI_QUERY_LIVE_SHADER_CACHE_MISSES: - query->begin_result = sctx->screen->live_shader_cache.misses; - break; - case SI_QUERY_MEMORY_SHADER_CACHE_HITS: - query->begin_result = sctx->screen->num_memory_shader_cache_hits; - break; - case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: - query->begin_result = sctx->screen->num_memory_shader_cache_misses; - break; - case SI_QUERY_DISK_SHADER_CACHE_HITS: - query->begin_result = sctx->screen->num_disk_shader_cache_hits; - break; - case SI_QUERY_DISK_SHADER_CACHE_MISSES: - query->begin_result = sctx->screen->num_disk_shader_cache_misses; - break; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - query->begin_result = sctx->compute_num_verts_accepted; - break; - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - query->begin_result = sctx->compute_num_verts_rejected; - break; - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - query->begin_result = sctx->compute_num_verts_ineligible; - break; - case SI_QUERY_GPIN_ASIC_ID: - case SI_QUERY_GPIN_NUM_SIMD: - case SI_QUERY_GPIN_NUM_RB: - case SI_QUERY_GPIN_NUM_SPI: - case SI_QUERY_GPIN_NUM_SE: - break; - default: - unreachable("si_query_sw_begin: bad query type"); - } - - return true; + struct si_query_sw *query = (struct si_query_sw *)squery; + enum radeon_value_id ws_id; + + switch (query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_GPU_FINISHED: + break; + case SI_QUERY_TIME_ELAPSED_SDMA_SI: + query->begin_result = si_finish_dma_get_cpu_time(sctx); + break; + case SI_QUERY_DRAW_CALLS: + query->begin_result = sctx->num_draw_calls; + break; + case SI_QUERY_DECOMPRESS_CALLS: + query->begin_result = sctx->num_decompress_calls; + break; + case SI_QUERY_MRT_DRAW_CALLS: + query->begin_result = sctx->num_mrt_draw_calls; + break; + case SI_QUERY_PRIM_RESTART_CALLS: + query->begin_result = sctx->num_prim_restart_calls; + break; + case SI_QUERY_SPILL_DRAW_CALLS: + query->begin_result = sctx->num_spill_draw_calls; + break; + case SI_QUERY_COMPUTE_CALLS: + query->begin_result = sctx->num_compute_calls; + break; + case SI_QUERY_SPILL_COMPUTE_CALLS: + query->begin_result = sctx->num_spill_compute_calls; + break; + case SI_QUERY_DMA_CALLS: + query->begin_result = sctx->num_dma_calls; + break; + case SI_QUERY_CP_DMA_CALLS: + query->begin_result = sctx->num_cp_dma_calls; + break; + case SI_QUERY_NUM_VS_FLUSHES: + query->begin_result = sctx->num_vs_flushes; + break; + case SI_QUERY_NUM_PS_FLUSHES: + query->begin_result = sctx->num_ps_flushes; + break; + case SI_QUERY_NUM_CS_FLUSHES: + query->begin_result = sctx->num_cs_flushes; + break; + case SI_QUERY_NUM_CB_CACHE_FLUSHES: + query->begin_result = sctx->num_cb_cache_flushes; + break; + case SI_QUERY_NUM_DB_CACHE_FLUSHES: + query->begin_result = sctx->num_db_cache_flushes; + break; + case SI_QUERY_NUM_L2_INVALIDATES: + query->begin_result = sctx->num_L2_invalidates; + break; + case SI_QUERY_NUM_L2_WRITEBACKS: + query->begin_result = sctx->num_L2_writebacks; + break; + case SI_QUERY_NUM_RESIDENT_HANDLES: + query->begin_result = sctx->num_resident_handles; + break; + case SI_QUERY_TC_OFFLOADED_SLOTS: + query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0; + break; + case SI_QUERY_TC_DIRECT_SLOTS: + query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0; + break; + case SI_QUERY_TC_NUM_SYNCS: + query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0; + break; + case SI_QUERY_REQUESTED_VRAM: + case SI_QUERY_REQUESTED_GTT: + case SI_QUERY_MAPPED_VRAM: + case SI_QUERY_MAPPED_GTT: + case SI_QUERY_VRAM_USAGE: + case SI_QUERY_VRAM_VIS_USAGE: + case SI_QUERY_GTT_USAGE: + case SI_QUERY_GPU_TEMPERATURE: + case SI_QUERY_CURRENT_GPU_SCLK: + case SI_QUERY_CURRENT_GPU_MCLK: + case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + case SI_QUERY_NUM_MAPPED_BUFFERS: + query->begin_result = 0; + break; + case SI_QUERY_BUFFER_WAIT_TIME: + case SI_QUERY_GFX_IB_SIZE: + case SI_QUERY_NUM_GFX_IBS: + case SI_QUERY_NUM_SDMA_IBS: + case SI_QUERY_NUM_BYTES_MOVED: + case SI_QUERY_NUM_EVICTIONS: + case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); + break; + } + case SI_QUERY_GFX_BO_LIST_SIZE: + ws_id = winsys_id_from_type(query->b.type); + query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); + query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS); + break; + case SI_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->begin_result = sctx->ws->query_value(sctx->ws, ws_id); + query->begin_time = os_time_get_nano(); + break; + case SI_QUERY_GALLIUM_THREAD_BUSY: + query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0; + query->begin_time = os_time_get_nano(); + break; + case SI_QUERY_GPU_LOAD: + case SI_QUERY_GPU_SHADERS_BUSY: + case SI_QUERY_GPU_TA_BUSY: + case SI_QUERY_GPU_GDS_BUSY: + case SI_QUERY_GPU_VGT_BUSY: + case SI_QUERY_GPU_IA_BUSY: + case SI_QUERY_GPU_SX_BUSY: + case SI_QUERY_GPU_WD_BUSY: + case SI_QUERY_GPU_BCI_BUSY: + case SI_QUERY_GPU_SC_BUSY: + case SI_QUERY_GPU_PA_BUSY: + case SI_QUERY_GPU_DB_BUSY: + case SI_QUERY_GPU_CP_BUSY: + case SI_QUERY_GPU_CB_BUSY: + case SI_QUERY_GPU_SDMA_BUSY: + case SI_QUERY_GPU_PFP_BUSY: + case SI_QUERY_GPU_MEQ_BUSY: + case SI_QUERY_GPU_ME_BUSY: + case SI_QUERY_GPU_SURF_SYNC_BUSY: + case SI_QUERY_GPU_CP_DMA_BUSY: + case SI_QUERY_GPU_SCRATCH_RAM_BUSY: + query->begin_result = si_begin_counter(sctx->screen, query->b.type); + break; + case SI_QUERY_NUM_COMPILATIONS: + query->begin_result = p_atomic_read(&sctx->screen->num_compilations); + break; + case SI_QUERY_NUM_SHADERS_CREATED: + query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created); + break; + case SI_QUERY_LIVE_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->live_shader_cache.hits; + break; + case SI_QUERY_LIVE_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->live_shader_cache.misses; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->num_memory_shader_cache_hits; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->num_memory_shader_cache_misses; + break; + case SI_QUERY_DISK_SHADER_CACHE_HITS: + query->begin_result = sctx->screen->num_disk_shader_cache_hits; + break; + case SI_QUERY_DISK_SHADER_CACHE_MISSES: + query->begin_result = sctx->screen->num_disk_shader_cache_misses; + break; + case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: + query->begin_result = sctx->compute_num_verts_accepted; + break; + case SI_QUERY_PD_NUM_PRIMS_REJECTED: + query->begin_result = sctx->compute_num_verts_rejected; + break; + case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: + query->begin_result = sctx->compute_num_verts_ineligible; + break; + case SI_QUERY_GPIN_ASIC_ID: + case SI_QUERY_GPIN_NUM_SIMD: + case SI_QUERY_GPIN_NUM_RB: + case SI_QUERY_GPIN_NUM_SPI: + case SI_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("si_query_sw_begin: bad query type"); + } + + return true; } -static bool si_query_sw_end(struct si_context *sctx, - struct si_query *squery) +static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery) { - struct si_query_sw *query = (struct si_query_sw *)squery; - enum radeon_value_id ws_id; - - switch(query->b.type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - break; - case PIPE_QUERY_GPU_FINISHED: - sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED); - break; - case SI_QUERY_TIME_ELAPSED_SDMA_SI: - query->end_result = si_finish_dma_get_cpu_time(sctx); - break; - case SI_QUERY_DRAW_CALLS: - query->end_result = sctx->num_draw_calls; - break; - case SI_QUERY_DECOMPRESS_CALLS: - query->end_result = sctx->num_decompress_calls; - break; - case SI_QUERY_MRT_DRAW_CALLS: - query->end_result = sctx->num_mrt_draw_calls; - break; - case SI_QUERY_PRIM_RESTART_CALLS: - query->end_result = sctx->num_prim_restart_calls; - break; - case SI_QUERY_SPILL_DRAW_CALLS: - query->end_result = sctx->num_spill_draw_calls; - break; - case SI_QUERY_COMPUTE_CALLS: - query->end_result = sctx->num_compute_calls; - break; - case SI_QUERY_SPILL_COMPUTE_CALLS: - query->end_result = sctx->num_spill_compute_calls; - break; - case SI_QUERY_DMA_CALLS: - query->end_result = sctx->num_dma_calls; - break; - case SI_QUERY_CP_DMA_CALLS: - query->end_result = sctx->num_cp_dma_calls; - break; - case SI_QUERY_NUM_VS_FLUSHES: - query->end_result = sctx->num_vs_flushes; - break; - case SI_QUERY_NUM_PS_FLUSHES: - query->end_result = sctx->num_ps_flushes; - break; - case SI_QUERY_NUM_CS_FLUSHES: - query->end_result = sctx->num_cs_flushes; - break; - case SI_QUERY_NUM_CB_CACHE_FLUSHES: - query->end_result = sctx->num_cb_cache_flushes; - break; - case SI_QUERY_NUM_DB_CACHE_FLUSHES: - query->end_result = sctx->num_db_cache_flushes; - break; - case SI_QUERY_NUM_L2_INVALIDATES: - query->end_result = sctx->num_L2_invalidates; - break; - case SI_QUERY_NUM_L2_WRITEBACKS: - query->end_result = sctx->num_L2_writebacks; - break; - case SI_QUERY_NUM_RESIDENT_HANDLES: - query->end_result = sctx->num_resident_handles; - break; - case SI_QUERY_TC_OFFLOADED_SLOTS: - query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0; - break; - case SI_QUERY_TC_DIRECT_SLOTS: - query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0; - break; - case SI_QUERY_TC_NUM_SYNCS: - query->end_result = sctx->tc ? sctx->tc->num_syncs : 0; - break; - case SI_QUERY_REQUESTED_VRAM: - case SI_QUERY_REQUESTED_GTT: - case SI_QUERY_MAPPED_VRAM: - case SI_QUERY_MAPPED_GTT: - case SI_QUERY_VRAM_USAGE: - case SI_QUERY_VRAM_VIS_USAGE: - case SI_QUERY_GTT_USAGE: - case SI_QUERY_GPU_TEMPERATURE: - case SI_QUERY_CURRENT_GPU_SCLK: - case SI_QUERY_CURRENT_GPU_MCLK: - case SI_QUERY_BUFFER_WAIT_TIME: - case SI_QUERY_GFX_IB_SIZE: - case SI_QUERY_NUM_MAPPED_BUFFERS: - case SI_QUERY_NUM_GFX_IBS: - case SI_QUERY_NUM_SDMA_IBS: - case SI_QUERY_NUM_BYTES_MOVED: - case SI_QUERY_NUM_EVICTIONS: - case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { - enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); - query->end_result = sctx->ws->query_value(sctx->ws, ws_id); - break; - } - case SI_QUERY_GFX_BO_LIST_SIZE: - ws_id = winsys_id_from_type(query->b.type); - query->end_result = sctx->ws->query_value(sctx->ws, ws_id); - query->end_time = sctx->ws->query_value(sctx->ws, - RADEON_NUM_GFX_IBS); - break; - case SI_QUERY_CS_THREAD_BUSY: - ws_id = winsys_id_from_type(query->b.type); - query->end_result = sctx->ws->query_value(sctx->ws, ws_id); - query->end_time = os_time_get_nano(); - break; - case SI_QUERY_GALLIUM_THREAD_BUSY: - query->end_result = - sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0; - query->end_time = os_time_get_nano(); - break; - case SI_QUERY_GPU_LOAD: - case SI_QUERY_GPU_SHADERS_BUSY: - case SI_QUERY_GPU_TA_BUSY: - case SI_QUERY_GPU_GDS_BUSY: - case SI_QUERY_GPU_VGT_BUSY: - case SI_QUERY_GPU_IA_BUSY: - case SI_QUERY_GPU_SX_BUSY: - case SI_QUERY_GPU_WD_BUSY: - case SI_QUERY_GPU_BCI_BUSY: - case SI_QUERY_GPU_SC_BUSY: - case SI_QUERY_GPU_PA_BUSY: - case SI_QUERY_GPU_DB_BUSY: - case SI_QUERY_GPU_CP_BUSY: - case SI_QUERY_GPU_CB_BUSY: - case SI_QUERY_GPU_SDMA_BUSY: - case SI_QUERY_GPU_PFP_BUSY: - case SI_QUERY_GPU_MEQ_BUSY: - case SI_QUERY_GPU_ME_BUSY: - case SI_QUERY_GPU_SURF_SYNC_BUSY: - case SI_QUERY_GPU_CP_DMA_BUSY: - case SI_QUERY_GPU_SCRATCH_RAM_BUSY: - query->end_result = si_end_counter(sctx->screen, - query->b.type, - query->begin_result); - query->begin_result = 0; - break; - case SI_QUERY_NUM_COMPILATIONS: - query->end_result = p_atomic_read(&sctx->screen->num_compilations); - break; - case SI_QUERY_NUM_SHADERS_CREATED: - query->end_result = p_atomic_read(&sctx->screen->num_shaders_created); - break; - case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO: - query->end_result = sctx->last_tex_ps_draw_ratio; - break; - case SI_QUERY_LIVE_SHADER_CACHE_HITS: - query->end_result = sctx->screen->live_shader_cache.hits; - break; - case SI_QUERY_LIVE_SHADER_CACHE_MISSES: - query->end_result = sctx->screen->live_shader_cache.misses; - break; - case SI_QUERY_MEMORY_SHADER_CACHE_HITS: - query->end_result = sctx->screen->num_memory_shader_cache_hits; - break; - case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: - query->end_result = sctx->screen->num_memory_shader_cache_misses; - break; - case SI_QUERY_DISK_SHADER_CACHE_HITS: - query->end_result = sctx->screen->num_disk_shader_cache_hits; - break; - case SI_QUERY_DISK_SHADER_CACHE_MISSES: - query->end_result = sctx->screen->num_disk_shader_cache_misses; - break; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - query->end_result = sctx->compute_num_verts_accepted; - break; - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - query->end_result = sctx->compute_num_verts_rejected; - break; - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - query->end_result = sctx->compute_num_verts_ineligible; - break; - case SI_QUERY_GPIN_ASIC_ID: - case SI_QUERY_GPIN_NUM_SIMD: - case SI_QUERY_GPIN_NUM_RB: - case SI_QUERY_GPIN_NUM_SPI: - case SI_QUERY_GPIN_NUM_SE: - break; - default: - unreachable("si_query_sw_end: bad query type"); - } - - return true; + struct si_query_sw *query = (struct si_query_sw *)squery; + enum radeon_value_id ws_id; + + switch (query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + break; + case PIPE_QUERY_GPU_FINISHED: + sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED); + break; + case SI_QUERY_TIME_ELAPSED_SDMA_SI: + query->end_result = si_finish_dma_get_cpu_time(sctx); + break; + case SI_QUERY_DRAW_CALLS: + query->end_result = sctx->num_draw_calls; + break; + case SI_QUERY_DECOMPRESS_CALLS: + query->end_result = sctx->num_decompress_calls; + break; + case SI_QUERY_MRT_DRAW_CALLS: + query->end_result = sctx->num_mrt_draw_calls; + break; + case SI_QUERY_PRIM_RESTART_CALLS: + query->end_result = sctx->num_prim_restart_calls; + break; + case SI_QUERY_SPILL_DRAW_CALLS: + query->end_result = sctx->num_spill_draw_calls; + break; + case SI_QUERY_COMPUTE_CALLS: + query->end_result = sctx->num_compute_calls; + break; + case SI_QUERY_SPILL_COMPUTE_CALLS: + query->end_result = sctx->num_spill_compute_calls; + break; + case SI_QUERY_DMA_CALLS: + query->end_result = sctx->num_dma_calls; + break; + case SI_QUERY_CP_DMA_CALLS: + query->end_result = sctx->num_cp_dma_calls; + break; + case SI_QUERY_NUM_VS_FLUSHES: + query->end_result = sctx->num_vs_flushes; + break; + case SI_QUERY_NUM_PS_FLUSHES: + query->end_result = sctx->num_ps_flushes; + break; + case SI_QUERY_NUM_CS_FLUSHES: + query->end_result = sctx->num_cs_flushes; + break; + case SI_QUERY_NUM_CB_CACHE_FLUSHES: + query->end_result = sctx->num_cb_cache_flushes; + break; + case SI_QUERY_NUM_DB_CACHE_FLUSHES: + query->end_result = sctx->num_db_cache_flushes; + break; + case SI_QUERY_NUM_L2_INVALIDATES: + query->end_result = sctx->num_L2_invalidates; + break; + case SI_QUERY_NUM_L2_WRITEBACKS: + query->end_result = sctx->num_L2_writebacks; + break; + case SI_QUERY_NUM_RESIDENT_HANDLES: + query->end_result = sctx->num_resident_handles; + break; + case SI_QUERY_TC_OFFLOADED_SLOTS: + query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0; + break; + case SI_QUERY_TC_DIRECT_SLOTS: + query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0; + break; + case SI_QUERY_TC_NUM_SYNCS: + query->end_result = sctx->tc ? sctx->tc->num_syncs : 0; + break; + case SI_QUERY_REQUESTED_VRAM: + case SI_QUERY_REQUESTED_GTT: + case SI_QUERY_MAPPED_VRAM: + case SI_QUERY_MAPPED_GTT: + case SI_QUERY_VRAM_USAGE: + case SI_QUERY_VRAM_VIS_USAGE: + case SI_QUERY_GTT_USAGE: + case SI_QUERY_GPU_TEMPERATURE: + case SI_QUERY_CURRENT_GPU_SCLK: + case SI_QUERY_CURRENT_GPU_MCLK: + case SI_QUERY_BUFFER_WAIT_TIME: + case SI_QUERY_GFX_IB_SIZE: + case SI_QUERY_NUM_MAPPED_BUFFERS: + case SI_QUERY_NUM_GFX_IBS: + case SI_QUERY_NUM_SDMA_IBS: + case SI_QUERY_NUM_BYTES_MOVED: + case SI_QUERY_NUM_EVICTIONS: + case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->end_result = sctx->ws->query_value(sctx->ws, ws_id); + break; + } + case SI_QUERY_GFX_BO_LIST_SIZE: + ws_id = winsys_id_from_type(query->b.type); + query->end_result = sctx->ws->query_value(sctx->ws, ws_id); + query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS); + break; + case SI_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->end_result = sctx->ws->query_value(sctx->ws, ws_id); + query->end_time = os_time_get_nano(); + break; + case SI_QUERY_GALLIUM_THREAD_BUSY: + query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0; + query->end_time = os_time_get_nano(); + break; + case SI_QUERY_GPU_LOAD: + case SI_QUERY_GPU_SHADERS_BUSY: + case SI_QUERY_GPU_TA_BUSY: + case SI_QUERY_GPU_GDS_BUSY: + case SI_QUERY_GPU_VGT_BUSY: + case SI_QUERY_GPU_IA_BUSY: + case SI_QUERY_GPU_SX_BUSY: + case SI_QUERY_GPU_WD_BUSY: + case SI_QUERY_GPU_BCI_BUSY: + case SI_QUERY_GPU_SC_BUSY: + case SI_QUERY_GPU_PA_BUSY: + case SI_QUERY_GPU_DB_BUSY: + case SI_QUERY_GPU_CP_BUSY: + case SI_QUERY_GPU_CB_BUSY: + case SI_QUERY_GPU_SDMA_BUSY: + case SI_QUERY_GPU_PFP_BUSY: + case SI_QUERY_GPU_MEQ_BUSY: + case SI_QUERY_GPU_ME_BUSY: + case SI_QUERY_GPU_SURF_SYNC_BUSY: + case SI_QUERY_GPU_CP_DMA_BUSY: + case SI_QUERY_GPU_SCRATCH_RAM_BUSY: + query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result); + query->begin_result = 0; + break; + case SI_QUERY_NUM_COMPILATIONS: + query->end_result = p_atomic_read(&sctx->screen->num_compilations); + break; + case SI_QUERY_NUM_SHADERS_CREATED: + query->end_result = p_atomic_read(&sctx->screen->num_shaders_created); + break; + case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + query->end_result = sctx->last_tex_ps_draw_ratio; + break; + case SI_QUERY_LIVE_SHADER_CACHE_HITS: + query->end_result = sctx->screen->live_shader_cache.hits; + break; + case SI_QUERY_LIVE_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->live_shader_cache.misses; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_HITS: + query->end_result = sctx->screen->num_memory_shader_cache_hits; + break; + case SI_QUERY_MEMORY_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->num_memory_shader_cache_misses; + break; + case SI_QUERY_DISK_SHADER_CACHE_HITS: + query->end_result = sctx->screen->num_disk_shader_cache_hits; + break; + case SI_QUERY_DISK_SHADER_CACHE_MISSES: + query->end_result = sctx->screen->num_disk_shader_cache_misses; + break; + case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: + query->end_result = sctx->compute_num_verts_accepted; + break; + case SI_QUERY_PD_NUM_PRIMS_REJECTED: + query->end_result = sctx->compute_num_verts_rejected; + break; + case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: + query->end_result = sctx->compute_num_verts_ineligible; + break; + case SI_QUERY_GPIN_ASIC_ID: + case SI_QUERY_GPIN_NUM_SIMD: + case SI_QUERY_GPIN_NUM_RB: + case SI_QUERY_GPIN_NUM_SPI: + case SI_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("si_query_sw_end: bad query type"); + } + + return true; } -static bool si_query_sw_get_result(struct si_context *sctx, - struct si_query *squery, - bool wait, - union pipe_query_result *result) +static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait, + union pipe_query_result *result) { - struct si_query_sw *query = (struct si_query_sw *)squery; - - switch (query->b.type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* Convert from cycles per millisecond to cycles per second (Hz). */ - result->timestamp_disjoint.frequency = - (uint64_t)sctx->screen->info.clock_crystal_freq * 1000; - result->timestamp_disjoint.disjoint = false; - return true; - case PIPE_QUERY_GPU_FINISHED: { - struct pipe_screen *screen = sctx->b.screen; - struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b; - - result->b = screen->fence_finish(screen, ctx, query->fence, - wait ? PIPE_TIMEOUT_INFINITE : 0); - return result->b; - } - - case SI_QUERY_GFX_BO_LIST_SIZE: - result->u64 = (query->end_result - query->begin_result) / - (query->end_time - query->begin_time); - return true; - case SI_QUERY_CS_THREAD_BUSY: - case SI_QUERY_GALLIUM_THREAD_BUSY: - result->u64 = (query->end_result - query->begin_result) * 100 / - (query->end_time - query->begin_time); - return true; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - result->u64 = ((unsigned)query->end_result - - (unsigned)query->begin_result) / 3; - return true; - case SI_QUERY_GPIN_ASIC_ID: - result->u32 = 0; - return true; - case SI_QUERY_GPIN_NUM_SIMD: - result->u32 = sctx->screen->info.num_good_compute_units; - return true; - case SI_QUERY_GPIN_NUM_RB: - result->u32 = sctx->screen->info.num_render_backends; - return true; - case SI_QUERY_GPIN_NUM_SPI: - result->u32 = 1; /* all supported chips have one SPI per SE */ - return true; - case SI_QUERY_GPIN_NUM_SE: - result->u32 = sctx->screen->info.max_se; - return true; - } - - result->u64 = query->end_result - query->begin_result; - - switch (query->b.type) { - case SI_QUERY_BUFFER_WAIT_TIME: - case SI_QUERY_GPU_TEMPERATURE: - result->u64 /= 1000; - break; - case SI_QUERY_CURRENT_GPU_SCLK: - case SI_QUERY_CURRENT_GPU_MCLK: - result->u64 *= 1000000; - break; - } - - return true; + struct si_query_sw *query = (struct si_query_sw *)squery; + + switch (query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* Convert from cycles per millisecond to cycles per second (Hz). */ + result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000; + result->timestamp_disjoint.disjoint = false; + return true; + case PIPE_QUERY_GPU_FINISHED: { + struct pipe_screen *screen = sctx->b.screen; + struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b; + + result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0); + return result->b; + } + + case SI_QUERY_GFX_BO_LIST_SIZE: + result->u64 = + (query->end_result - query->begin_result) / (query->end_time - query->begin_time); + return true; + case SI_QUERY_CS_THREAD_BUSY: + case SI_QUERY_GALLIUM_THREAD_BUSY: + result->u64 = + (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time); + return true; + case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: + case SI_QUERY_PD_NUM_PRIMS_REJECTED: + case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: + result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3; + return true; + case SI_QUERY_GPIN_ASIC_ID: + result->u32 = 0; + return true; + case SI_QUERY_GPIN_NUM_SIMD: + result->u32 = sctx->screen->info.num_good_compute_units; + return true; + case SI_QUERY_GPIN_NUM_RB: + result->u32 = sctx->screen->info.num_render_backends; + return true; + case SI_QUERY_GPIN_NUM_SPI: + result->u32 = 1; /* all supported chips have one SPI per SE */ + return true; + case SI_QUERY_GPIN_NUM_SE: + result->u32 = sctx->screen->info.max_se; + return true; + } + + result->u64 = query->end_result - query->begin_result; + + switch (query->b.type) { + case SI_QUERY_BUFFER_WAIT_TIME: + case SI_QUERY_GPU_TEMPERATURE: + result->u64 /= 1000; + break; + case SI_QUERY_CURRENT_GPU_SCLK: + case SI_QUERY_CURRENT_GPU_MCLK: + result->u64 *= 1000000; + break; + } + + return true; } - -static const struct si_query_ops sw_query_ops = { - .destroy = si_query_sw_destroy, - .begin = si_query_sw_begin, - .end = si_query_sw_end, - .get_result = si_query_sw_get_result, - .get_result_resource = NULL -}; +static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy, + .begin = si_query_sw_begin, + .end = si_query_sw_end, + .get_result = si_query_sw_get_result, + .get_result_resource = NULL}; static struct pipe_query *si_query_sw_create(unsigned query_type) { - struct si_query_sw *query; + struct si_query_sw *query; - query = CALLOC_STRUCT(si_query_sw); - if (!query) - return NULL; + query = CALLOC_STRUCT(si_query_sw); + if (!query) + return NULL; - query->b.type = query_type; - query->b.ops = &sw_query_ops; + query->b.type = query_type; + query->b.ops = &sw_query_ops; - return (struct pipe_query *)query; + return (struct pipe_query *)query; } void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer) { - struct si_query_buffer *prev = buffer->previous; + struct si_query_buffer *prev = buffer->previous; - /* Release all query buffers. */ - while (prev) { - struct si_query_buffer *qbuf = prev; - prev = prev->previous; - si_resource_reference(&qbuf->buf, NULL); - FREE(qbuf); - } + /* Release all query buffers. */ + while (prev) { + struct si_query_buffer *qbuf = prev; + prev = prev->previous; + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } - si_resource_reference(&buffer->buf, NULL); + si_resource_reference(&buffer->buf, NULL); } void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer) { - /* Discard all query buffers except for the oldest. */ - while (buffer->previous) { - struct si_query_buffer *qbuf = buffer->previous; - buffer->previous = qbuf->previous; - - si_resource_reference(&buffer->buf, NULL); - buffer->buf = qbuf->buf; /* move ownership */ - FREE(qbuf); - } - buffer->results_end = 0; - - if (!buffer->buf) - return; - - /* Discard even the oldest buffer if it can't be mapped without a stall. */ - if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) { - si_resource_reference(&buffer->buf, NULL); - } else { - buffer->unprepared = true; - } + /* Discard all query buffers except for the oldest. */ + while (buffer->previous) { + struct si_query_buffer *qbuf = buffer->previous; + buffer->previous = qbuf->previous; + + si_resource_reference(&buffer->buf, NULL); + buffer->buf = qbuf->buf; /* move ownership */ + FREE(qbuf); + } + buffer->results_end = 0; + + if (!buffer->buf) + return; + + /* Discard even the oldest buffer if it can't be mapped without a stall. */ + if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) { + si_resource_reference(&buffer->buf, NULL); + } else { + buffer->unprepared = true; + } } bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer, - bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*), - unsigned size) + bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *), + unsigned size) { - bool unprepared = buffer->unprepared; - buffer->unprepared = false; - - if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) { - if (buffer->buf) { - struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer); - memcpy(qbuf, buffer, sizeof(*qbuf)); - buffer->previous = qbuf; - } - buffer->results_end = 0; - - /* Queries are normally read by the CPU after - * being written by the gpu, hence staging is probably a good - * usage pattern. - */ - struct si_screen *screen = sctx->screen; - unsigned buf_size = MAX2(size, screen->info.min_alloc_size); - buffer->buf = si_resource( - pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); - if (unlikely(!buffer->buf)) - return false; - unprepared = true; - } - - if (unprepared && prepare_buffer) { - if (unlikely(!prepare_buffer(sctx, buffer))) { - si_resource_reference(&buffer->buf, NULL); - return false; - } - } - - return true; + bool unprepared = buffer->unprepared; + buffer->unprepared = false; + + if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) { + if (buffer->buf) { + struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer); + memcpy(qbuf, buffer, sizeof(*qbuf)); + buffer->previous = qbuf; + } + buffer->results_end = 0; + + /* Queries are normally read by the CPU after + * being written by the gpu, hence staging is probably a good + * usage pattern. + */ + struct si_screen *screen = sctx->screen; + unsigned buf_size = MAX2(size, screen->info.min_alloc_size); + buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); + if (unlikely(!buffer->buf)) + return false; + unprepared = true; + } + + if (unprepared && prepare_buffer) { + if (unlikely(!prepare_buffer(sctx, buffer))) { + si_resource_reference(&buffer->buf, NULL); + return false; + } + } + + return true; } - void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery) { - struct si_query_hw *query = (struct si_query_hw *)squery; + struct si_query_hw *query = (struct si_query_hw *)squery; - si_query_buffer_destroy(sctx->screen, &query->buffer); - si_resource_reference(&query->workaround_buf, NULL); - FREE(squery); + si_query_buffer_destroy(sctx->screen, &query->buffer); + si_resource_reference(&query->workaround_buf, NULL); + FREE(squery); } -static bool si_query_hw_prepare_buffer(struct si_context *sctx, - struct si_query_buffer *qbuf) +static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf) { - static const struct si_query_hw si_query_hw_s; - struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer); - struct si_screen *screen = sctx->screen; - - /* The caller ensures that the buffer is currently unused by the GPU. */ - uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL, - PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED); - if (!results) - return false; - - memset(results, 0, qbuf->buf->b.b.width0); - - if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || - query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || - query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { - unsigned max_rbs = screen->info.num_render_backends; - unsigned enabled_rb_mask = screen->info.enabled_rb_mask; - unsigned num_results; - unsigned i, j; - - /* Set top bits for unused backends. */ - num_results = qbuf->buf->b.b.width0 / query->result_size; - for (j = 0; j < num_results; j++) { - for (i = 0; i < max_rbs; i++) { - if (!(enabled_rb_mask & (1<screen; + + /* The caller ensures that the buffer is currently unused by the GPU. */ + uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL, + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); + if (!results) + return false; + + memset(results, 0, qbuf->buf->b.b.width0); + + if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { + unsigned max_rbs = screen->info.num_render_backends; + unsigned enabled_rb_mask = screen->info.enabled_rb_mask; + unsigned num_results; + unsigned i, j; + + /* Set top bits for unused backends. */ + num_results = qbuf->buf->b.b.width0 / query->result_size; + for (j = 0; j < num_results; j++) { + for (i = 0; i < max_rbs; i++) { + if (!(enabled_rb_mask & (1 << i))) { + results[(i * 4) + 1] = 0x80000000; + results[(i * 4) + 3] = 0x80000000; + } + } + results += 4 * max_rbs; + } + } + + return true; } -static void si_query_hw_get_result_resource(struct si_context *sctx, - struct si_query *squery, - bool wait, - enum pipe_query_value_type result_type, - int index, - struct pipe_resource *resource, - unsigned offset); - -static void si_query_hw_do_emit_start(struct si_context *sctx, - struct si_query_hw *query, - struct si_resource *buffer, - uint64_t va); -static void si_query_hw_do_emit_stop(struct si_context *sctx, - struct si_query_hw *query, - struct si_resource *buffer, - uint64_t va); -static void si_query_hw_add_result(struct si_screen *sscreen, - struct si_query_hw *, void *buffer, - union pipe_query_result *result); -static void si_query_hw_clear_result(struct si_query_hw *, - union pipe_query_result *); +static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery, + bool wait, enum pipe_query_value_type result_type, + int index, struct pipe_resource *resource, + unsigned offset); + +static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query, + struct si_resource *buffer, uint64_t va); +static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query, + struct si_resource *buffer, uint64_t va); +static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer, + union pipe_query_result *result); +static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *); static struct si_query_hw_ops query_hw_default_hw_ops = { - .prepare_buffer = si_query_hw_prepare_buffer, - .emit_start = si_query_hw_do_emit_start, - .emit_stop = si_query_hw_do_emit_stop, - .clear_result = si_query_hw_clear_result, - .add_result = si_query_hw_add_result, + .prepare_buffer = si_query_hw_prepare_buffer, + .emit_start = si_query_hw_do_emit_start, + .emit_stop = si_query_hw_do_emit_stop, + .clear_result = si_query_hw_clear_result, + .add_result = si_query_hw_add_result, }; -static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, - unsigned query_type, - unsigned index) +static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type, + unsigned index) { - struct si_query_hw *query = CALLOC_STRUCT(si_query_hw); - if (!query) - return NULL; - - query->b.type = query_type; - query->b.ops = &query_hw_ops; - query->ops = &query_hw_default_hw_ops; - - switch (query_type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - query->result_size = 16 * sscreen->info.num_render_backends; - query->result_size += 16; /* for the fence + alignment */ - query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); - break; - case SI_QUERY_TIME_ELAPSED_SDMA: - /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */ - query->result_size = 64; - break; - case PIPE_QUERY_TIME_ELAPSED: - query->result_size = 24; - query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen); - break; - case PIPE_QUERY_TIMESTAMP: - query->result_size = 16; - query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen); - query->flags = SI_QUERY_HW_FLAG_NO_START; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ - query->result_size = 32; - query->b.num_cs_dw_suspend = 6; - query->stream = index; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ - query->result_size = 32 * SI_MAX_STREAMS; - query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - /* 11 values on GCN. */ - query->result_size = 11 * 16; - query->result_size += 8; /* for the fence + alignment */ - query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); - break; - default: - assert(0); - FREE(query); - return NULL; - } - - return (struct pipe_query *)query; + struct si_query_hw *query = CALLOC_STRUCT(si_query_hw); + if (!query) + return NULL; + + query->b.type = query_type; + query->b.ops = &query_hw_ops; + query->ops = &query_hw_default_hw_ops; + + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + query->result_size = 16 * sscreen->info.num_render_backends; + query->result_size += 16; /* for the fence + alignment */ + query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); + break; + case SI_QUERY_TIME_ELAPSED_SDMA: + /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */ + query->result_size = 64; + break; + case PIPE_QUERY_TIME_ELAPSED: + query->result_size = 24; + query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen); + break; + case PIPE_QUERY_TIMESTAMP: + query->result_size = 16; + query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen); + query->flags = SI_QUERY_HW_FLAG_NO_START; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32; + query->b.num_cs_dw_suspend = 6; + query->stream = index; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32 * SI_MAX_STREAMS; + query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + /* 11 values on GCN. */ + query->result_size = 11 * 16; + query->result_size += 8; /* for the fence + alignment */ + query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); + break; + default: + assert(0); + FREE(query); + return NULL; + } + + return (struct pipe_query *)query; } -static void si_update_occlusion_query_state(struct si_context *sctx, - unsigned type, int diff) +static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff) { - if (type == PIPE_QUERY_OCCLUSION_COUNTER || - type == PIPE_QUERY_OCCLUSION_PREDICATE || - type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { - bool old_enable = sctx->num_occlusion_queries != 0; - bool old_perfect_enable = - sctx->num_perfect_occlusion_queries != 0; - bool enable, perfect_enable; - - sctx->num_occlusion_queries += diff; - assert(sctx->num_occlusion_queries >= 0); - - if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { - sctx->num_perfect_occlusion_queries += diff; - assert(sctx->num_perfect_occlusion_queries >= 0); - } - - enable = sctx->num_occlusion_queries != 0; - perfect_enable = sctx->num_perfect_occlusion_queries != 0; - - if (enable != old_enable || perfect_enable != old_perfect_enable) { - si_set_occlusion_query_state(sctx, old_perfect_enable); - } - } + if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE || + type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { + bool old_enable = sctx->num_occlusion_queries != 0; + bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0; + bool enable, perfect_enable; + + sctx->num_occlusion_queries += diff; + assert(sctx->num_occlusion_queries >= 0); + + if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { + sctx->num_perfect_occlusion_queries += diff; + assert(sctx->num_perfect_occlusion_queries >= 0); + } + + enable = sctx->num_occlusion_queries != 0; + perfect_enable = sctx->num_perfect_occlusion_queries != 0; + + if (enable != old_enable || perfect_enable != old_perfect_enable) { + si_set_occlusion_query_state(sctx, old_perfect_enable); + } + } } static unsigned event_type_for_stream(unsigned stream) { - switch (stream) { - default: - case 0: return V_028A90_SAMPLE_STREAMOUTSTATS; - case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1; - case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2; - case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3; - } + switch (stream) { + default: + case 0: + return V_028A90_SAMPLE_STREAMOUTSTATS; + case 1: + return V_028A90_SAMPLE_STREAMOUTSTATS1; + case 2: + return V_028A90_SAMPLE_STREAMOUTSTATS2; + case 3: + return V_028A90_SAMPLE_STREAMOUTSTATS3; + } } -static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, - unsigned stream) +static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); } -static void si_query_hw_do_emit_start(struct si_context *sctx, - struct si_query_hw *query, - struct si_resource *buffer, - uint64_t va) +static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query, + struct si_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - switch (query->b.type) { - case SI_QUERY_TIME_ELAPSED_SDMA: - si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address); - return; - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - emit_sample_streamout(cs, va, query->stream); - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) - emit_sample_streamout(cs, va + 32 * stream, stream); - break; - case PIPE_QUERY_TIME_ELAPSED: - si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_TIMESTAMP, NULL, va, - 0, query->b.type); - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - break; - default: - assert(0); - } - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE, - RADEON_PRIO_QUERY); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address); + return; + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + emit_sample_streamout(cs, va, query->stream); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) + emit_sample_streamout(cs, va + 32 * stream, stream); + break; + case PIPE_QUERY_TIME_ELAPSED: + si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + default: + assert(0); + } + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); } -static void si_query_hw_emit_start(struct si_context *sctx, - struct si_query_hw *query) +static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query) { - uint64_t va; + uint64_t va; - if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, - query->result_size)) - return; + if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size)) + return; - si_update_occlusion_query_state(sctx, query->b.type, 1); - si_update_prims_generated_query_state(sctx, query->b.type, 1); + si_update_occlusion_query_state(sctx, query->b.type, 1); + si_update_prims_generated_query_state(sctx, query->b.type, 1); - if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) - sctx->num_pipeline_stat_queries++; + if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) + sctx->num_pipeline_stat_queries++; - if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) - si_need_gfx_cs_space(sctx); + if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA) + si_need_gfx_cs_space(sctx); - va = query->buffer.buf->gpu_address + query->buffer.results_end; - query->ops->emit_start(sctx, query, query->buffer.buf, va); + va = query->buffer.buf->gpu_address + query->buffer.results_end; + query->ops->emit_start(sctx, query, query->buffer.buf, va); } -static void si_query_hw_do_emit_stop(struct si_context *sctx, - struct si_query_hw *query, - struct si_resource *buffer, - uint64_t va) +static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query, + struct si_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - uint64_t fence_va = 0; - - switch (query->b.type) { - case SI_QUERY_TIME_ELAPSED_SDMA: - si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address); - return; - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - va += 8; - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - - fence_va = va + sctx->screen->info.num_render_backends * 16 - 8; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - va += 16; - emit_sample_streamout(cs, va, query->stream); - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - va += 16; - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) - emit_sample_streamout(cs, va + 32 * stream, stream); - break; - case PIPE_QUERY_TIME_ELAPSED: - va += 8; - /* fall through */ - case PIPE_QUERY_TIMESTAMP: - si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_TIMESTAMP, NULL, va, - 0, query->b.type); - fence_va = va + 8; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: { - unsigned sample_size = (query->result_size - 8) / 2; - - va += sample_size; - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - - fence_va = va + sample_size; - break; - } - default: - assert(0); - } - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE, - RADEON_PRIO_QUERY); - - if (fence_va) { - si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, - EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, - query->buffer.buf, fence_va, 0x80000000, - query->b.type); - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + uint64_t fence_va = 0; + + switch (query->b.type) { + case SI_QUERY_TIME_ELAPSED_SDMA: + si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address); + return; + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + va += 8; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + fence_va = va + sctx->screen->info.num_render_backends * 16 - 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + va += 16; + emit_sample_streamout(cs, va, query->stream); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + va += 16; + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) + emit_sample_streamout(cs, va + 32 * stream, stream); + break; + case PIPE_QUERY_TIME_ELAPSED: + va += 8; + /* fall through */ + case PIPE_QUERY_TIMESTAMP: + si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type); + fence_va = va + 8; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: { + unsigned sample_size = (query->result_size - 8) / 2; + + va += sample_size; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + fence_va = va + sample_size; + break; + } + default: + assert(0); + } + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); + + if (fence_va) { + si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000, + query->b.type); + } } -static void si_query_hw_emit_stop(struct si_context *sctx, - struct si_query_hw *query) +static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query) { - uint64_t va; + uint64_t va; - /* The queries which need begin already called this in begin_query. */ - if (query->flags & SI_QUERY_HW_FLAG_NO_START) { - si_need_gfx_cs_space(sctx); - if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, - query->result_size)) - return; - } + /* The queries which need begin already called this in begin_query. */ + if (query->flags & SI_QUERY_HW_FLAG_NO_START) { + si_need_gfx_cs_space(sctx); + if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, + query->result_size)) + return; + } - if (!query->buffer.buf) - return; // previous buffer allocation failure + if (!query->buffer.buf) + return; // previous buffer allocation failure - /* emit end query */ - va = query->buffer.buf->gpu_address + query->buffer.results_end; + /* emit end query */ + va = query->buffer.buf->gpu_address + query->buffer.results_end; - query->ops->emit_stop(sctx, query, query->buffer.buf, va); + query->ops->emit_stop(sctx, query, query->buffer.buf, va); - query->buffer.results_end += query->result_size; + query->buffer.results_end += query->result_size; - si_update_occlusion_query_state(sctx, query->b.type, -1); - si_update_prims_generated_query_state(sctx, query->b.type, -1); + si_update_occlusion_query_state(sctx, query->b.type, -1); + si_update_prims_generated_query_state(sctx, query->b.type, -1); - if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) - sctx->num_pipeline_stat_queries--; + if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS) + sctx->num_pipeline_stat_queries--; } -static void emit_set_predicate(struct si_context *ctx, - struct si_resource *buf, uint64_t va, - uint32_t op) +static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va, + uint32_t op) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - - if (ctx->chip_class >= GFX9) { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); - radeon_emit(cs, op); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - } else { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, va); - radeon_emit(cs, op | ((va >> 32) & 0xFF)); - } - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, - RADEON_PRIO_QUERY); + struct radeon_cmdbuf *cs = ctx->gfx_cs; + + if (ctx->chip_class >= GFX9) { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } else { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va); + radeon_emit(cs, op | ((va >> 32) & 0xFF)); + } + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY); } static void si_emit_query_predication(struct si_context *ctx) { - struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond; - struct si_query_buffer *qbuf; - uint32_t op; - bool flag_wait, invert; - - if (!query) - return; - - if (ctx->screen->use_ngg_streamout && - (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || - query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { - assert(!"not implemented"); - } - - invert = ctx->render_cond_invert; - flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || - ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; - - if (query->workaround_buf) { - op = PRED_OP(PREDICATION_OP_BOOL64); - } else { - switch (query->b.type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - op = PRED_OP(PREDICATION_OP_ZPASS); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - op = PRED_OP(PREDICATION_OP_PRIMCOUNT); - invert = !invert; - break; - default: - assert(0); - return; - } - } - - /* if true then invert, see GL_ARB_conditional_render_inverted */ - if (invert) - op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ - else - op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ - - /* Use the value written by compute shader as a workaround. Note that - * the wait flag does not apply in this predication mode. - * - * The shader outputs the result value to L2. Workarounds only affect GFX8 - * and later, where the CP reads data from L2, so we don't need an - * additional flush. - */ - if (query->workaround_buf) { - uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset; - emit_set_predicate(ctx, query->workaround_buf, va, op); - return; - } - - op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - - /* emit predicate packets for all data blocks */ - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned results_base = 0; - uint64_t va_base = qbuf->buf->gpu_address; - - while (results_base < qbuf->results_end) { - uint64_t va = va_base + results_base; - - if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { - emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); - - /* set CONTINUE bit for all packets except the first */ - op |= PREDICATION_CONTINUE; - } - } else { - emit_set_predicate(ctx, qbuf->buf, va, op); - op |= PREDICATION_CONTINUE; - } - - results_base += query->result_size; - } - } + struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond; + struct si_query_buffer *qbuf; + uint32_t op; + bool flag_wait, invert; + + if (!query) + return; + + if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { + assert(!"not implemented"); + } + + invert = ctx->render_cond_invert; + flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || + ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; + + if (query->workaround_buf) { + op = PRED_OP(PREDICATION_OP_BOOL64); + } else { + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + op = PRED_OP(PREDICATION_OP_ZPASS); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); + invert = !invert; + break; + default: + assert(0); + return; + } + } + + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (invert) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ + + /* Use the value written by compute shader as a workaround. Note that + * the wait flag does not apply in this predication mode. + * + * The shader outputs the result value to L2. Workarounds only affect GFX8 + * and later, where the CP reads data from L2, so we don't need an + * additional flush. + */ + if (query->workaround_buf) { + uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset; + emit_set_predicate(ctx, query->workaround_buf, va, op); + return; + } + + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va_base = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + uint64_t va = va_base + results_base; + + if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; + } + } else { + emit_set_predicate(ctx, qbuf->buf, va, op); + op |= PREDICATION_CONTINUE; + } + + results_base += query->result_size; + } + } } -static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, + unsigned index) { - struct si_screen *sscreen = - (struct si_screen *)ctx->screen; - - if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || - query_type == PIPE_QUERY_GPU_FINISHED || - (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && - query_type != SI_QUERY_TIME_ELAPSED_SDMA)) - return si_query_sw_create(query_type); - - if (sscreen->use_ngg_streamout && - (query_type == PIPE_QUERY_PRIMITIVES_EMITTED || - query_type == PIPE_QUERY_PRIMITIVES_GENERATED || - query_type == PIPE_QUERY_SO_STATISTICS || - query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || - query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) - return gfx10_sh_query_create(sscreen, query_type, index); - - return si_query_hw_create(sscreen, query_type, index); + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + + if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED || + (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA)) + return si_query_sw_create(query_type); + + if (sscreen->use_ngg_streamout && + (query_type == PIPE_QUERY_PRIMITIVES_EMITTED || + query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS || + query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) + return gfx10_sh_query_create(sscreen, query_type, index); + + return si_query_hw_create(sscreen, query_type, index); } static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query *squery = (struct si_query *)query; + struct si_context *sctx = (struct si_context *)ctx; + struct si_query *squery = (struct si_query *)query; - squery->ops->destroy(sctx, squery); + squery->ops->destroy(sctx, squery); } -static bool si_begin_query(struct pipe_context *ctx, - struct pipe_query *query) +static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query *squery = (struct si_query *)query; + struct si_context *sctx = (struct si_context *)ctx; + struct si_query *squery = (struct si_query *)query; - return squery->ops->begin(sctx, squery); + return squery->ops->begin(sctx, squery); } -bool si_query_hw_begin(struct si_context *sctx, - struct si_query *squery) +bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery) { - struct si_query_hw *query = (struct si_query_hw *)squery; + struct si_query_hw *query = (struct si_query_hw *)squery; - if (query->flags & SI_QUERY_HW_FLAG_NO_START) { - assert(0); - return false; - } + if (query->flags & SI_QUERY_HW_FLAG_NO_START) { + assert(0); + return false; + } - if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES)) - si_query_buffer_reset(sctx, &query->buffer); + if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES)) + si_query_buffer_reset(sctx, &query->buffer); - si_resource_reference(&query->workaround_buf, NULL); + si_resource_reference(&query->workaround_buf, NULL); - si_query_hw_emit_start(sctx, query); - if (!query->buffer.buf) - return false; + si_query_hw_emit_start(sctx, query); + if (!query->buffer.buf) + return false; - list_addtail(&query->b.active_list, &sctx->active_queries); - sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; - return true; + list_addtail(&query->b.active_list, &sctx->active_queries); + sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend; + return true; } static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query *squery = (struct si_query *)query; + struct si_context *sctx = (struct si_context *)ctx; + struct si_query *squery = (struct si_query *)query; - return squery->ops->end(sctx, squery); + return squery->ops->end(sctx, squery); } -bool si_query_hw_end(struct si_context *sctx, - struct si_query *squery) +bool si_query_hw_end(struct si_context *sctx, struct si_query *squery) { - struct si_query_hw *query = (struct si_query_hw *)squery; + struct si_query_hw *query = (struct si_query_hw *)squery; - if (query->flags & SI_QUERY_HW_FLAG_NO_START) - si_query_buffer_reset(sctx, &query->buffer); + if (query->flags & SI_QUERY_HW_FLAG_NO_START) + si_query_buffer_reset(sctx, &query->buffer); - si_query_hw_emit_stop(sctx, query); + si_query_hw_emit_stop(sctx, query); - if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) { - list_delinit(&query->b.active_list); - sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend; - } + if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) { + list_delinit(&query->b.active_list); + sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend; + } - if (!query->buffer.buf) - return false; + if (!query->buffer.buf) + return false; - return true; + return true; } -static void si_get_hw_query_params(struct si_context *sctx, - struct si_query_hw *squery, int index, - struct si_hw_query_params *params) +static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index, + struct si_hw_query_params *params) { - unsigned max_rbs = sctx->screen->info.num_render_backends; - - params->pair_stride = 0; - params->pair_count = 1; - - switch (squery->b.type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - params->start_offset = 0; - params->end_offset = 8; - params->fence_offset = max_rbs * 16; - params->pair_stride = 16; - params->pair_count = max_rbs; - break; - case PIPE_QUERY_TIME_ELAPSED: - params->start_offset = 0; - params->end_offset = 8; - params->fence_offset = 16; - break; - case PIPE_QUERY_TIMESTAMP: - params->start_offset = 0; - params->end_offset = 0; - params->fence_offset = 8; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - params->start_offset = 8; - params->end_offset = 24; - params->fence_offset = params->end_offset + 4; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - params->start_offset = 0; - params->end_offset = 16; - params->fence_offset = params->end_offset + 4; - break; - case PIPE_QUERY_SO_STATISTICS: - params->start_offset = 8 - index * 8; - params->end_offset = 24 - index * 8; - params->fence_offset = params->end_offset + 4; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - params->pair_count = SI_MAX_STREAMS; - params->pair_stride = 32; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - params->start_offset = 0; - params->end_offset = 16; - - /* We can re-use the high dword of the last 64-bit value as a - * fence: it is initialized as 0, and the high bit is set by - * the write of the streamout stats event. - */ - params->fence_offset = squery->result_size - 4; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - { - static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; - params->start_offset = offsets[index]; - params->end_offset = 88 + offsets[index]; - params->fence_offset = 2 * 88; - break; - } - default: - unreachable("si_get_hw_query_params unsupported"); - } + unsigned max_rbs = sctx->screen->info.num_render_backends; + + params->pair_stride = 0; + params->pair_count = 1; + + switch (squery->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = max_rbs * 16; + params->pair_stride = 16; + params->pair_count = max_rbs; + break; + case PIPE_QUERY_TIME_ELAPSED: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = 16; + break; + case PIPE_QUERY_TIMESTAMP: + params->start_offset = 0; + params->end_offset = 0; + params->fence_offset = 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + params->start_offset = 8; + params->end_offset = 24; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + params->start_offset = 0; + params->end_offset = 16; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_STATISTICS: + params->start_offset = 8 - index * 8; + params->end_offset = 24 - index * 8; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + params->pair_count = SI_MAX_STREAMS; + params->pair_stride = 32; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + params->start_offset = 0; + params->end_offset = 16; + + /* We can re-use the high dword of the last 64-bit value as a + * fence: it is initialized as 0, and the high bit is set by + * the write of the streamout stats event. + */ + params->fence_offset = squery->result_size - 4; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: { + static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; + params->start_offset = offsets[index]; + params->end_offset = 88 + offsets[index]; + params->fence_offset = 2 * 88; + break; + } + default: + unreachable("si_get_hw_query_params unsupported"); + } } static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index, - bool test_status_bit) + bool test_status_bit) { - uint32_t *current_result = (uint32_t*)map; - uint64_t start, end; - - start = (uint64_t)current_result[start_index] | - (uint64_t)current_result[start_index+1] << 32; - end = (uint64_t)current_result[end_index] | - (uint64_t)current_result[end_index+1] << 32; - - if (!test_status_bit || - ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { - return end - start; - } - return 0; + uint32_t *current_result = (uint32_t *)map; + uint64_t start, end; + + start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32; + end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32; + + if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { + return end - start; + } + return 0; } -static void si_query_hw_add_result(struct si_screen *sscreen, - struct si_query_hw *query, - void *buffer, - union pipe_query_result *result) +static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query, + void *buffer, union pipe_query_result *result) { - unsigned max_rbs = sscreen->info.num_render_backends; - - switch (query->b.type) { - case PIPE_QUERY_OCCLUSION_COUNTER: { - for (unsigned i = 0; i < max_rbs; ++i) { - unsigned results_base = i * 16; - result->u64 += - si_query_read_result(buffer + results_base, 0, 2, true); - } - break; - } - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { - for (unsigned i = 0; i < max_rbs; ++i) { - unsigned results_base = i * 16; - result->b = result->b || - si_query_read_result(buffer + results_base, 0, 2, true) != 0; - } - break; - } - case PIPE_QUERY_TIME_ELAPSED: - result->u64 += si_query_read_result(buffer, 0, 2, false); - break; - case SI_QUERY_TIME_ELAPSED_SDMA: - result->u64 += si_query_read_result(buffer, 0, 32/4, false); - break; - case PIPE_QUERY_TIMESTAMP: - result->u64 = *(uint64_t*)buffer; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - /* SAMPLE_STREAMOUTSTATS stores this structure: - * { - * u64 NumPrimitivesWritten; - * u64 PrimitiveStorageNeeded; - * } - * We only need NumPrimitivesWritten here. */ - result->u64 += si_query_read_result(buffer, 2, 6, true); - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - /* Here we read PrimitiveStorageNeeded. */ - result->u64 += si_query_read_result(buffer, 0, 4, true); - break; - case PIPE_QUERY_SO_STATISTICS: - result->so_statistics.num_primitives_written += - si_query_read_result(buffer, 2, 6, true); - result->so_statistics.primitives_storage_needed += - si_query_read_result(buffer, 0, 4, true); - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - result->b = result->b || - si_query_read_result(buffer, 2, 6, true) != - si_query_read_result(buffer, 0, 4, true); - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { - result->b = result->b || - si_query_read_result(buffer, 2, 6, true) != - si_query_read_result(buffer, 0, 4, true); - buffer = (char *)buffer + 32; - } - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - result->pipeline_statistics.ps_invocations += - si_query_read_result(buffer, 0, 22, false); - result->pipeline_statistics.c_primitives += - si_query_read_result(buffer, 2, 24, false); - result->pipeline_statistics.c_invocations += - si_query_read_result(buffer, 4, 26, false); - result->pipeline_statistics.vs_invocations += - si_query_read_result(buffer, 6, 28, false); - result->pipeline_statistics.gs_invocations += - si_query_read_result(buffer, 8, 30, false); - result->pipeline_statistics.gs_primitives += - si_query_read_result(buffer, 10, 32, false); - result->pipeline_statistics.ia_primitives += - si_query_read_result(buffer, 12, 34, false); - result->pipeline_statistics.ia_vertices += - si_query_read_result(buffer, 14, 36, false); - result->pipeline_statistics.hs_invocations += - si_query_read_result(buffer, 16, 38, false); - result->pipeline_statistics.ds_invocations += - si_query_read_result(buffer, 18, 40, false); - result->pipeline_statistics.cs_invocations += - si_query_read_result(buffer, 20, 42, false); + unsigned max_rbs = sscreen->info.num_render_backends; + + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: { + for (unsigned i = 0; i < max_rbs; ++i) { + unsigned results_base = i * 16; + result->u64 += si_query_read_result(buffer + results_base, 0, 2, true); + } + break; + } + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { + for (unsigned i = 0; i < max_rbs; ++i) { + unsigned results_base = i * 16; + result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0; + } + break; + } + case PIPE_QUERY_TIME_ELAPSED: + result->u64 += si_query_read_result(buffer, 0, 2, false); + break; + case SI_QUERY_TIME_ELAPSED_SDMA: + result->u64 += si_query_read_result(buffer, 0, 32 / 4, false); + break; + case PIPE_QUERY_TIMESTAMP: + result->u64 = *(uint64_t *)buffer; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + /* SAMPLE_STREAMOUTSTATS stores this structure: + * { + * u64 NumPrimitivesWritten; + * u64 PrimitiveStorageNeeded; + * } + * We only need NumPrimitivesWritten here. */ + result->u64 += si_query_read_result(buffer, 2, 6, true); + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + /* Here we read PrimitiveStorageNeeded. */ + result->u64 += si_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_STATISTICS: + result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true); + result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result->b = result->b || si_query_read_result(buffer, 2, 6, true) != + si_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + result->b = result->b || si_query_read_result(buffer, 2, 6, true) != + si_query_read_result(buffer, 0, 4, true); + buffer = (char *)buffer + 32; + } + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false); + result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false); + result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false); + result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false); + result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false); + result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false); + result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false); + result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false); + result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false); + result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false); + result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false); #if 0 /* for testing */ printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, " "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, " @@ -1402,444 +1346,416 @@ static void si_query_hw_add_result(struct si_screen *sscreen, result->pipeline_statistics.ps_invocations, result->pipeline_statistics.cs_invocations); #endif - break; - default: - assert(0); - } + break; + default: + assert(0); + } } void si_query_hw_suspend(struct si_context *sctx, struct si_query *query) { - si_query_hw_emit_stop(sctx, (struct si_query_hw *)query); + si_query_hw_emit_stop(sctx, (struct si_query_hw *)query); } void si_query_hw_resume(struct si_context *sctx, struct si_query *query) { - si_query_hw_emit_start(sctx, (struct si_query_hw *)query); + si_query_hw_emit_start(sctx, (struct si_query_hw *)query); } static const struct si_query_ops query_hw_ops = { - .destroy = si_query_hw_destroy, - .begin = si_query_hw_begin, - .end = si_query_hw_end, - .get_result = si_query_hw_get_result, - .get_result_resource = si_query_hw_get_result_resource, - - .suspend = si_query_hw_suspend, - .resume = si_query_hw_resume, + .destroy = si_query_hw_destroy, + .begin = si_query_hw_begin, + .end = si_query_hw_end, + .get_result = si_query_hw_get_result, + .get_result_resource = si_query_hw_get_result_resource, + + .suspend = si_query_hw_suspend, + .resume = si_query_hw_resume, }; -static bool si_get_query_result(struct pipe_context *ctx, - struct pipe_query *query, bool wait, - union pipe_query_result *result) +static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait, + union pipe_query_result *result) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query *squery = (struct si_query *)query; + struct si_context *sctx = (struct si_context *)ctx; + struct si_query *squery = (struct si_query *)query; - return squery->ops->get_result(sctx, squery, wait, result); + return squery->ops->get_result(sctx, squery, wait, result); } -static void si_get_query_result_resource(struct pipe_context *ctx, - struct pipe_query *query, - bool wait, - enum pipe_query_value_type result_type, - int index, - struct pipe_resource *resource, - unsigned offset) +static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query, + bool wait, enum pipe_query_value_type result_type, + int index, struct pipe_resource *resource, unsigned offset) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query *squery = (struct si_query *)query; + struct si_context *sctx = (struct si_context *)ctx; + struct si_query *squery = (struct si_query *)query; - squery->ops->get_result_resource(sctx, squery, wait, result_type, index, - resource, offset); + squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset); } -static void si_query_hw_clear_result(struct si_query_hw *query, - union pipe_query_result *result) +static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result) { - util_query_clear_result(result, query->b.type); + util_query_clear_result(result, query->b.type); } -bool si_query_hw_get_result(struct si_context *sctx, - struct si_query *squery, - bool wait, union pipe_query_result *result) +bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait, + union pipe_query_result *result) { - struct si_screen *sscreen = sctx->screen; - struct si_query_hw *query = (struct si_query_hw *)squery; - struct si_query_buffer *qbuf; - - query->ops->clear_result(query, result); - - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned usage = PIPE_TRANSFER_READ | - (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); - unsigned results_base = 0; - void *map; - - if (squery->b.flushed) - map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); - else - map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); - - if (!map) - return false; - - while (results_base != qbuf->results_end) { - query->ops->add_result(sscreen, query, map + results_base, - result); - results_base += query->result_size; - } - } - - /* Convert the time to expected units. */ - if (squery->type == PIPE_QUERY_TIME_ELAPSED || - squery->type == SI_QUERY_TIME_ELAPSED_SDMA || - squery->type == PIPE_QUERY_TIMESTAMP) { - result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq; - } - return true; + struct si_screen *sscreen = sctx->screen; + struct si_query_hw *query = (struct si_query_hw *)squery; + struct si_query_buffer *qbuf; + + query->ops->clear_result(query, result); + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + unsigned results_base = 0; + void *map; + + if (squery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); + + if (!map) + return false; + + while (results_base != qbuf->results_end) { + query->ops->add_result(sscreen, query, map + results_base, result); + results_base += query->result_size; + } + } + + /* Convert the time to expected units. */ + if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA || + squery->type == PIPE_QUERY_TIMESTAMP) { + result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq; + } + return true; } -static void si_query_hw_get_result_resource(struct si_context *sctx, - struct si_query *squery, - bool wait, - enum pipe_query_value_type result_type, - int index, - struct pipe_resource *resource, - unsigned offset) +static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery, + bool wait, enum pipe_query_value_type result_type, + int index, struct pipe_resource *resource, + unsigned offset) { - struct si_query_hw *query = (struct si_query_hw *)squery; - struct si_query_buffer *qbuf; - struct si_query_buffer *qbuf_prev; - struct pipe_resource *tmp_buffer = NULL; - unsigned tmp_buffer_offset = 0; - struct si_qbo_state saved_state = {}; - struct pipe_grid_info grid = {}; - struct pipe_constant_buffer constant_buffer = {}; - struct pipe_shader_buffer ssbo[3]; - struct si_hw_query_params params; - struct { - uint32_t end_offset; - uint32_t result_stride; - uint32_t result_count; - uint32_t config; - uint32_t fence_offset; - uint32_t pair_stride; - uint32_t pair_count; - } consts; - - if (!sctx->query_result_shader) { - sctx->query_result_shader = si_create_query_result_cs(sctx); - if (!sctx->query_result_shader) - return; - } - - if (query->buffer.previous) { - u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, - &tmp_buffer_offset, &tmp_buffer); - if (!tmp_buffer) - return; - } - - si_save_qbo_state(sctx, &saved_state); - - si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms); - consts.end_offset = params.end_offset - params.start_offset; - consts.fence_offset = params.fence_offset - params.start_offset; - consts.result_stride = query->result_size; - consts.pair_stride = params.pair_stride; - consts.pair_count = params.pair_count; - - constant_buffer.buffer_size = sizeof(consts); - constant_buffer.user_buffer = &consts; - - ssbo[1].buffer = tmp_buffer; - ssbo[1].buffer_offset = tmp_buffer_offset; - ssbo[1].buffer_size = 16; - - ssbo[2] = ssbo[1]; - - sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader); - - grid.block[0] = 1; - grid.block[1] = 1; - grid.block[2] = 1; - grid.grid[0] = 1; - grid.grid[1] = 1; - grid.grid[2] = 1; - - consts.config = 0; - if (index < 0) - consts.config |= 4; - if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || - query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) - consts.config |= 8; - else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || - query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) - consts.config |= 8 | 256; - else if (query->b.type == PIPE_QUERY_TIMESTAMP || - query->b.type == PIPE_QUERY_TIME_ELAPSED) - consts.config |= 32; - - switch (result_type) { - case PIPE_QUERY_TYPE_U64: - case PIPE_QUERY_TYPE_I64: - consts.config |= 64; - break; - case PIPE_QUERY_TYPE_I32: - consts.config |= 128; - break; - case PIPE_QUERY_TYPE_U32: - break; - } - - sctx->flags |= sctx->screen->barrier_flags.cp_to_L2; - - for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { - if (query->b.type != PIPE_QUERY_TIMESTAMP) { - qbuf_prev = qbuf->previous; - consts.result_count = qbuf->results_end / query->result_size; - consts.config &= ~3; - if (qbuf != &query->buffer) - consts.config |= 1; - if (qbuf->previous) - consts.config |= 2; - } else { - /* Only read the last timestamp. */ - qbuf_prev = NULL; - consts.result_count = 0; - consts.config |= 16; - params.start_offset += qbuf->results_end - query->result_size; - } - - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); - - ssbo[0].buffer = &qbuf->buf->b.b; - ssbo[0].buffer_offset = params.start_offset; - ssbo[0].buffer_size = qbuf->results_end - params.start_offset; - - if (!qbuf->previous) { - ssbo[2].buffer = resource; - ssbo[2].buffer_offset = offset; - ssbo[2].buffer_size = 8; - - si_resource(resource)->TC_L2_dirty = true; - } - - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, - 1 << 2); - - if (wait && qbuf == &query->buffer) { - uint64_t va; - - /* Wait for result availability. Wait only for readiness - * of the last entry, since the fence writes should be - * serialized in the CP. - */ - va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; - va += params.fence_offset; - - si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, - 0x80000000, WAIT_REG_MEM_EQUAL); - } - - sctx->b.launch_grid(&sctx->b, &grid); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - si_restore_qbo_state(sctx, &saved_state); - pipe_resource_reference(&tmp_buffer, NULL); + struct si_query_hw *query = (struct si_query_hw *)squery; + struct si_query_buffer *qbuf; + struct si_query_buffer *qbuf_prev; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + struct si_qbo_state saved_state = {}; + struct pipe_grid_info grid = {}; + struct pipe_constant_buffer constant_buffer = {}; + struct pipe_shader_buffer ssbo[3]; + struct si_hw_query_params params; + struct { + uint32_t end_offset; + uint32_t result_stride; + uint32_t result_count; + uint32_t config; + uint32_t fence_offset; + uint32_t pair_stride; + uint32_t pair_count; + } consts; + + if (!sctx->query_result_shader) { + sctx->query_result_shader = si_create_query_result_cs(sctx); + if (!sctx->query_result_shader) + return; + } + + if (query->buffer.previous) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + si_save_qbo_state(sctx, &saved_state); + + si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms); + consts.end_offset = params.end_offset - params.start_offset; + consts.fence_offset = params.fence_offset - params.start_offset; + consts.result_stride = query->result_size; + consts.pair_stride = params.pair_stride; + consts.pair_count = params.pair_count; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + consts.config = 0; + if (index < 0) + consts.config |= 4; + if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) + consts.config |= 8; + else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + consts.config |= 8 | 256; + else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED) + consts.config |= 32; + + switch (result_type) { + case PIPE_QUERY_TYPE_U64: + case PIPE_QUERY_TYPE_I64: + consts.config |= 64; + break; + case PIPE_QUERY_TYPE_I32: + consts.config |= 128; + break; + case PIPE_QUERY_TYPE_U32: + break; + } + + sctx->flags |= sctx->screen->barrier_flags.cp_to_L2; + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { + if (query->b.type != PIPE_QUERY_TIMESTAMP) { + qbuf_prev = qbuf->previous; + consts.result_count = qbuf->results_end / query->result_size; + consts.config &= ~3; + if (qbuf != &query->buffer) + consts.config |= 1; + if (qbuf->previous) + consts.config |= 2; + } else { + /* Only read the last timestamp. */ + qbuf_prev = NULL; + consts.result_count = 0; + consts.config |= 16; + params.start_offset += qbuf->results_end - query->result_size; + } + + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = params.start_offset; + ssbo[0].buffer_size = qbuf->results_end - params.start_offset; + + if (!qbuf->previous) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + + si_resource(resource)->TC_L2_dirty = true; + } + + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2); + + if (wait && qbuf == &query->buffer) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; + va += params.fence_offset; + + si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL); + } + + sctx->b.launch_grid(&sctx->b, &grid); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + si_restore_qbo_state(sctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); } -static void si_render_condition(struct pipe_context *ctx, - struct pipe_query *query, - bool condition, - enum pipe_render_cond_flag mode) +static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition, + enum pipe_render_cond_flag mode) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_query_hw *squery = (struct si_query_hw *)query; - struct si_atom *atom = &sctx->atoms.s.render_cond; - - if (query) { - bool needs_workaround = false; - - /* There was a firmware regression in GFX8 which causes successive - * SET_PREDICATION packets to give the wrong answer for - * non-inverted stream overflow predication. - */ - if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) || - (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) && - !condition && - (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE || - (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE && - (squery->buffer.previous || - squery->buffer.results_end > squery->result_size)))) { - needs_workaround = true; - } - - if (needs_workaround && !squery->workaround_buf) { - bool old_force_off = sctx->render_cond_force_off; - sctx->render_cond_force_off = true; - - u_suballocator_alloc( - sctx->allocator_zeroed_memory, 8, 8, - &squery->workaround_offset, - (struct pipe_resource **)&squery->workaround_buf); - - /* Reset to NULL to avoid a redundant SET_PREDICATION - * from launching the compute grid. - */ - sctx->render_cond = NULL; - - ctx->get_query_result_resource( - ctx, query, true, PIPE_QUERY_TYPE_U64, 0, - &squery->workaround_buf->b.b, squery->workaround_offset); - - /* Settings this in the render cond atom is too late, - * so set it here. */ - sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | - SI_CONTEXT_FLUSH_FOR_RENDER_COND; - - sctx->render_cond_force_off = old_force_off; - } - } - - sctx->render_cond = query; - sctx->render_cond_invert = condition; - sctx->render_cond_mode = mode; - - si_set_atom_dirty(sctx, atom, query != NULL); + struct si_context *sctx = (struct si_context *)ctx; + struct si_query_hw *squery = (struct si_query_hw *)query; + struct si_atom *atom = &sctx->atoms.s.render_cond; + + if (query) { + bool needs_workaround = false; + + /* There was a firmware regression in GFX8 which causes successive + * SET_PREDICATION packets to give the wrong answer for + * non-inverted stream overflow predication. + */ + if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) || + (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) && + !condition && + (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE || + (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE && + (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) { + needs_workaround = true; + } + + if (needs_workaround && !squery->workaround_buf) { + bool old_force_off = sctx->render_cond_force_off; + sctx->render_cond_force_off = true; + + u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset, + (struct pipe_resource **)&squery->workaround_buf); + + /* Reset to NULL to avoid a redundant SET_PREDICATION + * from launching the compute grid. + */ + sctx->render_cond = NULL; + + ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0, + &squery->workaround_buf->b.b, squery->workaround_offset); + + /* Settings this in the render cond atom is too late, + * so set it here. */ + sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND; + + sctx->render_cond_force_off = old_force_off; + } + } + + sctx->render_cond = query; + sctx->render_cond_invert = condition; + sctx->render_cond_mode = mode; + + si_set_atom_dirty(sctx, atom, query != NULL); } void si_suspend_queries(struct si_context *sctx) { - struct si_query *query; + struct si_query *query; - LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list) - query->ops->suspend(sctx, query); + LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list) + query->ops->suspend(sctx, query); } void si_resume_queries(struct si_context *sctx) { - struct si_query *query; + struct si_query *query; - /* Check CS space here. Resuming must not be interrupted by flushes. */ - si_need_gfx_cs_space(sctx); + /* Check CS space here. Resuming must not be interrupted by flushes. */ + si_need_gfx_cs_space(sctx); - LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list) - query->ops->resume(sctx, query); + LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list) + query->ops->resume(sctx, query); } -#define XFULL(name_, query_type_, type_, result_type_, group_id_) \ - { \ - .name = name_, \ - .query_type = SI_QUERY_##query_type_, \ - .type = PIPE_DRIVER_QUERY_TYPE_##type_, \ - .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \ - .group_id = group_id_ \ - } +#define XFULL(name_, query_type_, type_, result_type_, group_id_) \ + { \ + .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \ + .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_ \ + } -#define X(name_, query_type_, type_, result_type_) \ - XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0) +#define X(name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0) -#define XG(group_, name_, query_type_, type_, result_type_) \ - XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_) +#define XG(group_, name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_) static struct pipe_driver_query_info si_driver_query_list[] = { - X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), - X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), - X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), - X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE), - X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE), - X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE), - X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), - X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), - X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), - X("dma-calls", DMA_CALLS, UINT64, AVERAGE), - X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), - X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), - X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), - X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), - X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE), - X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE), - X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE), - X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE), - X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE), - X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE), - X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE), - X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE), - X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE), - X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE), - X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), - X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), - X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), - X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), - X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), - X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), - X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), - X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), - X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE), - X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE), - X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), - X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), - X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE), - X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), - X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), - X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), - X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), - X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE), - X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE), - X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE), - X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE), - X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE), - X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE), - - /* GPIN queries are for the benefit of old versions of GPUPerfStudio, - * which use it as a fallback path to detect the GPU type. - * - * Note: The names of these queries are significant for GPUPerfStudio - * (and possibly their order as well). */ - XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE), - XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE), - XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE), - XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), - XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), - - X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), - X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), - X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), - - /* The following queries must be at the end of the list because their - * availability is adjusted dynamically based on the DRM version. */ - X("GPU-load", GPU_LOAD, UINT64, AVERAGE), - X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE), - X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE), - X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE), - X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE), - X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE), - X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE), - X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE), - X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE), - X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE), - X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE), - X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE), - X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE), - X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE), - - /* SRBM_STATUS2 */ - X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE), - - /* CP_STAT */ - X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE), - X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE), - X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE), - X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), - X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), - X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), - - X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE), - X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE), - X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE,UINT64, AVERAGE), + X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), + X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), + X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), + X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE), + X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE), + X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE), + X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), + X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), + X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), + X("dma-calls", DMA_CALLS, UINT64, AVERAGE), + X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), + X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), + X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), + X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), + X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE), + X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE), + X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE), + X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE), + X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE), + X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE), + X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE), + X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE), + X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE), + X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE), + X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), + X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), + X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), + X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), + X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), + X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), + X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), + X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), + X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE), + X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE), + X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), + X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), + X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE), + X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), + X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), + X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), + X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), + X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE), + X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE), + X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE), + X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE), + + /* GPIN queries are for the benefit of old versions of GPUPerfStudio, + * which use it as a fallback path to detect the GPU type. + * + * Note: The names of these queries are significant for GPUPerfStudio + * (and possibly their order as well). */ + XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE), + XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE), + XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE), + XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), + XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), + + X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), + X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), + X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), + + /* The following queries must be at the end of the list because their + * availability is adjusted dynamically based on the DRM version. */ + X("GPU-load", GPU_LOAD, UINT64, AVERAGE), + X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE), + X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE), + X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE), + X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE), + X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE), + X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE), + X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE), + X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE), + X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE), + X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE), + X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE), + X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE), + X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE), + + /* SRBM_STATUS2 */ + X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE), + + /* CP_STAT */ + X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE), + X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE), + X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE), + X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), + X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), + X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), + + X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE), + X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE), + X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE), }; #undef X @@ -1848,119 +1764,116 @@ static struct pipe_driver_query_info si_driver_query_list[] = { static unsigned si_get_num_queries(struct si_screen *sscreen) { - /* amdgpu */ - if (sscreen->info.is_amdgpu) { - if (sscreen->info.chip_class >= GFX8) - return ARRAY_SIZE(si_driver_query_list); - else - return ARRAY_SIZE(si_driver_query_list) - 7; - } - - /* radeon */ - if (sscreen->info.has_read_registers_query) { - if (sscreen->info.chip_class == GFX7) - return ARRAY_SIZE(si_driver_query_list) - 6; - else - return ARRAY_SIZE(si_driver_query_list) - 7; - } - - return ARRAY_SIZE(si_driver_query_list) - 21; + /* amdgpu */ + if (sscreen->info.is_amdgpu) { + if (sscreen->info.chip_class >= GFX8) + return ARRAY_SIZE(si_driver_query_list); + else + return ARRAY_SIZE(si_driver_query_list) - 7; + } + + /* radeon */ + if (sscreen->info.has_read_registers_query) { + if (sscreen->info.chip_class == GFX7) + return ARRAY_SIZE(si_driver_query_list) - 6; + else + return ARRAY_SIZE(si_driver_query_list) - 7; + } + + return ARRAY_SIZE(si_driver_query_list) - 21; } -static int si_get_driver_query_info(struct pipe_screen *screen, - unsigned index, - struct pipe_driver_query_info *info) +static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index, + struct pipe_driver_query_info *info) { - struct si_screen *sscreen = (struct si_screen*)screen; - unsigned num_queries = si_get_num_queries(sscreen); - - if (!info) { - unsigned num_perfcounters = - si_get_perfcounter_info(sscreen, 0, NULL); - - return num_queries + num_perfcounters; - } - - if (index >= num_queries) - return si_get_perfcounter_info(sscreen, index - num_queries, info); - - *info = si_driver_query_list[index]; - - switch (info->query_type) { - case SI_QUERY_REQUESTED_VRAM: - case SI_QUERY_VRAM_USAGE: - case SI_QUERY_MAPPED_VRAM: - info->max_value.u64 = sscreen->info.vram_size; - break; - case SI_QUERY_REQUESTED_GTT: - case SI_QUERY_GTT_USAGE: - case SI_QUERY_MAPPED_GTT: - info->max_value.u64 = sscreen->info.gart_size; - break; - case SI_QUERY_GPU_TEMPERATURE: - info->max_value.u64 = 125; - break; - case SI_QUERY_VRAM_VIS_USAGE: - info->max_value.u64 = sscreen->info.vram_vis_size; - break; - } - - if (info->group_id != ~(unsigned)0 && sscreen->perfcounters) - info->group_id += sscreen->perfcounters->num_groups; - - return 1; + struct si_screen *sscreen = (struct si_screen *)screen; + unsigned num_queries = si_get_num_queries(sscreen); + + if (!info) { + unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL); + + return num_queries + num_perfcounters; + } + + if (index >= num_queries) + return si_get_perfcounter_info(sscreen, index - num_queries, info); + + *info = si_driver_query_list[index]; + + switch (info->query_type) { + case SI_QUERY_REQUESTED_VRAM: + case SI_QUERY_VRAM_USAGE: + case SI_QUERY_MAPPED_VRAM: + info->max_value.u64 = sscreen->info.vram_size; + break; + case SI_QUERY_REQUESTED_GTT: + case SI_QUERY_GTT_USAGE: + case SI_QUERY_MAPPED_GTT: + info->max_value.u64 = sscreen->info.gart_size; + break; + case SI_QUERY_GPU_TEMPERATURE: + info->max_value.u64 = 125; + break; + case SI_QUERY_VRAM_VIS_USAGE: + info->max_value.u64 = sscreen->info.vram_vis_size; + break; + } + + if (info->group_id != ~(unsigned)0 && sscreen->perfcounters) + info->group_id += sscreen->perfcounters->num_groups; + + return 1; } /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware * performance counter groups, so be careful when changing this and related * functions. */ -static int si_get_driver_query_group_info(struct pipe_screen *screen, - unsigned index, - struct pipe_driver_query_group_info *info) +static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index, + struct pipe_driver_query_group_info *info) { - struct si_screen *sscreen = (struct si_screen *)screen; - unsigned num_pc_groups = 0; + struct si_screen *sscreen = (struct si_screen *)screen; + unsigned num_pc_groups = 0; - if (sscreen->perfcounters) - num_pc_groups = sscreen->perfcounters->num_groups; + if (sscreen->perfcounters) + num_pc_groups = sscreen->perfcounters->num_groups; - if (!info) - return num_pc_groups + SI_NUM_SW_QUERY_GROUPS; + if (!info) + return num_pc_groups + SI_NUM_SW_QUERY_GROUPS; - if (index < num_pc_groups) - return si_get_perfcounter_group_info(sscreen, index, info); + if (index < num_pc_groups) + return si_get_perfcounter_group_info(sscreen, index, info); - index -= num_pc_groups; - if (index >= SI_NUM_SW_QUERY_GROUPS) - return 0; + index -= num_pc_groups; + if (index >= SI_NUM_SW_QUERY_GROUPS) + return 0; - info->name = "GPIN"; - info->max_active_queries = 5; - info->num_queries = 5; - return 1; + info->name = "GPIN"; + info->max_active_queries = 5; + info->num_queries = 5; + return 1; } void si_init_query_functions(struct si_context *sctx) { - sctx->b.create_query = si_create_query; - sctx->b.create_batch_query = si_create_batch_query; - sctx->b.destroy_query = si_destroy_query; - sctx->b.begin_query = si_begin_query; - sctx->b.end_query = si_end_query; - sctx->b.get_query_result = si_get_query_result; - sctx->b.get_query_result_resource = si_get_query_result_resource; - - if (sctx->has_graphics) { - sctx->atoms.s.render_cond.emit = si_emit_query_predication; - sctx->b.render_condition = si_render_condition; - } - - list_inithead(&sctx->active_queries); + sctx->b.create_query = si_create_query; + sctx->b.create_batch_query = si_create_batch_query; + sctx->b.destroy_query = si_destroy_query; + sctx->b.begin_query = si_begin_query; + sctx->b.end_query = si_end_query; + sctx->b.get_query_result = si_get_query_result; + sctx->b.get_query_result_resource = si_get_query_result_resource; + + if (sctx->has_graphics) { + sctx->atoms.s.render_cond.emit = si_emit_query_predication; + sctx->b.render_condition = si_render_condition; + } + + list_inithead(&sctx->active_queries); } void si_init_screen_query_functions(struct si_screen *sscreen) { - sscreen->b.get_driver_query_info = si_get_driver_query_info; - sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info; + sscreen->b.get_driver_query_info = si_get_driver_query_info; + sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info; } diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 6c4386451cc..1eaa3b255a6 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -40,236 +40,220 @@ struct si_resource; #define SI_MAX_STREAMS 4 -enum { - SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC, - SI_QUERY_DECOMPRESS_CALLS, - SI_QUERY_MRT_DRAW_CALLS, - SI_QUERY_PRIM_RESTART_CALLS, - SI_QUERY_SPILL_DRAW_CALLS, - SI_QUERY_COMPUTE_CALLS, - SI_QUERY_SPILL_COMPUTE_CALLS, - SI_QUERY_DMA_CALLS, - SI_QUERY_CP_DMA_CALLS, - SI_QUERY_NUM_VS_FLUSHES, - SI_QUERY_NUM_PS_FLUSHES, - SI_QUERY_NUM_CS_FLUSHES, - SI_QUERY_NUM_CB_CACHE_FLUSHES, - SI_QUERY_NUM_DB_CACHE_FLUSHES, - SI_QUERY_NUM_L2_INVALIDATES, - SI_QUERY_NUM_L2_WRITEBACKS, - SI_QUERY_NUM_RESIDENT_HANDLES, - SI_QUERY_TC_OFFLOADED_SLOTS, - SI_QUERY_TC_DIRECT_SLOTS, - SI_QUERY_TC_NUM_SYNCS, - SI_QUERY_CS_THREAD_BUSY, - SI_QUERY_GALLIUM_THREAD_BUSY, - SI_QUERY_REQUESTED_VRAM, - SI_QUERY_REQUESTED_GTT, - SI_QUERY_MAPPED_VRAM, - SI_QUERY_MAPPED_GTT, - SI_QUERY_BUFFER_WAIT_TIME, - SI_QUERY_NUM_MAPPED_BUFFERS, - SI_QUERY_NUM_GFX_IBS, - SI_QUERY_NUM_SDMA_IBS, - SI_QUERY_GFX_BO_LIST_SIZE, - SI_QUERY_GFX_IB_SIZE, - SI_QUERY_NUM_BYTES_MOVED, - SI_QUERY_NUM_EVICTIONS, - SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS, - SI_QUERY_VRAM_USAGE, - SI_QUERY_VRAM_VIS_USAGE, - SI_QUERY_GTT_USAGE, - SI_QUERY_GPU_TEMPERATURE, - SI_QUERY_CURRENT_GPU_SCLK, - SI_QUERY_CURRENT_GPU_MCLK, - SI_QUERY_GPU_LOAD, - SI_QUERY_GPU_SHADERS_BUSY, - SI_QUERY_GPU_TA_BUSY, - SI_QUERY_GPU_GDS_BUSY, - SI_QUERY_GPU_VGT_BUSY, - SI_QUERY_GPU_IA_BUSY, - SI_QUERY_GPU_SX_BUSY, - SI_QUERY_GPU_WD_BUSY, - SI_QUERY_GPU_BCI_BUSY, - SI_QUERY_GPU_SC_BUSY, - SI_QUERY_GPU_PA_BUSY, - SI_QUERY_GPU_DB_BUSY, - SI_QUERY_GPU_CP_BUSY, - SI_QUERY_GPU_CB_BUSY, - SI_QUERY_GPU_SDMA_BUSY, - SI_QUERY_GPU_PFP_BUSY, - SI_QUERY_GPU_MEQ_BUSY, - SI_QUERY_GPU_ME_BUSY, - SI_QUERY_GPU_SURF_SYNC_BUSY, - SI_QUERY_GPU_CP_DMA_BUSY, - SI_QUERY_GPU_SCRATCH_RAM_BUSY, - SI_QUERY_NUM_COMPILATIONS, - SI_QUERY_NUM_SHADERS_CREATED, - SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO, - SI_QUERY_GPIN_ASIC_ID, - SI_QUERY_GPIN_NUM_SIMD, - SI_QUERY_GPIN_NUM_RB, - SI_QUERY_GPIN_NUM_SPI, - SI_QUERY_GPIN_NUM_SE, - SI_QUERY_TIME_ELAPSED_SDMA, - SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */ - SI_QUERY_PD_NUM_PRIMS_ACCEPTED, - SI_QUERY_PD_NUM_PRIMS_REJECTED, - SI_QUERY_PD_NUM_PRIMS_INELIGIBLE, - SI_QUERY_LIVE_SHADER_CACHE_HITS, - SI_QUERY_LIVE_SHADER_CACHE_MISSES, - SI_QUERY_MEMORY_SHADER_CACHE_HITS, - SI_QUERY_MEMORY_SHADER_CACHE_MISSES, - SI_QUERY_DISK_SHADER_CACHE_HITS, - SI_QUERY_DISK_SHADER_CACHE_MISSES, - - SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, +enum +{ + SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC, + SI_QUERY_DECOMPRESS_CALLS, + SI_QUERY_MRT_DRAW_CALLS, + SI_QUERY_PRIM_RESTART_CALLS, + SI_QUERY_SPILL_DRAW_CALLS, + SI_QUERY_COMPUTE_CALLS, + SI_QUERY_SPILL_COMPUTE_CALLS, + SI_QUERY_DMA_CALLS, + SI_QUERY_CP_DMA_CALLS, + SI_QUERY_NUM_VS_FLUSHES, + SI_QUERY_NUM_PS_FLUSHES, + SI_QUERY_NUM_CS_FLUSHES, + SI_QUERY_NUM_CB_CACHE_FLUSHES, + SI_QUERY_NUM_DB_CACHE_FLUSHES, + SI_QUERY_NUM_L2_INVALIDATES, + SI_QUERY_NUM_L2_WRITEBACKS, + SI_QUERY_NUM_RESIDENT_HANDLES, + SI_QUERY_TC_OFFLOADED_SLOTS, + SI_QUERY_TC_DIRECT_SLOTS, + SI_QUERY_TC_NUM_SYNCS, + SI_QUERY_CS_THREAD_BUSY, + SI_QUERY_GALLIUM_THREAD_BUSY, + SI_QUERY_REQUESTED_VRAM, + SI_QUERY_REQUESTED_GTT, + SI_QUERY_MAPPED_VRAM, + SI_QUERY_MAPPED_GTT, + SI_QUERY_BUFFER_WAIT_TIME, + SI_QUERY_NUM_MAPPED_BUFFERS, + SI_QUERY_NUM_GFX_IBS, + SI_QUERY_NUM_SDMA_IBS, + SI_QUERY_GFX_BO_LIST_SIZE, + SI_QUERY_GFX_IB_SIZE, + SI_QUERY_NUM_BYTES_MOVED, + SI_QUERY_NUM_EVICTIONS, + SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS, + SI_QUERY_VRAM_USAGE, + SI_QUERY_VRAM_VIS_USAGE, + SI_QUERY_GTT_USAGE, + SI_QUERY_GPU_TEMPERATURE, + SI_QUERY_CURRENT_GPU_SCLK, + SI_QUERY_CURRENT_GPU_MCLK, + SI_QUERY_GPU_LOAD, + SI_QUERY_GPU_SHADERS_BUSY, + SI_QUERY_GPU_TA_BUSY, + SI_QUERY_GPU_GDS_BUSY, + SI_QUERY_GPU_VGT_BUSY, + SI_QUERY_GPU_IA_BUSY, + SI_QUERY_GPU_SX_BUSY, + SI_QUERY_GPU_WD_BUSY, + SI_QUERY_GPU_BCI_BUSY, + SI_QUERY_GPU_SC_BUSY, + SI_QUERY_GPU_PA_BUSY, + SI_QUERY_GPU_DB_BUSY, + SI_QUERY_GPU_CP_BUSY, + SI_QUERY_GPU_CB_BUSY, + SI_QUERY_GPU_SDMA_BUSY, + SI_QUERY_GPU_PFP_BUSY, + SI_QUERY_GPU_MEQ_BUSY, + SI_QUERY_GPU_ME_BUSY, + SI_QUERY_GPU_SURF_SYNC_BUSY, + SI_QUERY_GPU_CP_DMA_BUSY, + SI_QUERY_GPU_SCRATCH_RAM_BUSY, + SI_QUERY_NUM_COMPILATIONS, + SI_QUERY_NUM_SHADERS_CREATED, + SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO, + SI_QUERY_GPIN_ASIC_ID, + SI_QUERY_GPIN_NUM_SIMD, + SI_QUERY_GPIN_NUM_RB, + SI_QUERY_GPIN_NUM_SPI, + SI_QUERY_GPIN_NUM_SE, + SI_QUERY_TIME_ELAPSED_SDMA, + SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */ + SI_QUERY_PD_NUM_PRIMS_ACCEPTED, + SI_QUERY_PD_NUM_PRIMS_REJECTED, + SI_QUERY_PD_NUM_PRIMS_INELIGIBLE, + SI_QUERY_LIVE_SHADER_CACHE_HITS, + SI_QUERY_LIVE_SHADER_CACHE_MISSES, + SI_QUERY_MEMORY_SHADER_CACHE_HITS, + SI_QUERY_MEMORY_SHADER_CACHE_MISSES, + SI_QUERY_DISK_SHADER_CACHE_HITS, + SI_QUERY_DISK_SHADER_CACHE_MISSES, + + SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, }; -enum { - SI_QUERY_GROUP_GPIN = 0, - SI_NUM_SW_QUERY_GROUPS +enum +{ + SI_QUERY_GROUP_GPIN = 0, + SI_NUM_SW_QUERY_GROUPS }; struct si_query_ops { - void (*destroy)(struct si_context *, struct si_query *); - bool (*begin)(struct si_context *, struct si_query *); - bool (*end)(struct si_context *, struct si_query *); - bool (*get_result)(struct si_context *, - struct si_query *, bool wait, - union pipe_query_result *result); - void (*get_result_resource)(struct si_context *, - struct si_query *, bool wait, - enum pipe_query_value_type result_type, - int index, - struct pipe_resource *resource, - unsigned offset); - - void (*suspend)(struct si_context *, struct si_query *); - void (*resume)(struct si_context *, struct si_query *); + void (*destroy)(struct si_context *, struct si_query *); + bool (*begin)(struct si_context *, struct si_query *); + bool (*end)(struct si_context *, struct si_query *); + bool (*get_result)(struct si_context *, struct si_query *, bool wait, + union pipe_query_result *result); + void (*get_result_resource)(struct si_context *, struct si_query *, bool wait, + enum pipe_query_value_type result_type, int index, + struct pipe_resource *resource, unsigned offset); + + void (*suspend)(struct si_context *, struct si_query *); + void (*resume)(struct si_context *, struct si_query *); }; struct si_query { - struct threaded_query b; - const struct si_query_ops *ops; + struct threaded_query b; + const struct si_query_ops *ops; - /* The PIPE_QUERY_xxx type of query */ - unsigned type; + /* The PIPE_QUERY_xxx type of query */ + unsigned type; - /* The number of dwords for suspend. */ - unsigned num_cs_dw_suspend; + /* The number of dwords for suspend. */ + unsigned num_cs_dw_suspend; - /* Linked list of queries that must be suspended at end of CS. */ - struct list_head active_list; + /* Linked list of queries that must be suspended at end of CS. */ + struct list_head active_list; }; -enum { - SI_QUERY_HW_FLAG_NO_START = (1 << 0), - /* gap */ - /* whether begin_query doesn't clear the result */ - SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2), +enum +{ + SI_QUERY_HW_FLAG_NO_START = (1 << 0), + /* gap */ + /* whether begin_query doesn't clear the result */ + SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2), }; struct si_query_hw_ops { - bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *); - void (*emit_start)(struct si_context *, - struct si_query_hw *, - struct si_resource *buffer, uint64_t va); - void (*emit_stop)(struct si_context *, - struct si_query_hw *, - struct si_resource *buffer, uint64_t va); - void (*clear_result)(struct si_query_hw *, union pipe_query_result *); - void (*add_result)(struct si_screen *screen, - struct si_query_hw *, void *buffer, - union pipe_query_result *result); + bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *); + void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer, + uint64_t va); + void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer, + uint64_t va); + void (*clear_result)(struct si_query_hw *, union pipe_query_result *); + void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer, + union pipe_query_result *result); }; struct si_query_buffer { - /* The buffer where query results are stored. */ - struct si_resource *buf; - /* If a query buffer is full, a new buffer is created and the old one - * is put in here. When we calculate the result, we sum up the samples - * from all buffers. */ - struct si_query_buffer *previous; - /* Offset of the next free result after current query data */ - unsigned results_end; - bool unprepared; + /* The buffer where query results are stored. */ + struct si_resource *buf; + /* If a query buffer is full, a new buffer is created and the old one + * is put in here. When we calculate the result, we sum up the samples + * from all buffers. */ + struct si_query_buffer *previous; + /* Offset of the next free result after current query data */ + unsigned results_end; + bool unprepared; }; void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer); void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer); bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer, - bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*), - unsigned size); - + bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *), + unsigned size); struct si_query_hw { - struct si_query b; - struct si_query_hw_ops *ops; - unsigned flags; - - /* The query buffer and how many results are in it. */ - struct si_query_buffer buffer; - /* Size of the result in memory for both begin_query and end_query, - * this can be one or two numbers, or it could even be a size of a structure. */ - unsigned result_size; - /* For transform feedback: which stream the query is for */ - unsigned stream; - - /* Workaround via compute shader */ - struct si_resource *workaround_buf; - unsigned workaround_offset; + struct si_query b; + struct si_query_hw_ops *ops; + unsigned flags; + + /* The query buffer and how many results are in it. */ + struct si_query_buffer buffer; + /* Size of the result in memory for both begin_query and end_query, + * this can be one or two numbers, or it could even be a size of a structure. */ + unsigned result_size; + /* For transform feedback: which stream the query is for */ + unsigned stream; + + /* Workaround via compute shader */ + struct si_resource *workaround_buf; + unsigned workaround_offset; }; -void si_query_hw_destroy(struct si_context *sctx, - struct si_query *squery); -bool si_query_hw_begin(struct si_context *sctx, - struct si_query *squery); -bool si_query_hw_end(struct si_context *sctx, - struct si_query *squery); -bool si_query_hw_get_result(struct si_context *sctx, - struct si_query *squery, - bool wait, - union pipe_query_result *result); +void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery); +bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery); +bool si_query_hw_end(struct si_context *sctx, struct si_query *squery); +bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait, + union pipe_query_result *result); void si_query_hw_suspend(struct si_context *sctx, struct si_query *query); void si_query_hw_resume(struct si_context *sctx, struct si_query *query); - /* Shader-based queries */ -struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, - enum pipe_query_type query_type, - unsigned index); - +struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, + unsigned index); /* Performance counters */ struct si_perfcounters { - unsigned num_groups; - unsigned num_blocks; - struct si_pc_block *blocks; + unsigned num_groups; + unsigned num_blocks; + struct si_pc_block *blocks; - unsigned num_stop_cs_dwords; - unsigned num_instance_cs_dwords; + unsigned num_stop_cs_dwords; + unsigned num_instance_cs_dwords; - bool separate_se; - bool separate_instance; + bool separate_se; + bool separate_instance; }; -struct pipe_query *si_create_batch_query(struct pipe_context *ctx, - unsigned num_queries, - unsigned *query_types); +struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries, + unsigned *query_types); -int si_get_perfcounter_info(struct si_screen *, - unsigned index, - struct pipe_driver_query_info *info); -int si_get_perfcounter_group_info(struct si_screen *, - unsigned index, - struct pipe_driver_query_group_info *info); +int si_get_perfcounter_info(struct si_screen *, unsigned index, + struct pipe_driver_query_info *info); +int si_get_perfcounter_group_info(struct si_screen *, unsigned index, + struct pipe_driver_query_group_info *info); struct si_qbo_state { - void *saved_compute; - struct pipe_constant_buffer saved_const0; - struct pipe_shader_buffer saved_ssbo[3]; - unsigned saved_ssbo_writable_mask; + void *saved_compute; + struct pipe_constant_buffer saved_const0; + struct pipe_shader_buffer saved_ssbo[3]; + unsigned saved_ssbo_writable_mask; }; #endif /* SI_QUERY_H */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index f0e60087dbf..e615b81c293 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -22,43 +22,38 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "util/u_memory.h" -#include "tgsi/tgsi_strings.h" -#include "tgsi/tgsi_from_mesa.h" - #include "ac_exp_param.h" #include "ac_rtld.h" -#include "si_shader_internal.h" -#include "si_pipe.h" -#include "sid.h" - #include "compiler/nir/nir.h" #include "compiler/nir/nir_serialize.h" +#include "si_pipe.h" +#include "si_shader_internal.h" +#include "sid.h" +#include "tgsi/tgsi_from_mesa.h" +#include "tgsi/tgsi_strings.h" +#include "util/u_memory.h" -static const char scratch_rsrc_dword0_symbol[] = - "SCRATCH_RSRC_DWORD0"; +static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0"; -static const char scratch_rsrc_dword1_symbol[] = - "SCRATCH_RSRC_DWORD1"; +static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1"; static void si_dump_shader_key(const struct si_shader *shader, FILE *f); /** Whether the shader runs as a combination of multiple API shaders */ bool si_is_multi_part_shader(struct si_shader *shader) { - if (shader->selector->screen->info.chip_class <= GFX8) - return false; + if (shader->selector->screen->info.chip_class <= GFX8) + return false; - return shader->key.as_ls || - shader->key.as_es || - shader->selector->type == PIPE_SHADER_TESS_CTRL || - shader->selector->type == PIPE_SHADER_GEOMETRY; + return shader->key.as_ls || shader->key.as_es || + shader->selector->type == PIPE_SHADER_TESS_CTRL || + shader->selector->type == PIPE_SHADER_GEOMETRY; } /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */ bool si_is_merged_shader(struct si_shader *shader) { - return shader->key.as_ngg || si_is_multi_part_shader(shader); + return shader->key.as_ngg || si_is_multi_part_shader(shader); } /** @@ -68,19 +63,19 @@ bool si_is_merged_shader(struct si_shader *shader) */ unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index) { - switch (semantic_name) { - case TGSI_SEMANTIC_TESSOUTER: - return 0; - case TGSI_SEMANTIC_TESSINNER: - return 1; - case TGSI_SEMANTIC_PATCH: - assert(index < 30); - return 2 + index; - - default: - assert(!"invalid semantic name"); - return 0; - } + switch (semantic_name) { + case TGSI_SEMANTIC_TESSOUTER: + return 0; + case TGSI_SEMANTIC_TESSINNER: + return 1; + case TGSI_SEMANTIC_PATCH: + assert(index < 30); + return 2 + index; + + default: + assert(!"invalid semantic name"); + return 0; + } } /** @@ -88,1527 +83,1420 @@ unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned in * less than 64, so that a 64-bit bitmask of used inputs or outputs can be * calculated. */ -unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, - unsigned is_varying) +unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying) { - switch (semantic_name) { - case TGSI_SEMANTIC_POSITION: - return 0; - case TGSI_SEMANTIC_GENERIC: - /* Since some shader stages use the the highest used IO index - * to determine the size to allocate for inputs/outputs - * (in LDS, tess and GS rings). GENERIC should be placed right - * after POSITION to make that size as small as possible. - */ - if (index < SI_MAX_IO_GENERIC) - return 1 + index; - - assert(!"invalid generic index"); - return 0; - case TGSI_SEMANTIC_FOG: - return SI_MAX_IO_GENERIC + 1; - case TGSI_SEMANTIC_COLOR: - assert(index < 2); - return SI_MAX_IO_GENERIC + 2 + index; - case TGSI_SEMANTIC_BCOLOR: - assert(index < 2); - /* If it's a varying, COLOR and BCOLOR alias. */ - if (is_varying) - return SI_MAX_IO_GENERIC + 2 + index; - else - return SI_MAX_IO_GENERIC + 4 + index; - case TGSI_SEMANTIC_TEXCOORD: - assert(index < 8); - return SI_MAX_IO_GENERIC + 6 + index; - - /* These are rarely used between LS and HS or ES and GS. */ - case TGSI_SEMANTIC_CLIPDIST: - assert(index < 2); - return SI_MAX_IO_GENERIC + 6 + 8 + index; - case TGSI_SEMANTIC_CLIPVERTEX: - return SI_MAX_IO_GENERIC + 6 + 8 + 2; - case TGSI_SEMANTIC_PSIZE: - return SI_MAX_IO_GENERIC + 6 + 8 + 3; - - /* These can't be written by LS, HS, and ES. */ - case TGSI_SEMANTIC_LAYER: - return SI_MAX_IO_GENERIC + 6 + 8 + 4; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - return SI_MAX_IO_GENERIC + 6 + 8 + 5; - case TGSI_SEMANTIC_PRIMID: - STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63); - return SI_MAX_IO_GENERIC + 6 + 8 + 6; - default: - fprintf(stderr, "invalid semantic name = %u\n", semantic_name); - assert(!"invalid semantic name"); - return 0; - } + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: + return 0; + case TGSI_SEMANTIC_GENERIC: + /* Since some shader stages use the the highest used IO index + * to determine the size to allocate for inputs/outputs + * (in LDS, tess and GS rings). GENERIC should be placed right + * after POSITION to make that size as small as possible. + */ + if (index < SI_MAX_IO_GENERIC) + return 1 + index; + + assert(!"invalid generic index"); + return 0; + case TGSI_SEMANTIC_FOG: + return SI_MAX_IO_GENERIC + 1; + case TGSI_SEMANTIC_COLOR: + assert(index < 2); + return SI_MAX_IO_GENERIC + 2 + index; + case TGSI_SEMANTIC_BCOLOR: + assert(index < 2); + /* If it's a varying, COLOR and BCOLOR alias. */ + if (is_varying) + return SI_MAX_IO_GENERIC + 2 + index; + else + return SI_MAX_IO_GENERIC + 4 + index; + case TGSI_SEMANTIC_TEXCOORD: + assert(index < 8); + return SI_MAX_IO_GENERIC + 6 + index; + + /* These are rarely used between LS and HS or ES and GS. */ + case TGSI_SEMANTIC_CLIPDIST: + assert(index < 2); + return SI_MAX_IO_GENERIC + 6 + 8 + index; + case TGSI_SEMANTIC_CLIPVERTEX: + return SI_MAX_IO_GENERIC + 6 + 8 + 2; + case TGSI_SEMANTIC_PSIZE: + return SI_MAX_IO_GENERIC + 6 + 8 + 3; + + /* These can't be written by LS, HS, and ES. */ + case TGSI_SEMANTIC_LAYER: + return SI_MAX_IO_GENERIC + 6 + 8 + 4; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + return SI_MAX_IO_GENERIC + 6 + 8 + 5; + case TGSI_SEMANTIC_PRIMID: + STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63); + return SI_MAX_IO_GENERIC + 6 + 8 + 6; + default: + fprintf(stderr, "invalid semantic name = %u\n", semantic_name); + assert(!"invalid semantic name"); + return 0; + } } static void si_dump_streamout(struct pipe_stream_output_info *so) { - unsigned i; - - if (so->num_outputs) - fprintf(stderr, "STREAMOUT\n"); - - for (i = 0; i < so->num_outputs; i++) { - unsigned mask = ((1 << so->output[i].num_components) - 1) << - so->output[i].start_component; - fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", - i, so->output[i].output_buffer, - so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, - so->output[i].register_index, - mask & 1 ? "x" : "", - mask & 2 ? "y" : "", - mask & 4 ? "z" : "", - mask & 8 ? "w" : ""); - } + unsigned i; + + if (so->num_outputs) + fprintf(stderr, "STREAMOUT\n"); + + for (i = 0; i < so->num_outputs; i++) { + unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; + fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer, + so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, + so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "", + mask & 4 ? "z" : "", mask & 8 ? "w" : ""); + } } static void declare_streamout_params(struct si_shader_context *ctx, - struct pipe_stream_output_info *so) + struct pipe_stream_output_info *so) { - if (ctx->screen->use_ngg_streamout) { - if (ctx->type == PIPE_SHADER_TESS_EVAL) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - return; - } - - /* Streamout SGPRs. */ - if (so->num_outputs) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index); - } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - } - - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (int i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]); - } + if (ctx->screen->use_ngg_streamout) { + if (ctx->type == PIPE_SHADER_TESS_EVAL) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + return; + } + + /* Streamout SGPRs. */ + if (so->num_outputs) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index); + } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + } + + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (int i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]); + } } unsigned si_get_max_workgroup_size(const struct si_shader *shader) { - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - return shader->key.as_ngg ? 128 : 0; - - case PIPE_SHADER_TESS_CTRL: - /* Return this so that LLVM doesn't remove s_barrier - * instructions on chips where we use s_barrier. */ - return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; - - case PIPE_SHADER_GEOMETRY: - return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; - - case PIPE_SHADER_COMPUTE: - break; /* see below */ - - default: - return 0; - } - - const unsigned *properties = shader->selector->info.properties; - unsigned max_work_group_size = - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - - if (!max_work_group_size) { - /* This is a variable group size compute shader, - * compile it for the maximum possible group size. - */ - max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - return max_work_group_size; + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + return shader->key.as_ngg ? 128 : 0; + + case PIPE_SHADER_TESS_CTRL: + /* Return this so that LLVM doesn't remove s_barrier + * instructions on chips where we use s_barrier. */ + return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; + + case PIPE_SHADER_GEOMETRY: + return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; + + case PIPE_SHADER_COMPUTE: + break; /* see below */ + + default: + return 0; + } + + const unsigned *properties = shader->selector->info.properties; + unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; + + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return max_work_group_size; } -static void declare_const_and_shader_buffers(struct si_shader_context *ctx, - bool assign_params) +static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params) { - enum ac_arg_type const_shader_buf_type; + enum ac_arg_type const_shader_buf_type; - if (ctx->shader->selector->info.const_buffers_declared == 1 && - ctx->shader->selector->info.shader_buffers_declared == 0) - const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; - else - const_shader_buf_type = AC_ARG_CONST_DESC_PTR; + if (ctx->shader->selector->info.const_buffers_declared == 1 && + ctx->shader->selector->info.shader_buffers_declared == 0) + const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; + else + const_shader_buf_type = AC_ARG_CONST_DESC_PTR; - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, - assign_params ? &ctx->const_and_shader_buffers : - &ctx->other_const_and_shader_buffers); + ac_add_arg( + &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, + assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers); } -static void declare_samplers_and_images(struct si_shader_context *ctx, - bool assign_params) +static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, - assign_params ? &ctx->samplers_and_images : - &ctx->other_samplers_and_images); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images); } -static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, - bool assign_params) +static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params) { - declare_const_and_shader_buffers(ctx, assign_params); - declare_samplers_and_images(ctx, assign_params); + declare_const_and_shader_buffers(ctx, assign_params); + declare_samplers_and_images(ctx, assign_params); } static void declare_global_desc_pointers(struct si_shader_context *ctx) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - &ctx->rw_buffers); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, - &ctx->bindless_samplers_and_images); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, + &ctx->bindless_samplers_and_images); } static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - if (!ctx->shader->is_gs_copy_shader) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); - } + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + if (!ctx->shader->is_gs_copy_shader) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); + } } static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers); - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; - if (num_vbos_in_user_sgprs) { - unsigned user_sgprs = ctx->args.num_sgprs_used; + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + if (num_vbos_in_user_sgprs) { + unsigned user_sgprs = ctx->args.num_sgprs_used; - if (si_is_merged_shader(ctx->shader)) - user_sgprs -= 8; - assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); + if (si_is_merged_shader(ctx->shader)) + user_sgprs -= 8; + assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); - /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ - for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ + for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); - for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); - } + assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); + for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); + } } -static void declare_vs_input_vgprs(struct si_shader_context *ctx, - unsigned *num_prolog_vgprs, - bool ngg_cull_shader) +static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs, + bool ngg_cull_shader) { - struct si_shader *shader = ctx->shader; - - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); - if (shader->key.as_ls) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id); - if (ctx->screen->info.chip_class >= GFX10) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - } else { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ - } - } else if (ctx->screen->info.chip_class >= GFX10) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - } else { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ - } - - if (!shader->is_gs_copy_shader) { - if (shader->key.opt.ngg_culling && !ngg_cull_shader) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->ngg_old_thread_id); - } - - /* Vertex load indices. */ - if (shader->selector->info.num_inputs) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->vertex_index0); - for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); - } - *num_prolog_vgprs += shader->selector->info.num_inputs; - } + struct si_shader *shader = ctx->shader; + + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); + if (shader->key.as_ls) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id); + if (ctx->screen->info.chip_class >= GFX10) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + } else { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + } + } else if (ctx->screen->info.chip_class >= GFX10) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, + &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + } else { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ + } + + if (!shader->is_gs_copy_shader) { + if (shader->key.opt.ngg_culling && !ngg_cull_shader) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id); + } + + /* Vertex load indices. */ + if (shader->selector->info.num_inputs) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0); + for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); + } + *num_prolog_vgprs += shader->selector->info.num_inputs; + } } -static void declare_vs_blit_inputs(struct si_shader_context *ctx, - unsigned vs_blit_property) +static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->vs_blit_inputs); /* i16 x1, y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ - } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ - } + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ + } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ + } } static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); - - if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->ngg_old_thread_id); - } + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); + + if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id); + } } -enum { - /* Convenient merged shader definitions. */ - SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, - SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, +enum +{ + /* Convenient merged shader definitions. */ + SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES, + SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, }; -void si_add_arg_checked(struct ac_shader_args *args, - enum ac_arg_regfile file, - unsigned registers, enum ac_arg_type type, - struct ac_arg *arg, - unsigned idx) +void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, + enum ac_arg_type type, struct ac_arg *arg, unsigned idx) { - assert(args->arg_count == idx); - ac_add_arg(args, file, registers, type, arg); + assert(args->arg_count == idx); + ac_add_arg(args, file, registers, type, arg); } void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) { - struct si_shader *shader = ctx->shader; - LLVMTypeRef returns[AC_MAX_ARGS]; - unsigned i, num_return_sgprs; - unsigned num_returns = 0; - unsigned num_prolog_vgprs = 0; - unsigned type = ctx->type; - unsigned vs_blit_property = - shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - /* Set MERGED shaders. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) - type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ - else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY) - type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; - } - - switch (type) { - case PIPE_SHADER_VERTEX: - declare_global_desc_pointers(ctx); - - if (vs_blit_property) { - declare_vs_blit_inputs(ctx, vs_blit_property); - - /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - break; - } - - declare_per_stage_desc_pointers(ctx, true); - declare_vs_specific_input_sgprs(ctx); - if (!shader->is_gs_copy_shader) - declare_vb_descriptor_input_sgprs(ctx); - - if (shader->key.as_es) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->es2gs_offset); - } else if (shader->key.as_ls) { - /* no extra parameters */ - } else { - /* The locations of the other parameters are assigned dynamically. */ - declare_streamout_params(ctx, &shader->selector->so); - } - - /* VGPRs */ - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - - /* Return values */ - if (shader->key.opt.vs_as_prim_discard_cs) { - for (i = 0; i < 4; i++) - returns[num_returns++] = ctx->ac.f32; /* VGPRs */ - } - break; - - case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */ - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); - - /* VGPRs */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); - - /* param_tcs_offchip_offset and param_tcs_factor_offset are - * placed after the user SGPRs. - */ - for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) - returns[num_returns++] = ctx->ac.i32; /* SGPRs */ - for (i = 0; i < 11; i++) - returns[num_returns++] = ctx->ac.f32; /* VGPRs */ - break; - - case SI_SHADER_MERGED_VERTEX_TESSCTRL: - /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ - declare_per_stage_desc_pointers(ctx, - ctx->type == PIPE_SHADER_TESS_CTRL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ - - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, - ctx->type == PIPE_SHADER_VERTEX); - declare_vs_specific_input_sgprs(ctx); - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - declare_vb_descriptor_input_sgprs(ctx); - - /* VGPRs (first TCS, then VS) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); - - if (ctx->type == PIPE_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - - /* LS return values are inputs to the TCS main shader part. */ - for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) - returns[num_returns++] = ctx->ac.i32; /* SGPRs */ - for (i = 0; i < 2; i++) - returns[num_returns++] = ctx->ac.f32; /* VGPRs */ - } else { - /* TCS return values are inputs to the TCS epilog. - * - * param_tcs_offchip_offset, param_tcs_factor_offset, - * param_tcs_offchip_layout, and param_rw_buffers - * should be passed to the epilog. - */ - for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) - returns[num_returns++] = ctx->ac.i32; /* SGPRs */ - for (i = 0; i < 11; i++) - returns[num_returns++] = ctx->ac.f32; /* VGPRs */ - } - break; - - case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: - /* Merged stages have 8 system SGPRs at the beginning. */ - /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ - declare_per_stage_desc_pointers(ctx, - ctx->type == PIPE_SHADER_GEOMETRY); - - if (ctx->shader->key.as_ngg) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info); - else - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, - &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ - - declare_global_desc_pointers(ctx); - if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) { - declare_per_stage_desc_pointers(ctx, - (ctx->type == PIPE_SHADER_VERTEX || - ctx->type == PIPE_SHADER_TESS_EVAL)); - } - - if (ctx->type == PIPE_SHADER_VERTEX) { - if (vs_blit_property) - declare_vs_blit_inputs(ctx, vs_blit_property); - else - declare_vs_specific_input_sgprs(ctx); - } else { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); - /* Declare as many input SGPRs as the VS has. */ - } - - if (ctx->type == PIPE_SHADER_VERTEX) - declare_vb_descriptor_input_sgprs(ctx); - - /* VGPRs (first GS, then VS/TES) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset); - - if (ctx->type == PIPE_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); - } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { - declare_tes_input_vgprs(ctx, ngg_cull_shader); - } - - if ((ctx->shader->key.as_es || ngg_cull_shader) && - (ctx->type == PIPE_SHADER_VERTEX || - ctx->type == PIPE_SHADER_TESS_EVAL)) { - unsigned num_user_sgprs, num_vgprs; - - if (ctx->type == PIPE_SHADER_VERTEX) { - /* For the NGG cull shader, add 1 SGPR to hold - * the vertex buffer pointer. - */ - num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader; - - if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) { - assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST); - num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST + - shader->selector->num_vbos_in_user_sgprs * 4; - } - } else { - num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - } - - /* The NGG cull shader has to return all 9 VGPRs + the old thread ID. - * - * The normal merged ESGS shader only has to return the 5 VGPRs - * for the GS stage. - */ - num_vgprs = ngg_cull_shader ? 10 : 5; - - /* ES return values are inputs to GS. */ - for (i = 0; i < 8 + num_user_sgprs; i++) - returns[num_returns++] = ctx->ac.i32; /* SGPRs */ - for (i = 0; i < num_vgprs; i++) - returns[num_returns++] = ctx->ac.f32; /* VGPRs */ - } - break; - - case PIPE_SHADER_TESS_EVAL: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); - - if (shader->key.as_es) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset); - } else { - declare_streamout_params(ctx, &shader->selector->so); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); - } - - /* VGPRs */ - declare_tes_input_vgprs(ctx, ngg_cull_shader); - break; - - case PIPE_SHADER_GEOMETRY: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id); - - /* VGPRs */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); - break; - - case PIPE_SHADER_FRAGMENT: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, - SI_PARAM_ALPHA_REF); - si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->args.prim_mask, SI_PARAM_PRIM_MASK); - - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, - SI_PARAM_PERSP_SAMPLE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, - &ctx->args.persp_center, SI_PARAM_PERSP_CENTER); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, - &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, - NULL, SI_PARAM_PERSP_PULL_MODEL); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, - &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, - &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, - &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, - NULL, SI_PARAM_LINE_STIPPLE_TEX); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, - &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, - &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, - &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, - &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT); - shader->info.face_vgpr_index = ctx->args.num_vgprs_used; - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->args.front_face, SI_PARAM_FRONT_FACE); - shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->args.ancillary, SI_PARAM_ANCILLARY); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, - &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE); - si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT); - - /* Color inputs from the prolog. */ - if (shader->selector->info.colors_read) { - unsigned num_color_elements = - util_bitcount(shader->selector->info.colors_read); - - for (i = 0; i < num_color_elements; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); - - num_prolog_vgprs += num_color_elements; - } - - /* Outputs for the epilog. */ - num_return_sgprs = SI_SGPR_ALPHA_REF + 1; - num_returns = - num_return_sgprs + - util_bitcount(shader->selector->info.colors_written) * 4 + - shader->selector->info.writes_z + - shader->selector->info.writes_stencil + - shader->selector->info.writes_samplemask + - 1 /* SampleMaskIn */; - - num_returns = MAX2(num_returns, - num_return_sgprs + - PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - - for (i = 0; i < num_return_sgprs; i++) - returns[i] = ctx->ac.i32; - for (; i < num_returns; i++) - returns[i] = ctx->ac.f32; - break; - - case PIPE_SHADER_COMPUTE: - declare_global_desc_pointers(ctx); - declare_per_stage_desc_pointers(ctx, true); - if (shader->selector->info.uses_grid_size) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, - &ctx->args.num_work_groups); - if (shader->selector->info.uses_block_size && - shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size); - - unsigned cs_user_data_dwords = - shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; - if (cs_user_data_dwords) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, - &ctx->cs_user_data); - } - - /* Hardware SGPRs. */ - for (i = 0; i < 3; i++) { - if (shader->selector->info.uses_block_id[i]) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->args.workgroup_ids[i]); - } - } - if (shader->selector->info.uses_subgroup_info) - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); - - /* Hardware VGPRs. */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, - &ctx->args.local_invocation_ids); - break; - default: - assert(0 && "unimplemented shader"); - return; - } - - si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", - returns, num_returns, si_get_max_workgroup_size(shader)); - - /* Reserve register locations for VGPR inputs the PS prolog may need. */ - if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "InitialPSInputAddr", - S_0286D0_PERSP_SAMPLE_ENA(1) | - S_0286D0_PERSP_CENTER_ENA(1) | - S_0286D0_PERSP_CENTROID_ENA(1) | - S_0286D0_LINEAR_SAMPLE_ENA(1) | - S_0286D0_LINEAR_CENTER_ENA(1) | - S_0286D0_LINEAR_CENTROID_ENA(1) | - S_0286D0_FRONT_FACE_ENA(1) | - S_0286D0_ANCILLARY_ENA(1) | - S_0286D0_POS_FIXED_PT_ENA(1)); - } - - shader->info.num_input_sgprs = ctx->args.num_sgprs_used; - shader->info.num_input_vgprs = ctx->args.num_vgprs_used; - - assert(shader->info.num_input_vgprs >= num_prolog_vgprs); - shader->info.num_input_vgprs -= num_prolog_vgprs; - - if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) { - if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { - /* The LSHS size is not known until draw time, so we append it - * at the end of whatever LDS use there may be in the rest of - * the shader (currently none, unless LLVM decides to do its - * own LDS-based lowering). - */ - ctx->ac.lds = LLVMAddGlobalInAddressSpace( - ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), - "__lds_end", AC_ADDR_SPACE_LDS); - LLVMSetAlignment(ctx->ac.lds, 256); - } else { - ac_declare_lds_as_pointer(&ctx->ac); - } - } - - /* Unlike radv, we override these arguments in the prolog, so to the - * API shader they appear as normal arguments. - */ - if (ctx->type == PIPE_SHADER_VERTEX) { - ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id); - ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id); - } else if (ctx->type == PIPE_SHADER_FRAGMENT) { - ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid); - ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid); - } + struct si_shader *shader = ctx->shader; + LLVMTypeRef returns[AC_MAX_ARGS]; + unsigned i, num_return_sgprs; + unsigned num_returns = 0; + unsigned num_prolog_vgprs = 0; + unsigned type = ctx->type; + unsigned vs_blit_property = shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* Set MERGED shaders. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL) + type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ + else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY) + type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; + } + + switch (type) { + case PIPE_SHADER_VERTEX: + declare_global_desc_pointers(ctx); + + if (vs_blit_property) { + declare_vs_blit_inputs(ctx, vs_blit_property); + + /* VGPRs */ + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + break; + } + + declare_per_stage_desc_pointers(ctx, true); + declare_vs_specific_input_sgprs(ctx); + if (!shader->is_gs_copy_shader) + declare_vb_descriptor_input_sgprs(ctx); + + if (shader->key.as_es) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset); + } else if (shader->key.as_ls) { + /* no extra parameters */ + } else { + /* The locations of the other parameters are assigned dynamically. */ + declare_streamout_params(ctx, &shader->selector->so); + } + + /* VGPRs */ + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + + /* Return values */ + if (shader->key.opt.vs_as_prim_discard_cs) { + for (i = 0; i < 4; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; + + case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */ + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); + + /* VGPRs */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); + + /* param_tcs_offchip_offset and param_tcs_factor_offset are + * placed after the user SGPRs. + */ + for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 11; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + break; + + case SI_SHADER_MERGED_VERTEX_TESSCTRL: + /* Merged stages have 8 system SGPRs at the beginning. */ + /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ + declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_TESS_CTRL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ + + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_VERTEX); + declare_vs_specific_input_sgprs(ctx); + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + declare_vb_descriptor_input_sgprs(ctx); + + /* VGPRs (first TCS, then VS) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); + + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + + /* LS return values are inputs to the TCS main shader part. */ + for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 2; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } else { + /* TCS return values are inputs to the TCS epilog. + * + * param_tcs_offchip_offset, param_tcs_factor_offset, + * param_tcs_offchip_layout, and param_rw_buffers + * should be passed to the epilog. + */ + for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < 11; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; + + case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: + /* Merged stages have 8 system SGPRs at the beginning. */ + /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ + declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_GEOMETRY); + + if (ctx->shader->key.as_ngg) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info); + else + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, + &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ + + declare_global_desc_pointers(ctx); + if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) { + declare_per_stage_desc_pointers( + ctx, (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)); + } + + if (ctx->type == PIPE_SHADER_VERTEX) { + if (vs_blit_property) + declare_vs_blit_inputs(ctx, vs_blit_property); + else + declare_vs_specific_input_sgprs(ctx); + } else { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); + /* Declare as many input SGPRs as the VS has. */ + } + + if (ctx->type == PIPE_SHADER_VERTEX) + declare_vb_descriptor_input_sgprs(ctx); + + /* VGPRs (first GS, then VS/TES) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset); + + if (ctx->type == PIPE_SHADER_VERTEX) { + declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader); + } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { + declare_tes_input_vgprs(ctx, ngg_cull_shader); + } + + if ((ctx->shader->key.as_es || ngg_cull_shader) && + (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)) { + unsigned num_user_sgprs, num_vgprs; + + if (ctx->type == PIPE_SHADER_VERTEX) { + /* For the NGG cull shader, add 1 SGPR to hold + * the vertex buffer pointer. + */ + num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader; + + if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) { + assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST); + num_user_sgprs = + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; + } + } else { + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + } + + /* The NGG cull shader has to return all 9 VGPRs + the old thread ID. + * + * The normal merged ESGS shader only has to return the 5 VGPRs + * for the GS stage. + */ + num_vgprs = ngg_cull_shader ? 10 : 5; + + /* ES return values are inputs to GS. */ + for (i = 0; i < 8 + num_user_sgprs; i++) + returns[num_returns++] = ctx->ac.i32; /* SGPRs */ + for (i = 0; i < num_vgprs; i++) + returns[num_returns++] = ctx->ac.f32; /* VGPRs */ + } + break; + + case PIPE_SHADER_TESS_EVAL: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); + + if (shader->key.as_es) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset); + } else { + declare_streamout_params(ctx, &shader->selector->so); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + } + + /* VGPRs */ + declare_tes_input_vgprs(ctx, ngg_cull_shader); + break; + + case PIPE_SHADER_GEOMETRY: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id); + + /* VGPRs */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]); + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); + break; + + case PIPE_SHADER_FRAGMENT: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask, + SI_PARAM_PRIM_MASK); + + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, + SI_PARAM_PERSP_SAMPLE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center, + SI_PARAM_PERSP_CENTER); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid, + SI_PARAM_PERSP_CENTROID); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample, + SI_PARAM_LINEAR_SAMPLE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center, + SI_PARAM_LINEAR_CENTER); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid, + SI_PARAM_LINEAR_CENTROID); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0], + SI_PARAM_POS_X_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1], + SI_PARAM_POS_Y_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2], + SI_PARAM_POS_Z_FLOAT); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3], + SI_PARAM_POS_W_FLOAT); + shader->info.face_vgpr_index = ctx->args.num_vgprs_used; + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face, + SI_PARAM_FRONT_FACE); + shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary, + SI_PARAM_ANCILLARY); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage, + SI_PARAM_SAMPLE_COVERAGE); + si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt, + SI_PARAM_POS_FIXED_PT); + + /* Color inputs from the prolog. */ + if (shader->selector->info.colors_read) { + unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); + + for (i = 0; i < num_color_elements; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + + num_prolog_vgprs += num_color_elements; + } + + /* Outputs for the epilog. */ + num_return_sgprs = SI_SGPR_ALPHA_REF + 1; + num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 + + shader->selector->info.writes_z + shader->selector->info.writes_stencil + + shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */; + + num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + for (i = 0; i < num_return_sgprs; i++) + returns[i] = ctx->ac.i32; + for (; i < num_returns; i++) + returns[i] = ctx->ac.f32; + break; + + case PIPE_SHADER_COMPUTE: + declare_global_desc_pointers(ctx); + declare_per_stage_desc_pointers(ctx, true); + if (shader->selector->info.uses_grid_size) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups); + if (shader->selector->info.uses_block_size && + shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size); + + unsigned cs_user_data_dwords = + shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; + if (cs_user_data_dwords) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data); + } + + /* Hardware SGPRs. */ + for (i = 0; i < 3; i++) { + if (shader->selector->info.uses_block_id[i]) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]); + } + } + if (shader->selector->info.uses_subgroup_info) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); + + /* Hardware VGPRs. */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids); + break; + default: + assert(0 && "unimplemented shader"); + return; + } + + si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns, + si_get_max_workgroup_size(shader)); + + /* Reserve register locations for VGPR inputs the PS prolog may need. */ + if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) { + ac_llvm_add_target_dep_function_attr( + ctx->main_fn, "InitialPSInputAddr", + S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) | + S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) | + S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) | + S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); + } + + shader->info.num_input_sgprs = ctx->args.num_sgprs_used; + shader->info.num_input_vgprs = ctx->args.num_vgprs_used; + + assert(shader->info.num_input_vgprs >= num_prolog_vgprs); + shader->info.num_input_vgprs -= num_prolog_vgprs; + + if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) { + if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + /* The LSHS size is not known until draw time, so we append it + * at the end of whatever LDS use there may be in the rest of + * the shader (currently none, unless LLVM decides to do its + * own LDS-based lowering). + */ + ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "__lds_end", AC_ADDR_SPACE_LDS); + LLVMSetAlignment(ctx->ac.lds, 256); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + } + } + + /* Unlike radv, we override these arguments in the prolog, so to the + * API shader they appear as normal arguments. + */ + if (ctx->type == PIPE_SHADER_VERTEX) { + ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id); + ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id); + } else if (ctx->type == PIPE_SHADER_FRAGMENT) { + ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid); + ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid); + } } /* For the UMR disassembler. */ -#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ -#define DEBUGGER_NUM_MARKERS 5 +#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ +#define DEBUGGER_NUM_MARKERS 5 -static bool si_shader_binary_open(struct si_screen *screen, - struct si_shader *shader, - struct ac_rtld_binary *rtld) +static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, + struct ac_rtld_binary *rtld) { - const struct si_shader_selector *sel = shader->selector; - const char *part_elfs[5]; - size_t part_sizes[5]; - unsigned num_parts = 0; - -#define add_part(shader_or_part) \ - if (shader_or_part) { \ - part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ - part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ - num_parts++; \ - } - - add_part(shader->prolog); - add_part(shader->previous_stage); - add_part(shader->prolog2); - add_part(shader); - add_part(shader->epilog); + const struct si_shader_selector *sel = shader->selector; + const char *part_elfs[5]; + size_t part_sizes[5]; + unsigned num_parts = 0; + +#define add_part(shader_or_part) \ + if (shader_or_part) { \ + part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ + part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ + num_parts++; \ + } + + add_part(shader->prolog); + add_part(shader->previous_stage); + add_part(shader->prolog2); + add_part(shader); + add_part(shader->epilog); #undef add_part - struct ac_rtld_symbol lds_symbols[2]; - unsigned num_lds_symbols = 0; - - if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && - (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) { - /* We add this symbol even on LLVM <= 8 to ensure that - * shader->config.lds_size is set correctly below. - */ - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "esgs_ring"; - sym->size = shader->gs_info.esgs_ring_size; - sym->align = 64 * 1024; - } - - if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) { - struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; - sym->name = "ngg_emit"; - sym->size = shader->ngg.ngg_emit_size * 4; - sym->align = 4; - } - - bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){ - .info = &screen->info, - .options = { - .halt_at_entry = screen->options.halt_shaders, - }, - .shader_type = tgsi_processor_to_shader_stage(sel->type), - .wave_size = si_get_shader_wave_size(shader), - .num_parts = num_parts, - .elf_ptrs = part_elfs, - .elf_sizes = part_sizes, - .num_shared_lds_symbols = num_lds_symbols, - .shared_lds_symbols = lds_symbols }); - - if (rtld->lds_size > 0) { - unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; - shader->config.lds_size = - align(rtld->lds_size, alloc_granularity) / alloc_granularity; - } - - return ok; + struct ac_rtld_symbol lds_symbols[2]; + unsigned num_lds_symbols = 0; + + if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && + (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) { + /* We add this symbol even on LLVM <= 8 to ensure that + * shader->config.lds_size is set correctly below. + */ + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "esgs_ring"; + sym->size = shader->gs_info.esgs_ring_size; + sym->align = 64 * 1024; + } + + if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) { + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "ngg_emit"; + sym->size = shader->ngg.ngg_emit_size * 4; + sym->align = 4; + } + + bool ok = ac_rtld_open( + rtld, (struct ac_rtld_open_info){.info = &screen->info, + .options = + { + .halt_at_entry = screen->options.halt_shaders, + }, + .shader_type = tgsi_processor_to_shader_stage(sel->type), + .wave_size = si_get_shader_wave_size(shader), + .num_parts = num_parts, + .elf_ptrs = part_elfs, + .elf_sizes = part_sizes, + .num_shared_lds_symbols = num_lds_symbols, + .shared_lds_symbols = lds_symbols}); + + if (rtld->lds_size > 0) { + unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; + shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity; + } + + return ok; } static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) { - struct ac_rtld_binary rtld; - si_shader_binary_open(screen, shader, &rtld); - return rtld.exec_size; + struct ac_rtld_binary rtld; + si_shader_binary_open(screen, shader, &rtld); + return rtld.exec_size; } static bool si_get_external_symbol(void *data, const char *name, uint64_t *value) { - uint64_t *scratch_va = data; - - if (!strcmp(scratch_rsrc_dword0_symbol, name)) { - *value = (uint32_t)*scratch_va; - return true; - } - if (!strcmp(scratch_rsrc_dword1_symbol, name)) { - /* Enable scratch coalescing. */ - *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | - S_008F04_SWIZZLE_ENABLE(1); - return true; - } - - return false; + uint64_t *scratch_va = data; + + if (!strcmp(scratch_rsrc_dword0_symbol, name)) { + *value = (uint32_t)*scratch_va; + return true; + } + if (!strcmp(scratch_rsrc_dword1_symbol, name)) { + /* Enable scratch coalescing. */ + *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); + return true; + } + + return false; } bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, - uint64_t scratch_va) + uint64_t scratch_va) { - struct ac_rtld_binary binary; - if (!si_shader_binary_open(sscreen, shader, &binary)) - return false; - - si_resource_reference(&shader->bo, NULL); - shader->bo = si_aligned_buffer_create(&sscreen->b, - sscreen->info.cpdma_prefetch_writes_memory ? - 0 : SI_RESOURCE_FLAG_READ_ONLY, - PIPE_USAGE_IMMUTABLE, - align(binary.rx_size, SI_CPDMA_ALIGNMENT), - 256); - if (!shader->bo) - return false; - - /* Upload. */ - struct ac_rtld_upload_info u = {}; - u.binary = &binary; - u.get_external_symbol = si_get_external_symbol; - u.cb_data = &scratch_va; - u.rx_va = shader->bo->gpu_address; - u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL, - PIPE_TRANSFER_READ_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED | - RADEON_TRANSFER_TEMPORARY); - if (!u.rx_ptr) - return false; - - bool ok = ac_rtld_upload(&u); - - sscreen->ws->buffer_unmap(shader->bo->buf); - ac_rtld_close(&binary); - - return ok; + struct ac_rtld_binary binary; + if (!si_shader_binary_open(sscreen, shader, &binary)) + return false; + + si_resource_reference(&shader->bo, NULL); + shader->bo = si_aligned_buffer_create( + &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY, + PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); + if (!shader->bo) + return false; + + /* Upload. */ + struct ac_rtld_upload_info u = {}; + u.binary = &binary; + u.get_external_symbol = si_get_external_symbol; + u.cb_data = &scratch_va; + u.rx_va = shader->bo->gpu_address; + u.rx_ptr = sscreen->ws->buffer_map( + shader->bo->buf, NULL, + PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY); + if (!u.rx_ptr) + return false; + + bool ok = ac_rtld_upload(&u); + + sscreen->ws->buffer_unmap(shader->bo->buf); + ac_rtld_close(&binary); + + return ok; } static void si_shader_dump_disassembly(struct si_screen *screen, - const struct si_shader_binary *binary, - enum pipe_shader_type shader_type, - unsigned wave_size, - struct pipe_debug_callback *debug, - const char *name, FILE *file) + const struct si_shader_binary *binary, + enum pipe_shader_type shader_type, unsigned wave_size, + struct pipe_debug_callback *debug, const char *name, + FILE *file) { - struct ac_rtld_binary rtld_binary; - - if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ - .info = &screen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size })) - return; - - const char *disasm; - size_t nbytes; - - if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) - goto out; - - if (nbytes > INT_MAX) - goto out; - - if (debug && debug->debug_message) { - /* Very long debug messages are cut off, so send the - * disassembly one line at a time. This causes more - * overhead, but on the plus side it simplifies - * parsing of resulting logs. - */ - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly Begin"); - - uint64_t line = 0; - while (line < nbytes) { - int count = nbytes - line; - const char *nl = memchr(disasm + line, '\n', nbytes - line); - if (nl) - count = nl - (disasm + line); - - if (count) { - pipe_debug_message(debug, SHADER_INFO, - "%.*s", count, disasm + line); - } - - line += count + 1; - } - - pipe_debug_message(debug, SHADER_INFO, - "Shader Disassembly End"); - } - - if (file) { - fprintf(file, "Shader %s disassembly:\n", name); - fprintf(file, "%*s", (int)nbytes, disasm); - } + struct ac_rtld_binary rtld_binary; + + if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ + .info = &screen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size})) + return; + + const char *disasm; + size_t nbytes; + + if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) + goto out; + + if (nbytes > INT_MAX) + goto out; + + if (debug && debug->debug_message) { + /* Very long debug messages are cut off, so send the + * disassembly one line at a time. This causes more + * overhead, but on the plus side it simplifies + * parsing of resulting logs. + */ + pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin"); + + uint64_t line = 0; + while (line < nbytes) { + int count = nbytes - line; + const char *nl = memchr(disasm + line, '\n', nbytes - line); + if (nl) + count = nl - (disasm + line); + + if (count) { + pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line); + } + + line += count + 1; + } + + pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End"); + } + + if (file) { + fprintf(file, "Shader %s disassembly:\n", name); + fprintf(file, "%*s", (int)nbytes, disasm); + } out: - ac_rtld_close(&rtld_binary); + ac_rtld_close(&rtld_binary); } static void si_calculate_max_simd_waves(struct si_shader *shader) { - struct si_screen *sscreen = shader->selector->screen; - struct ac_shader_config *conf = &shader->config; - unsigned num_inputs = shader->selector->info.num_inputs; - unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; - unsigned lds_per_wave = 0; - unsigned max_simd_waves; - - max_simd_waves = sscreen->info.max_wave64_per_simd; - - /* Compute LDS usage for PS. */ - switch (shader->selector->type) { - case PIPE_SHADER_FRAGMENT: - /* The minimum usage per wave is (num_inputs * 48). The maximum - * usage is (num_inputs * 48 * 16). - * We can get anything in between and it varies between waves. - * - * The 48 bytes per input for a single primitive is equal to - * 4 bytes/component * 4 components/input * 3 points. - * - * Other stages don't know the size at compile time or don't - * allocate LDS per wave, but instead they do it per thread group. - */ - lds_per_wave = conf->lds_size * lds_increment + - align(num_inputs * 48, lds_increment); - break; - case PIPE_SHADER_COMPUTE: - if (shader->selector) { - unsigned max_workgroup_size = - si_get_max_workgroup_size(shader); - lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, - sscreen->compute_wave_size); - } - break; - default:; - } - - /* Compute the per-SIMD wave counts. */ - if (conf->num_sgprs) { - max_simd_waves = - MIN2(max_simd_waves, - sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs); - } - - if (conf->num_vgprs) { - /* Always print wave limits as Wave64, so that we can compare - * Wave32 and Wave64 with shader-db fairly. */ - unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; - max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); - } - - unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4; - if (lds_per_wave) - max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); - - shader->info.max_simd_waves = max_simd_waves; + struct si_screen *sscreen = shader->selector->screen; + struct ac_shader_config *conf = &shader->config; + unsigned num_inputs = shader->selector->info.num_inputs; + unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; + unsigned lds_per_wave = 0; + unsigned max_simd_waves; + + max_simd_waves = sscreen->info.max_wave64_per_simd; + + /* Compute LDS usage for PS. */ + switch (shader->selector->type) { + case PIPE_SHADER_FRAGMENT: + /* The minimum usage per wave is (num_inputs * 48). The maximum + * usage is (num_inputs * 48 * 16). + * We can get anything in between and it varies between waves. + * + * The 48 bytes per input for a single primitive is equal to + * 4 bytes/component * 4 components/input * 3 points. + * + * Other stages don't know the size at compile time or don't + * allocate LDS per wave, but instead they do it per thread group. + */ + lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); + break; + case PIPE_SHADER_COMPUTE: + if (shader->selector) { + unsigned max_workgroup_size = si_get_max_workgroup_size(shader); + lds_per_wave = (conf->lds_size * lds_increment) / + DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size); + } + break; + default:; + } + + /* Compute the per-SIMD wave counts. */ + if (conf->num_sgprs) { + max_simd_waves = + MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs); + } + + if (conf->num_vgprs) { + /* Always print wave limits as Wave64, so that we can compare + * Wave32 and Wave64 with shader-db fairly. */ + unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; + max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); + } + + unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4; + if (lds_per_wave) + max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); + + shader->info.max_simd_waves = max_simd_waves; } -void si_shader_dump_stats_for_shader_db(struct si_screen *screen, - struct si_shader *shader, - struct pipe_debug_callback *debug) +void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, + struct pipe_debug_callback *debug) { - const struct ac_shader_config *conf = &shader->config; - - if (screen->options.debug_disassembly) - si_shader_dump_disassembly(screen, &shader->binary, - shader->selector->type, - si_get_shader_wave_size(shader), - debug, "main", NULL); - - pipe_debug_message(debug, SHADER_INFO, - "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " - "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " - "Spilled VGPRs: %d PrivMem VGPRs: %d", - conf->num_sgprs, conf->num_vgprs, - si_get_shader_binary_size(screen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, - shader->info.max_simd_waves, conf->spilled_sgprs, - conf->spilled_vgprs, shader->info.private_mem_vgprs); + const struct ac_shader_config *conf = &shader->config; + + if (screen->options.debug_disassembly) + si_shader_dump_disassembly(screen, &shader->binary, shader->selector->type, + si_get_shader_wave_size(shader), debug, "main", NULL); + + pipe_debug_message(debug, SHADER_INFO, + "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " + "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " + "Spilled VGPRs: %d PrivMem VGPRs: %d", + conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader), + conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, + conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs); } -static void si_shader_dump_stats(struct si_screen *sscreen, - struct si_shader *shader, - FILE *file, - bool check_debug_option) +static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file, + bool check_debug_option) { - const struct ac_shader_config *conf = &shader->config; - - if (!check_debug_option || - si_can_dump_shader(sscreen, shader->selector->type)) { - if (shader->selector->type == PIPE_SHADER_FRAGMENT) { - fprintf(file, "*** SHADER CONFIG ***\n" - "SPI_PS_INPUT_ADDR = 0x%04x\n" - "SPI_PS_INPUT_ENA = 0x%04x\n", - conf->spi_ps_input_addr, conf->spi_ps_input_ena); - } - - fprintf(file, "*** SHADER STATS ***\n" - "SGPRS: %d\n" - "VGPRS: %d\n" - "Spilled SGPRs: %d\n" - "Spilled VGPRs: %d\n" - "Private memory VGPRs: %d\n" - "Code Size: %d bytes\n" - "LDS: %d blocks\n" - "Scratch: %d bytes per wave\n" - "Max Waves: %d\n" - "********************\n\n\n", - conf->num_sgprs, conf->num_vgprs, - conf->spilled_sgprs, conf->spilled_vgprs, - shader->info.private_mem_vgprs, - si_get_shader_binary_size(sscreen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, - shader->info.max_simd_waves); - } + const struct ac_shader_config *conf = &shader->config; + + if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->type)) { + if (shader->selector->type == PIPE_SHADER_FRAGMENT) { + fprintf(file, + "*** SHADER CONFIG ***\n" + "SPI_PS_INPUT_ADDR = 0x%04x\n" + "SPI_PS_INPUT_ENA = 0x%04x\n", + conf->spi_ps_input_addr, conf->spi_ps_input_ena); + } + + fprintf(file, + "*** SHADER STATS ***\n" + "SGPRS: %d\n" + "VGPRS: %d\n" + "Spilled SGPRs: %d\n" + "Spilled VGPRs: %d\n" + "Private memory VGPRs: %d\n" + "Code Size: %d bytes\n" + "LDS: %d blocks\n" + "Scratch: %d bytes per wave\n" + "Max Waves: %d\n" + "********************\n\n\n", + conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, + shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader), + conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves); + } } const char *si_get_shader_name(const struct si_shader *shader) { - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - if (shader->key.as_es) - return "Vertex Shader as ES"; - else if (shader->key.as_ls) - return "Vertex Shader as LS"; - else if (shader->key.opt.vs_as_prim_discard_cs) - return "Vertex Shader as Primitive Discard CS"; - else if (shader->key.as_ngg) - return "Vertex Shader as ESGS"; - else - return "Vertex Shader as VS"; - case PIPE_SHADER_TESS_CTRL: - return "Tessellation Control Shader"; - case PIPE_SHADER_TESS_EVAL: - if (shader->key.as_es) - return "Tessellation Evaluation Shader as ES"; - else if (shader->key.as_ngg) - return "Tessellation Evaluation Shader as ESGS"; - else - return "Tessellation Evaluation Shader as VS"; - case PIPE_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - return "GS Copy Shader as VS"; - else - return "Geometry Shader"; - case PIPE_SHADER_FRAGMENT: - return "Pixel Shader"; - case PIPE_SHADER_COMPUTE: - return "Compute Shader"; - default: - return "Unknown Shader"; - } + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (shader->key.as_es) + return "Vertex Shader as ES"; + else if (shader->key.as_ls) + return "Vertex Shader as LS"; + else if (shader->key.opt.vs_as_prim_discard_cs) + return "Vertex Shader as Primitive Discard CS"; + else if (shader->key.as_ngg) + return "Vertex Shader as ESGS"; + else + return "Vertex Shader as VS"; + case PIPE_SHADER_TESS_CTRL: + return "Tessellation Control Shader"; + case PIPE_SHADER_TESS_EVAL: + if (shader->key.as_es) + return "Tessellation Evaluation Shader as ES"; + else if (shader->key.as_ngg) + return "Tessellation Evaluation Shader as ESGS"; + else + return "Tessellation Evaluation Shader as VS"; + case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + return "GS Copy Shader as VS"; + else + return "Geometry Shader"; + case PIPE_SHADER_FRAGMENT: + return "Pixel Shader"; + case PIPE_SHADER_COMPUTE: + return "Compute Shader"; + default: + return "Unknown Shader"; + } } void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, - struct pipe_debug_callback *debug, - FILE *file, bool check_debug_option) + struct pipe_debug_callback *debug, FILE *file, bool check_debug_option) { - enum pipe_shader_type shader_type = shader->selector->type; - - if (!check_debug_option || - si_can_dump_shader(sscreen, shader_type)) - si_dump_shader_key(shader, file); - - if (!check_debug_option && shader->binary.llvm_ir_string) { - if (shader->previous_stage && - shader->previous_stage->binary.llvm_ir_string) { - fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", - si_get_shader_name(shader)); - fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); - } - - fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", - si_get_shader_name(shader)); - fprintf(file, "%s\n", shader->binary.llvm_ir_string); - } - - if (!check_debug_option || - (si_can_dump_shader(sscreen, shader_type) && - !(sscreen->debug_flags & DBG(NO_ASM)))) { - unsigned wave_size = si_get_shader_wave_size(shader); - - fprintf(file, "\n%s:\n", si_get_shader_name(shader)); - - if (shader->prolog) - si_shader_dump_disassembly(sscreen, &shader->prolog->binary, - shader_type, wave_size, debug, "prolog", file); - if (shader->previous_stage) - si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, - shader_type, wave_size, debug, "previous stage", file); - if (shader->prolog2) - si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, - shader_type, wave_size, debug, "prolog2", file); - - si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, - wave_size, debug, "main", file); - - if (shader->epilog) - si_shader_dump_disassembly(sscreen, &shader->epilog->binary, - shader_type, wave_size, debug, "epilog", file); - fprintf(file, "\n"); - } - - si_shader_dump_stats(sscreen, shader, file, check_debug_option); + enum pipe_shader_type shader_type = shader->selector->type; + + if (!check_debug_option || si_can_dump_shader(sscreen, shader_type)) + si_dump_shader_key(shader, file); + + if (!check_debug_option && shader->binary.llvm_ir_string) { + if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) { + fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader)); + fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); + } + + fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader)); + fprintf(file, "%s\n", shader->binary.llvm_ir_string); + } + + if (!check_debug_option || + (si_can_dump_shader(sscreen, shader_type) && !(sscreen->debug_flags & DBG(NO_ASM)))) { + unsigned wave_size = si_get_shader_wave_size(shader); + + fprintf(file, "\n%s:\n", si_get_shader_name(shader)); + + if (shader->prolog) + si_shader_dump_disassembly(sscreen, &shader->prolog->binary, shader_type, wave_size, debug, + "prolog", file); + if (shader->previous_stage) + si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, shader_type, + wave_size, debug, "previous stage", file); + if (shader->prolog2) + si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, shader_type, wave_size, + debug, "prolog2", file); + + si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, wave_size, debug, "main", + file); + + if (shader->epilog) + si_shader_dump_disassembly(sscreen, &shader->epilog->binary, shader_type, wave_size, debug, + "epilog", file); + fprintf(file, "\n"); + } + + si_shader_dump_stats(sscreen, shader, file, check_debug_option); } static void si_dump_shader_key_vs(const struct si_shader_key *key, - const struct si_vs_prolog_bits *prolog, - const char *prefix, FILE *f) + const struct si_vs_prolog_bits *prolog, const char *prefix, + FILE *f) { - fprintf(f, " %s.instance_divisor_is_one = %u\n", - prefix, prolog->instance_divisor_is_one); - fprintf(f, " %s.instance_divisor_is_fetched = %u\n", - prefix, prolog->instance_divisor_is_fetched); - fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", - prefix, prolog->unpack_instance_id_from_vertex_id); - fprintf(f, " %s.ls_vgpr_fix = %u\n", - prefix, prolog->ls_vgpr_fix); - - fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); - fprintf(f, " mono.vs.fix_fetch = {"); - for (int i = 0; i < SI_MAX_ATTRIBS; i++) { - union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; - if (i) - fprintf(f, ", "); - if (!fix.bits) - fprintf(f, "0"); - else - fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, - fix.u.num_channels_m1, fix.u.format); - } - fprintf(f, "}\n"); + fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one); + fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix, + prolog->instance_divisor_is_fetched); + fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix, + prolog->unpack_instance_id_from_vertex_id); + fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); + + fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); + fprintf(f, " mono.vs.fix_fetch = {"); + for (int i = 0; i < SI_MAX_ATTRIBS; i++) { + union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; + if (i) + fprintf(f, ", "); + if (!fix.bits) + fprintf(f, "0"); + else + fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1, + fix.u.format); + } + fprintf(f, "}\n"); } static void si_dump_shader_key(const struct si_shader *shader, FILE *f) { - const struct si_shader_key *key = &shader->key; - enum pipe_shader_type shader_type = shader->selector->type; - - fprintf(f, "SHADER KEY\n"); - - switch (shader_type) { - case PIPE_SHADER_VERTEX: - si_dump_shader_key_vs(key, &key->part.vs.prolog, - "part.vs.prolog", f); - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ls = %u\n", key->as_ls); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", - key->mono.u.vs_export_prim_id); - fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", - key->opt.vs_as_prim_discard_cs); - fprintf(f, " opt.cs_prim_type = %s\n", - tgsi_primitive_names[key->opt.cs_prim_type]); - fprintf(f, " opt.cs_indexed = %u\n", - key->opt.cs_indexed); - fprintf(f, " opt.cs_instancing = %u\n", - key->opt.cs_instancing); - fprintf(f, " opt.cs_primitive_restart = %u\n", - key->opt.cs_primitive_restart); - fprintf(f, " opt.cs_provoking_vertex_first = %u\n", - key->opt.cs_provoking_vertex_first); - fprintf(f, " opt.cs_need_correct_orientation = %u\n", - key->opt.cs_need_correct_orientation); - fprintf(f, " opt.cs_cull_front = %u\n", - key->opt.cs_cull_front); - fprintf(f, " opt.cs_cull_back = %u\n", - key->opt.cs_cull_back); - fprintf(f, " opt.cs_cull_z = %u\n", - key->opt.cs_cull_z); - fprintf(f, " opt.cs_halfz_clip_space = %u\n", - key->opt.cs_halfz_clip_space); - break; - - case PIPE_SHADER_TESS_CTRL: - if (shader->selector->screen->info.chip_class >= GFX9) { - si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, - "part.tcs.ls_prolog", f); - } - fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); - fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy); - break; - - case PIPE_SHADER_TESS_EVAL: - fprintf(f, " as_es = %u\n", key->as_es); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - fprintf(f, " mono.u.vs_export_prim_id = %u\n", - key->mono.u.vs_export_prim_id); - break; - - case PIPE_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - break; - - if (shader->selector->screen->info.chip_class >= GFX9 && - key->part.gs.es->type == PIPE_SHADER_VERTEX) { - si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, - "part.gs.vs_prolog", f); - } - fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix); - fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs); - fprintf(f, " as_ngg = %u\n", key->as_ngg); - break; - - case PIPE_SHADER_COMPUTE: - break; - - case PIPE_SHADER_FRAGMENT: - fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); - fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); - fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); - fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp); - fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp); - fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp); - fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp); - fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp); - fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear); - fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter); - fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format); - fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); - fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); - fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); - fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); - fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); - fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing); - fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); - fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center); - fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); - fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); - fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); - break; - - default: - assert(0); - } - - if ((shader_type == PIPE_SHADER_GEOMETRY || - shader_type == PIPE_SHADER_TESS_EVAL || - shader_type == PIPE_SHADER_VERTEX) && - !key->as_es && !key->as_ls) { - fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs); - fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); - if (shader_type != PIPE_SHADER_GEOMETRY) - fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); - } + const struct si_shader_key *key = &shader->key; + enum pipe_shader_type shader_type = shader->selector->type; + + fprintf(f, "SHADER KEY\n"); + + switch (shader_type) { + case PIPE_SHADER_VERTEX: + si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f); + fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " as_ls = %u\n", key->as_ls); + fprintf(f, " as_ngg = %u\n", key->as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); + fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs); + fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]); + fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed); + fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing); + fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart); + fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first); + fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation); + fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front); + fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back); + fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z); + fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space); + break; + + case PIPE_SHADER_TESS_CTRL: + if (shader->selector->screen->info.chip_class >= GFX9) { + si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f); + } + fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); + fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n", + key->mono.u.ff_tcs_inputs_to_copy); + break; + + case PIPE_SHADER_TESS_EVAL: + fprintf(f, " as_es = %u\n", key->as_es); + fprintf(f, " as_ngg = %u\n", key->as_ngg); + fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); + break; + + case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + break; + + if (shader->selector->screen->info.chip_class >= GFX9 && + key->part.gs.es->type == PIPE_SHADER_VERTEX) { + si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f); + } + fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", + key->part.gs.prolog.tri_strip_adj_fix); + fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs); + fprintf(f, " as_ngg = %u\n", key->as_ngg); + break; + + case PIPE_SHADER_COMPUTE: + break; + + case PIPE_SHADER_FRAGMENT: + fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); + fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); + fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); + fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", + key->part.ps.prolog.force_persp_sample_interp); + fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", + key->part.ps.prolog.force_linear_sample_interp); + fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", + key->part.ps.prolog.force_persp_center_interp); + fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", + key->part.ps.prolog.force_linear_center_interp); + fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", + key->part.ps.prolog.bc_optimize_for_persp); + fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", + key->part.ps.prolog.bc_optimize_for_linear); + fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", + key->part.ps.prolog.samplemask_log_ps_iter); + fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", + key->part.ps.epilog.spi_shader_col_format); + fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); + fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); + fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); + fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); + fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); + fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", + key->part.ps.epilog.poly_line_smoothing); + fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); + fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", + key->mono.u.ps.interpolate_at_sample_force_center); + fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); + fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); + fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); + break; + + default: + assert(0); + } + + if ((shader_type == PIPE_SHADER_GEOMETRY || shader_type == PIPE_SHADER_TESS_EVAL || + shader_type == PIPE_SHADER_VERTEX) && + !key->as_es && !key->as_ls) { + fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs); + fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable); + if (shader_type != PIPE_SHADER_GEOMETRY) + fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); + } } static void si_optimize_vs_outputs(struct si_shader_context *ctx) { - struct si_shader *shader = ctx->shader; - struct si_shader_info *info = &shader->selector->info; - - if ((ctx->type != PIPE_SHADER_VERTEX && - ctx->type != PIPE_SHADER_TESS_EVAL) || - shader->key.as_ls || - shader->key.as_es) - return; - - ac_optimize_vs_outputs(&ctx->ac, - ctx->main_fn, - shader->info.vs_output_param_offset, - info->num_outputs, - &shader->info.nr_param_exports); + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + + if ((ctx->type != PIPE_SHADER_VERTEX && ctx->type != PIPE_SHADER_TESS_EVAL) || + shader->key.as_ls || shader->key.as_es) + return; + + ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset, + info->num_outputs, &shader->info.nr_param_exports); } static bool si_vs_needs_prolog(const struct si_shader_selector *sel, - const struct si_vs_prolog_bits *prolog_key, - const struct si_shader_key *key, - bool ngg_cull_shader) + const struct si_vs_prolog_bits *prolog_key, + const struct si_shader_key *key, bool ngg_cull_shader) { - /* VGPR initialization fixup for Vega10 and Raven is always done in the - * VS prolog. */ - return sel->vs_needs_prolog || - prolog_key->ls_vgpr_fix || - prolog_key->unpack_instance_id_from_vertex_id || - (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + /* VGPR initialization fixup for Vega10 and Raven is always done in the + * VS prolog. */ + return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || + prolog_key->unpack_instance_id_from_vertex_id || + (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } -static bool si_build_main_function(struct si_shader_context *ctx, - struct si_shader *shader, - struct nir_shader *nir, bool free_nir, - bool ngg_cull_shader) +static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader, + struct nir_shader *nir, bool free_nir, bool ngg_cull_shader) { - struct si_shader_selector *sel = shader->selector; - const struct si_shader_info *info = &sel->info; - - ctx->shader = shader; - ctx->type = sel->type; - - ctx->num_const_buffers = util_last_bit(info->const_buffers_declared); - ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared); - - ctx->num_samplers = util_last_bit(info->samplers_declared); - ctx->num_images = util_last_bit(info->images_declared); - - si_llvm_init_resource_callbacks(ctx); - - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); - break; - case PIPE_SHADER_TESS_CTRL: - si_llvm_init_tcs_callbacks(ctx); - break; - case PIPE_SHADER_TESS_EVAL: - si_llvm_init_tes_callbacks(ctx, ngg_cull_shader); - break; - case PIPE_SHADER_GEOMETRY: - si_llvm_init_gs_callbacks(ctx); - break; - case PIPE_SHADER_FRAGMENT: - si_llvm_init_ps_callbacks(ctx); - break; - case PIPE_SHADER_COMPUTE: - ctx->abi.load_local_group_size = si_llvm_get_block_size; - break; - default: - assert(!"Unsupported shader type"); - return false; - } - - si_create_function(ctx, ngg_cull_shader); - - if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) - si_preload_esgs_ring(ctx); - - if (ctx->type == PIPE_SHADER_GEOMETRY) - si_preload_gs_rings(ctx); - else if (ctx->type == PIPE_SHADER_TESS_EVAL) - si_llvm_preload_tes_rings(ctx); - - if (ctx->type == PIPE_SHADER_TESS_CTRL && - sel->info.tessfactors_are_def_in_all_invocs) { - for (unsigned i = 0; i < 6; i++) { - ctx->invoc0_tess_factors[i] = - ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); - } - } - - if (ctx->type == PIPE_SHADER_GEOMETRY) { - for (unsigned i = 0; i < 4; i++) { - ctx->gs_next_vertex[i] = - ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - } - if (shader->key.as_ngg) { - for (unsigned i = 0; i < 4; ++i) { - ctx->gs_curprim_verts[i] = - ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - ctx->gs_generated_prims[i] = - ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - } - - unsigned scratch_size = 8; - if (sel->so.num_outputs) - scratch_size = 44; - - assert(!ctx->gs_ngg_scratch); - LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size); - ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, - ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32)); - LLVMSetAlignment(ctx->gs_ngg_scratch, 4); - - ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module, - LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); - LLVMSetAlignment(ctx->gs_ngg_emit, 4); - } - } - - if (ctx->type != PIPE_SHADER_GEOMETRY && - (shader->key.as_ngg && !shader->key.as_es)) { - /* Unconditionally declare scratch space base for streamout and - * vertex compaction. Whether space is actually allocated is - * determined during linking / PM4 creation. - * - * Add an extra dword per vertex to ensure an odd stride, which - * avoids bank conflicts for SoA accesses. - */ - if (!gfx10_is_ngg_passthrough(shader)) - si_llvm_declare_esgs_ring(ctx); - - /* This is really only needed when streamout and / or vertex - * compaction is enabled. - */ - if (!ctx->gs_ngg_scratch && - (sel->so.num_outputs || shader->key.opt.ngg_culling)) { - LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8); - ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module, - asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx->gs_ngg_scratch, 4); - } - } - - /* For GFX9 merged shaders: - * - Set EXEC for the first shader. If the prolog is present, set - * EXEC there instead. - * - Add a barrier before the second shader. - * - In the second shader, reset EXEC to ~0 and wrap the main part in - * an if-statement. This is required for correctness in geometry - * shaders, to ensure that empty GS waves do not send GS_EMIT and - * GS_CUT messages. - * - * For monolithic merged shaders, the first shader is wrapped in an - * if-block together with its prolog in si_build_wrapper_function. - * - * NGG vertex and tess eval shaders running as the last - * vertex/geometry stage handle execution explicitly using - * if-statements. - */ - if (ctx->screen->info.chip_class >= GFX9) { - if (!shader->is_monolithic && - (shader->key.as_es || shader->key.as_ls) && - (ctx->type == PIPE_SHADER_TESS_EVAL || - (ctx->type == PIPE_SHADER_VERTEX && - !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, - &shader->key, ngg_cull_shader)))) { - si_init_exec_from_input(ctx, - ctx->merged_wave_info, 0); - } else if (ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY || - (shader->key.as_ngg && !shader->key.as_es)) { - LLVMValueRef thread_enabled; - bool nested_barrier; - - if (!shader->is_monolithic || - (ctx->type == PIPE_SHADER_TESS_EVAL && - shader->key.as_ngg && !shader->key.as_es && - !shader->key.opt.ngg_culling)) - ac_init_exec_full_mask(&ctx->ac); - - if ((ctx->type == PIPE_SHADER_VERTEX || - ctx->type == PIPE_SHADER_TESS_EVAL) && - shader->key.as_ngg && !shader->key.as_es && - !shader->key.opt.ngg_culling) { - gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); - - /* Build the primitive export at the beginning - * of the shader if possible. - */ - if (gfx10_ngg_export_prim_early(shader)) - gfx10_ngg_build_export_prim(ctx, NULL, NULL); - } - - if (ctx->type == PIPE_SHADER_TESS_CTRL || - ctx->type == PIPE_SHADER_GEOMETRY) { - if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) { - gfx10_ngg_gs_emit_prologue(ctx); - nested_barrier = false; - } else { - nested_barrier = true; - } - - thread_enabled = si_is_gs_thread(ctx); - } else { - thread_enabled = si_is_es_thread(ctx); - nested_barrier = false; - } - - ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); - ctx->merged_wrap_if_label = 11500; - ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); - - if (nested_barrier) { - /* Execute a barrier before the second shader in - * a merged shader. - * - * Execute the barrier inside the conditional block, - * so that empty waves can jump directly to s_endpgm, - * which will also signal the barrier. - * - * This is possible in gfx9, because an empty wave - * for the second shader does not participate in - * the epilogue. With NGG, empty waves may still - * be required to export data (e.g. GS output vertices), - * so we cannot let them exit early. - * - * If the shader is TCS and the TCS epilog is present - * and contains a barrier, it will wait there and then - * reach s_endpgm. - */ - si_llvm_emit_barrier(ctx); - } - } - } - - bool success = si_nir_build_llvm(ctx, nir); - if (free_nir) - ralloc_free(nir); - if (!success) { - fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); - return false; - } - - si_llvm_build_ret(ctx, ctx->return_value); - return true; + struct si_shader_selector *sel = shader->selector; + const struct si_shader_info *info = &sel->info; + + ctx->shader = shader; + ctx->type = sel->type; + + ctx->num_const_buffers = util_last_bit(info->const_buffers_declared); + ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared); + + ctx->num_samplers = util_last_bit(info->samplers_declared); + ctx->num_images = util_last_bit(info->images_declared); + + si_llvm_init_resource_callbacks(ctx); + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); + break; + case PIPE_SHADER_TESS_CTRL: + si_llvm_init_tcs_callbacks(ctx); + break; + case PIPE_SHADER_TESS_EVAL: + si_llvm_init_tes_callbacks(ctx, ngg_cull_shader); + break; + case PIPE_SHADER_GEOMETRY: + si_llvm_init_gs_callbacks(ctx); + break; + case PIPE_SHADER_FRAGMENT: + si_llvm_init_ps_callbacks(ctx); + break; + case PIPE_SHADER_COMPUTE: + ctx->abi.load_local_group_size = si_llvm_get_block_size; + break; + default: + assert(!"Unsupported shader type"); + return false; + } + + si_create_function(ctx, ngg_cull_shader); + + if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_esgs_ring(ctx); + + if (ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_gs_rings(ctx); + else if (ctx->type == PIPE_SHADER_TESS_EVAL) + si_llvm_preload_tes_rings(ctx); + + if (ctx->type == PIPE_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) { + for (unsigned i = 0; i < 6; i++) { + ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + } + } + + if (ctx->type == PIPE_SHADER_GEOMETRY) { + for (unsigned i = 0; i < 4; i++) { + ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + } + if (shader->key.as_ngg) { + for (unsigned i = 0; i < 4; ++i) { + ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); + } + + unsigned scratch_size = 8; + if (sel->so.num_outputs) + scratch_size = 44; + + assert(!ctx->gs_ngg_scratch); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size); + ctx->gs_ngg_scratch = + LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32)); + LLVMSetAlignment(ctx->gs_ngg_scratch, 4); + + ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); + LLVMSetAlignment(ctx->gs_ngg_emit, 4); + } + } + + if (ctx->type != PIPE_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) { + /* Unconditionally declare scratch space base for streamout and + * vertex compaction. Whether space is actually allocated is + * determined during linking / PM4 creation. + * + * Add an extra dword per vertex to ensure an odd stride, which + * avoids bank conflicts for SoA accesses. + */ + if (!gfx10_is_ngg_passthrough(shader)) + si_llvm_declare_esgs_ring(ctx); + + /* This is really only needed when streamout and / or vertex + * compaction is enabled. + */ + if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) { + LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8); + ctx->gs_ngg_scratch = + LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx->gs_ngg_scratch, 4); + } + } + + /* For GFX9 merged shaders: + * - Set EXEC for the first shader. If the prolog is present, set + * EXEC there instead. + * - Add a barrier before the second shader. + * - In the second shader, reset EXEC to ~0 and wrap the main part in + * an if-statement. This is required for correctness in geometry + * shaders, to ensure that empty GS waves do not send GS_EMIT and + * GS_CUT messages. + * + * For monolithic merged shaders, the first shader is wrapped in an + * if-block together with its prolog in si_build_wrapper_function. + * + * NGG vertex and tess eval shaders running as the last + * vertex/geometry stage handle execution explicitly using + * if-statements. + */ + if (ctx->screen->info.chip_class >= GFX9) { + if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) && + (ctx->type == PIPE_SHADER_TESS_EVAL || + (ctx->type == PIPE_SHADER_VERTEX && + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) { + si_init_exec_from_input(ctx, ctx->merged_wave_info, 0); + } else if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY || + (shader->key.as_ngg && !shader->key.as_es)) { + LLVMValueRef thread_enabled; + bool nested_barrier; + + if (!shader->is_monolithic || (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.as_ngg && + !shader->key.as_es && !shader->key.opt.ngg_culling)) + ac_init_exec_full_mask(&ctx->ac); + + if ((ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL) && + shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) { + gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); + + /* Build the primitive export at the beginning + * of the shader if possible. + */ + if (gfx10_ngg_export_prim_early(shader)) + gfx10_ngg_build_export_prim(ctx, NULL, NULL); + } + + if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) { + if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) { + gfx10_ngg_gs_emit_prologue(ctx); + nested_barrier = false; + } else { + nested_barrier = true; + } + + thread_enabled = si_is_gs_thread(ctx); + } else { + thread_enabled = si_is_es_thread(ctx); + nested_barrier = false; + } + + ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); + ctx->merged_wrap_if_label = 11500; + ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); + + if (nested_barrier) { + /* Execute a barrier before the second shader in + * a merged shader. + * + * Execute the barrier inside the conditional block, + * so that empty waves can jump directly to s_endpgm, + * which will also signal the barrier. + * + * This is possible in gfx9, because an empty wave + * for the second shader does not participate in + * the epilogue. With NGG, empty waves may still + * be required to export data (e.g. GS output vertices), + * so we cannot let them exit early. + * + * If the shader is TCS and the TCS epilog is present + * and contains a barrier, it will wait there and then + * reach s_endpgm. + */ + si_llvm_emit_barrier(ctx); + } + } + } + + bool success = si_nir_build_llvm(ctx, nir); + if (free_nir) + ralloc_free(nir); + if (!success) { + fprintf(stderr, "Failed to translate shader from NIR to LLVM\n"); + return false; + } + + si_llvm_build_ret(ctx, ctx->return_value); + return true; } /** @@ -1622,425 +1510,385 @@ static bool si_build_main_function(struct si_shader_context *ctx, * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. * \param key Output shader part key. */ -static void si_get_vs_prolog_key(const struct si_shader_info *info, - unsigned num_input_sgprs, - bool ngg_cull_shader, - const struct si_vs_prolog_bits *prolog_key, - struct si_shader *shader_out, - union si_shader_part_key *key) +static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, + bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, + struct si_shader *shader_out, union si_shader_part_key *key) { - memset(key, 0, sizeof(*key)); - key->vs_prolog.states = *prolog_key; - key->vs_prolog.num_input_sgprs = num_input_sgprs; - key->vs_prolog.num_inputs = info->num_inputs; - key->vs_prolog.as_ls = shader_out->key.as_ls; - key->vs_prolog.as_es = shader_out->key.as_es; - key->vs_prolog.as_ngg = shader_out->key.as_ngg; - key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs; - - if (ngg_cull_shader) { - key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling & - SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); - key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling & - SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); - } else { - key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling; - } - - if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { - key->vs_prolog.as_ls = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 2; - } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { - key->vs_prolog.as_es = 1; - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } else if (shader_out->key.as_ngg) { - key->vs_prolog.num_merged_next_stage_vgprs = 5; - } - - /* Only one of these combinations can be set. as_ngg can be set with as_es. */ - assert(key->vs_prolog.as_ls + - key->vs_prolog.as_ngg + - (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + - key->vs_prolog.as_prim_discard_cs <= 1); - - /* Enable loading the InstanceID VGPR. */ - uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); - - if ((key->vs_prolog.states.instance_divisor_is_one | - key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) - shader_out->info.uses_instanceid = true; + memset(key, 0, sizeof(*key)); + key->vs_prolog.states = *prolog_key; + key->vs_prolog.num_input_sgprs = num_input_sgprs; + key->vs_prolog.num_inputs = info->num_inputs; + key->vs_prolog.as_ls = shader_out->key.as_ls; + key->vs_prolog.as_es = shader_out->key.as_es; + key->vs_prolog.as_ngg = shader_out->key.as_ngg; + key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs; + + if (ngg_cull_shader) { + key->vs_prolog.gs_fast_launch_tri_list = + !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); + key->vs_prolog.gs_fast_launch_tri_strip = + !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); + } else { + key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling; + } + + if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { + key->vs_prolog.as_ls = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 2; + } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { + key->vs_prolog.as_es = 1; + key->vs_prolog.num_merged_next_stage_vgprs = 5; + } else if (shader_out->key.as_ngg) { + key->vs_prolog.num_merged_next_stage_vgprs = 5; + } + + /* Only one of these combinations can be set. as_ngg can be set with as_es. */ + assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + + (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <= + 1); + + /* Enable loading the InstanceID VGPR. */ + uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); + + if ((key->vs_prolog.states.instance_divisor_is_one | + key->vs_prolog.states.instance_divisor_is_fetched) & + input_mask) + shader_out->info.uses_instanceid = true; } static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, - struct si_shader_selector *sel) + struct si_shader_selector *sel) { - if (!compiler->low_opt_passes) - return false; + if (!compiler->low_opt_passes) + return false; - /* Assume a slow CPU. */ - assert(!sel->screen->info.has_dedicated_vram && - sel->screen->info.chip_class <= GFX8); + /* Assume a slow CPU. */ + assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8); - /* For a crazy dEQP test containing 2597 memory opcodes, mostly - * buffer stores. */ - return sel->type == PIPE_SHADER_COMPUTE && - sel->info.num_memory_instructions > 1000; + /* For a crazy dEQP test containing 2597 memory opcodes, mostly + * buffer stores. */ + return sel->type == PIPE_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000; } -static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, - bool *free_nir) +static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir) { - *free_nir = false; - - if (sel->nir) { - return sel->nir; - } else if (sel->nir_binary) { - struct pipe_screen *screen = &sel->screen->b; - const void *options = - screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, - sel->type); - - struct blob_reader blob_reader; - blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); - *free_nir = true; - return nir_deserialize(NULL, options, &blob_reader); - } - return NULL; + *free_nir = false; + + if (sel->nir) { + return sel->nir; + } else if (sel->nir_binary) { + struct pipe_screen *screen = &sel->screen->b; + const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->type); + + struct blob_reader blob_reader; + blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); + *free_nir = true; + return nir_deserialize(NULL, options, &blob_reader); + } + return NULL; } -static bool si_llvm_compile_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug, - struct nir_shader *nir, - bool free_nir) +static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug, + struct nir_shader *nir, bool free_nir) { - struct si_shader_selector *sel = shader->selector; - struct si_shader_context ctx; - - si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); - - LLVMValueRef ngg_cull_main_fn = NULL; - if (shader->key.opt.ngg_culling) { - if (!si_build_main_function(&ctx, shader, nir, false, true)) { - si_llvm_dispose(&ctx); - return false; - } - ngg_cull_main_fn = ctx.main_fn; - ctx.main_fn = NULL; - } - - if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) { - si_llvm_dispose(&ctx); - return false; - } - - if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { - LLVMValueRef parts[4]; - unsigned num_parts = 0; - bool has_prolog = false; - LLVMValueRef main_fn = ctx.main_fn; - - if (ngg_cull_main_fn) { - if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, - &shader->key, true)) { - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&sel->info, - shader->info.num_input_sgprs, - true, - &shader->key.part.vs.prolog, - shader, &prolog_key); - prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &prolog_key); - parts[num_parts++] = ctx.main_fn; - has_prolog = true; - } - parts[num_parts++] = ngg_cull_main_fn; - } - - if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, - &shader->key, false)) { - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&sel->info, - shader->info.num_input_sgprs, - false, - &shader->key.part.vs.prolog, - shader, &prolog_key); - prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &prolog_key); - parts[num_parts++] = ctx.main_fn; - has_prolog = true; - } - parts[num_parts++] = main_fn; - - si_build_wrapper_function(&ctx, parts, num_parts, - has_prolog ? 1 : 0, 0); - - if (ctx.shader->key.opt.vs_as_prim_discard_cs) - si_build_prim_discard_compute_shader(&ctx); - } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && - ngg_cull_main_fn) { - LLVMValueRef parts[2]; - - parts[0] = ngg_cull_main_fn; - parts[1] = ctx.main_fn; - - si_build_wrapper_function(&ctx, parts, 2, 0, 0); - } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { - if (sscreen->info.chip_class >= GFX9) { - struct si_shader_selector *ls = shader->key.part.tcs.ls; - LLVMValueRef parts[4]; - bool vs_needs_prolog = - si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, - &shader->key, false); - - /* TCS main part */ - parts[2] = ctx.main_fn; - - /* TCS epilog */ - union si_shader_part_key tcs_epilog_key; - memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); - tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key); - parts[3] = ctx.main_fn; - - /* VS as LS main part */ - nir = get_nir_shader(ls, &free_nir); - struct si_shader shader_ls = {}; - shader_ls.selector = ls; - shader_ls.key.as_ls = 1; - shader_ls.key.mono = shader->key.mono; - shader_ls.key.opt = shader->key.opt; - shader_ls.is_monolithic = true; - - if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) { - si_llvm_dispose(&ctx); - return false; - } - shader->info.uses_instanceid |= ls->info.uses_instanceid; - parts[1] = ctx.main_fn; - - /* LS prolog */ - if (vs_needs_prolog) { - union si_shader_part_key vs_prolog_key; - si_get_vs_prolog_key(&ls->info, - shader_ls.info.num_input_sgprs, - false, - &shader->key.part.tcs.ls_prolog, - shader, &vs_prolog_key); - vs_prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); - parts[0] = ctx.main_fn; - } - - /* Reset the shader context. */ - ctx.shader = shader; - ctx.type = PIPE_SHADER_TESS_CTRL; - - si_build_wrapper_function(&ctx, - parts + !vs_needs_prolog, - 4 - !vs_needs_prolog, vs_needs_prolog, - vs_needs_prolog ? 2 : 1); - } else { - LLVMValueRef parts[2]; - union si_shader_part_key epilog_key; - - parts[0] = ctx.main_fn; - - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - si_llvm_build_tcs_epilog(&ctx, &epilog_key); - parts[1] = ctx.main_fn; - - si_build_wrapper_function(&ctx, parts, 2, 0, 0); - } - } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { - if (ctx.screen->info.chip_class >= GFX9) { - struct si_shader_selector *es = shader->key.part.gs.es; - LLVMValueRef es_prolog = NULL; - LLVMValueRef es_main = NULL; - LLVMValueRef gs_prolog = NULL; - LLVMValueRef gs_main = ctx.main_fn; - - /* GS prolog */ - union si_shader_part_key gs_prolog_key; - memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); - gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - gs_prolog_key.gs_prolog.is_monolithic = true; - gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; - si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); - gs_prolog = ctx.main_fn; - - /* ES main part */ - nir = get_nir_shader(es, &free_nir); - struct si_shader shader_es = {}; - shader_es.selector = es; - shader_es.key.as_es = 1; - shader_es.key.as_ngg = shader->key.as_ngg; - shader_es.key.mono = shader->key.mono; - shader_es.key.opt = shader->key.opt; - shader_es.is_monolithic = true; - - if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) { - si_llvm_dispose(&ctx); - return false; - } - shader->info.uses_instanceid |= es->info.uses_instanceid; - es_main = ctx.main_fn; - - /* ES prolog */ - if (es->type == PIPE_SHADER_VERTEX && - si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, - &shader->key, false)) { - union si_shader_part_key vs_prolog_key; - si_get_vs_prolog_key(&es->info, - shader_es.info.num_input_sgprs, - false, - &shader->key.part.gs.vs_prolog, - shader, &vs_prolog_key); - vs_prolog_key.vs_prolog.is_monolithic = true; - si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); - es_prolog = ctx.main_fn; - } - - /* Reset the shader context. */ - ctx.shader = shader; - ctx.type = PIPE_SHADER_GEOMETRY; - - /* Prepare the array of shader parts. */ - LLVMValueRef parts[4]; - unsigned num_parts = 0, main_part, next_first_part; - - if (es_prolog) - parts[num_parts++] = es_prolog; - - parts[main_part = num_parts++] = es_main; - parts[next_first_part = num_parts++] = gs_prolog; - parts[num_parts++] = gs_main; - - si_build_wrapper_function(&ctx, parts, num_parts, - main_part, next_first_part); - } else { - LLVMValueRef parts[2]; - union si_shader_part_key prolog_key; - - parts[1] = ctx.main_fn; - - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - si_llvm_build_gs_prolog(&ctx, &prolog_key); - parts[0] = ctx.main_fn; - - si_build_wrapper_function(&ctx, parts, 2, 1, 0); - } - } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { - si_llvm_build_monolithic_ps(&ctx, shader); - } - - si_llvm_optimize_module(&ctx); - - /* Post-optimization transformations and analysis. */ - si_optimize_vs_outputs(&ctx); - - if ((debug && debug->debug_message) || - si_can_dump_shader(sscreen, ctx.type)) { - ctx.shader->info.private_mem_vgprs = - ac_count_scratch_private_memory(ctx.main_fn); - } - - /* Make sure the input is a pointer and not integer followed by inttoptr. */ - assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == - LLVMPointerTypeKind); - - /* Compile to bytecode. */ - if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, - &ctx.ac, debug, ctx.type, si_get_shader_name(shader), - si_should_optimize_less(compiler, shader->selector))) { - si_llvm_dispose(&ctx); - fprintf(stderr, "LLVM failed to compile shader\n"); - return false; - } - - si_llvm_dispose(&ctx); - return true; + struct si_shader_selector *sel = shader->selector; + struct si_shader_context ctx; + + si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); + + LLVMValueRef ngg_cull_main_fn = NULL; + if (shader->key.opt.ngg_culling) { + if (!si_build_main_function(&ctx, shader, nir, false, true)) { + si_llvm_dispose(&ctx); + return false; + } + ngg_cull_main_fn = ctx.main_fn; + ctx.main_fn = NULL; + } + + if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) { + si_llvm_dispose(&ctx); + return false; + } + + if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) { + LLVMValueRef parts[4]; + unsigned num_parts = 0; + bool has_prolog = false; + LLVMValueRef main_fn = ctx.main_fn; + + if (ngg_cull_main_fn) { + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) { + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true, + &shader->key.part.vs.prolog, shader, &prolog_key); + prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &prolog_key); + parts[num_parts++] = ctx.main_fn; + has_prolog = true; + } + parts[num_parts++] = ngg_cull_main_fn; + } + + if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) { + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false, + &shader->key.part.vs.prolog, shader, &prolog_key); + prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &prolog_key); + parts[num_parts++] = ctx.main_fn; + has_prolog = true; + } + parts[num_parts++] = main_fn; + + si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0); + + if (ctx.shader->key.opt.vs_as_prim_discard_cs) + si_build_prim_discard_compute_shader(&ctx); + } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && ngg_cull_main_fn) { + LLVMValueRef parts[2]; + + parts[0] = ngg_cull_main_fn; + parts[1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 0, 0); + } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) { + if (sscreen->info.chip_class >= GFX9) { + struct si_shader_selector *ls = shader->key.part.tcs.ls; + LLVMValueRef parts[4]; + bool vs_needs_prolog = + si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false); + + /* TCS main part */ + parts[2] = ctx.main_fn; + + /* TCS epilog */ + union si_shader_part_key tcs_epilog_key; + memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key)); + tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key); + parts[3] = ctx.main_fn; + + /* VS as LS main part */ + nir = get_nir_shader(ls, &free_nir); + struct si_shader shader_ls = {}; + shader_ls.selector = ls; + shader_ls.key.as_ls = 1; + shader_ls.key.mono = shader->key.mono; + shader_ls.key.opt = shader->key.opt; + shader_ls.is_monolithic = true; + + if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) { + si_llvm_dispose(&ctx); + return false; + } + shader->info.uses_instanceid |= ls->info.uses_instanceid; + parts[1] = ctx.main_fn; + + /* LS prolog */ + if (vs_needs_prolog) { + union si_shader_part_key vs_prolog_key; + si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false, + &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key); + vs_prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); + parts[0] = ctx.main_fn; + } + + /* Reset the shader context. */ + ctx.shader = shader; + ctx.type = PIPE_SHADER_TESS_CTRL; + + si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog, + vs_needs_prolog, vs_needs_prolog ? 2 : 1); + } else { + LLVMValueRef parts[2]; + union si_shader_part_key epilog_key; + + parts[0] = ctx.main_fn; + + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + si_llvm_build_tcs_epilog(&ctx, &epilog_key); + parts[1] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 0, 0); + } + } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) { + if (ctx.screen->info.chip_class >= GFX9) { + struct si_shader_selector *es = shader->key.part.gs.es; + LLVMValueRef es_prolog = NULL; + LLVMValueRef es_main = NULL; + LLVMValueRef gs_prolog = NULL; + LLVMValueRef gs_main = ctx.main_fn; + + /* GS prolog */ + union si_shader_part_key gs_prolog_key; + memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); + gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + gs_prolog_key.gs_prolog.is_monolithic = true; + gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; + si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); + gs_prolog = ctx.main_fn; + + /* ES main part */ + nir = get_nir_shader(es, &free_nir); + struct si_shader shader_es = {}; + shader_es.selector = es; + shader_es.key.as_es = 1; + shader_es.key.as_ngg = shader->key.as_ngg; + shader_es.key.mono = shader->key.mono; + shader_es.key.opt = shader->key.opt; + shader_es.is_monolithic = true; + + if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) { + si_llvm_dispose(&ctx); + return false; + } + shader->info.uses_instanceid |= es->info.uses_instanceid; + es_main = ctx.main_fn; + + /* ES prolog */ + if (es->type == PIPE_SHADER_VERTEX && + si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) { + union si_shader_part_key vs_prolog_key; + si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false, + &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key); + vs_prolog_key.vs_prolog.is_monolithic = true; + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); + es_prolog = ctx.main_fn; + } + + /* Reset the shader context. */ + ctx.shader = shader; + ctx.type = PIPE_SHADER_GEOMETRY; + + /* Prepare the array of shader parts. */ + LLVMValueRef parts[4]; + unsigned num_parts = 0, main_part, next_first_part; + + if (es_prolog) + parts[num_parts++] = es_prolog; + + parts[main_part = num_parts++] = es_main; + parts[next_first_part = num_parts++] = gs_prolog; + parts[num_parts++] = gs_main; + + si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part); + } else { + LLVMValueRef parts[2]; + union si_shader_part_key prolog_key; + + parts[1] = ctx.main_fn; + + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + si_llvm_build_gs_prolog(&ctx, &prolog_key); + parts[0] = ctx.main_fn; + + si_build_wrapper_function(&ctx, parts, 2, 1, 0); + } + } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) { + si_llvm_build_monolithic_ps(&ctx, shader); + } + + si_llvm_optimize_module(&ctx); + + /* Post-optimization transformations and analysis. */ + si_optimize_vs_outputs(&ctx); + + if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.type)) { + ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn); + } + + /* Make sure the input is a pointer and not integer followed by inttoptr. */ + assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind); + + /* Compile to bytecode. */ + if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug, + ctx.type, si_get_shader_name(shader), + si_should_optimize_less(compiler, shader->selector))) { + si_llvm_dispose(&ctx); + fprintf(stderr, "LLVM failed to compile shader\n"); + return false; + } + + si_llvm_dispose(&ctx); + return true; } -bool si_compile_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - struct si_shader_selector *sel = shader->selector; - bool free_nir; - struct nir_shader *nir = get_nir_shader(sel, &free_nir); - - /* Dump NIR before doing NIR->LLVM conversion in case the - * conversion fails. */ - if (si_can_dump_shader(sscreen, sel->type) && - !(sscreen->debug_flags & DBG(NO_NIR))) { - nir_print_shader(nir, stderr); - si_dump_streamout(&sel->so); - } - - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, - sizeof(shader->info.vs_output_param_offset)); - - shader->info.uses_instanceid = sel->info.uses_instanceid; - - /* TODO: ACO could compile non-monolithic shaders here (starting - * with PS and NGG VS), but monolithic shaders should be compiled - * by LLVM due to more complicated compilation. - */ - if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) - return false; - - /* Validate SGPR and VGPR usage for compute to detect compiler bugs. - * LLVM 3.9svn has this bug. - */ - if (sel->type == PIPE_SHADER_COMPUTE) { - unsigned wave_size = sscreen->compute_wave_size; - unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd * - (wave_size == 32 ? 2 : 1); - unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; - unsigned max_sgprs_per_wave = 128; - unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ - unsigned threads_per_tg = si_get_max_workgroup_size(shader); - unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); - unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); - - max_vgprs = max_vgprs / waves_per_simd; - max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave); - - if (shader->config.num_sgprs > max_sgprs || - shader->config.num_vgprs > max_vgprs) { - fprintf(stderr, "LLVM failed to compile a shader correctly: " - "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", - shader->config.num_sgprs, shader->config.num_vgprs, - max_sgprs, max_vgprs); - - /* Just terminate the process, because dependent - * shaders can hang due to bad input data, but use - * the env var to allow shader-db to work. - */ - if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) - abort(); - } - } - - /* Add the scratch offset to input SGPRs. */ - if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) - shader->info.num_input_sgprs += 1; /* scratch byte offset */ - - /* Calculate the number of fragment input VGPRs. */ - if (sel->type == PIPE_SHADER_FRAGMENT) { - shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config, - &shader->info.face_vgpr_index, - &shader->info.ancillary_vgpr_index); - } - - si_calculate_max_simd_waves(shader); - si_shader_dump_stats_for_shader_db(sscreen, shader, debug); - return true; + struct si_shader_selector *sel = shader->selector; + bool free_nir; + struct nir_shader *nir = get_nir_shader(sel, &free_nir); + + /* Dump NIR before doing NIR->LLVM conversion in case the + * conversion fails. */ + if (si_can_dump_shader(sscreen, sel->type) && !(sscreen->debug_flags & DBG(NO_NIR))) { + nir_print_shader(nir, stderr); + si_dump_streamout(&sel->so); + } + + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(shader->info.vs_output_param_offset)); + + shader->info.uses_instanceid = sel->info.uses_instanceid; + + /* TODO: ACO could compile non-monolithic shaders here (starting + * with PS and NGG VS), but monolithic shaders should be compiled + * by LLVM due to more complicated compilation. + */ + if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) + return false; + + /* Validate SGPR and VGPR usage for compute to detect compiler bugs. + * LLVM 3.9svn has this bug. + */ + if (sel->type == PIPE_SHADER_COMPUTE) { + unsigned wave_size = sscreen->compute_wave_size; + unsigned max_vgprs = + sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1); + unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; + unsigned max_sgprs_per_wave = 128; + unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ + unsigned threads_per_tg = si_get_max_workgroup_size(shader); + unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); + unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); + + max_vgprs = max_vgprs / waves_per_simd; + max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave); + + if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) { + fprintf(stderr, + "LLVM failed to compile a shader correctly: " + "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", + shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs); + + /* Just terminate the process, because dependent + * shaders can hang due to bad input data, but use + * the env var to allow shader-db to work. + */ + if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) + abort(); + } + } + + /* Add the scratch offset to input SGPRs. */ + if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) + shader->info.num_input_sgprs += 1; /* scratch byte offset */ + + /* Calculate the number of fragment input VGPRs. */ + if (sel->type == PIPE_SHADER_FRAGMENT) { + shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt( + &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index); + } + + si_calculate_max_simd_waves(shader); + si_shader_dump_stats_for_shader_db(sscreen, shader, debug); + return true; } /** @@ -2057,335 +1905,300 @@ bool si_compile_shader(struct si_screen *sscreen, * \return non-NULL on success */ static struct si_shader_part * -si_get_shader_part(struct si_screen *sscreen, - struct si_shader_part **list, - enum pipe_shader_type type, - bool prolog, - union si_shader_part_key *key, - struct ac_llvm_compiler *compiler, - struct pipe_debug_callback *debug, - void (*build)(struct si_shader_context *, - union si_shader_part_key *), - const char *name) +si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, + enum pipe_shader_type type, bool prolog, union si_shader_part_key *key, + struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, + void (*build)(struct si_shader_context *, union si_shader_part_key *), + const char *name) { - struct si_shader_part *result; - - simple_mtx_lock(&sscreen->shader_parts_mutex); - - /* Find existing. */ - for (result = *list; result; result = result->next) { - if (memcmp(&result->key, key, sizeof(*key)) == 0) { - simple_mtx_unlock(&sscreen->shader_parts_mutex); - return result; - } - } - - /* Compile a new one. */ - result = CALLOC_STRUCT(si_shader_part); - result->key = *key; - - struct si_shader_selector sel = {}; - sel.screen = sscreen; - - struct si_shader shader = {}; - shader.selector = &sel; - - switch (type) { - case PIPE_SHADER_VERTEX: - shader.key.as_ls = key->vs_prolog.as_ls; - shader.key.as_es = key->vs_prolog.as_es; - shader.key.as_ngg = key->vs_prolog.as_ngg; - shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; - break; - case PIPE_SHADER_TESS_CTRL: - assert(!prolog); - shader.key.part.tcs.epilog = key->tcs_epilog.states; - break; - case PIPE_SHADER_GEOMETRY: - assert(prolog); - shader.key.as_ngg = key->gs_prolog.as_ngg; - break; - case PIPE_SHADER_FRAGMENT: - if (prolog) - shader.key.part.ps.prolog = key->ps_prolog.states; - else - shader.key.part.ps.epilog = key->ps_epilog.states; - break; - default: - unreachable("bad shader part"); - } - - struct si_shader_context ctx; - si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, type, shader.key.as_ngg, - shader.key.as_es, - shader.key.opt.vs_as_prim_discard_cs)); - ctx.shader = &shader; - ctx.type = type; - - build(&ctx, key); - - /* Compile. */ - si_llvm_optimize_module(&ctx); - - if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, - &ctx.ac, debug, ctx.type, name, false)) { - FREE(result); - result = NULL; - goto out; - } - - result->next = *list; - *list = result; + struct si_shader_part *result; + + simple_mtx_lock(&sscreen->shader_parts_mutex); + + /* Find existing. */ + for (result = *list; result; result = result->next) { + if (memcmp(&result->key, key, sizeof(*key)) == 0) { + simple_mtx_unlock(&sscreen->shader_parts_mutex); + return result; + } + } + + /* Compile a new one. */ + result = CALLOC_STRUCT(si_shader_part); + result->key = *key; + + struct si_shader_selector sel = {}; + sel.screen = sscreen; + + struct si_shader shader = {}; + shader.selector = &sel; + + switch (type) { + case PIPE_SHADER_VERTEX: + shader.key.as_ls = key->vs_prolog.as_ls; + shader.key.as_es = key->vs_prolog.as_es; + shader.key.as_ngg = key->vs_prolog.as_ngg; + shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; + break; + case PIPE_SHADER_TESS_CTRL: + assert(!prolog); + shader.key.part.tcs.epilog = key->tcs_epilog.states; + break; + case PIPE_SHADER_GEOMETRY: + assert(prolog); + shader.key.as_ngg = key->gs_prolog.as_ngg; + break; + case PIPE_SHADER_FRAGMENT: + if (prolog) + shader.key.part.ps.prolog = key->ps_prolog.states; + else + shader.key.part.ps.epilog = key->ps_epilog.states; + break; + default: + unreachable("bad shader part"); + } + + struct si_shader_context ctx; + si_llvm_context_init(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es, + shader.key.opt.vs_as_prim_discard_cs)); + ctx.shader = &shader; + ctx.type = type; + + build(&ctx, key); + + /* Compile. */ + si_llvm_optimize_module(&ctx); + + if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug, + ctx.type, name, false)) { + FREE(result); + result = NULL; + goto out; + } + + result->next = *list; + *list = result; out: - si_llvm_dispose(&ctx); - simple_mtx_unlock(&sscreen->shader_parts_mutex); - return result; + si_llvm_dispose(&ctx); + simple_mtx_unlock(&sscreen->shader_parts_mutex); + return result; } -static bool si_get_vs_prolog(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug, - struct si_shader *main_part, - const struct si_vs_prolog_bits *key) +static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug, + struct si_shader *main_part, const struct si_vs_prolog_bits *key) { - struct si_shader_selector *vs = main_part->selector; - - if (!si_vs_needs_prolog(vs, key, &shader->key, false)) - return true; - - /* Get the prolog. */ - union si_shader_part_key prolog_key; - si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, - key, shader, &prolog_key); - - shader->prolog = - si_get_shader_part(sscreen, &sscreen->vs_prologs, - PIPE_SHADER_VERTEX, true, &prolog_key, compiler, - debug, si_llvm_build_vs_prolog, - "Vertex Shader Prolog"); - return shader->prolog != NULL; + struct si_shader_selector *vs = main_part->selector; + + if (!si_vs_needs_prolog(vs, key, &shader->key, false)) + return true; + + /* Get the prolog. */ + union si_shader_part_key prolog_key; + si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, + &prolog_key); + + shader->prolog = + si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key, + compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); + return shader->prolog != NULL; } /** * Select and compile (or reuse) vertex shader parts (prolog & epilog). */ -static bool si_shader_select_vs_parts(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, - &shader->key.part.vs.prolog); + return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog); } /** * Select and compile (or reuse) TCS parts (epilog). */ -static bool si_shader_select_tcs_parts(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - if (sscreen->info.chip_class >= GFX9) { - struct si_shader *ls_main_part = - shader->key.part.tcs.ls->main_shader_part_ls; - - if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, - &shader->key.part.tcs.ls_prolog)) - return false; - - shader->previous_stage = ls_main_part; - } - - /* Get the epilog. */ - union si_shader_part_key epilog_key; - memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; - - shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, - PIPE_SHADER_TESS_CTRL, false, - &epilog_key, compiler, debug, - si_llvm_build_tcs_epilog, - "Tessellation Control Shader Epilog"); - return shader->epilog != NULL; + if (sscreen->info.chip_class >= GFX9) { + struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls; + + if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, + &shader->key.part.tcs.ls_prolog)) + return false; + + shader->previous_stage = ls_main_part; + } + + /* Get the epilog. */ + union si_shader_part_key epilog_key; + memset(&epilog_key, 0, sizeof(epilog_key)); + epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; + + shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, PIPE_SHADER_TESS_CTRL, false, + &epilog_key, compiler, debug, si_llvm_build_tcs_epilog, + "Tessellation Control Shader Epilog"); + return shader->epilog != NULL; } /** * Select and compile (or reuse) GS parts (prolog). */ -static bool si_shader_select_gs_parts(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - if (sscreen->info.chip_class >= GFX9) { - struct si_shader *es_main_part; - enum pipe_shader_type es_type = shader->key.part.gs.es->type; - - if (shader->key.as_ngg) - es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es; - else - es_main_part = shader->key.part.gs.es->main_shader_part_es; - - if (es_type == PIPE_SHADER_VERTEX && - !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, - &shader->key.part.gs.vs_prolog)) - return false; - - shader->previous_stage = es_main_part; - } - - if (!shader->key.part.gs.prolog.tri_strip_adj_fix) - return true; - - union si_shader_part_key prolog_key; - memset(&prolog_key, 0, sizeof(prolog_key)); - prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; - - shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, - PIPE_SHADER_GEOMETRY, true, - &prolog_key, compiler, debug, - si_llvm_build_gs_prolog, - "Geometry Shader Prolog"); - return shader->prolog2 != NULL; + if (sscreen->info.chip_class >= GFX9) { + struct si_shader *es_main_part; + enum pipe_shader_type es_type = shader->key.part.gs.es->type; + + if (shader->key.as_ngg) + es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es; + else + es_main_part = shader->key.part.gs.es->main_shader_part_es; + + if (es_type == PIPE_SHADER_VERTEX && + !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, + &shader->key.part.gs.vs_prolog)) + return false; + + shader->previous_stage = es_main_part; + } + + if (!shader->key.part.gs.prolog.tri_strip_adj_fix) + return true; + + union si_shader_part_key prolog_key; + memset(&prolog_key, 0, sizeof(prolog_key)); + prolog_key.gs_prolog.states = shader->key.part.gs.prolog; + prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; + + shader->prolog2 = + si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key, + compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog"); + return shader->prolog2 != NULL; } /** * Compute the PS prolog key, which contains all the information needed to * build the PS prolog function, and set related bits in shader->config. */ -void si_get_ps_prolog_key(struct si_shader *shader, - union si_shader_part_key *key, - bool separate_prolog) +void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, + bool separate_prolog) { - struct si_shader_info *info = &shader->selector->info; - - memset(key, 0, sizeof(*key)); - key->ps_prolog.states = shader->key.part.ps.prolog; - key->ps_prolog.colors_read = info->colors_read; - key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; - key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; - key->ps_prolog.wqm = info->uses_derivatives && - (key->ps_prolog.colors_read || - key->ps_prolog.states.force_persp_sample_interp || - key->ps_prolog.states.force_linear_sample_interp || - key->ps_prolog.states.force_persp_center_interp || - key->ps_prolog.states.force_linear_center_interp || - key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear); - key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; - - if (info->colors_read) { - unsigned *color = shader->selector->color_attr_index; - - if (shader->key.part.ps.prolog.color_two_side) { - /* BCOLORs are stored after the last input. */ - key->ps_prolog.num_interp_inputs = info->num_inputs; - key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; - if (separate_prolog) - shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); - } - - for (unsigned i = 0; i < 2; i++) { - unsigned interp = info->input_interpolate[color[i]]; - unsigned location = info->input_interpolate_loc[color[i]]; - - if (!(info->colors_read & (0xf << i*4))) - continue; - - key->ps_prolog.color_attr_index[i] = color[i]; - - if (shader->key.part.ps.prolog.flatshade_colors && - interp == TGSI_INTERPOLATE_COLOR) - interp = TGSI_INTERPOLATE_CONSTANT; - - switch (interp) { - case TGSI_INTERPOLATE_CONSTANT: - key->ps_prolog.color_interp_vgpr_index[i] = -1; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - case TGSI_INTERPOLATE_COLOR: - /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_persp_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_persp_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - key->ps_prolog.color_interp_vgpr_index[i] = 0; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_SAMPLE_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTER: - key->ps_prolog.color_interp_vgpr_index[i] = 2; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTER_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - key->ps_prolog.color_interp_vgpr_index[i] = 4; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_PERSP_CENTROID_ENA(1); - } - break; - default: - assert(0); - } - break; - case TGSI_INTERPOLATE_LINEAR: - /* Force the interpolation location for colors here. */ - if (shader->key.part.ps.prolog.force_linear_sample_interp) - location = TGSI_INTERPOLATE_LOC_SAMPLE; - if (shader->key.part.ps.prolog.force_linear_center_interp) - location = TGSI_INTERPOLATE_LOC_CENTER; - - /* The VGPR assignment for non-monolithic shaders - * works because InitialPSInputAddr is set on the - * main shader and PERSP_PULL_MODEL is never used. - */ - switch (location) { - case TGSI_INTERPOLATE_LOC_SAMPLE: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 6 : 9; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_SAMPLE_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTER: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 8 : 11; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTER_ENA(1); - } - break; - case TGSI_INTERPOLATE_LOC_CENTROID: - key->ps_prolog.color_interp_vgpr_index[i] = - separate_prolog ? 10 : 13; - if (separate_prolog) { - shader->config.spi_ps_input_ena |= - S_0286CC_LINEAR_CENTROID_ENA(1); - } - break; - default: - assert(0); - } - break; - default: - assert(0); - } - } - } + struct si_shader_info *info = &shader->selector->info; + + memset(key, 0, sizeof(*key)); + key->ps_prolog.states = shader->key.part.ps.prolog; + key->ps_prolog.colors_read = info->colors_read; + key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; + key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; + key->ps_prolog.wqm = + info->uses_derivatives && + (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); + key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; + + if (info->colors_read) { + unsigned *color = shader->selector->color_attr_index; + + if (shader->key.part.ps.prolog.color_two_side) { + /* BCOLORs are stored after the last input. */ + key->ps_prolog.num_interp_inputs = info->num_inputs; + key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; + if (separate_prolog) + shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); + } + + for (unsigned i = 0; i < 2; i++) { + unsigned interp = info->input_interpolate[color[i]]; + unsigned location = info->input_interpolate_loc[color[i]]; + + if (!(info->colors_read & (0xf << i * 4))) + continue; + + key->ps_prolog.color_attr_index[i] = color[i]; + + if (shader->key.part.ps.prolog.flatshade_colors && interp == TGSI_INTERPOLATE_COLOR) + interp = TGSI_INTERPOLATE_CONSTANT; + + switch (interp) { + case TGSI_INTERPOLATE_CONSTANT: + key->ps_prolog.color_interp_vgpr_index[i] = -1; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + case TGSI_INTERPOLATE_COLOR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_persp_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_persp_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; + + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = 0; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = 2; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = 4; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1); + } + break; + default: + assert(0); + } + break; + case TGSI_INTERPOLATE_LINEAR: + /* Force the interpolation location for colors here. */ + if (shader->key.part.ps.prolog.force_linear_sample_interp) + location = TGSI_INTERPOLATE_LOC_SAMPLE; + if (shader->key.part.ps.prolog.force_linear_center_interp) + location = TGSI_INTERPOLATE_LOC_CENTER; + + /* The VGPR assignment for non-monolithic shaders + * works because InitialPSInputAddr is set on the + * main shader and PERSP_PULL_MODEL is never used. + */ + switch (location) { + case TGSI_INTERPOLATE_LOC_SAMPLE: + key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTER: + key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + } + break; + case TGSI_INTERPOLATE_LOC_CENTROID: + key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13; + if (separate_prolog) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1); + } + break; + default: + assert(0); + } + break; + default: + assert(0); + } + } + } } /** @@ -2393,331 +2206,308 @@ void si_get_ps_prolog_key(struct si_shader *shader, */ bool si_need_ps_prolog(const union si_shader_part_key *key) { - return key->ps_prolog.colors_read || - key->ps_prolog.states.force_persp_sample_interp || - key->ps_prolog.states.force_linear_sample_interp || - key->ps_prolog.states.force_persp_center_interp || - key->ps_prolog.states.force_linear_center_interp || - key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear || - key->ps_prolog.states.poly_stipple || - key->ps_prolog.states.samplemask_log_ps_iter; + return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || + key->ps_prolog.states.force_linear_sample_interp || + key->ps_prolog.states.force_persp_center_interp || + key->ps_prolog.states.force_linear_center_interp || + key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple || + key->ps_prolog.states.samplemask_log_ps_iter; } /** * Compute the PS epilog key, which contains all the information needed to * build the PS epilog function. */ -void si_get_ps_epilog_key(struct si_shader *shader, - union si_shader_part_key *key) +void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key) { - struct si_shader_info *info = &shader->selector->info; - memset(key, 0, sizeof(*key)); - key->ps_epilog.colors_written = info->colors_written; - key->ps_epilog.writes_z = info->writes_z; - key->ps_epilog.writes_stencil = info->writes_stencil; - key->ps_epilog.writes_samplemask = info->writes_samplemask; - key->ps_epilog.states = shader->key.part.ps.epilog; + struct si_shader_info *info = &shader->selector->info; + memset(key, 0, sizeof(*key)); + key->ps_epilog.colors_written = info->colors_written; + key->ps_epilog.writes_z = info->writes_z; + key->ps_epilog.writes_stencil = info->writes_stencil; + key->ps_epilog.writes_samplemask = info->writes_samplemask; + key->ps_epilog.states = shader->key.part.ps.epilog; } /** * Select and compile (or reuse) pixel shader parts (prolog & epilog). */ -static bool si_shader_select_ps_parts(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - union si_shader_part_key prolog_key; - union si_shader_part_key epilog_key; - - /* Get the prolog. */ - si_get_ps_prolog_key(shader, &prolog_key, true); - - /* The prolog is a no-op if these aren't set. */ - if (si_need_ps_prolog(&prolog_key)) { - shader->prolog = - si_get_shader_part(sscreen, &sscreen->ps_prologs, - PIPE_SHADER_FRAGMENT, true, - &prolog_key, compiler, debug, - si_llvm_build_ps_prolog, - "Fragment Shader Prolog"); - if (!shader->prolog) - return false; - } - - /* Get the epilog. */ - si_get_ps_epilog_key(shader, &epilog_key); - - shader->epilog = - si_get_shader_part(sscreen, &sscreen->ps_epilogs, - PIPE_SHADER_FRAGMENT, false, - &epilog_key, compiler, debug, - si_llvm_build_ps_epilog, - "Fragment Shader Epilog"); - if (!shader->epilog) - return false; - - /* Enable POS_FIXED_PT if polygon stippling is enabled. */ - if (shader->key.part.ps.prolog.poly_stipple) { - shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); - assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); - } - - /* Set up the enable bits for per-sample shading if needed. */ - if (shader->key.part.ps.prolog.force_persp_sample_interp && - (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); - } - if (shader->key.part.ps.prolog.force_linear_sample_interp && - (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); - } - if (shader->key.part.ps.prolog.force_persp_center_interp && - (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); - } - if (shader->key.part.ps.prolog.force_linear_center_interp && - (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; - shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - } - - /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ - if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && - !(shader->config.spi_ps_input_ena & 0xf)) { - shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); - assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); - } - - /* At least one pair of interpolation weights must be enabled. */ - if (!(shader->config.spi_ps_input_ena & 0x7f)) { - shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); - assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); - } - - /* Samplemask fixup requires the sample ID. */ - if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { - shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); - assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); - } - - /* The sample mask input is always enabled, because the API shader always - * passes it through to the epilog. Disable it here if it's unused. - */ - if (!shader->key.part.ps.epilog.poly_line_smoothing && - !shader->selector->info.reads_samplemask) - shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; - - return true; + union si_shader_part_key prolog_key; + union si_shader_part_key epilog_key; + + /* Get the prolog. */ + si_get_ps_prolog_key(shader, &prolog_key, true); + + /* The prolog is a no-op if these aren't set. */ + if (si_need_ps_prolog(&prolog_key)) { + shader->prolog = + si_get_shader_part(sscreen, &sscreen->ps_prologs, PIPE_SHADER_FRAGMENT, true, &prolog_key, + compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog"); + if (!shader->prolog) + return false; + } + + /* Get the epilog. */ + si_get_ps_epilog_key(shader, &epilog_key); + + shader->epilog = + si_get_shader_part(sscreen, &sscreen->ps_epilogs, PIPE_SHADER_FRAGMENT, false, &epilog_key, + compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog"); + if (!shader->epilog) + return false; + + /* Enable POS_FIXED_PT if polygon stippling is enabled. */ + if (shader->key.part.ps.prolog.poly_stipple) { + shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); + assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); + } + + /* Set up the enable bits for per-sample shading if needed. */ + if (shader->key.part.ps.prolog.force_persp_sample_interp && + (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); + } + if (shader->key.part.ps.prolog.force_linear_sample_interp && + (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); + } + if (shader->key.part.ps.prolog.force_persp_center_interp && + (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + } + if (shader->key.part.ps.prolog.force_linear_center_interp && + (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; + shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + } + + /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ + if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && + !(shader->config.spi_ps_input_ena & 0xf)) { + shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); + assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* At least one pair of interpolation weights must be enabled. */ + if (!(shader->config.spi_ps_input_ena & 0x7f)) { + shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); + assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); + } + + /* Samplemask fixup requires the sample ID. */ + if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { + shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); + assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); + } + + /* The sample mask input is always enabled, because the API shader always + * passes it through to the epilog. Disable it here if it's unused. + */ + if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask) + shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; + + return true; } -void si_multiwave_lds_size_workaround(struct si_screen *sscreen, - unsigned *lds_size) +void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size) { - /* If tessellation is all offchip and on-chip GS isn't used, this - * workaround is not needed. - */ - return; - - /* SPI barrier management bug: - * Make sure we have at least 4k of LDS in use to avoid the bug. - * It applies to workgroup sizes of more than one wavefront. - */ - if (sscreen->info.family == CHIP_BONAIRE || - sscreen->info.family == CHIP_KABINI) - *lds_size = MAX2(*lds_size, 8); + /* If tessellation is all offchip and on-chip GS isn't used, this + * workaround is not needed. + */ + return; + + /* SPI barrier management bug: + * Make sure we have at least 4k of LDS in use to avoid the bug. + * It applies to workgroup sizes of more than one wavefront. + */ + if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI) + *lds_size = MAX2(*lds_size, 8); } void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) { - unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ + unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ - shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); - if (shader->selector->type == PIPE_SHADER_COMPUTE && - si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { - si_multiwave_lds_size_workaround(sscreen, - &shader->config.lds_size); - } + if (shader->selector->type == PIPE_SHADER_COMPUTE && + si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { + si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); + } } -bool si_create_shader_variant(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug) +bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug) { - struct si_shader_selector *sel = shader->selector; - struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); - - /* LS, ES, VS are compiled on demand if the main part hasn't been - * compiled for that stage. - * - * GS are compiled on demand if the main part hasn't been compiled - * for the chosen NGG-ness. - * - * Vertex shaders are compiled on demand when a vertex fetch - * workaround must be applied. - */ - if (shader->is_monolithic) { - /* Monolithic shader (compiled as a whole, has many variants, - * may take a long time to compile). - */ - if (!si_compile_shader(sscreen, compiler, shader, debug)) - return false; - } else { - /* The shader consists of several parts: - * - * - the middle part is the user shader, it has 1 variant only - * and it was compiled during the creation of the shader - * selector - * - the prolog part is inserted at the beginning - * - the epilog part is inserted at the end - * - * The prolog and epilog have many (but simple) variants. - * - * Starting with gfx9, geometry and tessellation control - * shaders also contain the prolog and user shader parts of - * the previous shader stage. - */ - - if (!mainp) - return false; - - /* Copy the compiled shader data over. */ - shader->is_binary_shared = true; - shader->binary = mainp->binary; - shader->config = mainp->config; - shader->info.num_input_sgprs = mainp->info.num_input_sgprs; - shader->info.num_input_vgprs = mainp->info.num_input_vgprs; - shader->info.face_vgpr_index = mainp->info.face_vgpr_index; - shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; - memcpy(shader->info.vs_output_param_offset, - mainp->info.vs_output_param_offset, - sizeof(mainp->info.vs_output_param_offset)); - shader->info.uses_instanceid = mainp->info.uses_instanceid; - shader->info.nr_pos_exports = mainp->info.nr_pos_exports; - shader->info.nr_param_exports = mainp->info.nr_param_exports; - - /* Select prologs and/or epilogs. */ - switch (sel->type) { - case PIPE_SHADER_VERTEX: - if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) - return false; - break; - case PIPE_SHADER_TESS_CTRL: - if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) - return false; - break; - case PIPE_SHADER_TESS_EVAL: - break; - case PIPE_SHADER_GEOMETRY: - if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) - return false; - break; - case PIPE_SHADER_FRAGMENT: - if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) - return false; - - /* Make sure we have at least as many VGPRs as there - * are allocated inputs. - */ - shader->config.num_vgprs = MAX2(shader->config.num_vgprs, - shader->info.num_input_vgprs); - break; - default:; - } - - /* Update SGPR and VGPR counts. */ - if (shader->prolog) { - shader->config.num_sgprs = MAX2(shader->config.num_sgprs, - shader->prolog->config.num_sgprs); - shader->config.num_vgprs = MAX2(shader->config.num_vgprs, - shader->prolog->config.num_vgprs); - } - if (shader->previous_stage) { - shader->config.num_sgprs = MAX2(shader->config.num_sgprs, - shader->previous_stage->config.num_sgprs); - shader->config.num_vgprs = MAX2(shader->config.num_vgprs, - shader->previous_stage->config.num_vgprs); - shader->config.spilled_sgprs = - MAX2(shader->config.spilled_sgprs, - shader->previous_stage->config.spilled_sgprs); - shader->config.spilled_vgprs = - MAX2(shader->config.spilled_vgprs, - shader->previous_stage->config.spilled_vgprs); - shader->info.private_mem_vgprs = - MAX2(shader->info.private_mem_vgprs, - shader->previous_stage->info.private_mem_vgprs); - shader->config.scratch_bytes_per_wave = - MAX2(shader->config.scratch_bytes_per_wave, - shader->previous_stage->config.scratch_bytes_per_wave); - shader->info.uses_instanceid |= - shader->previous_stage->info.uses_instanceid; - } - if (shader->prolog2) { - shader->config.num_sgprs = MAX2(shader->config.num_sgprs, - shader->prolog2->config.num_sgprs); - shader->config.num_vgprs = MAX2(shader->config.num_vgprs, - shader->prolog2->config.num_vgprs); - } - if (shader->epilog) { - shader->config.num_sgprs = MAX2(shader->config.num_sgprs, - shader->epilog->config.num_sgprs); - shader->config.num_vgprs = MAX2(shader->config.num_vgprs, - shader->epilog->config.num_vgprs); - } - si_calculate_max_simd_waves(shader); - } - - if (shader->key.as_ngg) { - assert(!shader->key.as_es && !shader->key.as_ls); - gfx10_ngg_calculate_subgroup_info(shader); - } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) { - gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); - } - - si_fix_resource_usage(sscreen, shader); - si_shader_dump(sscreen, shader, debug, stderr, true); - - /* Upload. */ - if (!si_shader_binary_upload(sscreen, shader, 0)) { - fprintf(stderr, "LLVM failed to upload shader\n"); - return false; - } - - return true; + struct si_shader_selector *sel = shader->selector; + struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); + + /* LS, ES, VS are compiled on demand if the main part hasn't been + * compiled for that stage. + * + * GS are compiled on demand if the main part hasn't been compiled + * for the chosen NGG-ness. + * + * Vertex shaders are compiled on demand when a vertex fetch + * workaround must be applied. + */ + if (shader->is_monolithic) { + /* Monolithic shader (compiled as a whole, has many variants, + * may take a long time to compile). + */ + if (!si_compile_shader(sscreen, compiler, shader, debug)) + return false; + } else { + /* The shader consists of several parts: + * + * - the middle part is the user shader, it has 1 variant only + * and it was compiled during the creation of the shader + * selector + * - the prolog part is inserted at the beginning + * - the epilog part is inserted at the end + * + * The prolog and epilog have many (but simple) variants. + * + * Starting with gfx9, geometry and tessellation control + * shaders also contain the prolog and user shader parts of + * the previous shader stage. + */ + + if (!mainp) + return false; + + /* Copy the compiled shader data over. */ + shader->is_binary_shared = true; + shader->binary = mainp->binary; + shader->config = mainp->config; + shader->info.num_input_sgprs = mainp->info.num_input_sgprs; + shader->info.num_input_vgprs = mainp->info.num_input_vgprs; + shader->info.face_vgpr_index = mainp->info.face_vgpr_index; + shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; + memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset, + sizeof(mainp->info.vs_output_param_offset)); + shader->info.uses_instanceid = mainp->info.uses_instanceid; + shader->info.nr_pos_exports = mainp->info.nr_pos_exports; + shader->info.nr_param_exports = mainp->info.nr_param_exports; + + /* Select prologs and/or epilogs. */ + switch (sel->type) { + case PIPE_SHADER_VERTEX: + if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) + return false; + break; + case PIPE_SHADER_TESS_CTRL: + if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) + return false; + break; + case PIPE_SHADER_TESS_EVAL: + break; + case PIPE_SHADER_GEOMETRY: + if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) + return false; + break; + case PIPE_SHADER_FRAGMENT: + if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) + return false; + + /* Make sure we have at least as many VGPRs as there + * are allocated inputs. + */ + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs); + break; + default:; + } + + /* Update SGPR and VGPR counts. */ + if (shader->prolog) { + shader->config.num_sgprs = + MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs); + shader->config.num_vgprs = + MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs); + } + if (shader->previous_stage) { + shader->config.num_sgprs = + MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs); + shader->config.num_vgprs = + MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs); + shader->config.spilled_sgprs = + MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs); + shader->config.spilled_vgprs = + MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs); + shader->info.private_mem_vgprs = + MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs); + shader->config.scratch_bytes_per_wave = + MAX2(shader->config.scratch_bytes_per_wave, + shader->previous_stage->config.scratch_bytes_per_wave); + shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid; + } + if (shader->prolog2) { + shader->config.num_sgprs = + MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs); + shader->config.num_vgprs = + MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs); + } + if (shader->epilog) { + shader->config.num_sgprs = + MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs); + shader->config.num_vgprs = + MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs); + } + si_calculate_max_simd_waves(shader); + } + + if (shader->key.as_ngg) { + assert(!shader->key.as_es && !shader->key.as_ls); + gfx10_ngg_calculate_subgroup_info(shader); + } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) { + gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); + } + + si_fix_resource_usage(sscreen, shader); + si_shader_dump(sscreen, shader, debug, stderr, true); + + /* Upload. */ + if (!si_shader_binary_upload(sscreen, shader, 0)) { + fprintf(stderr, "LLVM failed to upload shader\n"); + return false; + } + + return true; } void si_shader_binary_clean(struct si_shader_binary *binary) { - free((void *)binary->elf_buffer); - binary->elf_buffer = NULL; + free((void *)binary->elf_buffer); + binary->elf_buffer = NULL; - free(binary->llvm_ir_string); - binary->llvm_ir_string = NULL; + free(binary->llvm_ir_string); + binary->llvm_ir_string = NULL; } void si_shader_destroy(struct si_shader *shader) { - if (shader->scratch_bo) - si_resource_reference(&shader->scratch_bo, NULL); + if (shader->scratch_bo) + si_resource_reference(&shader->scratch_bo, NULL); - si_resource_reference(&shader->bo, NULL); + si_resource_reference(&shader->bo, NULL); - if (!shader->is_binary_shared) - si_shader_binary_clean(&shader->binary); + if (!shader->is_binary_shared) + si_shader_binary_clean(&shader->binary); - free(shader->shader_log); + free(shader->shader_log); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ef571a5d684..4b3bdf4a30e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -131,14 +131,13 @@ #ifndef SI_SHADER_H #define SI_SHADER_H -#include "util/u_inlines.h" -#include "util/u_live_shader_cache.h" -#include "util/u_queue.h" -#include "util/simple_mtx.h" - #include "ac_binary.h" #include "ac_llvm_build.h" #include "ac_llvm_util.h" +#include "util/simple_mtx.h" +#include "util/u_inlines.h" +#include "util/u_live_shader_cache.h" +#include "util/u_queue.h" #include @@ -150,136 +149,139 @@ struct nir_shader; struct si_shader; struct si_context; -#define SI_MAX_ATTRIBS 16 -#define SI_MAX_VS_OUTPUTS 40 +#define SI_MAX_ATTRIBS 16 +#define SI_MAX_VS_OUTPUTS 40 /* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an * index smaller than this. */ -#define SI_MAX_IO_GENERIC 32 +#define SI_MAX_IO_GENERIC 32 #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29)) /* SGPR user data indices */ -enum { - SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */ - SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, - SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */ - SI_SGPR_SAMPLERS_AND_IMAGES, - SI_NUM_RESOURCE_SGPRS, - - /* API VS, TES without GS, GS copy shader */ - SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS, - SI_NUM_VS_STATE_RESOURCE_SGPRS, - - /* all VS variants */ - SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS, - SI_SGPR_START_INSTANCE, - SI_SGPR_DRAWID, - SI_VS_NUM_USER_SGPR, - - SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS, - - /* TES */ - SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS, - SI_SGPR_TES_OFFCHIP_ADDR, - SI_TES_NUM_USER_SGPR, - - /* GFX6-8: TCS only */ - GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, - GFX6_SGPR_TCS_OUT_OFFSETS, - GFX6_SGPR_TCS_OUT_LAYOUT, - GFX6_SGPR_TCS_IN_LAYOUT, - GFX6_TCS_NUM_USER_SGPR, - - /* GFX9: Merged shaders. */ - /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */ - /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */ - GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, - - /* GFX9: Merged LS-HS (VS-TCS) only. */ - GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, - GFX9_SGPR_TCS_OUT_OFFSETS, - GFX9_SGPR_TCS_OUT_LAYOUT, - GFX9_TCS_NUM_USER_SGPR, - - /* GS limits */ - GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, - GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, - GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, - SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, - - /* PS only */ - SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, - SI_PS_NUM_USER_SGPR, - - /* The value has to be 12, because the hw requires that descriptors - * are aligned to 4 SGPRs. - */ - SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12, +enum +{ + SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */ + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, + SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */ + SI_SGPR_SAMPLERS_AND_IMAGES, + SI_NUM_RESOURCE_SGPRS, + + /* API VS, TES without GS, GS copy shader */ + SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS, + SI_NUM_VS_STATE_RESOURCE_SGPRS, + + /* all VS variants */ + SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS, + SI_SGPR_START_INSTANCE, + SI_SGPR_DRAWID, + SI_VS_NUM_USER_SGPR, + + SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS, + + /* TES */ + SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS, + SI_SGPR_TES_OFFCHIP_ADDR, + SI_TES_NUM_USER_SGPR, + + /* GFX6-8: TCS only */ + GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, + GFX6_SGPR_TCS_OUT_OFFSETS, + GFX6_SGPR_TCS_OUT_LAYOUT, + GFX6_SGPR_TCS_IN_LAYOUT, + GFX6_TCS_NUM_USER_SGPR, + + /* GFX9: Merged shaders. */ + /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */ + /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */ + GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, + + /* GFX9: Merged LS-HS (VS-TCS) only. */ + GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, + GFX9_SGPR_TCS_OUT_OFFSETS, + GFX9_SGPR_TCS_OUT_LAYOUT, + GFX9_TCS_NUM_USER_SGPR, + + /* GS limits */ + GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, + GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, + GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, + SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, + + /* PS only */ + SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, + SI_PS_NUM_USER_SGPR, + + /* The value has to be 12, because the hw requires that descriptors + * are aligned to 4 SGPRs. + */ + SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12, }; /* LLVM function parameter indices */ -enum { - SI_NUM_RESOURCE_PARAMS = 4, - - /* PS only parameters */ - SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS, - SI_PARAM_PRIM_MASK, - SI_PARAM_PERSP_SAMPLE, - SI_PARAM_PERSP_CENTER, - SI_PARAM_PERSP_CENTROID, - SI_PARAM_PERSP_PULL_MODEL, - SI_PARAM_LINEAR_SAMPLE, - SI_PARAM_LINEAR_CENTER, - SI_PARAM_LINEAR_CENTROID, - SI_PARAM_LINE_STIPPLE_TEX, - SI_PARAM_POS_X_FLOAT, - SI_PARAM_POS_Y_FLOAT, - SI_PARAM_POS_Z_FLOAT, - SI_PARAM_POS_W_FLOAT, - SI_PARAM_FRONT_FACE, - SI_PARAM_ANCILLARY, - SI_PARAM_SAMPLE_COVERAGE, - SI_PARAM_POS_FIXED_PT, - - SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ +enum +{ + SI_NUM_RESOURCE_PARAMS = 4, + + /* PS only parameters */ + SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS, + SI_PARAM_PRIM_MASK, + SI_PARAM_PERSP_SAMPLE, + SI_PARAM_PERSP_CENTER, + SI_PARAM_PERSP_CENTROID, + SI_PARAM_PERSP_PULL_MODEL, + SI_PARAM_LINEAR_SAMPLE, + SI_PARAM_LINEAR_CENTER, + SI_PARAM_LINEAR_CENTROID, + SI_PARAM_LINE_STIPPLE_TEX, + SI_PARAM_POS_X_FLOAT, + SI_PARAM_POS_Y_FLOAT, + SI_PARAM_POS_Z_FLOAT, + SI_PARAM_POS_W_FLOAT, + SI_PARAM_FRONT_FACE, + SI_PARAM_ANCILLARY, + SI_PARAM_SAMPLE_COVERAGE, + SI_PARAM_POS_FIXED_PT, + + SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ }; /* Fields of driver-defined VS state SGPR. */ -#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x) & 0x1) << 0) -#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE -#define S_VS_STATE_INDEXED(x) (((unsigned)(x) & 0x1) << 1) -#define C_VS_STATE_INDEXED 0xFFFFFFFD -#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x) & 0x3) << 2) -#define C_VS_STATE_OUTPRIM 0xFFFFFFF3 -#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x) & 0x3) << 4) -#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF -#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x) & 0x1) << 6) -#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF -#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x) & 0xF) << 7) -#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F -#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x) & 0x1FFF) << 11) -#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF -#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x) & 0xFF) << 24) -#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF - -enum { - /* Use a property enum that CS wouldn't use. */ - TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN, - - /* These represent the number of SGPRs the shader uses. */ - SI_VS_BLIT_SGPRS_POS = 3, - SI_VS_BLIT_SGPRS_POS_COLOR = 7, - SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, +#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x)&0x1) << 0) +#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE +#define S_VS_STATE_INDEXED(x) (((unsigned)(x)&0x1) << 1) +#define C_VS_STATE_INDEXED 0xFFFFFFFD +#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x)&0x3) << 2) +#define C_VS_STATE_OUTPRIM 0xFFFFFFF3 +#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x)&0x3) << 4) +#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF +#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6) +#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF +#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x)&0xF) << 7) +#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F +#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x)&0x1FFF) << 11) +#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF +#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24) +#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF + +enum +{ + /* Use a property enum that CS wouldn't use. */ + TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN, + + /* These represent the number of SGPRs the shader uses. */ + SI_VS_BLIT_SGPRS_POS = 3, + SI_VS_BLIT_SGPRS_POS_COLOR = 7, + SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; -#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ -#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ -#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */ +#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */ +#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ +#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -292,190 +294,190 @@ enum { * buffer_load_format_xyzw). */ union si_vs_fix_fetch { - struct { - uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */ - uint8_t num_channels_m1 : 2; /* number of channels minus 1 */ - uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */ - uint8_t reverse : 1; /* reverse XYZ channels */ - } u; - uint8_t bits; + struct { + uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */ + uint8_t num_channels_m1 : 2; /* number of channels minus 1 */ + uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */ + uint8_t reverse : 1; /* reverse XYZ channels */ + } u; + uint8_t bits; }; struct si_shader; /* State of the context creating the shader object. */ struct si_compiler_ctx_state { - /* Should only be used by si_init_shader_selector_async and - * si_build_shader_variant if thread_index == -1 (non-threaded). */ - struct ac_llvm_compiler *compiler; + /* Should only be used by si_init_shader_selector_async and + * si_build_shader_variant if thread_index == -1 (non-threaded). */ + struct ac_llvm_compiler *compiler; - /* Used if thread_index == -1 or if debug.async is true. */ - struct pipe_debug_callback debug; + /* Used if thread_index == -1 or if debug.async is true. */ + struct pipe_debug_callback debug; - /* Used for creating the log string for gallium/ddebug. */ - bool is_debug_context; + /* Used for creating the log string for gallium/ddebug. */ + bool is_debug_context; }; struct si_shader_info { - ubyte num_inputs; - ubyte num_outputs; - ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ - ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; - ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; - ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS]; - ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; - ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ - ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; - ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; - - ubyte processor; - - int constbuf0_num_slots; - unsigned const_buffers_declared; /**< bitmask of declared const buffers */ - unsigned samplers_declared; /**< bitmask of declared samplers */ - ubyte num_stream_output_components[4]; - - uint num_memory_instructions; /**< sampler, buffer, and image instructions */ - - /** - * If a tessellation control shader reads outputs, this describes which ones. - */ - bool reads_pervertex_outputs; - bool reads_perpatch_outputs; - bool reads_tessfactor_outputs; - - ubyte colors_read; /**< which color components are read by the FS */ - ubyte colors_written; - bool reads_samplemask; /**< does fragment shader read sample mask? */ - bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ - bool writes_z; /**< does fragment shader write Z value? */ - bool writes_stencil; /**< does fragment shader write stencil value? */ - bool writes_samplemask; /**< does fragment shader write sample mask? */ - bool writes_edgeflag; /**< vertex shader outputs edgeflag */ - bool uses_kill; /**< KILL or KILL_IF instruction used? */ - bool uses_persp_center; - bool uses_persp_centroid; - bool uses_persp_sample; - bool uses_linear_center; - bool uses_linear_centroid; - bool uses_linear_sample; - bool uses_persp_opcode_interp_sample; - bool uses_linear_opcode_interp_sample; - bool uses_instanceid; - bool uses_vertexid; - bool uses_vertexid_nobase; - bool uses_basevertex; - bool uses_drawid; - bool uses_primid; - bool uses_frontface; - bool uses_invocationid; - bool uses_thread_id[3]; - bool uses_block_id[3]; - bool uses_block_size; - bool uses_grid_size; - bool uses_subgroup_info; - bool writes_position; - bool writes_psize; - bool writes_clipvertex; - bool writes_primid; - bool writes_viewport_index; - bool writes_layer; - bool writes_memory; /**< contains stores or atomics to buffers or images */ - bool uses_derivatives; - bool uses_bindless_samplers; - bool uses_bindless_images; - bool uses_fbfetch; - unsigned clipdist_writemask; - unsigned culldist_writemask; - unsigned num_written_culldistance; - unsigned num_written_clipdistance; - - unsigned images_declared; /**< bitmask of declared images */ - unsigned msaa_images_declared; /**< bitmask of declared MSAA images */ - unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */ - - unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */ - - /** Whether all codepaths write tess factors in all invocations. */ - bool tessfactors_are_def_in_all_invocs; + ubyte num_inputs; + ubyte num_outputs; + ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; + ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; + ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS]; + ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; + ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; + + ubyte processor; + + int constbuf0_num_slots; + unsigned const_buffers_declared; /**< bitmask of declared const buffers */ + unsigned samplers_declared; /**< bitmask of declared samplers */ + ubyte num_stream_output_components[4]; + + uint num_memory_instructions; /**< sampler, buffer, and image instructions */ + + /** + * If a tessellation control shader reads outputs, this describes which ones. + */ + bool reads_pervertex_outputs; + bool reads_perpatch_outputs; + bool reads_tessfactor_outputs; + + ubyte colors_read; /**< which color components are read by the FS */ + ubyte colors_written; + bool reads_samplemask; /**< does fragment shader read sample mask? */ + bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */ + bool writes_z; /**< does fragment shader write Z value? */ + bool writes_stencil; /**< does fragment shader write stencil value? */ + bool writes_samplemask; /**< does fragment shader write sample mask? */ + bool writes_edgeflag; /**< vertex shader outputs edgeflag */ + bool uses_kill; /**< KILL or KILL_IF instruction used? */ + bool uses_persp_center; + bool uses_persp_centroid; + bool uses_persp_sample; + bool uses_linear_center; + bool uses_linear_centroid; + bool uses_linear_sample; + bool uses_persp_opcode_interp_sample; + bool uses_linear_opcode_interp_sample; + bool uses_instanceid; + bool uses_vertexid; + bool uses_vertexid_nobase; + bool uses_basevertex; + bool uses_drawid; + bool uses_primid; + bool uses_frontface; + bool uses_invocationid; + bool uses_thread_id[3]; + bool uses_block_id[3]; + bool uses_block_size; + bool uses_grid_size; + bool uses_subgroup_info; + bool writes_position; + bool writes_psize; + bool writes_clipvertex; + bool writes_primid; + bool writes_viewport_index; + bool writes_layer; + bool writes_memory; /**< contains stores or atomics to buffers or images */ + bool uses_derivatives; + bool uses_bindless_samplers; + bool uses_bindless_images; + bool uses_fbfetch; + unsigned clipdist_writemask; + unsigned culldist_writemask; + unsigned num_written_culldistance; + unsigned num_written_clipdistance; + + unsigned images_declared; /**< bitmask of declared images */ + unsigned msaa_images_declared; /**< bitmask of declared MSAA images */ + unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */ + + unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */ + + /** Whether all codepaths write tess factors in all invocations. */ + bool tessfactors_are_def_in_all_invocs; }; /* A shader selector is a gallium CSO and contains shader variants and * binaries for one NIR program. This can be shared by multiple contexts. */ struct si_shader_selector { - struct util_live_shader base; - struct si_screen *screen; - struct util_queue_fence ready; - struct si_compiler_ctx_state compiler_ctx_state; - - simple_mtx_t mutex; - struct si_shader *first_variant; /* immutable after the first variant */ - struct si_shader *last_variant; /* mutable */ - - /* The compiled NIR shader without a prolog and/or epilog (not - * uploaded to a buffer object). - */ - struct si_shader *main_shader_part; - struct si_shader *main_shader_part_ls; /* as_ls is set in the key */ - struct si_shader *main_shader_part_es; /* as_es is set in the key */ - struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ - struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ - - struct si_shader *gs_copy_shader; - - struct nir_shader *nir; - void *nir_binary; - unsigned nir_size; - - struct pipe_stream_output_info so; - struct si_shader_info info; - - /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ - enum pipe_shader_type type; - bool vs_needs_prolog; - bool prim_discard_cs_allowed; - bool ngg_culling_allowed; - unsigned num_vs_inputs; - unsigned num_vbos_in_user_sgprs; - unsigned pa_cl_vs_out_cntl; - ubyte clipdist_mask; - ubyte culldist_mask; - unsigned rast_prim; - - /* ES parameters. */ - unsigned esgs_itemsize; /* vertex stride */ - unsigned lshs_vertex_stride; - - /* GS parameters. */ - unsigned gs_input_verts_per_prim; - unsigned gs_output_prim; - unsigned gs_max_out_vertices; - unsigned gs_num_invocations; - unsigned max_gs_stream; /* count - 1 */ - unsigned gsvs_vertex_size; - unsigned max_gsvs_emit_size; - unsigned enabled_streamout_buffer_mask; - bool tess_turns_off_ngg; - - /* PS parameters. */ - unsigned color_attr_index[2]; - unsigned db_shader_control; - /* Set 0xf or 0x0 (4 bits) per each written output. - * ANDed with spi_shader_col_format. - */ - unsigned colors_written_4bit; - - uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ - uint64_t outputs_written; /* "get_unique_index" bits */ - uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ - - uint64_t inputs_read; /* "get_unique_index" bits */ - - /* bitmasks of used descriptor slots */ - uint32_t active_const_and_shader_buffers; - uint64_t active_samplers_and_images; + struct util_live_shader base; + struct si_screen *screen; + struct util_queue_fence ready; + struct si_compiler_ctx_state compiler_ctx_state; + + simple_mtx_t mutex; + struct si_shader *first_variant; /* immutable after the first variant */ + struct si_shader *last_variant; /* mutable */ + + /* The compiled NIR shader without a prolog and/or epilog (not + * uploaded to a buffer object). + */ + struct si_shader *main_shader_part; + struct si_shader *main_shader_part_ls; /* as_ls is set in the key */ + struct si_shader *main_shader_part_es; /* as_es is set in the key */ + struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ + struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ + + struct si_shader *gs_copy_shader; + + struct nir_shader *nir; + void *nir_binary; + unsigned nir_size; + + struct pipe_stream_output_info so; + struct si_shader_info info; + + /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ + enum pipe_shader_type type; + bool vs_needs_prolog; + bool prim_discard_cs_allowed; + bool ngg_culling_allowed; + unsigned num_vs_inputs; + unsigned num_vbos_in_user_sgprs; + unsigned pa_cl_vs_out_cntl; + ubyte clipdist_mask; + ubyte culldist_mask; + unsigned rast_prim; + + /* ES parameters. */ + unsigned esgs_itemsize; /* vertex stride */ + unsigned lshs_vertex_stride; + + /* GS parameters. */ + unsigned gs_input_verts_per_prim; + unsigned gs_output_prim; + unsigned gs_max_out_vertices; + unsigned gs_num_invocations; + unsigned max_gs_stream; /* count - 1 */ + unsigned gsvs_vertex_size; + unsigned max_gsvs_emit_size; + unsigned enabled_streamout_buffer_mask; + bool tess_turns_off_ngg; + + /* PS parameters. */ + unsigned color_attr_index[2]; + unsigned db_shader_control; + /* Set 0xf or 0x0 (4 bits) per each written output. + * ANDed with spi_shader_col_format. + */ + unsigned colors_written_4bit; + + uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ + uint64_t outputs_written; /* "get_unique_index" bits */ + uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ + + uint64_t inputs_read; /* "get_unique_index" bits */ + + /* bitmasks of used descriptor slots */ + uint32_t active_const_and_shader_buffers; + uint64_t active_samplers_and_images; }; /* Valid shader configurations: @@ -506,184 +508,184 @@ struct si_shader_selector { /* Common VS bits between the shader key and the prolog key. */ struct si_vs_prolog_bits { - /* - If neither "is_one" nor "is_fetched" has a bit set, the instance - * divisor is 0. - * - If "is_one" has a bit set, the instance divisor is 1. - * - If "is_fetched" has a bit set, the instance divisor will be loaded - * from the constant buffer. - */ - uint16_t instance_divisor_is_one; /* bitmask of inputs */ - uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ - unsigned ls_vgpr_fix:1; - unsigned unpack_instance_id_from_vertex_id:1; + /* - If neither "is_one" nor "is_fetched" has a bit set, the instance + * divisor is 0. + * - If "is_one" has a bit set, the instance divisor is 1. + * - If "is_fetched" has a bit set, the instance divisor will be loaded + * from the constant buffer. + */ + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ + unsigned ls_vgpr_fix : 1; + unsigned unpack_instance_id_from_vertex_id : 1; }; /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { - unsigned prim_mode:3; - unsigned invoc0_tess_factors_are_def:1; - unsigned tes_reads_tess_factors:1; + unsigned prim_mode : 3; + unsigned invoc0_tess_factors_are_def : 1; + unsigned tes_reads_tess_factors : 1; }; struct si_gs_prolog_bits { - unsigned tri_strip_adj_fix:1; - unsigned gfx9_prev_is_vs:1; + unsigned tri_strip_adj_fix : 1; + unsigned gfx9_prev_is_vs : 1; }; /* Common PS bits between the shader key and the prolog key. */ struct si_ps_prolog_bits { - unsigned color_two_side:1; - unsigned flatshade_colors:1; - unsigned poly_stipple:1; - unsigned force_persp_sample_interp:1; - unsigned force_linear_sample_interp:1; - unsigned force_persp_center_interp:1; - unsigned force_linear_center_interp:1; - unsigned bc_optimize_for_persp:1; - unsigned bc_optimize_for_linear:1; - unsigned samplemask_log_ps_iter:3; + unsigned color_two_side : 1; + unsigned flatshade_colors : 1; + unsigned poly_stipple : 1; + unsigned force_persp_sample_interp : 1; + unsigned force_linear_sample_interp : 1; + unsigned force_persp_center_interp : 1; + unsigned force_linear_center_interp : 1; + unsigned bc_optimize_for_persp : 1; + unsigned bc_optimize_for_linear : 1; + unsigned samplemask_log_ps_iter : 3; }; /* Common PS bits between the shader key and the epilog key. */ struct si_ps_epilog_bits { - unsigned spi_shader_col_format; - unsigned color_is_int8:8; - unsigned color_is_int10:8; - unsigned last_cbuf:3; - unsigned alpha_func:3; - unsigned alpha_to_one:1; - unsigned poly_line_smoothing:1; - unsigned clamp_color:1; + unsigned spi_shader_col_format; + unsigned color_is_int8 : 8; + unsigned color_is_int10 : 8; + unsigned last_cbuf : 3; + unsigned alpha_func : 3; + unsigned alpha_to_one : 1; + unsigned poly_line_smoothing : 1; + unsigned clamp_color : 1; }; union si_shader_part_key { - struct { - struct si_vs_prolog_bits states; - unsigned num_input_sgprs:6; - /* For merged stages such as LS-HS, HS input VGPRs are first. */ - unsigned num_merged_next_stage_vgprs:3; - unsigned num_inputs:5; - unsigned as_ls:1; - unsigned as_es:1; - unsigned as_ngg:1; - unsigned as_prim_discard_cs:1; - unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */ - unsigned gs_fast_launch_tri_list:1; /* for NGG culling */ - unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */ - /* Prologs for monolithic shaders shouldn't set EXEC. */ - unsigned is_monolithic:1; - } vs_prolog; - struct { - struct si_tcs_epilog_bits states; - } tcs_epilog; - struct { - struct si_gs_prolog_bits states; - /* Prologs of monolithic shaders shouldn't set EXEC. */ - unsigned is_monolithic:1; - unsigned as_ngg:1; - } gs_prolog; - struct { - struct si_ps_prolog_bits states; - unsigned num_input_sgprs:6; - unsigned num_input_vgprs:5; - /* Color interpolation and two-side color selection. */ - unsigned colors_read:8; /* color input components read */ - unsigned num_interp_inputs:5; /* BCOLOR is at this location */ - unsigned face_vgpr_index:5; - unsigned ancillary_vgpr_index:5; - unsigned wqm:1; - char color_attr_index[2]; - signed char color_interp_vgpr_index[2]; /* -1 == constant */ - } ps_prolog; - struct { - struct si_ps_epilog_bits states; - unsigned colors_written:8; - unsigned writes_z:1; - unsigned writes_stencil:1; - unsigned writes_samplemask:1; - } ps_epilog; + struct { + struct si_vs_prolog_bits states; + unsigned num_input_sgprs : 6; + /* For merged stages such as LS-HS, HS input VGPRs are first. */ + unsigned num_merged_next_stage_vgprs : 3; + unsigned num_inputs : 5; + unsigned as_ls : 1; + unsigned as_es : 1; + unsigned as_ngg : 1; + unsigned as_prim_discard_cs : 1; + unsigned has_ngg_cull_inputs : 1; /* from the NGG cull shader */ + unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ + unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ + /* Prologs for monolithic shaders shouldn't set EXEC. */ + unsigned is_monolithic : 1; + } vs_prolog; + struct { + struct si_tcs_epilog_bits states; + } tcs_epilog; + struct { + struct si_gs_prolog_bits states; + /* Prologs of monolithic shaders shouldn't set EXEC. */ + unsigned is_monolithic : 1; + unsigned as_ngg : 1; + } gs_prolog; + struct { + struct si_ps_prolog_bits states; + unsigned num_input_sgprs : 6; + unsigned num_input_vgprs : 5; + /* Color interpolation and two-side color selection. */ + unsigned colors_read : 8; /* color input components read */ + unsigned num_interp_inputs : 5; /* BCOLOR is at this location */ + unsigned face_vgpr_index : 5; + unsigned ancillary_vgpr_index : 5; + unsigned wqm : 1; + char color_attr_index[2]; + signed char color_interp_vgpr_index[2]; /* -1 == constant */ + } ps_prolog; + struct { + struct si_ps_epilog_bits states; + unsigned colors_written : 8; + unsigned writes_z : 1; + unsigned writes_stencil : 1; + unsigned writes_samplemask : 1; + } ps_epilog; }; struct si_shader_key { - /* Prolog and epilog flags. */ - union { - struct { - struct si_vs_prolog_bits prolog; - } vs; - struct { - struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */ - struct si_shader_selector *ls; /* for merged LS-HS */ - struct si_tcs_epilog_bits epilog; - } tcs; /* tessellation control shader */ - struct { - struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */ - struct si_shader_selector *es; /* for merged ES-GS */ - struct si_gs_prolog_bits prolog; - } gs; - struct { - struct si_ps_prolog_bits prolog; - struct si_ps_epilog_bits epilog; - } ps; - } part; - - /* These three are initially set according to the NEXT_SHADER property, - * or guessed if the property doesn't seem correct. - */ - unsigned as_es:1; /* export shader, which precedes GS */ - unsigned as_ls:1; /* local shader, which precedes TCS */ - unsigned as_ngg:1; /* VS, TES, or GS compiled as NGG primitive shader */ - - /* Flags for monolithic compilation only. */ - struct { - /* Whether fetch should be opencoded according to vs_fix_fetch. - * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw - * with minimal fixups is used. */ - uint16_t vs_fetch_opencode; - union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS]; - - union { - uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ - /* When PS needs PrimID and GS is disabled. */ - unsigned vs_export_prim_id:1; - struct { - unsigned interpolate_at_sample_force_center:1; - unsigned fbfetch_msaa:1; - unsigned fbfetch_is_1D:1; - unsigned fbfetch_layered:1; - } ps; - } u; - } mono; - - /* Optimization flags for asynchronous compilation only. */ - struct { - /* For HW VS (it can be VS, TES, GS) */ - uint64_t kill_outputs; /* "get_unique_index" bits */ - unsigned clip_disable:1; - - /* For NGG VS and TES. */ - unsigned ngg_culling:5; /* SI_NGG_CULL_* */ - - /* For shaders where monolithic variants have better code. - * - * This is a flag that has no effect on code generation, - * but forces monolithic shaders to be used as soon as - * possible, because it's in the "opt" group. - */ - unsigned prefer_mono:1; - - /* Primitive discard compute shader. */ - unsigned vs_as_prim_discard_cs:1; - unsigned cs_prim_type:4; - unsigned cs_indexed:1; - unsigned cs_instancing:1; - unsigned cs_primitive_restart:1; - unsigned cs_provoking_vertex_first:1; - unsigned cs_need_correct_orientation:1; - unsigned cs_cull_front:1; - unsigned cs_cull_back:1; - unsigned cs_cull_z:1; - unsigned cs_halfz_clip_space:1; - } opt; + /* Prolog and epilog flags. */ + union { + struct { + struct si_vs_prolog_bits prolog; + } vs; + struct { + struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */ + struct si_shader_selector *ls; /* for merged LS-HS */ + struct si_tcs_epilog_bits epilog; + } tcs; /* tessellation control shader */ + struct { + struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */ + struct si_shader_selector *es; /* for merged ES-GS */ + struct si_gs_prolog_bits prolog; + } gs; + struct { + struct si_ps_prolog_bits prolog; + struct si_ps_epilog_bits epilog; + } ps; + } part; + + /* These three are initially set according to the NEXT_SHADER property, + * or guessed if the property doesn't seem correct. + */ + unsigned as_es : 1; /* export shader, which precedes GS */ + unsigned as_ls : 1; /* local shader, which precedes TCS */ + unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */ + + /* Flags for monolithic compilation only. */ + struct { + /* Whether fetch should be opencoded according to vs_fix_fetch. + * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw + * with minimal fixups is used. */ + uint16_t vs_fetch_opencode; + union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS]; + + union { + uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ + /* When PS needs PrimID and GS is disabled. */ + unsigned vs_export_prim_id : 1; + struct { + unsigned interpolate_at_sample_force_center : 1; + unsigned fbfetch_msaa : 1; + unsigned fbfetch_is_1D : 1; + unsigned fbfetch_layered : 1; + } ps; + } u; + } mono; + + /* Optimization flags for asynchronous compilation only. */ + struct { + /* For HW VS (it can be VS, TES, GS) */ + uint64_t kill_outputs; /* "get_unique_index" bits */ + unsigned clip_disable : 1; + + /* For NGG VS and TES. */ + unsigned ngg_culling : 5; /* SI_NGG_CULL_* */ + + /* For shaders where monolithic variants have better code. + * + * This is a flag that has no effect on code generation, + * but forces monolithic shaders to be used as soon as + * possible, because it's in the "opt" group. + */ + unsigned prefer_mono : 1; + + /* Primitive discard compute shader. */ + unsigned vs_as_prim_discard_cs : 1; + unsigned cs_prim_type : 4; + unsigned cs_indexed : 1; + unsigned cs_instancing : 1; + unsigned cs_primitive_restart : 1; + unsigned cs_provoking_vertex_first : 1; + unsigned cs_need_correct_orientation : 1; + unsigned cs_cull_front : 1; + unsigned cs_cull_back : 1; + unsigned cs_cull_z : 1; + unsigned cs_halfz_clip_space : 1; + } opt; }; /* Restore the pack alignment to default. */ @@ -691,232 +693,214 @@ struct si_shader_key { /* GCN-specific shader info. */ struct si_shader_binary_info { - ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; - ubyte num_input_sgprs; - ubyte num_input_vgprs; - signed char face_vgpr_index; - signed char ancillary_vgpr_index; - bool uses_instanceid; - ubyte nr_pos_exports; - ubyte nr_param_exports; - unsigned private_mem_vgprs; - unsigned max_simd_waves; + ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + ubyte num_input_sgprs; + ubyte num_input_vgprs; + signed char face_vgpr_index; + signed char ancillary_vgpr_index; + bool uses_instanceid; + ubyte nr_pos_exports; + ubyte nr_param_exports; + unsigned private_mem_vgprs; + unsigned max_simd_waves; }; struct si_shader_binary { - const char *elf_buffer; - size_t elf_size; + const char *elf_buffer; + size_t elf_size; - char *llvm_ir_string; + char *llvm_ir_string; }; struct gfx9_gs_info { - unsigned es_verts_per_subgroup; - unsigned gs_prims_per_subgroup; - unsigned gs_inst_prims_in_subgroup; - unsigned max_prims_per_subgroup; - unsigned esgs_ring_size; /* in bytes */ + unsigned es_verts_per_subgroup; + unsigned gs_prims_per_subgroup; + unsigned gs_inst_prims_in_subgroup; + unsigned max_prims_per_subgroup; + unsigned esgs_ring_size; /* in bytes */ }; struct si_shader { - struct si_compiler_ctx_state compiler_ctx_state; - - struct si_shader_selector *selector; - struct si_shader_selector *previous_stage_sel; /* for refcounting */ - struct si_shader *next_variant; - - struct si_shader_part *prolog; - struct si_shader *previous_stage; /* for GFX9 */ - struct si_shader_part *prolog2; - struct si_shader_part *epilog; - - struct si_pm4_state *pm4; - struct si_resource *bo; - struct si_resource *scratch_bo; - struct si_shader_key key; - struct util_queue_fence ready; - bool compilation_failed; - bool is_monolithic; - bool is_optimized; - bool is_binary_shared; - bool is_gs_copy_shader; - - /* The following data is all that's needed for binary shaders. */ - struct si_shader_binary binary; - struct ac_shader_config config; - struct si_shader_binary_info info; - - struct { - uint16_t ngg_emit_size; /* in dwords */ - uint16_t hw_max_esverts; - uint16_t max_gsprims; - uint16_t max_out_verts; - uint16_t prim_amp_factor; - bool max_vert_out_per_gs_instance; - } ngg; - - /* Shader key + LLVM IR + disassembly + statistics. - * Generated for debug contexts only. - */ - char *shader_log; - size_t shader_log_size; - - struct gfx9_gs_info gs_info; - - /* For save precompute context registers values. */ - union { - struct { - unsigned vgt_gsvs_ring_offset_1; - unsigned vgt_gsvs_ring_offset_2; - unsigned vgt_gsvs_ring_offset_3; - unsigned vgt_gsvs_ring_itemsize; - unsigned vgt_gs_max_vert_out; - unsigned vgt_gs_vert_itemsize; - unsigned vgt_gs_vert_itemsize_1; - unsigned vgt_gs_vert_itemsize_2; - unsigned vgt_gs_vert_itemsize_3; - unsigned vgt_gs_instance_cnt; - unsigned vgt_gs_onchip_cntl; - unsigned vgt_gs_max_prims_per_subgroup; - unsigned vgt_esgs_ring_itemsize; - } gs; - - struct { - unsigned ge_max_output_per_subgroup; - unsigned ge_ngg_subgrp_cntl; - unsigned vgt_primitiveid_en; - unsigned vgt_gs_onchip_cntl; - unsigned vgt_gs_instance_cnt; - unsigned vgt_esgs_ring_itemsize; - unsigned spi_vs_out_config; - unsigned spi_shader_idx_format; - unsigned spi_shader_pos_format; - unsigned pa_cl_vte_cntl; - unsigned pa_cl_ngg_cntl; - unsigned vgt_gs_max_vert_out; /* for API GS */ - unsigned ge_pc_alloc; /* uconfig register */ - } ngg; - - struct { - unsigned vgt_gs_mode; - unsigned vgt_primitiveid_en; - unsigned vgt_reuse_off; - unsigned spi_vs_out_config; - unsigned spi_shader_pos_format; - unsigned pa_cl_vte_cntl; - unsigned ge_pc_alloc; /* uconfig register */ - } vs; - - struct { - unsigned spi_ps_input_ena; - unsigned spi_ps_input_addr; - unsigned spi_baryc_cntl; - unsigned spi_ps_in_control; - unsigned spi_shader_z_format; - unsigned spi_shader_col_format; - unsigned cb_shader_mask; - } ps; - } ctx_reg; - - /*For save precompute registers value */ - unsigned vgt_tf_param; /* VGT_TF_PARAM */ - unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */ - unsigned pa_cl_vs_out_cntl; - unsigned ge_cntl; + struct si_compiler_ctx_state compiler_ctx_state; + + struct si_shader_selector *selector; + struct si_shader_selector *previous_stage_sel; /* for refcounting */ + struct si_shader *next_variant; + + struct si_shader_part *prolog; + struct si_shader *previous_stage; /* for GFX9 */ + struct si_shader_part *prolog2; + struct si_shader_part *epilog; + + struct si_pm4_state *pm4; + struct si_resource *bo; + struct si_resource *scratch_bo; + struct si_shader_key key; + struct util_queue_fence ready; + bool compilation_failed; + bool is_monolithic; + bool is_optimized; + bool is_binary_shared; + bool is_gs_copy_shader; + + /* The following data is all that's needed for binary shaders. */ + struct si_shader_binary binary; + struct ac_shader_config config; + struct si_shader_binary_info info; + + struct { + uint16_t ngg_emit_size; /* in dwords */ + uint16_t hw_max_esverts; + uint16_t max_gsprims; + uint16_t max_out_verts; + uint16_t prim_amp_factor; + bool max_vert_out_per_gs_instance; + } ngg; + + /* Shader key + LLVM IR + disassembly + statistics. + * Generated for debug contexts only. + */ + char *shader_log; + size_t shader_log_size; + + struct gfx9_gs_info gs_info; + + /* For save precompute context registers values. */ + union { + struct { + unsigned vgt_gsvs_ring_offset_1; + unsigned vgt_gsvs_ring_offset_2; + unsigned vgt_gsvs_ring_offset_3; + unsigned vgt_gsvs_ring_itemsize; + unsigned vgt_gs_max_vert_out; + unsigned vgt_gs_vert_itemsize; + unsigned vgt_gs_vert_itemsize_1; + unsigned vgt_gs_vert_itemsize_2; + unsigned vgt_gs_vert_itemsize_3; + unsigned vgt_gs_instance_cnt; + unsigned vgt_gs_onchip_cntl; + unsigned vgt_gs_max_prims_per_subgroup; + unsigned vgt_esgs_ring_itemsize; + } gs; + + struct { + unsigned ge_max_output_per_subgroup; + unsigned ge_ngg_subgrp_cntl; + unsigned vgt_primitiveid_en; + unsigned vgt_gs_onchip_cntl; + unsigned vgt_gs_instance_cnt; + unsigned vgt_esgs_ring_itemsize; + unsigned spi_vs_out_config; + unsigned spi_shader_idx_format; + unsigned spi_shader_pos_format; + unsigned pa_cl_vte_cntl; + unsigned pa_cl_ngg_cntl; + unsigned vgt_gs_max_vert_out; /* for API GS */ + unsigned ge_pc_alloc; /* uconfig register */ + } ngg; + + struct { + unsigned vgt_gs_mode; + unsigned vgt_primitiveid_en; + unsigned vgt_reuse_off; + unsigned spi_vs_out_config; + unsigned spi_shader_pos_format; + unsigned pa_cl_vte_cntl; + unsigned ge_pc_alloc; /* uconfig register */ + } vs; + + struct { + unsigned spi_ps_input_ena; + unsigned spi_ps_input_addr; + unsigned spi_baryc_cntl; + unsigned spi_ps_in_control; + unsigned spi_shader_z_format; + unsigned spi_shader_col_format; + unsigned cb_shader_mask; + } ps; + } ctx_reg; + + /*For save precompute registers value */ + unsigned vgt_tf_param; /* VGT_TF_PARAM */ + unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */ + unsigned pa_cl_vs_out_cntl; + unsigned ge_cntl; }; struct si_shader_part { - struct si_shader_part *next; - union si_shader_part_key key; - struct si_shader_binary binary; - struct ac_shader_config config; + struct si_shader_part *next; + union si_shader_part_key key; + struct si_shader_binary binary; + struct ac_shader_config config; }; /* si_shader.c */ -bool si_compile_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug); -bool si_create_shader_variant(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader *shader, - struct pipe_debug_callback *debug); +bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug); +bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, + struct si_shader *shader, struct pipe_debug_callback *debug); void si_shader_destroy(struct si_shader *shader); unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index); -unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, - unsigned is_varying); +unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying); bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, - uint64_t scratch_va); + uint64_t scratch_va); void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, - struct pipe_debug_callback *debug, - FILE *f, bool check_debug_option); -void si_shader_dump_stats_for_shader_db(struct si_screen *screen, - struct si_shader *shader, - struct pipe_debug_callback *debug); -void si_multiwave_lds_size_workaround(struct si_screen *sscreen, - unsigned *lds_size); + struct pipe_debug_callback *debug, FILE *f, bool check_debug_option); +void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, + struct pipe_debug_callback *debug); +void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size); const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); /* si_shader_llvm_gs.c */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug); +struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug); /* si_shader_nir.c */ -void si_nir_scan_shader(const struct nir_shader *nir, - struct si_shader_info *info); +void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); void si_nir_adjust_driver_locations(struct nir_shader *nir); void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize); /* si_state_shaders.c */ -void gfx9_get_gs_info(struct si_shader_selector *es, - struct si_shader_selector *gs, - struct gfx9_gs_info *out); +void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, + struct gfx9_gs_info *out); /* Inline helpers. */ /* Return the pointer to the main shader part's pointer. */ -static inline struct si_shader ** -si_get_main_shader_part(struct si_shader_selector *sel, - struct si_shader_key *key) +static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel, + struct si_shader_key *key) { - if (key->as_ls) - return &sel->main_shader_part_ls; - if (key->as_es && key->as_ngg) - return &sel->main_shader_part_ngg_es; - if (key->as_es) - return &sel->main_shader_part_es; - if (key->as_ngg) - return &sel->main_shader_part_ngg; - return &sel->main_shader_part; + if (key->as_ls) + return &sel->main_shader_part_ls; + if (key->as_es && key->as_ngg) + return &sel->main_shader_part_ngg_es; + if (key->as_es) + return &sel->main_shader_part_es; + if (key->as_ngg) + return &sel->main_shader_part_ngg; + return &sel->main_shader_part; } -static inline bool -gfx10_is_ngg_passthrough(struct si_shader *shader) +static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader) { - struct si_shader_selector *sel = shader->selector; - - return sel->type != PIPE_SHADER_GEOMETRY && - !sel->so.num_outputs && - !sel->info.writes_edgeflag && - !shader->key.opt.ngg_culling && - (sel->type != PIPE_SHADER_VERTEX || - !shader->key.mono.u.vs_export_prim_id); + struct si_shader_selector *sel = shader->selector; + + return sel->type != PIPE_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag && + !shader->key.opt.ngg_culling && + (sel->type != PIPE_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id); } -static inline bool -si_shader_uses_bindless_samplers(struct si_shader_selector *selector) +static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector) { - return selector ? selector->info.uses_bindless_samplers : false; + return selector ? selector->info.uses_bindless_samplers : false; } -static inline bool -si_shader_uses_bindless_images(struct si_shader_selector *selector) +static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector) { - return selector ? selector->info.uses_bindless_images : false; + return selector ? selector->info.uses_bindless_images : false; } #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 47173142d44..2191604b706 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -25,8 +25,8 @@ #ifndef SI_SHADER_PRIVATE_H #define SI_SHADER_PRIVATE_H -#include "si_shader.h" #include "ac_shader_abi.h" +#include "si_shader.h" struct pipe_debug_callback; @@ -38,275 +38,245 @@ struct pipe_debug_callback; #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14 struct si_shader_output_values { - LLVMValueRef values[4]; - unsigned semantic_name; - unsigned semantic_index; - ubyte vertex_stream[4]; + LLVMValueRef values[4]; + unsigned semantic_name; + unsigned semantic_index; + ubyte vertex_stream[4]; }; struct si_shader_context { - struct ac_llvm_context ac; - struct si_shader *shader; - struct si_screen *screen; - - unsigned type; /* PIPE_SHADER_* specifies the type of shader. */ - - /* For clamping the non-constant index in resource indexing: */ - unsigned num_const_buffers; - unsigned num_shader_buffers; - unsigned num_images; - unsigned num_samplers; - - struct ac_shader_args args; - struct ac_shader_abi abi; - - LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; - - LLVMBasicBlockRef merged_wrap_if_entry_block; - int merged_wrap_if_label; - - LLVMValueRef main_fn; - LLVMTypeRef return_type; - - struct ac_arg const_and_shader_buffers; - struct ac_arg samplers_and_images; - - /* For merged shaders, the per-stage descriptors for the stage other - * than the one we're processing, used to pass them through from the - * first stage to the second. - */ - struct ac_arg other_const_and_shader_buffers; - struct ac_arg other_samplers_and_images; - - struct ac_arg rw_buffers; - struct ac_arg bindless_samplers_and_images; - /* Common inputs for merged shaders. */ - struct ac_arg merged_wave_info; - struct ac_arg merged_scratch_offset; - struct ac_arg small_prim_cull_info; - /* API VS */ - struct ac_arg vertex_buffers; - struct ac_arg vb_descriptors[5]; - struct ac_arg rel_auto_id; - struct ac_arg vs_prim_id; - struct ac_arg vertex_index0; - /* VS states and layout of LS outputs / TCS inputs at the end - * [0] = clamp vertex color - * [1] = indexed - * [2:3] = NGG: output primitive type - * [4:5] = NGG: provoking vertex index - * [6] = NGG: streamout queries enabled - * [7:10] = NGG: small prim filter precision = num_samples / quant_mode, - * but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12 - * Only the first 4 bits of the exponent are stored. - * Set it like this: (fui(num_samples / quant_mode) >> 23) - * Expand to FP32 like this: ((0x70 | value) << 23); - * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15) - * = 1/2^(15 - value) in FP32 - * [11:23] = stride between patches in DW = num_inputs * num_vertices * 4 - * max = 32*32*4 + 32*4 - * [24:31] = stride between vertices in DW = num_inputs * 4 - * max = 32*4 - */ - struct ac_arg vs_state_bits; - struct ac_arg vs_blit_inputs; - struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */ - /* HW VS */ - struct ac_arg streamout_config; - struct ac_arg streamout_write_index; - struct ac_arg streamout_offset[4]; - - /* API TCS & TES */ - /* Layout of TCS outputs in the offchip buffer - * # 6 bits - * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40) - * # 6 bits - * [6:11] = the number of output vertices per patch, max = 32 - * # 20 bits - * [12:31] = the offset of per patch attributes in the buffer in bytes. - * max = NUM_PATCHES*32*32*16 - */ - struct ac_arg tcs_offchip_layout; - - /* API TCS */ - /* Offsets where TCS outputs and TCS patch outputs live in LDS: - * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 - * [16:31] = TCS output patch0 offset for per-patch / 16 - * max = (NUM_PATCHES + 1) * 32*32 - */ - struct ac_arg tcs_out_lds_offsets; - /* Layout of TCS outputs / TES inputs: - * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4 - * max = 32*32*4 + 32*4 - * [13:18] = gl_PatchVerticesIn, max = 32 - * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers - */ - struct ac_arg tcs_out_lds_layout; - struct ac_arg tcs_offchip_offset; - struct ac_arg tcs_factor_offset; - - /* API TES */ - struct ac_arg tes_offchip_addr; - struct ac_arg tes_u; - struct ac_arg tes_v; - struct ac_arg tes_rel_patch_id; - /* HW ES */ - struct ac_arg es2gs_offset; - /* HW GS */ - /* On gfx10: - * - bits 0..11: ordered_wave_id - * - bits 12..20: number of vertices in group - * - bits 22..30: number of primitives in group - */ - struct ac_arg gs_tg_info; - /* API GS */ - struct ac_arg gs2vs_offset; - struct ac_arg gs_wave_id; /* GFX6 */ - struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */ - struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */ - struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */ - struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */ - /* PS */ - struct ac_arg pos_fixed_pt; - /* CS */ - struct ac_arg block_size; - struct ac_arg cs_user_data; - - struct ac_llvm_compiler *compiler; - - /* Preloaded descriptors. */ - LLVMValueRef esgs_ring; - LLVMValueRef gsvs_ring[4]; - LLVMValueRef tess_offchip_ring; - - LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */ - LLVMValueRef gs_next_vertex[4]; - LLVMValueRef gs_curprim_verts[4]; - LLVMValueRef gs_generated_prims[4]; - LLVMValueRef gs_ngg_emit; - LLVMValueRef gs_ngg_scratch; - LLVMValueRef postponed_kill; - LLVMValueRef return_value; + struct ac_llvm_context ac; + struct si_shader *shader; + struct si_screen *screen; + + unsigned type; /* PIPE_SHADER_* specifies the type of shader. */ + + /* For clamping the non-constant index in resource indexing: */ + unsigned num_const_buffers; + unsigned num_shader_buffers; + unsigned num_images; + unsigned num_samplers; + + struct ac_shader_args args; + struct ac_shader_abi abi; + + LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; + + LLVMBasicBlockRef merged_wrap_if_entry_block; + int merged_wrap_if_label; + + LLVMValueRef main_fn; + LLVMTypeRef return_type; + + struct ac_arg const_and_shader_buffers; + struct ac_arg samplers_and_images; + + /* For merged shaders, the per-stage descriptors for the stage other + * than the one we're processing, used to pass them through from the + * first stage to the second. + */ + struct ac_arg other_const_and_shader_buffers; + struct ac_arg other_samplers_and_images; + + struct ac_arg rw_buffers; + struct ac_arg bindless_samplers_and_images; + /* Common inputs for merged shaders. */ + struct ac_arg merged_wave_info; + struct ac_arg merged_scratch_offset; + struct ac_arg small_prim_cull_info; + /* API VS */ + struct ac_arg vertex_buffers; + struct ac_arg vb_descriptors[5]; + struct ac_arg rel_auto_id; + struct ac_arg vs_prim_id; + struct ac_arg vertex_index0; + /* VS states and layout of LS outputs / TCS inputs at the end + * [0] = clamp vertex color + * [1] = indexed + * [2:3] = NGG: output primitive type + * [4:5] = NGG: provoking vertex index + * [6] = NGG: streamout queries enabled + * [7:10] = NGG: small prim filter precision = num_samples / quant_mode, + * but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12 + * Only the first 4 bits of the exponent are stored. + * Set it like this: (fui(num_samples / quant_mode) >> 23) + * Expand to FP32 like this: ((0x70 | value) << 23); + * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15) + * = 1/2^(15 - value) in FP32 + * [11:23] = stride between patches in DW = num_inputs * num_vertices * 4 + * max = 32*32*4 + 32*4 + * [24:31] = stride between vertices in DW = num_inputs * 4 + * max = 32*4 + */ + struct ac_arg vs_state_bits; + struct ac_arg vs_blit_inputs; + struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */ + /* HW VS */ + struct ac_arg streamout_config; + struct ac_arg streamout_write_index; + struct ac_arg streamout_offset[4]; + + /* API TCS & TES */ + /* Layout of TCS outputs in the offchip buffer + * # 6 bits + * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40) + * # 6 bits + * [6:11] = the number of output vertices per patch, max = 32 + * # 20 bits + * [12:31] = the offset of per patch attributes in the buffer in bytes. + * max = NUM_PATCHES*32*32*16 + */ + struct ac_arg tcs_offchip_layout; + + /* API TCS */ + /* Offsets where TCS outputs and TCS patch outputs live in LDS: + * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 + * [16:31] = TCS output patch0 offset for per-patch / 16 + * max = (NUM_PATCHES + 1) * 32*32 + */ + struct ac_arg tcs_out_lds_offsets; + /* Layout of TCS outputs / TES inputs: + * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4 + * max = 32*32*4 + 32*4 + * [13:18] = gl_PatchVerticesIn, max = 32 + * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers + */ + struct ac_arg tcs_out_lds_layout; + struct ac_arg tcs_offchip_offset; + struct ac_arg tcs_factor_offset; + + /* API TES */ + struct ac_arg tes_offchip_addr; + struct ac_arg tes_u; + struct ac_arg tes_v; + struct ac_arg tes_rel_patch_id; + /* HW ES */ + struct ac_arg es2gs_offset; + /* HW GS */ + /* On gfx10: + * - bits 0..11: ordered_wave_id + * - bits 12..20: number of vertices in group + * - bits 22..30: number of primitives in group + */ + struct ac_arg gs_tg_info; + /* API GS */ + struct ac_arg gs2vs_offset; + struct ac_arg gs_wave_id; /* GFX6 */ + struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */ + struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */ + struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */ + struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */ + /* PS */ + struct ac_arg pos_fixed_pt; + /* CS */ + struct ac_arg block_size; + struct ac_arg cs_user_data; + + struct ac_llvm_compiler *compiler; + + /* Preloaded descriptors. */ + LLVMValueRef esgs_ring; + LLVMValueRef gsvs_ring[4]; + LLVMValueRef tess_offchip_ring; + + LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */ + LLVMValueRef gs_next_vertex[4]; + LLVMValueRef gs_curprim_verts[4]; + LLVMValueRef gs_generated_prims[4]; + LLVMValueRef gs_ngg_emit; + LLVMValueRef gs_ngg_scratch; + LLVMValueRef postponed_kill; + LLVMValueRef return_value; }; -static inline struct si_shader_context * -si_shader_context_from_abi(struct ac_shader_abi *abi) +static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = NULL; - return container_of(abi, ctx, abi); + struct si_shader_context *ctx = NULL; + return container_of(abi, ctx, abi); } bool si_is_multi_part_shader(struct si_shader *shader); bool si_is_merged_shader(struct si_shader *shader); -void si_add_arg_checked(struct ac_shader_args *args, - enum ac_arg_regfile file, - unsigned registers, enum ac_arg_type type, - struct ac_arg *arg, - unsigned idx); +void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, + enum ac_arg_type type, struct ac_arg *arg, unsigned idx); unsigned si_get_max_workgroup_size(const struct si_shader *shader); bool si_need_ps_prolog(const union si_shader_part_key *key); -void si_get_ps_prolog_key(struct si_shader *shader, - union si_shader_part_key *key, - bool separate_prolog); -void si_get_ps_epilog_key(struct si_shader *shader, - union si_shader_part_key *key); +void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, + bool separate_prolog); +void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key); void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader); bool gfx10_ngg_export_prim_early(struct si_shader *shader); void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx); -void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, - LLVMValueRef user_edgeflags[3], - LLVMValueRef prim_passthrough); -void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs); -void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs); -void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, - unsigned stream, - LLVMValueRef *addrs); +void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3], + LLVMValueRef prim_passthrough); +void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); +void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs); void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); /* si_shader_llvm.c */ -bool si_compile_llvm(struct si_screen *sscreen, - struct si_shader_binary *binary, - struct ac_shader_config *conf, - struct ac_llvm_compiler *compiler, - struct ac_llvm_context *ac, - struct pipe_debug_callback *debug, - enum pipe_shader_type shader_type, - const char *name, - bool less_optimized); -void si_llvm_context_init(struct si_shader_context *ctx, - struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - unsigned wave_size); -void si_llvm_create_func(struct si_shader_context *ctx, const char *name, - LLVMTypeRef *return_types, unsigned num_return_elems, - unsigned max_workgroup_size); +bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, + struct ac_shader_config *conf, struct ac_llvm_compiler *compiler, + struct ac_llvm_context *ac, struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, const char *name, bool less_optimized); +void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, unsigned wave_size); +void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, + unsigned num_return_elems, unsigned max_workgroup_size); void si_llvm_optimize_module(struct si_shader_context *ctx); void si_llvm_dispose(struct si_shader_context *ctx); -LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, - LLVMValueRef resource, LLVMValueRef offset); +LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, + LLVMValueRef offset); void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret); LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index); + struct ac_arg param, unsigned return_index); LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index); + struct ac_arg param, unsigned return_index); LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index); + struct ac_arg param, unsigned return_index); LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx); -LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, - LLVMTypeRef type, LLVMValueRef val1, - LLVMValueRef val2); +LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, + LLVMValueRef val1, LLVMValueRef val2); void si_llvm_emit_barrier(struct si_shader_context *ctx); void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, - unsigned bitoffset); -LLVMValueRef si_unpack_param(struct si_shader_context *ctx, - struct ac_arg param, unsigned rshift, - unsigned bitwidth); -LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, - unsigned swizzle); + unsigned bitoffset); +LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, + unsigned bitwidth); +LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle); LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi); void si_llvm_declare_compute_memory(struct si_shader_context *ctx); bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir); void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, - unsigned num_parts, unsigned main_part, - unsigned next_shader_first_part); + unsigned num_parts, unsigned main_part, + unsigned next_shader_first_part); /* si_shader_llvm_gs.c */ LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx); -void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs); +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); void si_preload_esgs_ring(struct si_shader_context *ctx); void si_preload_gs_rings(struct si_shader_context *ctx); -void si_llvm_build_gs_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key); +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_gs_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_tess.c */ void si_llvm_preload_tes_rings(struct si_shader_context *ctx); -void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs); -void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, - union si_shader_part_key *key); +void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx); void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); /* si_shader_llvm_ps.c */ LLVMValueRef si_get_sample_id(struct si_shader_context *ctx); -void si_llvm_build_ps_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key); -void si_llvm_build_ps_epilog(struct si_shader_context *ctx, - union si_shader_part_key *key); -void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, - struct si_shader *shader); +void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); +void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key); +void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader); void si_llvm_init_ps_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_resources.c */ @@ -314,21 +284,16 @@ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_vs.c */ void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir); -void si_llvm_streamout_store_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out); -void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream); +void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out); +void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream); void si_llvm_build_vs_exports(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput); -void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs); -void si_llvm_build_vs_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key); + struct si_shader_output_values *outputs, unsigned noutput); +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs); +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index dca604afe40..d8bcb4ad55c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -22,298 +22,272 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" -#include "si_pipe.h" -#include "ac_rtld.h" #include "ac_nir_to_llvm.h" +#include "ac_rtld.h" +#include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" - #include "tgsi/tgsi_from_mesa.h" #include "util/u_memory.h" struct si_llvm_diagnostics { - struct pipe_debug_callback *debug; - unsigned retval; + struct pipe_debug_callback *debug; + unsigned retval; }; static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) { - struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; - LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); - const char *severity_str = NULL; - - switch (severity) { - case LLVMDSError: - severity_str = "error"; - break; - case LLVMDSWarning: - severity_str = "warning"; - break; - case LLVMDSRemark: - case LLVMDSNote: - default: - return; - } - - char *description = LLVMGetDiagInfoDescription(di); - - pipe_debug_message(diag->debug, SHADER_INFO, - "LLVM diagnostic (%s): %s", severity_str, description); - - if (severity == LLVMDSError) { - diag->retval = 1; - fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description); - } - - LLVMDisposeMessage(description); + struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context; + LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di); + const char *severity_str = NULL; + + switch (severity) { + case LLVMDSError: + severity_str = "error"; + break; + case LLVMDSWarning: + severity_str = "warning"; + break; + case LLVMDSRemark: + case LLVMDSNote: + default: + return; + } + + char *description = LLVMGetDiagInfoDescription(di); + + pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str, + description); + + if (severity == LLVMDSError) { + diag->retval = 1; + fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description); + } + + LLVMDisposeMessage(description); } -bool si_compile_llvm(struct si_screen *sscreen, - struct si_shader_binary *binary, - struct ac_shader_config *conf, - struct ac_llvm_compiler *compiler, - struct ac_llvm_context *ac, - struct pipe_debug_callback *debug, - enum pipe_shader_type shader_type, - const char *name, - bool less_optimized) +bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, + struct ac_shader_config *conf, struct ac_llvm_compiler *compiler, + struct ac_llvm_context *ac, struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, const char *name, bool less_optimized) { - unsigned count = p_atomic_inc_return(&sscreen->num_compilations); - - if (si_can_dump_shader(sscreen, shader_type)) { - fprintf(stderr, "radeonsi: Compiling shader %d\n", count); - - if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { - fprintf(stderr, "%s LLVM IR:\n\n", name); - ac_dump_module(ac->module); - fprintf(stderr, "\n"); - } - } - - if (sscreen->record_llvm_ir) { - char *ir = LLVMPrintModuleToString(ac->module); - binary->llvm_ir_string = strdup(ir); - LLVMDisposeMessage(ir); - } - - if (!si_replace_shader(count, binary)) { - struct ac_compiler_passes *passes = compiler->passes; - - if (ac->wave_size == 32) - passes = compiler->passes_wave32; - else if (less_optimized && compiler->low_opt_passes) - passes = compiler->low_opt_passes; - - struct si_llvm_diagnostics diag = {debug}; - LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); - - if (!ac_compile_module_to_elf(passes, ac->module, - (char **)&binary->elf_buffer, - &binary->elf_size)) - diag.retval = 1; - - if (diag.retval != 0) { - pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); - return false; - } - } - - struct ac_rtld_binary rtld; - if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ - .info = &sscreen->info, - .shader_type = tgsi_processor_to_shader_stage(shader_type), - .wave_size = ac->wave_size, - .num_parts = 1, - .elf_ptrs = &binary->elf_buffer, - .elf_sizes = &binary->elf_size })) - return false; - - bool ok = ac_rtld_read_config(&rtld, conf); - ac_rtld_close(&rtld); - return ok; + unsigned count = p_atomic_inc_return(&sscreen->num_compilations); + + if (si_can_dump_shader(sscreen, shader_type)) { + fprintf(stderr, "radeonsi: Compiling shader %d\n", count); + + if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) { + fprintf(stderr, "%s LLVM IR:\n\n", name); + ac_dump_module(ac->module); + fprintf(stderr, "\n"); + } + } + + if (sscreen->record_llvm_ir) { + char *ir = LLVMPrintModuleToString(ac->module); + binary->llvm_ir_string = strdup(ir); + LLVMDisposeMessage(ir); + } + + if (!si_replace_shader(count, binary)) { + struct ac_compiler_passes *passes = compiler->passes; + + if (ac->wave_size == 32) + passes = compiler->passes_wave32; + else if (less_optimized && compiler->low_opt_passes) + passes = compiler->low_opt_passes; + + struct si_llvm_diagnostics diag = {debug}; + LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag); + + if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer, + &binary->elf_size)) + diag.retval = 1; + + if (diag.retval != 0) { + pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed"); + return false; + } + } + + struct ac_rtld_binary rtld; + if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){ + .info = &sscreen->info, + .shader_type = tgsi_processor_to_shader_stage(shader_type), + .wave_size = ac->wave_size, + .num_parts = 1, + .elf_ptrs = &binary->elf_buffer, + .elf_sizes = &binary->elf_size})) + return false; + + bool ok = ac_rtld_read_config(&rtld, conf); + ac_rtld_close(&rtld); + return ok; } -void si_llvm_context_init(struct si_shader_context *ctx, - struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - unsigned wave_size) +void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, unsigned wave_size) { - memset(ctx, 0, sizeof(*ctx)); - ctx->screen = sscreen; - ctx->compiler = compiler; - - ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, - sscreen->info.family, - AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, - wave_size, 64); + memset(ctx, 0, sizeof(*ctx)); + ctx->screen = sscreen; + ctx->compiler = compiler; + + ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family, + AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, wave_size, 64); } -void si_llvm_create_func(struct si_shader_context *ctx, const char *name, - LLVMTypeRef *return_types, unsigned num_return_elems, - unsigned max_workgroup_size) +void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types, + unsigned num_return_elems, unsigned max_workgroup_size) { - LLVMTypeRef ret_type; - enum ac_llvm_calling_convention call_conv; - enum pipe_shader_type real_shader_type; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx->ac.context, - return_types, - num_return_elems, true); - else - ret_type = ctx->ac.voidt; - - real_shader_type = ctx->type; - - /* LS is merged into HS (TCS), and ES is merged into GS. */ - if (ctx->screen->info.chip_class >= GFX9) { - if (ctx->shader->key.as_ls) - real_shader_type = PIPE_SHADER_TESS_CTRL; - else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) - real_shader_type = PIPE_SHADER_GEOMETRY; - } - - switch (real_shader_type) { - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - call_conv = AC_LLVM_AMDGPU_VS; - break; - case PIPE_SHADER_TESS_CTRL: - call_conv = AC_LLVM_AMDGPU_HS; - break; - case PIPE_SHADER_GEOMETRY: - call_conv = AC_LLVM_AMDGPU_GS; - break; - case PIPE_SHADER_FRAGMENT: - call_conv = AC_LLVM_AMDGPU_PS; - break; - case PIPE_SHADER_COMPUTE: - call_conv = AC_LLVM_AMDGPU_CS; - break; - default: - unreachable("Unhandle shader type"); - } - - /* Setup the function */ - ctx->return_type = ret_type; - ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, - ret_type, ctx->ac.module); - ctx->return_value = LLVMGetUndef(ctx->return_type); - - if (ctx->screen->info.address32_hi) { - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "amdgpu-32bit-address-high-bits", - ctx->screen->info.address32_hi); - } - - LLVMAddTargetDependentFunctionAttr(ctx->main_fn, - "no-signed-zeros-fp-math", - "true"); - - ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); + LLVMTypeRef ret_type; + enum ac_llvm_calling_convention call_conv; + enum pipe_shader_type real_shader_type; + + if (num_return_elems) + ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true); + else + ret_type = ctx->ac.voidt; + + real_shader_type = ctx->type; + + /* LS is merged into HS (TCS), and ES is merged into GS. */ + if (ctx->screen->info.chip_class >= GFX9) { + if (ctx->shader->key.as_ls) + real_shader_type = PIPE_SHADER_TESS_CTRL; + else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg) + real_shader_type = PIPE_SHADER_GEOMETRY; + } + + switch (real_shader_type) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + call_conv = AC_LLVM_AMDGPU_VS; + break; + case PIPE_SHADER_TESS_CTRL: + call_conv = AC_LLVM_AMDGPU_HS; + break; + case PIPE_SHADER_GEOMETRY: + call_conv = AC_LLVM_AMDGPU_GS; + break; + case PIPE_SHADER_FRAGMENT: + call_conv = AC_LLVM_AMDGPU_PS; + break; + case PIPE_SHADER_COMPUTE: + call_conv = AC_LLVM_AMDGPU_CS; + break; + default: + unreachable("Unhandle shader type"); + } + + /* Setup the function */ + ctx->return_type = ret_type; + ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module); + ctx->return_value = LLVMGetUndef(ctx->return_type); + + if (ctx->screen->info.address32_hi) { + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits", + ctx->screen->info.address32_hi); + } + + LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true"); + + ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size); } void si_llvm_optimize_module(struct si_shader_context *ctx) { - /* Dump LLVM IR before any optimization passes */ - if (ctx->screen->debug_flags & DBG(PREOPT_IR) && - si_can_dump_shader(ctx->screen, ctx->type)) - LLVMDumpModule(ctx->ac.module); - - /* Run the pass */ - LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); - LLVMDisposeBuilder(ctx->ac.builder); + /* Dump LLVM IR before any optimization passes */ + if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type)) + LLVMDumpModule(ctx->ac.module); + + /* Run the pass */ + LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module); + LLVMDisposeBuilder(ctx->ac.builder); } void si_llvm_dispose(struct si_shader_context *ctx) { - LLVMDisposeModule(ctx->ac.module); - LLVMContextDispose(ctx->ac.context); - ac_llvm_context_dispose(&ctx->ac); + LLVMDisposeModule(ctx->ac.module); + LLVMContextDispose(ctx->ac.context); + ac_llvm_context_dispose(&ctx->ac); } /** * Load a dword from a constant buffer. */ -LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, - LLVMValueRef resource, LLVMValueRef offset) +LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource, + LLVMValueRef offset) { - return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, - 0, 0, true, true); + return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true); } void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret) { - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(ctx->ac.builder); - else - LLVMBuildRet(ctx->ac.builder, ret); + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(ctx->ac.builder); + else + LLVMBuildRet(ctx->ac.builder, ret); } LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - return LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_get_arg(&ctx->ac, param), - return_index, ""); + return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, ""); } LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef p = ac_get_arg(&ctx->ac, param); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef p = ac_get_arg(&ctx->ac, param); - return LLVMBuildInsertValue(builder, ret, - ac_to_float(&ctx->ac, p), - return_index, ""); + return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, ""); } LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, - struct ac_arg param, unsigned return_index) + struct ac_arg param, unsigned return_index) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); - ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); - return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, param); + ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, ""); + return LLVMBuildInsertValue(builder, ret, ptr, return_index, ""); } LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) { - LLVMValueRef ptr[2], list; - bool merged_shader = si_is_merged_shader(ctx->shader); + LLVMValueRef ptr[2], list; + bool merged_shader = si_is_merged_shader(ctx->shader); - ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); - list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], - ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); - return list; + ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS); + list = + LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + return list; } -LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, - LLVMTypeRef type, LLVMValueRef val1, - LLVMValueRef val2) +LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, + LLVMValueRef val1, LLVMValueRef val2) { - LLVMValueRef values[2] = { - ac_to_integer(&ctx->ac, val1), - ac_to_integer(&ctx->ac, val2), - }; - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); - return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); + LLVMValueRef values[2] = { + ac_to_integer(&ctx->ac, val1), + ac_to_integer(&ctx->ac, val2), + }; + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); + return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); } void si_llvm_emit_barrier(struct si_shader_context *ctx) { - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - if (ctx->screen->info.chip_class == GFX6 && - ctx->type == PIPE_SHADER_TESS_CTRL) { - ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); - return; - } - - ac_build_s_barrier(&ctx->ac); + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) { + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE); + return; + } + + ac_build_s_barrier(&ctx->ac); } /* Ensure that the esgs ring is declared. @@ -323,187 +297,169 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx) */ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) { - if (ctx->esgs_ring) - return; + if (ctx->esgs_ring) + return; - assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); + assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); - ctx->esgs_ring = LLVMAddGlobalInAddressSpace( - ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), - "esgs_ring", - AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); - LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); + ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), + "esgs_ring", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); } -void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, - unsigned bitoffset) +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset) { - LLVMValueRef args[] = { - ac_get_arg(&ctx->ac, param), - LLVMConstInt(ctx->ac.i32, bitoffset, 0), - }; - ac_build_intrinsic(&ctx->ac, - "llvm.amdgcn.init.exec.from.input", - ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); + LLVMValueRef args[] = { + ac_get_arg(&ctx->ac, param), + LLVMConstInt(ctx->ac.i32, bitoffset, 0), + }; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2, + AC_FUNC_ATTR_CONVERGENT); } /** * Get the value of a shader input parameter and extract a bitfield. */ -static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, - LLVMValueRef value, unsigned rshift, - unsigned bitwidth) +static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value, + unsigned rshift, unsigned bitwidth) { - if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) - value = ac_to_integer(&ctx->ac, value); + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind) + value = ac_to_integer(&ctx->ac, value); - if (rshift) - value = LLVMBuildLShr(ctx->ac.builder, value, - LLVMConstInt(ctx->ac.i32, rshift, 0), ""); + if (rshift) + value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), ""); - if (rshift + bitwidth < 32) { - unsigned mask = (1 << bitwidth) - 1; - value = LLVMBuildAnd(ctx->ac.builder, value, - LLVMConstInt(ctx->ac.i32, mask, 0), ""); - } + if (rshift + bitwidth < 32) { + unsigned mask = (1 << bitwidth) - 1; + value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), ""); + } - return value; + return value; } -LLVMValueRef si_unpack_param(struct si_shader_context *ctx, - struct ac_arg param, unsigned rshift, - unsigned bitwidth) +LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, + unsigned bitwidth) { - LLVMValueRef value = ac_get_arg(&ctx->ac, param); + LLVMValueRef value = ac_get_arg(&ctx->ac, param); - return unpack_llvm_param(ctx, value, rshift, bitwidth); + return unpack_llvm_param(ctx, value, rshift, bitwidth); } -LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, - unsigned swizzle) +LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle) { - if (swizzle > 0) - return ctx->ac.i32_0; - - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - return ac_get_arg(&ctx->ac, ctx->vs_prim_id); - case PIPE_SHADER_TESS_CTRL: - return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); - case PIPE_SHADER_TESS_EVAL: - return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); - case PIPE_SHADER_GEOMETRY: - return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); - default: - assert(0); - return ctx->ac.i32_0; - } + if (swizzle > 0) + return ctx->ac.i32_0; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + return ac_get_arg(&ctx->ac, ctx->vs_prim_id); + case PIPE_SHADER_TESS_CTRL: + return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id); + case PIPE_SHADER_TESS_EVAL: + return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id); + case PIPE_SHADER_GEOMETRY: + return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id); + default: + assert(0); + return ctx->ac.i32_0; + } } LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef values[3]; - LLVMValueRef result; - unsigned i; - unsigned *properties = ctx->shader->selector->info.properties; + LLVMValueRef values[3]; + LLVMValueRef result; + unsigned i; + unsigned *properties = ctx->shader->selector->info.properties; - if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { - unsigned sizes[3] = { - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] - }; + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { + unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]}; - for (i = 0; i < 3; ++i) - values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0); + for (i = 0; i < 3; ++i) + values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0); - result = ac_build_gather_values(&ctx->ac, values, 3); - } else { - result = ac_get_arg(&ctx->ac, ctx->block_size); - } + result = ac_build_gather_values(&ctx->ac, values, 3); + } else { + result = ac_get_arg(&ctx->ac, ctx->block_size); + } - return result; + return result; } void si_llvm_declare_compute_memory(struct si_shader_context *ctx) { - struct si_shader_selector *sel = ctx->shader->selector; - unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; + struct si_shader_selector *sel = ctx->shader->selector; + unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]; - LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); - LLVMValueRef var; + LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS); + LLVMValueRef var; - assert(!ctx->ac.lds); + assert(!ctx->ac.lds); - var = LLVMAddGlobalInAddressSpace(ctx->ac.module, - LLVMArrayType(ctx->ac.i8, lds_size), - "compute_lds", - AC_ADDR_SPACE_LDS); - LLVMSetAlignment(var, 64 * 1024); + var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size), + "compute_lds", AC_ADDR_SPACE_LDS); + LLVMSetAlignment(var, 64 * 1024); - ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); + ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); } bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) { - if (nir->info.stage == MESA_SHADER_VERTEX) { - si_llvm_load_vs_inputs(ctx, nir); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - unsigned colors_read = - ctx->shader->selector->info.colors_read; - LLVMValueRef main_fn = ctx->main_fn; - - LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); - - unsigned offset = SI_PARAM_POS_FIXED_PT + 1; - - if (colors_read & 0x0f) { - unsigned mask = colors_read & 0x0f; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color0 = - ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - } - if (colors_read & 0xf0) { - unsigned mask = (colors_read & 0xf0) >> 4; - LLVMValueRef values[4]; - values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; - values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; - values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; - values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; - ctx->abi.color1 = - ac_to_integer(&ctx->ac, - ac_build_gather_values(&ctx->ac, values, 4)); - } - - ctx->abi.interp_at_sample_force_center = - ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; - } else if (nir->info.stage == MESA_SHADER_COMPUTE) { - if (nir->info.cs.user_data_components_amd) { - ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); - ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, - nir->info.cs.user_data_components_amd); - } - } - - ctx->abi.inputs = &ctx->inputs[0]; - ctx->abi.clamp_shadow_reference = true; - ctx->abi.robust_buffer_access = true; - - if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { - assert(gl_shader_stage_is_compute(nir->info.stage)); - si_llvm_declare_compute_memory(ctx); - } - ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); - - return true; + if (nir->info.stage == MESA_SHADER_VERTEX) { + si_llvm_load_vs_inputs(ctx, nir); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + unsigned colors_read = ctx->shader->selector->info.colors_read; + LLVMValueRef main_fn = ctx->main_fn; + + LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32); + + unsigned offset = SI_PARAM_POS_FIXED_PT + 1; + + if (colors_read & 0x0f) { + unsigned mask = colors_read & 0x0f; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); + } + if (colors_read & 0xf0) { + unsigned mask = (colors_read & 0xf0) >> 4; + LLVMValueRef values[4]; + values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef; + values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef; + values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef; + values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef; + ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4)); + } + + ctx->abi.interp_at_sample_force_center = + ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center; + } else if (nir->info.stage == MESA_SHADER_COMPUTE) { + if (nir->info.cs.user_data_components_amd) { + ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data); + ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data, + nir->info.cs.user_data_components_amd); + } + } + + ctx->abi.inputs = &ctx->inputs[0]; + ctx->abi.clamp_shadow_reference = true; + ctx->abi.robust_buffer_access = true; + + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) { + assert(gl_shader_stage_is_compute(nir->info.stage)); + si_llvm_declare_compute_memory(ctx); + } + ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir); + + return true; } /** @@ -511,278 +467,270 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) * runs them in sequence to form a monolithic shader. */ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, - unsigned num_parts, unsigned main_part, - unsigned next_shader_first_part) + unsigned num_parts, unsigned main_part, + unsigned next_shader_first_part) { - LLVMBuilderRef builder = ctx->ac.builder; - /* PS epilog has one arg per color component; gfx9 merged shader - * prologs need to forward 40 SGPRs. - */ - LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; - LLVMTypeRef function_type; - unsigned num_first_params; - unsigned num_out, initial_num_out; - ASSERTED unsigned num_out_sgpr; /* used in debug checks */ - ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ - unsigned num_sgprs, num_vgprs; - unsigned gprs; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - for (unsigned i = 0; i < num_parts; ++i) { - ac_add_function_attr(ctx->ac.context, parts[i], -1, - AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(parts[i], LLVMPrivateLinkage); - } - - /* The parameters of the wrapper function correspond to those of the - * first part in terms of SGPRs and VGPRs, but we use the types of the - * main part to get the right types. This is relevant for the - * dereferenceable attribute on descriptor table pointers. - */ - num_sgprs = 0; - num_vgprs = 0; - - function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); - num_first_params = LLVMCountParamTypes(function_type); - - for (unsigned i = 0; i < num_first_params; ++i) { - LLVMValueRef param = LLVMGetParam(parts[0], i); - - if (ac_is_sgpr_param(param)) { - assert(num_vgprs == 0); - num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; - } else { - num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; - } - } - - gprs = 0; - while (gprs < num_sgprs + num_vgprs) { - LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); - LLVMTypeRef type = LLVMTypeOf(param); - unsigned size = ac_get_type_size(type) / 4; - - /* This is going to get casted anyways, so we don't have to - * have the exact same type. But we do have to preserve the - * pointer-ness so that LLVM knows about it. - */ - enum ac_arg_type arg_type = AC_ARG_INT; - if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { - type = LLVMGetElementType(type); - - if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { - if (LLVMGetVectorSize(type) == 4) - arg_type = AC_ARG_CONST_DESC_PTR; - else if (LLVMGetVectorSize(type) == 8) - arg_type = AC_ARG_CONST_IMAGE_PTR; - else - assert(0); - } else if (type == ctx->ac.f32) { - arg_type = AC_ARG_CONST_FLOAT_PTR; - } else { - assert(0); - } - } - - ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, - size, arg_type, NULL); - - assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); - assert(gprs + size <= num_sgprs + num_vgprs && - (gprs >= num_sgprs || gprs + size <= num_sgprs)); - - gprs += size; - } - - /* Prepare the return type. */ - unsigned num_returns = 0; - LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; - - last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); - return_type = LLVMGetReturnType(last_func_type); - - switch (LLVMGetTypeKind(return_type)) { - case LLVMStructTypeKind: - num_returns = LLVMCountStructElementTypes(return_type); - assert(num_returns <= ARRAY_SIZE(returns)); - LLVMGetStructElementTypes(return_type, returns); - break; - case LLVMVoidTypeKind: - break; - default: - unreachable("unexpected type"); - } - - si_llvm_create_func(ctx, "wrapper", returns, num_returns, - si_get_max_workgroup_size(ctx->shader)); - - if (si_is_merged_shader(ctx->shader)) - ac_init_exec_full_mask(&ctx->ac); - - /* Record the arguments of the function as if they were an output of - * a previous part. - */ - num_out = 0; - num_out_sgpr = 0; - - for (unsigned i = 0; i < ctx->args.arg_count; ++i) { - LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); - LLVMTypeRef param_type = LLVMTypeOf(param); - LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; - unsigned size = ac_get_type_size(param_type) / 4; - - if (size == 1) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); - param_type = ctx->ac.i32; - } - - if (param_type != out_type) - param = LLVMBuildBitCast(builder, param, out_type, ""); - out[num_out++] = param; - } else { - LLVMTypeRef vector_type = LLVMVectorType(out_type, size); - - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); - param_type = ctx->ac.i64; - } - - if (param_type != vector_type) - param = LLVMBuildBitCast(builder, param, vector_type, ""); - - for (unsigned j = 0; j < size; ++j) - out[num_out++] = LLVMBuildExtractElement( - builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); - } - - if (ctx->args.args[i].file == AC_ARG_SGPR) - num_out_sgpr = num_out; - } - - memcpy(initial, out, sizeof(out)); - initial_num_out = num_out; - initial_num_out_sgpr = num_out_sgpr; - - /* Now chain the parts. */ - LLVMValueRef ret = NULL; - for (unsigned part = 0; part < num_parts; ++part) { - LLVMValueRef in[AC_MAX_ARGS]; - LLVMTypeRef ret_type; - unsigned out_idx = 0; - unsigned num_params = LLVMCountParams(parts[part]); - - /* Merged shaders are executed conditionally depending - * on the number of enabled threads passed in the input SGPRs. */ - if (si_is_multi_part_shader(ctx->shader) && part == 0) { - LLVMValueRef ena, count = initial[3]; - - count = LLVMBuildAnd(builder, count, - LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); - ena = LLVMBuildICmp(builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), count, ""); - ac_build_ifcc(&ctx->ac, ena, 6506); - } - - /* Derive arguments for the next part from outputs of the - * previous one. - */ - for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { - LLVMValueRef param; - LLVMTypeRef param_type; - bool is_sgpr; - unsigned param_size; - LLVMValueRef arg = NULL; - - param = LLVMGetParam(parts[part], param_idx); - param_type = LLVMTypeOf(param); - param_size = ac_get_type_size(param_type) / 4; - is_sgpr = ac_is_sgpr_param(param); - - if (is_sgpr) { - ac_add_function_attr(ctx->ac.context, parts[part], - param_idx + 1, AC_FUNC_ATTR_INREG); - } else if (out_idx < num_out_sgpr) { - /* Skip returned SGPRs the current part doesn't - * declare on the input. */ - out_idx = num_out_sgpr; - } - - assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); - - if (param_size == 1) - arg = out[out_idx]; - else - arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); - - if (LLVMTypeOf(arg) != param_type) { - if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { - if (LLVMGetPointerAddressSpace(param_type) == - AC_ADDR_SPACE_CONST_32BIT) { - arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } else { - arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); - arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); - } - } else { - arg = LLVMBuildBitCast(builder, arg, param_type, ""); - } - } - - in[param_idx] = arg; - out_idx += param_size; - } - - ret = ac_build_call(&ctx->ac, parts[part], in, num_params); - - if (si_is_multi_part_shader(ctx->shader) && - part + 1 == next_shader_first_part) { - ac_build_endif(&ctx->ac, 6506); - - /* The second half of the merged shader should use - * the inputs from the toplevel (wrapper) function, - * not the return value from the last call. - * - * That's because the last call was executed condi- - * tionally, so we can't consume it in the main - * block. - */ - memcpy(out, initial, sizeof(initial)); - num_out = initial_num_out; - num_out_sgpr = initial_num_out_sgpr; - continue; - } - - /* Extract the returned GPRs. */ - ret_type = LLVMTypeOf(ret); - num_out = 0; - num_out_sgpr = 0; - - if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { - assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); - - unsigned ret_size = LLVMCountStructElementTypes(ret_type); - - for (unsigned i = 0; i < ret_size; ++i) { - LLVMValueRef val = - LLVMBuildExtractValue(builder, ret, i, ""); - - assert(num_out < ARRAY_SIZE(out)); - out[num_out++] = val; - - if (LLVMTypeOf(val) == ctx->ac.i32) { - assert(num_out_sgpr + 1 == num_out); - num_out_sgpr = num_out; - } - } - } - } - - /* Return the value from the last part. */ - if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) - LLVMBuildRetVoid(builder); - else - LLVMBuildRet(builder, ret); + LLVMBuilderRef builder = ctx->ac.builder; + /* PS epilog has one arg per color component; gfx9 merged shader + * prologs need to forward 40 SGPRs. + */ + LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS]; + LLVMTypeRef function_type; + unsigned num_first_params; + unsigned num_out, initial_num_out; + ASSERTED unsigned num_out_sgpr; /* used in debug checks */ + ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */ + unsigned num_sgprs, num_vgprs; + unsigned gprs; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + for (unsigned i = 0; i < num_parts; ++i) { + ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE); + LLVMSetLinkage(parts[i], LLVMPrivateLinkage); + } + + /* The parameters of the wrapper function correspond to those of the + * first part in terms of SGPRs and VGPRs, but we use the types of the + * main part to get the right types. This is relevant for the + * dereferenceable attribute on descriptor table pointers. + */ + num_sgprs = 0; + num_vgprs = 0; + + function_type = LLVMGetElementType(LLVMTypeOf(parts[0])); + num_first_params = LLVMCountParamTypes(function_type); + + for (unsigned i = 0; i < num_first_params; ++i) { + LLVMValueRef param = LLVMGetParam(parts[0], i); + + if (ac_is_sgpr_param(param)) { + assert(num_vgprs == 0); + num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } else { + num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4; + } + } + + gprs = 0; + while (gprs < num_sgprs + num_vgprs) { + LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count); + LLVMTypeRef type = LLVMTypeOf(param); + unsigned size = ac_get_type_size(type) / 4; + + /* This is going to get casted anyways, so we don't have to + * have the exact same type. But we do have to preserve the + * pointer-ness so that LLVM knows about it. + */ + enum ac_arg_type arg_type = AC_ARG_INT; + if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) { + type = LLVMGetElementType(type); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) { + if (LLVMGetVectorSize(type) == 4) + arg_type = AC_ARG_CONST_DESC_PTR; + else if (LLVMGetVectorSize(type) == 8) + arg_type = AC_ARG_CONST_IMAGE_PTR; + else + assert(0); + } else if (type == ctx->ac.f32) { + arg_type = AC_ARG_CONST_FLOAT_PTR; + } else { + assert(0); + } + } + + ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL); + + assert(ac_is_sgpr_param(param) == (gprs < num_sgprs)); + assert(gprs + size <= num_sgprs + num_vgprs && + (gprs >= num_sgprs || gprs + size <= num_sgprs)); + + gprs += size; + } + + /* Prepare the return type. */ + unsigned num_returns = 0; + LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type; + + last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1])); + return_type = LLVMGetReturnType(last_func_type); + + switch (LLVMGetTypeKind(return_type)) { + case LLVMStructTypeKind: + num_returns = LLVMCountStructElementTypes(return_type); + assert(num_returns <= ARRAY_SIZE(returns)); + LLVMGetStructElementTypes(return_type, returns); + break; + case LLVMVoidTypeKind: + break; + default: + unreachable("unexpected type"); + } + + si_llvm_create_func(ctx, "wrapper", returns, num_returns, + si_get_max_workgroup_size(ctx->shader)); + + if (si_is_merged_shader(ctx->shader)) + ac_init_exec_full_mask(&ctx->ac); + + /* Record the arguments of the function as if they were an output of + * a previous part. + */ + num_out = 0; + num_out_sgpr = 0; + + for (unsigned i = 0; i < ctx->args.arg_count; ++i) { + LLVMValueRef param = LLVMGetParam(ctx->main_fn, i); + LLVMTypeRef param_type = LLVMTypeOf(param); + LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32; + unsigned size = ac_get_type_size(param_type) / 4; + + if (size == 1) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, ""); + param_type = ctx->ac.i32; + } + + if (param_type != out_type) + param = LLVMBuildBitCast(builder, param, out_type, ""); + out[num_out++] = param; + } else { + LLVMTypeRef vector_type = LLVMVectorType(out_type, size); + + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, ""); + param_type = ctx->ac.i64; + } + + if (param_type != vector_type) + param = LLVMBuildBitCast(builder, param, vector_type, ""); + + for (unsigned j = 0; j < size; ++j) + out[num_out++] = + LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), ""); + } + + if (ctx->args.args[i].file == AC_ARG_SGPR) + num_out_sgpr = num_out; + } + + memcpy(initial, out, sizeof(out)); + initial_num_out = num_out; + initial_num_out_sgpr = num_out_sgpr; + + /* Now chain the parts. */ + LLVMValueRef ret = NULL; + for (unsigned part = 0; part < num_parts; ++part) { + LLVMValueRef in[AC_MAX_ARGS]; + LLVMTypeRef ret_type; + unsigned out_idx = 0; + unsigned num_params = LLVMCountParams(parts[part]); + + /* Merged shaders are executed conditionally depending + * on the number of enabled threads passed in the input SGPRs. */ + if (si_is_multi_part_shader(ctx->shader) && part == 0) { + LLVMValueRef ena, count = initial[3]; + + count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); + ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); + ac_build_ifcc(&ctx->ac, ena, 6506); + } + + /* Derive arguments for the next part from outputs of the + * previous one. + */ + for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) { + LLVMValueRef param; + LLVMTypeRef param_type; + bool is_sgpr; + unsigned param_size; + LLVMValueRef arg = NULL; + + param = LLVMGetParam(parts[part], param_idx); + param_type = LLVMTypeOf(param); + param_size = ac_get_type_size(param_type) / 4; + is_sgpr = ac_is_sgpr_param(param); + + if (is_sgpr) { + ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG); + } else if (out_idx < num_out_sgpr) { + /* Skip returned SGPRs the current part doesn't + * declare on the input. */ + out_idx = num_out_sgpr; + } + + assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out)); + + if (param_size == 1) + arg = out[out_idx]; + else + arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size); + + if (LLVMTypeOf(arg) != param_type) { + if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) { + if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } else { + arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, ""); + arg = LLVMBuildIntToPtr(builder, arg, param_type, ""); + } + } else { + arg = LLVMBuildBitCast(builder, arg, param_type, ""); + } + } + + in[param_idx] = arg; + out_idx += param_size; + } + + ret = ac_build_call(&ctx->ac, parts[part], in, num_params); + + if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) { + ac_build_endif(&ctx->ac, 6506); + + /* The second half of the merged shader should use + * the inputs from the toplevel (wrapper) function, + * not the return value from the last call. + * + * That's because the last call was executed condi- + * tionally, so we can't consume it in the main + * block. + */ + memcpy(out, initial, sizeof(initial)); + num_out = initial_num_out; + num_out_sgpr = initial_num_out_sgpr; + continue; + } + + /* Extract the returned GPRs. */ + ret_type = LLVMTypeOf(ret); + num_out = 0; + num_out_sgpr = 0; + + if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { + assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); + + unsigned ret_size = LLVMCountStructElementTypes(ret_type); + + for (unsigned i = 0; i < ret_size; ++i) { + LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, ""); + + assert(num_out < ARRAY_SIZE(out)); + out[num_out++] = val; + + if (LLVMTypeOf(val) == ctx->ac.i32) { + assert(num_out_sgpr + 1 == num_out); + num_out_sgpr = num_out; + } + } + } + } + + /* Return the value from the last part. */ + if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) + LLVMBuildRetVoid(builder); + else + LLVMBuildRet(builder, ret); } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 99ffdd2e980..2a609572d84 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -22,759 +22,693 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" #include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" #include "util/u_memory.h" LLVMValueRef si_is_es_thread(struct si_shader_context *ctx) { - /* Return true if the current thread should execute an ES thread. */ - return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), - si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), ""); + /* Return true if the current thread should execute an ES thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), ""); } LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx) { - /* Return true if the current thread should execute a GS thread. */ - return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), - si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), ""); + /* Return true if the current thread should execute a GS thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), ""); } -static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, - unsigned input_index, - unsigned vtx_offset_param, - LLVMTypeRef type, - unsigned swizzle) +static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index, + unsigned vtx_offset_param, LLVMTypeRef type, + unsigned swizzle) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - LLVMValueRef vtx_offset, soffset; - struct si_shader_info *info = &shader->selector->info; - unsigned semantic_name = info->input_semantic_name[input_index]; - unsigned semantic_index = info->input_semantic_index[input_index]; - unsigned param; - LLVMValueRef value; - - param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - unsigned index = vtx_offset_param; - - switch (index / 2) { - case 0: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, - index % 2 ? 16 : 0, 16); - break; - case 1: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, - index % 2 ? 16 : 0, 16); - break; - case 2: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, - index % 2 ? 16 : 0, 16); - break; - default: - assert(0); - return NULL; - } - - unsigned offset = param * 4 + swizzle; - vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, - LLVMConstInt(ctx->ac.i32, offset, false), ""); - - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); - LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - if (ac_get_type_size(type) == 8) { - ptr = LLVMBuildGEP(ctx->ac.builder, ptr, - &ctx->ac.i32_1, 1, ""); - LLVMValueRef values[2] = { - value, - LLVMBuildLoad(ctx->ac.builder, ptr, "") - }; - value = ac_build_gather_values(&ctx->ac, values, 2); - } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); - } - - /* GFX6: input load from the ESGS ring in memory. */ - if (swizzle == ~0) { - LLVMValueRef values[4]; - unsigned chan; - for (chan = 0; chan < 4; chan++) { - values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, - type, chan); - } - return ac_build_gather_values(&ctx->ac, values, 4); - } - - /* Get the vertex offset parameter on GFX6. */ - LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, - ctx->gs_vtx_offset[vtx_offset_param]); - - vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, - LLVMConstInt(ctx->ac.i32, 4, 0), ""); - - soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0); - - value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, - vtx_offset, soffset, 0, ac_glc, true, false); - if (ac_get_type_size(type) == 8) { - LLVMValueRef value2; - soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0); - - value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, - ctx->ac.i32_0, vtx_offset, soffset, - 0, ac_glc, true, false); - return si_build_gather_64bit(ctx, type, value, value2); - } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + LLVMValueRef vtx_offset, soffset; + struct si_shader_info *info = &shader->selector->info; + unsigned semantic_name = info->input_semantic_name[input_index]; + unsigned semantic_index = info->input_semantic_index[input_index]; + unsigned param; + LLVMValueRef value; + + param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + unsigned index = vtx_offset_param; + + switch (index / 2) { + case 0: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16); + break; + case 1: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16); + break; + case 2: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16); + break; + default: + assert(0); + return NULL; + } + + unsigned offset = param * 4 + swizzle; + vtx_offset = + LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + if (ac_get_type_size(type) == 8) { + ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, ""); + LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")}; + value = ac_build_gather_values(&ctx->ac, values, 2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); + } + + /* GFX6: input load from the ESGS ring in memory. */ + if (swizzle == ~0) { + LLVMValueRef values[4]; + unsigned chan; + for (chan = 0; chan < 4; chan++) { + values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan); + } + return ac_build_gather_values(&ctx->ac, values, 4); + } + + /* Get the vertex offset parameter on GFX6. */ + LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]); + + vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0); + + value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0, + ac_glc, true, false); + if (ac_get_type_size(type) == 8) { + LLVMValueRef value2; + soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0); + + value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, + 0, ac_glc, true, false); + return si_build_gather_64bit(ctx, type, value, value2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } -static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - unsigned vertex_index, - unsigned const_index, - LLVMTypeRef type) +static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location, + unsigned driver_location, unsigned component, + unsigned num_components, unsigned vertex_index, + unsigned const_index, LLVMTypeRef type) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) - offset *= 2; + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) + offset *= 2; - offset += component; - value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, - vertex_index, type, offset); - } + offset += component; + value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, + vertex_index, type, offset); + } - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); } /* Pass GS inputs from ES to GS on GFX9. */ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) { - LLVMValueRef ret = ctx->return_value; - - ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); - ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); - if (ctx->shader->key.as_ngg) - ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2); - else - ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); - - ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - if (ctx->screen->use_ngg) { - ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - } - - unsigned vgpr; - if (ctx->type == PIPE_SHADER_VERTEX) - vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; - - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++); - ctx->return_value = ret; + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); + ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); + if (ctx->shader->key.as_ngg) + ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2); + else + ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + if (ctx->screen->use_ngg) { + ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); + } + + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++); + ctx->return_value = ret; } -void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs) +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *es = ctx->shader; - struct si_shader_info *info = &es->selector->info; - LLVMValueRef lds_base = NULL; - unsigned chan; - int i; - - if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { - unsigned itemsize_dw = es->selector->esgs_itemsize / 4; - LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); - vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, - LLVMBuildMul(ctx->ac.builder, wave_idx, - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), ""); - lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, - LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), ""); - } - - for (i = 0; i < info->num_outputs; i++) { - int param; - - if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || - info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) - continue; - - param = si_shader_io_get_unique_index(info->output_semantic_name[i], - info->output_semantic_index[i], false); - - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - out_val = ac_to_integer(&ctx->ac, out_val); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false); - idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); - ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); - continue; - } - - ac_build_buffer_store_dword(&ctx->ac, - ctx->esgs_ring, - out_val, 1, NULL, - ac_get_arg(&ctx->ac, ctx->es2gs_offset), - (4 * param + chan) * 4, - ac_glc | ac_slc | ac_swizzled); - } - } - - if (ctx->screen->info.chip_class >= GFX9) - si_set_es_return_value_for_gs(ctx); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *es = ctx->shader; + struct si_shader_info *info = &es->selector->info; + LLVMValueRef lds_base = NULL; + unsigned chan; + int i; + + if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { + unsigned itemsize_dw = es->selector->esgs_itemsize / 4; + LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); + LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); + vertex_idx = + LLVMBuildOr(ctx->ac.builder, vertex_idx, + LLVMBuildMul(ctx->ac.builder, wave_idx, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), + ""); + lds_base = + LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), ""); + } + + for (i = 0; i < info->num_outputs; i++) { + int param; + + if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || + info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) + continue; + + param = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i], false); + + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + out_val = ac_to_integer(&ctx->ac, out_val); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false); + idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); + ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); + continue; + } + + ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, + ac_get_arg(&ctx->ac, ctx->es2gs_offset), + (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled); + } + } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_es_return_value_for_gs(ctx); } static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) { - if (ctx->screen->info.chip_class >= GFX9) - return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8); - else - return ac_get_arg(&ctx->ac, ctx->gs_wave_id); + if (ctx->screen->info.chip_class >= GFX9) + return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8); + else + return ac_get_arg(&ctx->ac, ctx->gs_wave_id); } static void emit_gs_epilogue(struct si_shader_context *ctx) { - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_epilogue(ctx); - return; - } + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_epilogue(ctx); + return; + } - if (ctx->screen->info.chip_class >= GFX10) - LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); + if (ctx->screen->info.chip_class >= GFX10) + LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, - si_get_gs_wave_id(ctx)); + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx)); - if (ctx->screen->info.chip_class >= GFX9) - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + if (ctx->screen->info.chip_class >= GFX9) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); } -static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info UNUSED *info = &ctx->shader->selector->info; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info UNUSED *info = &ctx->shader->selector->info; - assert(info->num_outputs <= max_outputs); + assert(info->num_outputs <= max_outputs); - emit_gs_epilogue(ctx); + emit_gs_epilogue(ctx); } /* Emit one vertex from the geometry shader */ -static void si_llvm_emit_vertex(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef *addrs) +static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); - return; - } - - struct si_shader_info *info = &ctx->shader->selector->info; - struct si_shader *shader = ctx->shader; - LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset); - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; - unsigned chan, offset; - int i; - - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - * - * If the shader has no writes to memory, kill it instead. This skips - * further memory loads and may allow LLVM to skip to the end - * altogether. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, - shader->selector->gs_max_out_vertices, 0), ""); - - bool use_kill = !info->writes_memory; - if (use_kill) { - ac_build_kill_if_false(&ctx->ac, can_emit); - } else { - ac_build_ifcc(&ctx->ac, can_emit, 6505); - } - - offset = 0; - for (i = 0; i < info->num_outputs; i++) { - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan)) || - ((info->output_streams[i] >> (2 * chan)) & 3) != stream) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - LLVMValueRef voffset = - LLVMConstInt(ctx->ac.i32, offset * - shader->selector->gs_max_out_vertices, 0); - offset++; - - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); - voffset = LLVMBuildMul(ctx->ac.builder, voffset, - LLVMConstInt(ctx->ac.i32, 4, 0), ""); - - out_val = ac_to_integer(&ctx->ac, out_val); - - ac_build_buffer_store_dword(&ctx->ac, - ctx->gsvs_ring[stream], - out_val, 1, - voffset, soffset, 0, - ac_glc | ac_slc | ac_swizzled); - } - } - - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - - /* Signal vertex emission if vertex data was written. */ - if (offset) { - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); - } - - if (!use_kill) - ac_build_endif(&ctx->ac, 6505); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + return; + } + + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader *shader = ctx->shader; + LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset); + LLVMValueRef gs_next_vertex; + LLVMValueRef can_emit; + unsigned chan, offset; + int i; + + /* Write vertex attribute values to GSVS ring */ + gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], ""); + + /* If this thread has already emitted the declared maximum number of + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + * + * If the shader has no writes to memory, kill it instead. This skips + * further memory loads and may allow LLVM to skip to the end + * altogether. + */ + can_emit = + LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, + LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), ""); + + bool use_kill = !info->writes_memory; + if (use_kill) { + ac_build_kill_if_false(&ctx->ac, can_emit); + } else { + ac_build_ifcc(&ctx->ac, can_emit, 6505); + } + + offset = 0; + for (i = 0; i < info->num_outputs; i++) { + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan)) || + ((info->output_streams[i] >> (2 * chan)) & 3) != stream) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + LLVMValueRef voffset = + LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0); + offset++; + + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + out_val = ac_to_integer(&ctx->ac, out_val); + + ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset, + 0, ac_glc | ac_slc | ac_swizzled); + } + } + + gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, ""); + LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); + + /* Signal vertex emission if vertex data was written. */ + if (offset) { + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); + } + + if (!use_kill) + ac_build_endif(&ctx->ac, 6505); } /* Cut one primitive from the geometry shader */ -static void si_llvm_emit_primitive(struct ac_shader_abi *abi, - unsigned stream) +static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); - if (ctx->shader->key.as_ngg) { - LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); - return; - } + if (ctx->shader->key.as_ngg) { + LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); + return; + } - /* Signal primitive cut */ - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); + /* Signal primitive cut */ + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); } void si_preload_esgs_ring(struct si_shader_context *ctx) { - if (ctx->screen->info.chip_class <= GFX8) { - unsigned ring = - ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS - : SI_ES_RING_ESGS; - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0); - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - - ctx->esgs_ring = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - } else { - if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { - /* Declare the ESGS ring as an explicit LDS symbol. */ - si_llvm_declare_esgs_ring(ctx); - } else { - ac_declare_lds_as_pointer(&ctx->ac); - ctx->esgs_ring = ctx->ac.lds; - } - } + if (ctx->screen->info.chip_class <= GFX8) { + unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS; + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + + ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + } else { + if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + /* Declare the ESGS ring as an explicit LDS symbol. */ + si_llvm_declare_esgs_ring(ctx); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + ctx->esgs_ring = ctx->ac.lds; + } + } } void si_preload_gs_rings(struct si_shader_context *ctx) { - const struct si_shader_selector *sel = ctx->shader->selector; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0); - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - /* The conceptual layout of the GSVS ring is - * v0c0 .. vLv0 v0c1 .. vLc1 .. - * but the real memory layout is swizzled across - * threads: - * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL - * t16v0c0 .. - * Override the buffer descriptor accordingly. - */ - LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); - uint64_t stream_offset = 0; - - for (unsigned stream = 0; stream < 4; ++stream) { - unsigned num_components; - unsigned stride; - unsigned num_records; - LLVMValueRef ring, tmp; - - num_components = sel->info.num_stream_output_components[stream]; - if (!num_components) - continue; - - stride = 4 * num_components * sel->gs_max_out_vertices; - - /* Limit on the stride field for <= GFX7. */ - assert(stride < (1 << 14)); - - num_records = ctx->ac.wave_size; - - ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, ""); - tmp = LLVMBuildAdd(builder, tmp, - LLVMConstInt(ctx->ac.i64, - stream_offset, 0), ""); - stream_offset += stride * ctx->ac.wave_size; - - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, ""); - ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, ""); - tmp = LLVMBuildOr(builder, tmp, - LLVMConstInt(ctx->ac.i32, - S_008F04_STRIDE(stride) | - S_008F04_SWIZZLE_ENABLE(1), 0), ""); - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, ""); - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->ac.i32, num_records, 0), - LLVMConstInt(ctx->ac.i32, 2, 0), ""); - - uint32_t rsrc3 = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ - S_008F0C_ADD_TID_ENABLE(1); - - if (ctx->ac.chip_class >= GFX10) { - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | - S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ - } - - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->ac.i32, rsrc3, false), - LLVMConstInt(ctx->ac.i32, 3, 0), ""); - - ctx->gsvs_ring[stream] = ring; - } + const struct si_shader_selector *sel = ctx->shader->selector; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + /* The conceptual layout of the GSVS ring is + * v0c0 .. vLv0 v0c1 .. vLc1 .. + * but the real memory layout is swizzled across + * threads: + * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL + * t16v0c0 .. + * Override the buffer descriptor accordingly. + */ + LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); + uint64_t stream_offset = 0; + + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + unsigned stride; + unsigned num_records; + LLVMValueRef ring, tmp; + + num_components = sel->info.num_stream_output_components[stream]; + if (!num_components) + continue; + + stride = 4 * num_components * sel->gs_max_out_vertices; + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + num_records = ctx->ac.wave_size; + + ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, ""); + tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), ""); + stream_offset += stride * ctx->ac.wave_size; + + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, ""); + ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, ""); + tmp = LLVMBuildOr( + builder, tmp, + LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, ""); + ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0), + LLVMConstInt(ctx->ac.i32, 2, 0), ""); + + uint32_t rsrc3 = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ + S_008F0C_ADD_TID_ENABLE(1); + + if (ctx->ac.chip_class >= GFX10) { + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); + } else { + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ + } + + ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false), + LLVMConstInt(ctx->ac.i32, 3, 0), ""); + + ctx->gsvs_ring[stream] = ring; + } } /* Generate code for the hardware VS shader stage to go with a geometry shader */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug) +struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug) { - struct si_shader_context ctx; - struct si_shader *shader; - LLVMBuilderRef builder; - struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; - struct si_shader_info *gsinfo = &gs_selector->info; - int i; - - - shader = CALLOC_STRUCT(si_shader); - if (!shader) - return NULL; - - /* We can leave the fence as permanently signaled because the GS copy - * shader only becomes visible globally after it has been compiled. */ - util_queue_fence_init(&shader->ready); - - shader->selector = gs_selector; - shader->is_gs_copy_shader = true; + struct si_shader_context ctx; + struct si_shader *shader; + LLVMBuilderRef builder; + struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; + struct si_shader_info *gsinfo = &gs_selector->info; + int i; + + shader = CALLOC_STRUCT(si_shader); + if (!shader) + return NULL; + + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); + + shader->selector = gs_selector; + shader->is_gs_copy_shader = true; + + si_llvm_context_init(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false)); + ctx.shader = shader; + ctx.type = PIPE_SHADER_VERTEX; + + builder = ctx.ac.builder; + + si_create_function(&ctx, false); + + LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers); + ctx.gsvs_ring[0] = + ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0)); + + LLVMValueRef voffset = + LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), ""); + + /* Fetch the vertex stream ID.*/ + LLVMValueRef stream_id; + + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) + stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2); + else + stream_id = ctx.ac.i32_0; + + /* Fill in output information. */ + for (i = 0; i < gsinfo->num_outputs; ++i) { + outputs[i].semantic_name = gsinfo->output_semantic_name[i]; + outputs[i].semantic_index = gsinfo->output_semantic_index[i]; + + for (int chan = 0; chan < 4; chan++) { + outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3; + } + } + + LLVMBasicBlockRef end_bb; + LLVMValueRef switch_inst; + + end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); + switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); + + for (int stream = 0; stream < 4; stream++) { + LLVMBasicBlockRef bb; + unsigned offset; + + if (!gsinfo->num_stream_output_components[stream]) + continue; + + if (stream > 0 && !gs_selector->so.num_outputs) + continue; + + bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); + LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb); + LLVMPositionBuilderAtEnd(builder, bb); + + /* Fetch vertex data from GSVS ring */ + offset = 0; + for (i = 0; i < gsinfo->num_outputs; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + if (!(gsinfo->output_usagemask[i] & (1 << chan)) || + outputs[i].vertex_stream[chan] != stream) { + outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32); + continue; + } + + LLVMValueRef soffset = + LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); + offset++; + + outputs[i].values[chan] = + ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0, + ac_glc | ac_slc, true, false); + } + } + + /* Streamout and exports. */ + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { + si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream); + } + + if (stream == 0) + si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); + + LLVMBuildBr(builder, end_bb); + } + + LLVMPositionBuilderAtEnd(builder, end_bb); + + LLVMBuildRetVoid(ctx.ac.builder); + + ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ + si_llvm_optimize_module(&ctx); + + bool ok = false; + if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac, + debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) { + if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) + fprintf(stderr, "GS Copy Shader:\n"); + si_shader_dump(sscreen, ctx.shader, debug, stderr, true); + + if (!ctx.shader->config.scratch_bytes_per_wave) + ok = si_shader_binary_upload(sscreen, ctx.shader, 0); + else + ok = true; + } - si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, - false, false, false)); - ctx.shader = shader; - ctx.type = PIPE_SHADER_VERTEX; - - builder = ctx.ac.builder; - - si_create_function(&ctx, false); - - LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers); - ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr, - LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0)); - - LLVMValueRef voffset = - LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, - LLVMConstInt(ctx.ac.i32, 4, 0), ""); - - /* Fetch the vertex stream ID.*/ - LLVMValueRef stream_id; - - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) - stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2); - else - stream_id = ctx.ac.i32_0; - - /* Fill in output information. */ - for (i = 0; i < gsinfo->num_outputs; ++i) { - outputs[i].semantic_name = gsinfo->output_semantic_name[i]; - outputs[i].semantic_index = gsinfo->output_semantic_index[i]; - - for (int chan = 0; chan < 4; chan++) { - outputs[i].vertex_stream[chan] = - (gsinfo->output_streams[i] >> (2 * chan)) & 3; - } - } - - LLVMBasicBlockRef end_bb; - LLVMValueRef switch_inst; - - end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); - switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); - - for (int stream = 0; stream < 4; stream++) { - LLVMBasicBlockRef bb; - unsigned offset; - - if (!gsinfo->num_stream_output_components[stream]) - continue; - - if (stream > 0 && !gs_selector->so.num_outputs) - continue; - - bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); - LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb); - LLVMPositionBuilderAtEnd(builder, bb); - - /* Fetch vertex data from GSVS ring */ - offset = 0; - for (i = 0; i < gsinfo->num_outputs; ++i) { - for (unsigned chan = 0; chan < 4; chan++) { - if (!(gsinfo->output_usagemask[i] & (1 << chan)) || - outputs[i].vertex_stream[chan] != stream) { - outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32); - continue; - } - - LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32, - offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); - offset++; - - outputs[i].values[chan] = - ac_build_buffer_load(&ctx.ac, - ctx.gsvs_ring[0], 1, - ctx.ac.i32_0, voffset, - soffset, 0, ac_glc | ac_slc, - true, false); - } - } - - /* Streamout and exports. */ - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { - si_llvm_emit_streamout(&ctx, outputs, - gsinfo->num_outputs, - stream); - } - - if (stream == 0) - si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); - - LLVMBuildBr(builder, end_bb); - } - - LLVMPositionBuilderAtEnd(builder, end_bb); - - LLVMBuildRetVoid(ctx.ac.builder); - - ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ - si_llvm_optimize_module(&ctx); - - bool ok = false; - if (si_compile_llvm(sscreen, &ctx.shader->binary, - &ctx.shader->config, ctx.compiler, &ctx.ac, - debug, PIPE_SHADER_GEOMETRY, - "GS Copy Shader", false)) { - if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) - fprintf(stderr, "GS Copy Shader:\n"); - si_shader_dump(sscreen, ctx.shader, debug, stderr, true); - - if (!ctx.shader->config.scratch_bytes_per_wave) - ok = si_shader_binary_upload(sscreen, ctx.shader, 0); - else - ok = true; - } - - si_llvm_dispose(&ctx); - - if (!ok) { - FREE(shader); - shader = NULL; - } else { - si_fix_resource_usage(sscreen, shader); - } - return shader; + si_llvm_dispose(&ctx); + + if (!ok) { + FREE(shader); + shader = NULL; + } else { + si_fix_resource_usage(sscreen, shader); + } + return shader; } /** * Build the GS prolog function. Rotate the input vertices for triangle strips * with adjacency. */ -void si_llvm_build_gs_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key) { - unsigned num_sgprs, num_vgprs; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMTypeRef returns[AC_MAX_ARGS]; - LLVMValueRef func, ret; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - if (ctx->screen->info.chip_class >= GFX9) { - if (key->gs_prolog.states.gfx9_prev_is_vs) - num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; - num_vgprs = 5; /* ES inputs are not needed by GS */ - } else { - num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - num_vgprs = 8; - } - - for (unsigned i = 0; i < num_sgprs; ++i) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - returns[i] = ctx->ac.i32; - } - - for (unsigned i = 0; i < num_vgprs; ++i) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); - returns[num_sgprs + i] = ctx->ac.f32; - } - - /* Create the function. */ - si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); - func = ctx->main_fn; - - /* Set the full EXEC mask for the prolog, because we are only fiddling - * with registers here. The main shader part will set the correct EXEC - * mask. - */ - if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) - ac_init_exec_full_mask(&ctx->ac); - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (unsigned i = 0; i < num_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(builder, ret, p, i, ""); - } - for (unsigned i = 0; i < num_vgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); - } - - if (key->gs_prolog.states.tri_strip_adj_fix) { - /* Remap the input vertices for every other primitive. */ - const struct ac_arg gfx6_vtx_params[6] = { - { .used = true, .arg_index = num_sgprs }, - { .used = true, .arg_index = num_sgprs + 1 }, - { .used = true, .arg_index = num_sgprs + 3 }, - { .used = true, .arg_index = num_sgprs + 4 }, - { .used = true, .arg_index = num_sgprs + 5 }, - { .used = true, .arg_index = num_sgprs + 6 }, - }; - const struct ac_arg gfx9_vtx_params[3] = { - { .used = true, .arg_index = num_sgprs }, - { .used = true, .arg_index = num_sgprs + 1 }, - { .used = true, .arg_index = num_sgprs + 4 }, - }; - LLVMValueRef vtx_in[6], vtx_out[6]; - LLVMValueRef prim_id, rotate; - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); - vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); - } - } else { - for (unsigned i = 0; i < 6; i++) - vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); - } - - prim_id = LLVMGetParam(func, num_sgprs + 2); - rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, ""); - - for (unsigned i = 0; i < 6; ++i) { - LLVMValueRef base, rotated; - base = vtx_in[i]; - rotated = vtx_in[(i + 4) % 6]; - vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); - } - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef hi, out; - - hi = LLVMBuildShl(builder, vtx_out[i*2+1], - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); - out = ac_to_float(&ctx->ac, out); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx9_vtx_params[i].arg_index, ""); - } - } else { - for (unsigned i = 0; i < 6; i++) { - LLVMValueRef out; - - out = ac_to_float(&ctx->ac, vtx_out[i]); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx6_vtx_params[i].arg_index, ""); - } - } - } - - LLVMBuildRet(builder, ret); + unsigned num_sgprs, num_vgprs; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMTypeRef returns[AC_MAX_ARGS]; + LLVMValueRef func, ret; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + if (ctx->screen->info.chip_class >= GFX9) { + if (key->gs_prolog.states.gfx9_prev_is_vs) + num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + + for (unsigned i = 0; i < num_sgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + returns[i] = ctx->ac.i32; + } + + for (unsigned i = 0; i < num_vgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); + returns[num_sgprs + i] = ctx->ac.f32; + } + + /* Create the function. */ + si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); + func = ctx->main_fn; + + /* Set the full EXEC mask for the prolog, because we are only fiddling + * with registers here. The main shader part will set the correct EXEC + * mask. + */ + if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) + ac_init_exec_full_mask(&ctx->ac); + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (unsigned i = 0; i < num_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(builder, ret, p, i, ""); + } + for (unsigned i = 0; i < num_vgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); + } + + if (key->gs_prolog.states.tri_strip_adj_fix) { + /* Remap the input vertices for every other primitive. */ + const struct ac_arg gfx6_vtx_params[6] = { + {.used = true, .arg_index = num_sgprs}, {.used = true, .arg_index = num_sgprs + 1}, + {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4}, + {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6}, + }; + const struct ac_arg gfx9_vtx_params[3] = { + {.used = true, .arg_index = num_sgprs}, + {.used = true, .arg_index = num_sgprs + 1}, + {.used = true, .arg_index = num_sgprs + 4}, + }; + LLVMValueRef vtx_in[6], vtx_out[6]; + LLVMValueRef prim_id, rotate; + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); + vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); + } + } else { + for (unsigned i = 0; i < 6; i++) + vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); + } + + prim_id = LLVMGetParam(func, num_sgprs + 2); + rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, ""); + + for (unsigned i = 0; i < 6; ++i) { + LLVMValueRef base, rotated; + base = vtx_in[i]; + rotated = vtx_in[(i + 4) % 6]; + vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); + } + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef hi, out; + + hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), ""); + out = LLVMBuildOr(builder, vtx_out[i * 2], hi, ""); + out = ac_to_float(&ctx->ac, out); + ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, ""); + } + } else { + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef out; + + out = ac_to_float(&ctx->ac, vtx_out[i]); + ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, ""); + } + } + } + + LLVMBuildRet(builder, ret); } void si_llvm_init_gs_callbacks(struct si_shader_context *ctx) { - ctx->abi.load_inputs = si_nir_load_input_gs; - ctx->abi.emit_vertex = si_llvm_emit_vertex; - ctx->abi.emit_primitive = si_llvm_emit_primitive; - ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; + ctx->abi.load_inputs = si_nir_load_input_gs; + ctx->abi.emit_vertex = si_llvm_emit_vertex; + ctx->abi.emit_primitive = si_llvm_emit_primitive; + ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c index c2efcc88e99..6e4d5d429c7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c @@ -22,117 +22,108 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" #include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" LLVMValueRef si_get_sample_id(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->args.ancillary, 8, 4); + return si_unpack_param(ctx, ctx->args.ancillary, 8, 4); } static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage)); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage)); } static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers); - LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); - LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); - - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ - LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), ""); - LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), ""); - - LLVMValueRef pos[4] = { - si_buffer_load_const(ctx, resource, offset0), - si_buffer_load_const(ctx, resource, offset1), - LLVMConstReal(ctx->ac.f32, 0), - LLVMConstReal(ctx->ac.f32, 0) - }; - - return ac_build_gather_values(&ctx->ac, pos, 4); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0); + LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index); + + /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ + LLVMValueRef offset0 = + LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), ""); + LLVMValueRef offset1 = + LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + LLVMValueRef pos[4] = {si_buffer_load_const(ctx, resource, offset0), + si_buffer_load_const(ctx, resource, offset1), + LLVMConstReal(ctx->ac.f32, 0), LLVMConstReal(ctx->ac.f32, 0)}; + + return ac_build_gather_values(&ctx->ac, pos, 4); } static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct ac_image_args args = {}; - LLVMValueRef ptr, image, fmask; - - /* Ignore src0, because KHR_blend_func_extended disallows multiple render - * targets. - */ - - /* Load the image descriptor. */ - STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0); - ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr, - ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); - image = ac_build_load_to_sgpr(&ctx->ac, ptr, - LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0)); - - unsigned chan = 0; - - args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16); - - if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D) - args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16); - - /* Get the current render target layer index. */ - if (ctx->shader->key.mono.u.ps.fbfetch_layered) - args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11); - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa) - args.coords[chan++] = si_get_sample_id(ctx); - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa && - !(ctx->screen->debug_flags & DBG(NO_FMASK))) { - fmask = ac_build_load_to_sgpr(&ctx->ac, ptr, - LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0)); - - ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords, - ctx->shader->key.mono.u.ps.fbfetch_layered); - } - - args.opcode = ac_image_load; - args.resource = image; - args.dmask = 0xf; - args.attributes = AC_FUNC_ATTR_READNONE; - - if (ctx->shader->key.mono.u.ps.fbfetch_msaa) - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_2darraymsaa : ac_image_2dmsaa; - else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D) - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_1darray : ac_image_1d; - else - args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? - ac_image_2darray : ac_image_2d; - - return ac_build_image_opcode(&ctx->ac, &args); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct ac_image_args args = {}; + LLVMValueRef ptr, image, fmask; + + /* Ignore src0, because KHR_blend_func_extended disallows multiple render + * targets. + */ + + /* Load the image descriptor. */ + STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0); + ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + ptr = + LLVMBuildPointerCast(ctx->ac.builder, ptr, ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); + image = + ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0)); + + unsigned chan = 0; + + args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16); + + if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D) + args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16); + + /* Get the current render target layer index. */ + if (ctx->shader->key.mono.u.ps.fbfetch_layered) + args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11); + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa) + args.coords[chan++] = si_get_sample_id(ctx); + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) { + fmask = ac_build_load_to_sgpr(&ctx->ac, ptr, + LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0)); + + ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords, + ctx->shader->key.mono.u.ps.fbfetch_layered); + } + + args.opcode = ac_image_load; + args.resource = image; + args.dmask = 0xf; + args.attributes = AC_FUNC_ATTR_READNONE; + + if (ctx->shader->key.mono.u.ps.fbfetch_msaa) + args.dim = + ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa; + else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D) + args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_1darray : ac_image_1d; + else + args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darray : ac_image_2d; + + return ac_build_image_opcode(&ctx->ac, &args); } -static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, - unsigned attr_index, unsigned chan, - LLVMValueRef prim_mask, - LLVMValueRef i, LLVMValueRef j) +static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index, + unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i, + LLVMValueRef j) { - if (i || j) { - return ac_build_fs_interp(&ctx->ac, - LLVMConstInt(ctx->ac.i32, chan, 0), - LLVMConstInt(ctx->ac.i32, attr_index, 0), - prim_mask, i, j); - } - return ac_build_fs_interp_mov(&ctx->ac, - LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */ - LLVMConstInt(ctx->ac.i32, chan, 0), - LLVMConstInt(ctx->ac.i32, attr_index, 0), - prim_mask); + if (i || j) { + return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0), + LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j); + } + return ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */ + LLVMConstInt(ctx->ac.i32, chan, 0), + LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask); } /** @@ -149,345 +140,300 @@ static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, * @param face SI_PARAM_FRONT_FACE * @param result the return value (4 components) */ -static void interp_fs_color(struct si_shader_context *ctx, - unsigned input_index, - unsigned semantic_index, - unsigned num_interp_inputs, - unsigned colors_read_mask, - LLVMValueRef interp_param, - LLVMValueRef prim_mask, - LLVMValueRef face, - LLVMValueRef result[4]) +static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index, + unsigned semantic_index, unsigned num_interp_inputs, + unsigned colors_read_mask, LLVMValueRef interp_param, + LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4]) { - LLVMValueRef i = NULL, j = NULL; - unsigned chan; - - /* fs.constant returns the param from the middle vertex, so it's not - * really useful for flat shading. It's meant to be used for custom - * interpolation (but the intrinsic can't fetch from the other two - * vertices). - * - * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state - * to do the right thing. The only reason we use fs.constant is that - * fs.interp cannot be used on integers, because they can be equal - * to NaN. - * - * When interp is false we will use fs.constant or for newer llvm, - * amdgcn.interp.mov. - */ - bool interp = interp_param != NULL; - - if (interp) { - interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param, - LLVMVectorType(ctx->ac.f32, 2), ""); - - i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->ac.i32_0, ""); - j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, - ctx->ac.i32_1, ""); - } - - if (ctx->shader->key.part.ps.prolog.color_two_side) { - LLVMValueRef is_face_positive; - - /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", - * otherwise it's at offset "num_inputs". - */ - unsigned back_attr_offset = num_interp_inputs; - if (semantic_index == 1 && colors_read_mask & 0xf) - back_attr_offset += 1; - - is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, - face, ctx->ac.i32_0, ""); - - for (chan = 0; chan < 4; chan++) { - LLVMValueRef front, back; - - front = si_build_fs_interp(ctx, - input_index, chan, - prim_mask, i, j); - back = si_build_fs_interp(ctx, - back_attr_offset, chan, - prim_mask, i, j); - - result[chan] = LLVMBuildSelect(ctx->ac.builder, - is_face_positive, - front, - back, - ""); - } - } else { - for (chan = 0; chan < 4; chan++) { - result[chan] = si_build_fs_interp(ctx, - input_index, chan, - prim_mask, i, j); - } - } + LLVMValueRef i = NULL, j = NULL; + unsigned chan; + + /* fs.constant returns the param from the middle vertex, so it's not + * really useful for flat shading. It's meant to be used for custom + * interpolation (but the intrinsic can't fetch from the other two + * vertices). + * + * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state + * to do the right thing. The only reason we use fs.constant is that + * fs.interp cannot be used on integers, because they can be equal + * to NaN. + * + * When interp is false we will use fs.constant or for newer llvm, + * amdgcn.interp.mov. + */ + bool interp = interp_param != NULL; + + if (interp) { + interp_param = + LLVMBuildBitCast(ctx->ac.builder, interp_param, LLVMVectorType(ctx->ac.f32, 2), ""); + + i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, ""); + j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, ""); + } + + if (ctx->shader->key.part.ps.prolog.color_two_side) { + LLVMValueRef is_face_positive; + + /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1", + * otherwise it's at offset "num_inputs". + */ + unsigned back_attr_offset = num_interp_inputs; + if (semantic_index == 1 && colors_read_mask & 0xf) + back_attr_offset += 1; + + is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, ""); + + for (chan = 0; chan < 4; chan++) { + LLVMValueRef front, back; + + front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j); + back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j); + + result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, ""); + } + } else { + for (chan = 0; chan < 4; chan++) { + result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j); + } + } } static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha) { - if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { - static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { - [PIPE_FUNC_LESS] = LLVMRealOLT, - [PIPE_FUNC_EQUAL] = LLVMRealOEQ, - [PIPE_FUNC_LEQUAL] = LLVMRealOLE, - [PIPE_FUNC_GREATER] = LLVMRealOGT, - [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, - [PIPE_FUNC_GEQUAL] = LLVMRealOGE, - }; - LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; - assert(cond); - - LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, - SI_PARAM_ALPHA_REF); - LLVMValueRef alpha_pass = - LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); - ac_build_kill_if_false(&ctx->ac, alpha_pass); - } else { - ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false); - } + if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) { + static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = { + [PIPE_FUNC_LESS] = LLVMRealOLT, [PIPE_FUNC_EQUAL] = LLVMRealOEQ, + [PIPE_FUNC_LEQUAL] = LLVMRealOLE, [PIPE_FUNC_GREATER] = LLVMRealOGT, + [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE, + }; + LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func]; + assert(cond); + + LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF); + LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, ""); + ac_build_kill_if_false(&ctx->ac, alpha_pass); + } else { + ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false); + } } -static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, - LLVMValueRef alpha, - unsigned samplemask_param) +static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha, + unsigned samplemask_param) { - LLVMValueRef coverage; + LLVMValueRef coverage; - /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ - coverage = LLVMGetParam(ctx->main_fn, - samplemask_param); - coverage = ac_to_integer(&ctx->ac, coverage); + /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */ + coverage = LLVMGetParam(ctx->main_fn, samplemask_param); + coverage = ac_to_integer(&ctx->ac, coverage); - coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", - ctx->ac.i32, - &coverage, 1, AC_FUNC_ATTR_READNONE); + coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, &coverage, 1, + AC_FUNC_ATTR_READNONE); - coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, - ctx->ac.f32, ""); + coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, ""); - coverage = LLVMBuildFMul(ctx->ac.builder, coverage, - LLVMConstReal(ctx->ac.f32, - 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); + coverage = LLVMBuildFMul(ctx->ac.builder, coverage, + LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), ""); - return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); + return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, ""); } struct si_ps_exports { - unsigned num; - struct ac_export_args args[10]; + unsigned num; + struct ac_export_args args[10]; }; -static void si_export_mrt_z(struct si_shader_context *ctx, - LLVMValueRef depth, LLVMValueRef stencil, - LLVMValueRef samplemask, struct si_ps_exports *exp) +static void si_export_mrt_z(struct si_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil, + LLVMValueRef samplemask, struct si_ps_exports *exp) { - struct ac_export_args args; + struct ac_export_args args; - ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); + ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args); - memcpy(&exp->args[exp->num++], &args, sizeof(args)); + memcpy(&exp->args[exp->num++], &args, sizeof(args)); } /* Initialize arguments for the shader export intrinsic */ -static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, - LLVMValueRef *values, - unsigned target, - struct ac_export_args *args) +static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values, + unsigned target, struct ac_export_args *args) { - const struct si_shader_key *key = &ctx->shader->key; - unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; - LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); - unsigned spi_shader_col_format; - unsigned chan; - bool is_int8, is_int10; - int cbuf = target - V_008DFC_SQ_EXP_MRT; - - assert(cbuf >= 0 && cbuf < 8); - - spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; - is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; - is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; - - /* Default is 0xf. Adjusted below depending on the format. */ - args->enabled_channels = 0xf; /* writemask */ - - /* Specify whether the EXEC mask represents the valid mask */ - args->valid_mask = 0; - - /* Specify whether this is the last export */ - args->done = 0; - - /* Specify the target we are exporting */ - args->target = target; - - args->compr = false; - args->out[0] = f32undef; - args->out[1] = f32undef; - args->out[2] = f32undef; - args->out[3] = f32undef; - - LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; - LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2], - unsigned bits, bool hi) = NULL; - - switch (spi_shader_col_format) { - case V_028714_SPI_SHADER_ZERO: - args->enabled_channels = 0; /* writemask */ - args->target = V_008DFC_SQ_EXP_NULL; - break; - - case V_028714_SPI_SHADER_32_R: - args->enabled_channels = 1; /* writemask */ - args->out[0] = values[0]; - break; - - case V_028714_SPI_SHADER_32_GR: - args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[1]; - break; - - case V_028714_SPI_SHADER_32_AR: - if (ctx->screen->info.chip_class >= GFX10) { - args->enabled_channels = 0x3; /* writemask */ - args->out[0] = values[0]; - args->out[1] = values[3]; - } else { - args->enabled_channels = 0x9; /* writemask */ - args->out[0] = values[0]; - args->out[3] = values[3]; - } - break; - - case V_028714_SPI_SHADER_FP16_ABGR: - packf = ac_build_cvt_pkrtz_f16; - break; - - case V_028714_SPI_SHADER_UNORM16_ABGR: - packf = ac_build_cvt_pknorm_u16; - break; - - case V_028714_SPI_SHADER_SNORM16_ABGR: - packf = ac_build_cvt_pknorm_i16; - break; - - case V_028714_SPI_SHADER_UINT16_ABGR: - packi = ac_build_cvt_pk_u16; - break; - - case V_028714_SPI_SHADER_SINT16_ABGR: - packi = ac_build_cvt_pk_i16; - break; - - case V_028714_SPI_SHADER_32_ABGR: - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - break; - } - - /* Pack f16 or norm_i16/u16. */ - if (packf) { - for (chan = 0; chan < 2; chan++) { - LLVMValueRef pack_args[2] = { - values[2 * chan], - values[2 * chan + 1] - }; - LLVMValueRef packed; - - packed = packf(&ctx->ac, pack_args); - args->out[chan] = ac_to_float(&ctx->ac, packed); - } - args->compr = 1; /* COMPR flag */ - } - /* Pack i16/u16. */ - if (packi) { - for (chan = 0; chan < 2; chan++) { - LLVMValueRef pack_args[2] = { - ac_to_integer(&ctx->ac, values[2 * chan]), - ac_to_integer(&ctx->ac, values[2 * chan + 1]) - }; - LLVMValueRef packed; - - packed = packi(&ctx->ac, pack_args, - is_int8 ? 8 : is_int10 ? 10 : 16, - chan == 1); - args->out[chan] = ac_to_float(&ctx->ac, packed); - } - args->compr = 1; /* COMPR flag */ - } + const struct si_shader_key *key = &ctx->shader->key; + unsigned col_formats = key->part.ps.epilog.spi_shader_col_format; + LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32); + unsigned spi_shader_col_format; + unsigned chan; + bool is_int8, is_int10; + int cbuf = target - V_008DFC_SQ_EXP_MRT; + + assert(cbuf >= 0 && cbuf < 8); + + spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf; + is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1; + is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1; + + /* Default is 0xf. Adjusted below depending on the format. */ + args->enabled_channels = 0xf; /* writemask */ + + /* Specify whether the EXEC mask represents the valid mask */ + args->valid_mask = 0; + + /* Specify whether this is the last export */ + args->done = 0; + + /* Specify the target we are exporting */ + args->target = target; + + args->compr = false; + args->out[0] = f32undef; + args->out[1] = f32undef; + args->out[2] = f32undef; + args->out[3] = f32undef; + + LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL; + LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits, + bool hi) = NULL; + + switch (spi_shader_col_format) { + case V_028714_SPI_SHADER_ZERO: + args->enabled_channels = 0; /* writemask */ + args->target = V_008DFC_SQ_EXP_NULL; + break; + + case V_028714_SPI_SHADER_32_R: + args->enabled_channels = 1; /* writemask */ + args->out[0] = values[0]; + break; + + case V_028714_SPI_SHADER_32_GR: + args->enabled_channels = 0x3; /* writemask */ + args->out[0] = values[0]; + args->out[1] = values[1]; + break; + + case V_028714_SPI_SHADER_32_AR: + if (ctx->screen->info.chip_class >= GFX10) { + args->enabled_channels = 0x3; /* writemask */ + args->out[0] = values[0]; + args->out[1] = values[3]; + } else { + args->enabled_channels = 0x9; /* writemask */ + args->out[0] = values[0]; + args->out[3] = values[3]; + } + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + packf = ac_build_cvt_pkrtz_f16; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + packf = ac_build_cvt_pknorm_u16; + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + packf = ac_build_cvt_pknorm_i16; + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: + packi = ac_build_cvt_pk_u16; + break; + + case V_028714_SPI_SHADER_SINT16_ABGR: + packi = ac_build_cvt_pk_i16; + break; + + case V_028714_SPI_SHADER_32_ABGR: + memcpy(&args->out[0], values, sizeof(values[0]) * 4); + break; + } + + /* Pack f16 or norm_i16/u16. */ + if (packf) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]}; + LLVMValueRef packed; + + packed = packf(&ctx->ac, pack_args); + args->out[chan] = ac_to_float(&ctx->ac, packed); + } + args->compr = 1; /* COMPR flag */ + } + /* Pack i16/u16. */ + if (packi) { + for (chan = 0; chan < 2; chan++) { + LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]), + ac_to_integer(&ctx->ac, values[2 * chan + 1])}; + LLVMValueRef packed; + + packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1); + args->out[chan] = ac_to_float(&ctx->ac, packed); + } + args->compr = 1; /* COMPR flag */ + } } -static void si_export_mrt_color(struct si_shader_context *ctx, - LLVMValueRef *color, unsigned index, - unsigned samplemask_param, - bool is_last, struct si_ps_exports *exp) +static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index, + unsigned samplemask_param, bool is_last, struct si_ps_exports *exp) { - int i; - - /* Clamp color */ - if (ctx->shader->key.part.ps.epilog.clamp_color) - for (i = 0; i < 4; i++) - color[i] = ac_build_clamp(&ctx->ac, color[i]); - - /* Alpha to one */ - if (ctx->shader->key.part.ps.epilog.alpha_to_one) - color[3] = ctx->ac.f32_1; - - /* Alpha test */ - if (index == 0 && - ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) - si_alpha_test(ctx, color[3]); - - /* Line & polygon smoothing */ - if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) - color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], - samplemask_param); - - /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { - struct ac_export_args args[8]; - int c, last = -1; - - /* Get the export arguments, also find out what the last one is. */ - for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - si_llvm_init_ps_export_args(ctx, color, - V_008DFC_SQ_EXP_MRT + c, &args[c]); - if (args[c].enabled_channels) - last = c; - } - - /* Emit all exports. */ - for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { - if (is_last && last == c) { - args[c].valid_mask = 1; /* whether the EXEC mask is valid */ - args[c].done = 1; /* DONE bit */ - } else if (!args[c].enabled_channels) - continue; /* unnecessary NULL export */ - - memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); - } - } else { - struct ac_export_args args; - - /* Export */ - si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, - &args); - if (is_last) { - args.valid_mask = 1; /* whether the EXEC mask is valid */ - args.done = 1; /* DONE bit */ - } else if (!args.enabled_channels) - return; /* unnecessary NULL export */ - - memcpy(&exp->args[exp->num++], &args, sizeof(args)); - } + int i; + + /* Clamp color */ + if (ctx->shader->key.part.ps.epilog.clamp_color) + for (i = 0; i < 4; i++) + color[i] = ac_build_clamp(&ctx->ac, color[i]); + + /* Alpha to one */ + if (ctx->shader->key.part.ps.epilog.alpha_to_one) + color[3] = ctx->ac.f32_1; + + /* Alpha test */ + if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) + si_alpha_test(ctx, color[3]); + + /* Line & polygon smoothing */ + if (ctx->shader->key.part.ps.epilog.poly_line_smoothing) + color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param); + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) { + struct ac_export_args args[8]; + int c, last = -1; + + /* Get the export arguments, also find out what the last one is. */ + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { + si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + c, &args[c]); + if (args[c].enabled_channels) + last = c; + } + + /* Emit all exports. */ + for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) { + if (is_last && last == c) { + args[c].valid_mask = 1; /* whether the EXEC mask is valid */ + args[c].done = 1; /* DONE bit */ + } else if (!args[c].enabled_channels) + continue; /* unnecessary NULL export */ + + memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c])); + } + } else { + struct ac_export_args args; + + /* Export */ + si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, &args); + if (is_last) { + args.valid_mask = 1; /* whether the EXEC mask is valid */ + args.done = 1; /* DONE bit */ + } else if (!args.enabled_channels) + return; /* unnecessary NULL export */ + + memcpy(&exp->args[exp->num++], &args, sizeof(args)); + } } -static void si_emit_ps_exports(struct si_shader_context *ctx, - struct si_ps_exports *exp) +static void si_emit_ps_exports(struct si_shader_context *ctx, struct si_ps_exports *exp) { - for (unsigned i = 0; i < exp->num; i++) - ac_build_export(&ctx->ac, &exp->args[i]); + for (unsigned i = 0; i < exp->num; i++) + ac_build_export(&ctx->ac, &exp->args[i]); } /** @@ -503,117 +449,108 @@ static void si_emit_ps_exports(struct si_shader_context *ctx, * * The alpha-ref SGPR is returned via its original location. */ -static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - struct si_shader_info *info = &shader->selector->info; - LLVMBuilderRef builder = ctx->ac.builder; - unsigned i, j, first_vgpr, vgpr; - - LLVMValueRef color[8][4] = {}; - LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - LLVMValueRef ret; - - if (ctx->postponed_kill) - ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); - - /* Read the output values. */ - for (i = 0; i < info->num_outputs; i++) { - unsigned semantic_name = info->output_semantic_name[i]; - unsigned semantic_index = info->output_semantic_index[i]; - - switch (semantic_name) { - case TGSI_SEMANTIC_COLOR: - assert(semantic_index < 8); - for (j = 0; j < 4; j++) { - LLVMValueRef ptr = addrs[4 * i + j]; - LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); - color[semantic_index][j] = result; - } - break; - case TGSI_SEMANTIC_POSITION: - depth = LLVMBuildLoad(builder, - addrs[4 * i + 0], ""); - break; - case TGSI_SEMANTIC_STENCIL: - stencil = LLVMBuildLoad(builder, - addrs[4 * i + 0], ""); - break; - case TGSI_SEMANTIC_SAMPLEMASK: - samplemask = LLVMBuildLoad(builder, - addrs[4 * i + 0], ""); - break; - default: - fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", - semantic_name); - } - } - - /* Fill the return structure. */ - ret = ctx->return_value; - - /* Set SGPRs. */ - ret = LLVMBuildInsertValue(builder, ret, - ac_to_integer(&ctx->ac, - LLVMGetParam(ctx->main_fn, - SI_PARAM_ALPHA_REF)), - SI_SGPR_ALPHA_REF, ""); - - /* Set VGPRs */ - first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; - for (i = 0; i < ARRAY_SIZE(color); i++) { - if (!color[i][0]) - continue; - - for (j = 0; j < 4; j++) - ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); - } - if (depth) - ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); - if (stencil) - ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); - if (samplemask) - ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); - - /* Add the input sample mask for smoothing at the end. */ - if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) - vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; - ret = LLVMBuildInsertValue(builder, ret, - LLVMGetParam(ctx->main_fn, - SI_PARAM_SAMPLE_COVERAGE), vgpr++, ""); - - ctx->return_value = ret; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + LLVMBuilderRef builder = ctx->ac.builder; + unsigned i, j, first_vgpr, vgpr; + + LLVMValueRef color[8][4] = {}; + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + LLVMValueRef ret; + + if (ctx->postponed_kill) + ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, "")); + + /* Read the output values. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned semantic_name = info->output_semantic_name[i]; + unsigned semantic_index = info->output_semantic_index[i]; + + switch (semantic_name) { + case TGSI_SEMANTIC_COLOR: + assert(semantic_index < 8); + for (j = 0; j < 4; j++) { + LLVMValueRef ptr = addrs[4 * i + j]; + LLVMValueRef result = LLVMBuildLoad(builder, ptr, ""); + color[semantic_index][j] = result; + } + break; + case TGSI_SEMANTIC_POSITION: + depth = LLVMBuildLoad(builder, addrs[4 * i + 0], ""); + break; + case TGSI_SEMANTIC_STENCIL: + stencil = LLVMBuildLoad(builder, addrs[4 * i + 0], ""); + break; + case TGSI_SEMANTIC_SAMPLEMASK: + samplemask = LLVMBuildLoad(builder, addrs[4 * i + 0], ""); + break; + default: + fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", semantic_name); + } + } + + /* Fill the return structure. */ + ret = ctx->return_value; + + /* Set SGPRs. */ + ret = LLVMBuildInsertValue( + builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF)), + SI_SGPR_ALPHA_REF, ""); + + /* Set VGPRs */ + first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1; + for (i = 0; i < ARRAY_SIZE(color); i++) { + if (!color[i][0]) + continue; + + for (j = 0; j < 4; j++) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + } + if (depth) + ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, ""); + if (stencil) + ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, ""); + if (samplemask) + ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, ""); + + /* Add the input sample mask for smoothing at the end. */ + if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC) + vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC; + ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE), + vgpr++, ""); + + ctx->return_value = ret; } static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, - LLVMValueRef param_rw_buffers, - struct ac_arg param_pos_fixed_pt) + LLVMValueRef param_rw_buffers, + struct ac_arg param_pos_fixed_pt) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef slot, desc, offset, row, bit, address[2]; - - /* Use the fixed-point gl_FragCoord input. - * Since the stipple pattern is 32x32 and it repeats, just get 5 bits - * per coordinate to get the repeating effect. - */ - address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); - address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); - - /* Load the buffer descriptor. */ - slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0); - desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); - - /* The stipple pattern is 32x32, each row has 32 bits. */ - offset = LLVMBuildMul(builder, address[1], - LLVMConstInt(ctx->ac.i32, 4, 0), ""); - row = si_buffer_load_const(ctx, desc, offset); - row = ac_to_integer(&ctx->ac, row); - bit = LLVMBuildLShr(builder, row, address[0], ""); - bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, ""); - ac_build_kill_if_false(&ctx->ac, bit); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef slot, desc, offset, row, bit, address[2]; + + /* Use the fixed-point gl_FragCoord input. + * Since the stipple pattern is 32x32 and it repeats, just get 5 bits + * per coordinate to get the repeating effect. + */ + address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5); + address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5); + + /* Load the buffer descriptor. */ + slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0); + desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot); + + /* The stipple pattern is 32x32, each row has 32 bits. */ + offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), ""); + row = si_buffer_load_const(ctx, desc, offset); + row = ac_to_integer(&ctx->ac, row); + bit = LLVMBuildLShr(builder, row, address[0], ""); + bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, ""); + ac_build_kill_if_false(&ctx->ac, bit); } /** @@ -626,416 +563,372 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, * overriden by other states. (e.g. per-sample interpolation) * Interpolated colors are stored after the preloaded VGPRs. */ -void si_llvm_build_ps_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key) { - LLVMValueRef ret, func; - int num_returns, i, num_color_channels; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - /* Declare inputs. */ - LLVMTypeRef return_types[AC_MAX_ARGS]; - num_returns = 0; - num_color_channels = util_bitcount(key->ps_prolog.colors_read); - assert(key->ps_prolog.num_input_sgprs + - key->ps_prolog.num_input_vgprs + - num_color_channels <= AC_MAX_ARGS); - for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - return_types[num_returns++] = ctx->ac.i32; - - } - - struct ac_arg pos_fixed_pt; - struct ac_arg ancillary; - struct ac_arg param_sample_mask; - for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) { - struct ac_arg *arg = NULL; - if (i == key->ps_prolog.ancillary_vgpr_index) { - arg = &ancillary; - } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) { - arg = ¶m_sample_mask; - } else if (i == key->ps_prolog.num_input_vgprs - 1) { - /* POS_FIXED_PT is always last. */ - arg = &pos_fixed_pt; - } - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg); - return_types[num_returns++] = ctx->ac.f32; - } - - /* Declare outputs (same as inputs + add colors if needed) */ - for (i = 0; i < num_color_channels; i++) - return_types[num_returns++] = ctx->ac.f32; - - /* Create the function. */ - si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0); - func = ctx->main_fn; - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (i = 0; i < ctx->args.arg_count; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); - } - - /* Polygon stippling. */ - if (key->ps_prolog.states.poly_stipple) { - LLVMValueRef list = si_prolog_get_rw_buffers(ctx); - - si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt); - } - - if (key->ps_prolog.states.bc_optimize_for_persp || - key->ps_prolog.states.bc_optimize_for_linear) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef center[2], centroid[2], tmp, bc_optimize; - - /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; - * The hw doesn't compute CENTROID if the whole wave only - * contains fully-covered quads. - * - * PRIM_MASK is after user SGPRs. - */ - bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); - bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize, - LLVMConstInt(ctx->ac.i32, 31, 0), ""); - bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, - ctx->ac.i1, ""); - - if (key->ps_prolog.states.bc_optimize_for_persp) { - /* Read PERSP_CENTER. */ - for (i = 0; i < 2; i++) - center[i] = LLVMGetParam(func, base + 2 + i); - /* Read PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - centroid[i] = LLVMGetParam(func, base + 4 + i); - /* Select PERSP_CENTROID. */ - for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, - center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - tmp, base + 4 + i, ""); - } - } - if (key->ps_prolog.states.bc_optimize_for_linear) { - /* Read LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - center[i] = LLVMGetParam(func, base + 8 + i); - /* Read LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - centroid[i] = LLVMGetParam(func, base + 10 + i); - /* Select LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) { - tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, - center[i], centroid[i], ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - tmp, base + 10 + i, ""); - } - } - } - - /* Force per-sample interpolation. */ - if (key->ps_prolog.states.force_persp_sample_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef persp_sample[2]; - - /* Read PERSP_SAMPLE. */ - for (i = 0; i < 2; i++) - persp_sample[i] = LLVMGetParam(func, base + i); - /* Overwrite PERSP_CENTER. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_sample[i], base + 2 + i, ""); - /* Overwrite PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_sample[i], base + 4 + i, ""); - } - if (key->ps_prolog.states.force_linear_sample_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef linear_sample[2]; - - /* Read LINEAR_SAMPLE. */ - for (i = 0; i < 2; i++) - linear_sample[i] = LLVMGetParam(func, base + 6 + i); - /* Overwrite LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_sample[i], base + 8 + i, ""); - /* Overwrite LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_sample[i], base + 10 + i, ""); - } - - /* Force center interpolation. */ - if (key->ps_prolog.states.force_persp_center_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef persp_center[2]; - - /* Read PERSP_CENTER. */ - for (i = 0; i < 2; i++) - persp_center[i] = LLVMGetParam(func, base + 2 + i); - /* Overwrite PERSP_SAMPLE. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_center[i], base + i, ""); - /* Overwrite PERSP_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - persp_center[i], base + 4 + i, ""); - } - if (key->ps_prolog.states.force_linear_center_interp) { - unsigned i, base = key->ps_prolog.num_input_sgprs; - LLVMValueRef linear_center[2]; - - /* Read LINEAR_CENTER. */ - for (i = 0; i < 2; i++) - linear_center[i] = LLVMGetParam(func, base + 8 + i); - /* Overwrite LINEAR_SAMPLE. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_center[i], base + 6 + i, ""); - /* Overwrite LINEAR_CENTROID. */ - for (i = 0; i < 2; i++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - linear_center[i], base + 10 + i, ""); - } - - /* Interpolate colors. */ - unsigned color_out_idx = 0; - for (i = 0; i < 2; i++) { - unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; - unsigned face_vgpr = key->ps_prolog.num_input_sgprs + - key->ps_prolog.face_vgpr_index; - LLVMValueRef interp[2], color[4]; - LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; - - if (!writemask) - continue; - - /* If the interpolation qualifier is not CONSTANT (-1). */ - if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { - unsigned interp_vgpr = key->ps_prolog.num_input_sgprs + - key->ps_prolog.color_interp_vgpr_index[i]; - - /* Get the (i,j) updated by bc_optimize handling. */ - interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, - interp_vgpr, ""); - interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, - interp_vgpr + 1, ""); - interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); - } - - /* Use the absolute location of the input. */ - prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); - - if (key->ps_prolog.states.color_two_side) { - face = LLVMGetParam(func, face_vgpr); - face = ac_to_integer(&ctx->ac, face); - } - - interp_fs_color(ctx, - key->ps_prolog.color_attr_index[i], i, - key->ps_prolog.num_interp_inputs, - key->ps_prolog.colors_read, interp_ij, - prim_mask, face, color); - - while (writemask) { - unsigned chan = u_bit_scan(&writemask); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], - ctx->args.arg_count + color_out_idx++, ""); - } - } - - /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec - * says: - * - * "When per-sample shading is active due to the use of a fragment - * input qualified by sample or due to the use of the gl_SampleID - * or gl_SamplePosition variables, only the bit for the current - * sample is set in gl_SampleMaskIn. When state specifies multiple - * fragment shader invocations for a given fragment, the sample - * mask for any single fragment shader invocation may specify a - * subset of the covered samples for the fragment. In this case, - * the bit corresponding to each covered sample will be set in - * exactly one fragment shader invocation." - * - * The samplemask loaded by hardware is always the coverage of the - * entire pixel/fragment, so mask bits out based on the sample ID. - */ - if (key->ps_prolog.states.samplemask_log_ps_iter) { - /* The bit pattern matches that used by fixed function fragment - * processing. */ - static const uint16_t ps_iter_masks[] = { - 0xffff, /* not used */ - 0x5555, - 0x1111, - 0x0101, - 0x0001, - }; - assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); - - uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; - LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4); - LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask); - - samplemask = ac_to_integer(&ctx->ac, samplemask); - samplemask = LLVMBuildAnd( - ctx->ac.builder, - samplemask, - LLVMBuildShl(ctx->ac.builder, - LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), - sampleid, ""), - ""); - samplemask = ac_to_float(&ctx->ac, samplemask); - - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, - param_sample_mask.arg_index, ""); - } - - /* Tell LLVM to insert WQM instruction sequence when needed. */ - if (key->ps_prolog.wqm) { - LLVMAddTargetDependentFunctionAttr(func, - "amdgpu-ps-wqm-outputs", ""); - } - - si_llvm_build_ret(ctx, ret); + LLVMValueRef ret, func; + int num_returns, i, num_color_channels; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* Declare inputs. */ + LLVMTypeRef return_types[AC_MAX_ARGS]; + num_returns = 0; + num_color_channels = util_bitcount(key->ps_prolog.colors_read); + assert(key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs + num_color_channels <= + AC_MAX_ARGS); + for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + return_types[num_returns++] = ctx->ac.i32; + } + + struct ac_arg pos_fixed_pt; + struct ac_arg ancillary; + struct ac_arg param_sample_mask; + for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) { + struct ac_arg *arg = NULL; + if (i == key->ps_prolog.ancillary_vgpr_index) { + arg = &ancillary; + } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) { + arg = ¶m_sample_mask; + } else if (i == key->ps_prolog.num_input_vgprs - 1) { + /* POS_FIXED_PT is always last. */ + arg = &pos_fixed_pt; + } + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg); + return_types[num_returns++] = ctx->ac.f32; + } + + /* Declare outputs (same as inputs + add colors if needed) */ + for (i = 0; i < num_color_channels; i++) + return_types[num_returns++] = ctx->ac.f32; + + /* Create the function. */ + si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0); + func = ctx->main_fn; + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (i = 0; i < ctx->args.arg_count; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); + } + + /* Polygon stippling. */ + if (key->ps_prolog.states.poly_stipple) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + + si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt); + } + + if (key->ps_prolog.states.bc_optimize_for_persp || + key->ps_prolog.states.bc_optimize_for_linear) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef center[2], centroid[2], tmp, bc_optimize; + + /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; + * The hw doesn't compute CENTROID if the whole wave only + * contains fully-covered quads. + * + * PRIM_MASK is after user SGPRs. + */ + bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + bc_optimize = + LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), ""); + bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, ""); + + if (key->ps_prolog.states.bc_optimize_for_persp) { + /* Read PERSP_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 2 + i); + /* Read PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 4 + i); + /* Select PERSP_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, ""); + } + } + if (key->ps_prolog.states.bc_optimize_for_linear) { + /* Read LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + center[i] = LLVMGetParam(func, base + 8 + i); + /* Read LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + centroid[i] = LLVMGetParam(func, base + 10 + i); + /* Select LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) { + tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, ""); + } + } + } + + /* Force per-sample interpolation. */ + if (key->ps_prolog.states.force_persp_sample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_sample[2]; + + /* Read PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + persp_sample[i] = LLVMGetParam(func, base + i); + /* Overwrite PERSP_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, ""); + } + if (key->ps_prolog.states.force_linear_sample_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef linear_sample[2]; + + /* Read LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + linear_sample[i] = LLVMGetParam(func, base + 6 + i); + /* Overwrite LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, ""); + } + + /* Force center interpolation. */ + if (key->ps_prolog.states.force_persp_center_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef persp_center[2]; + + /* Read PERSP_CENTER. */ + for (i = 0; i < 2; i++) + persp_center[i] = LLVMGetParam(func, base + 2 + i); + /* Overwrite PERSP_SAMPLE. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, ""); + /* Overwrite PERSP_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, ""); + } + if (key->ps_prolog.states.force_linear_center_interp) { + unsigned i, base = key->ps_prolog.num_input_sgprs; + LLVMValueRef linear_center[2]; + + /* Read LINEAR_CENTER. */ + for (i = 0; i < 2; i++) + linear_center[i] = LLVMGetParam(func, base + 8 + i); + /* Overwrite LINEAR_SAMPLE. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, ""); + /* Overwrite LINEAR_CENTROID. */ + for (i = 0; i < 2; i++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, ""); + } + + /* Interpolate colors. */ + unsigned color_out_idx = 0; + for (i = 0; i < 2; i++) { + unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf; + unsigned face_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.face_vgpr_index; + LLVMValueRef interp[2], color[4]; + LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL; + + if (!writemask) + continue; + + /* If the interpolation qualifier is not CONSTANT (-1). */ + if (key->ps_prolog.color_interp_vgpr_index[i] != -1) { + unsigned interp_vgpr = + key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i]; + + /* Get the (i,j) updated by bc_optimize handling. */ + interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, ""); + interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, ""); + interp_ij = ac_build_gather_values(&ctx->ac, interp, 2); + } + + /* Use the absolute location of the input. */ + prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR); + + if (key->ps_prolog.states.color_two_side) { + face = LLVMGetParam(func, face_vgpr); + face = ac_to_integer(&ctx->ac, face); + } + + interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs, + key->ps_prolog.colors_read, interp_ij, prim_mask, face, color); + + while (writemask) { + unsigned chan = u_bit_scan(&writemask); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan], + ctx->args.arg_count + color_out_idx++, ""); + } + } + + /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec + * says: + * + * "When per-sample shading is active due to the use of a fragment + * input qualified by sample or due to the use of the gl_SampleID + * or gl_SamplePosition variables, only the bit for the current + * sample is set in gl_SampleMaskIn. When state specifies multiple + * fragment shader invocations for a given fragment, the sample + * mask for any single fragment shader invocation may specify a + * subset of the covered samples for the fragment. In this case, + * the bit corresponding to each covered sample will be set in + * exactly one fragment shader invocation." + * + * The samplemask loaded by hardware is always the coverage of the + * entire pixel/fragment, so mask bits out based on the sample ID. + */ + if (key->ps_prolog.states.samplemask_log_ps_iter) { + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const uint16_t ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, 0x1111, 0x0101, 0x0001, + }; + assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks)); + + uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter]; + LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4); + LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask); + + samplemask = ac_to_integer(&ctx->ac, samplemask); + samplemask = + LLVMBuildAnd(ctx->ac.builder, samplemask, + LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), + sampleid, ""), + ""); + samplemask = ac_to_float(&ctx->ac, samplemask); + + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, param_sample_mask.arg_index, ""); + } + + /* Tell LLVM to insert WQM instruction sequence when needed. */ + if (key->ps_prolog.wqm) { + LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", ""); + } + + si_llvm_build_ret(ctx, ret); } /** * Build the pixel shader epilog function. This handles everything that must be * emulated for pixel shader exports. (alpha-test, format conversions, etc) */ -void si_llvm_build_ps_epilog(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key) { - LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; - int i; - struct si_ps_exports exp = {}; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - /* Declare input SGPRs. */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->bindless_samplers_and_images); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->const_and_shader_buffers); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->samplers_and_images); - si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, - NULL, SI_PARAM_ALPHA_REF); - - /* Declare input VGPRs. */ - unsigned required_num_params = - ctx->args.num_sgprs_used + - util_bitcount(key->ps_epilog.colors_written) * 4 + - key->ps_epilog.writes_z + - key->ps_epilog.writes_stencil + - key->ps_epilog.writes_samplemask; - - required_num_params = MAX2(required_num_params, - ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); - - while (ctx->args.arg_count < required_num_params) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); - - /* Create the function. */ - si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0); - /* Disable elimination of unused inputs. */ - ac_llvm_add_target_dep_function_attr(ctx->main_fn, - "InitialPSInputAddr", 0xffffff); - - /* Process colors. */ - unsigned vgpr = ctx->args.num_sgprs_used; - unsigned colors_written = key->ps_epilog.colors_written; - int last_color_export = -1; - - /* Find the last color export. */ - if (!key->ps_epilog.writes_z && - !key->ps_epilog.writes_stencil && - !key->ps_epilog.writes_samplemask) { - unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; - - /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ - if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { - /* Just set this if any of the colorbuffers are enabled. */ - if (spi_format & - ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) - last_color_export = 0; - } else { - for (i = 0; i < 8; i++) - if (colors_written & (1 << i) && - (spi_format >> (i * 4)) & 0xf) - last_color_export = i; - } - } - - while (colors_written) { - LLVMValueRef color[4]; - int mrt = u_bit_scan(&colors_written); - - for (i = 0; i < 4; i++) - color[i] = LLVMGetParam(ctx->main_fn, vgpr++); - - si_export_mrt_color(ctx, color, mrt, - ctx->args.arg_count - 1, - mrt == last_color_export, &exp); - } - - /* Process depth, stencil, samplemask. */ - if (key->ps_epilog.writes_z) - depth = LLVMGetParam(ctx->main_fn, vgpr++); - if (key->ps_epilog.writes_stencil) - stencil = LLVMGetParam(ctx->main_fn, vgpr++); - if (key->ps_epilog.writes_samplemask) - samplemask = LLVMGetParam(ctx->main_fn, vgpr++); - - if (depth || stencil || samplemask) - si_export_mrt_z(ctx, depth, stencil, samplemask, &exp); - else if (last_color_export == -1) - ac_build_export_null(&ctx->ac); - - if (exp.num) - si_emit_ps_exports(ctx, &exp); - - /* Compile. */ - LLVMBuildRetVoid(ctx->ac.builder); + LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; + int i; + struct si_ps_exports exp = {}; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* Declare input SGPRs. */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->bindless_samplers_and_images); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->const_and_shader_buffers); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->samplers_and_images); + si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_ALPHA_REF); + + /* Declare input VGPRs. */ + unsigned required_num_params = + ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 + + key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask; + + required_num_params = + MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); + + while (ctx->args.arg_count < required_num_params) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); + + /* Create the function. */ + si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0); + /* Disable elimination of unused inputs. */ + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "InitialPSInputAddr", 0xffffff); + + /* Process colors. */ + unsigned vgpr = ctx->args.num_sgprs_used; + unsigned colors_written = key->ps_epilog.colors_written; + int last_color_export = -1; + + /* Find the last color export. */ + if (!key->ps_epilog.writes_z && !key->ps_epilog.writes_stencil && + !key->ps_epilog.writes_samplemask) { + unsigned spi_format = key->ps_epilog.states.spi_shader_col_format; + + /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */ + if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) { + /* Just set this if any of the colorbuffers are enabled. */ + if (spi_format & ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1)) + last_color_export = 0; + } else { + for (i = 0; i < 8; i++) + if (colors_written & (1 << i) && (spi_format >> (i * 4)) & 0xf) + last_color_export = i; + } + } + + while (colors_written) { + LLVMValueRef color[4]; + int mrt = u_bit_scan(&colors_written); + + for (i = 0; i < 4; i++) + color[i] = LLVMGetParam(ctx->main_fn, vgpr++); + + si_export_mrt_color(ctx, color, mrt, ctx->args.arg_count - 1, mrt == last_color_export, &exp); + } + + /* Process depth, stencil, samplemask. */ + if (key->ps_epilog.writes_z) + depth = LLVMGetParam(ctx->main_fn, vgpr++); + if (key->ps_epilog.writes_stencil) + stencil = LLVMGetParam(ctx->main_fn, vgpr++); + if (key->ps_epilog.writes_samplemask) + samplemask = LLVMGetParam(ctx->main_fn, vgpr++); + + if (depth || stencil || samplemask) + si_export_mrt_z(ctx, depth, stencil, samplemask, &exp); + else if (last_color_export == -1) + ac_build_export_null(&ctx->ac); + + if (exp.num) + si_emit_ps_exports(ctx, &exp); + + /* Compile. */ + LLVMBuildRetVoid(ctx->ac.builder); } -void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, - struct si_shader *shader) +void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader) { - LLVMValueRef parts[3]; - unsigned num_parts = 0, main_index; + LLVMValueRef parts[3]; + unsigned num_parts = 0, main_index; - union si_shader_part_key prolog_key; - si_get_ps_prolog_key(shader, &prolog_key, false); + union si_shader_part_key prolog_key; + si_get_ps_prolog_key(shader, &prolog_key, false); - if (si_need_ps_prolog(&prolog_key)) { - si_llvm_build_ps_prolog(ctx, &prolog_key); - parts[num_parts++] = ctx->main_fn; - } + if (si_need_ps_prolog(&prolog_key)) { + si_llvm_build_ps_prolog(ctx, &prolog_key); + parts[num_parts++] = ctx->main_fn; + } - main_index = num_parts; - parts[num_parts++] = ctx->main_fn; + main_index = num_parts; + parts[num_parts++] = ctx->main_fn; - union si_shader_part_key epilog_key; - si_get_ps_epilog_key(shader, &epilog_key); - si_llvm_build_ps_epilog(ctx, &epilog_key); - parts[num_parts++] = ctx->main_fn; + union si_shader_part_key epilog_key; + si_get_ps_epilog_key(shader, &epilog_key); + si_llvm_build_ps_epilog(ctx, &epilog_key); + parts[num_parts++] = ctx->main_fn; - si_build_wrapper_function(ctx, parts, num_parts, main_index, 0); + si_build_wrapper_function(ctx, parts, num_parts, main_index, 0); } void si_llvm_init_ps_callbacks(struct si_shader_context *ctx) { - ctx->abi.emit_outputs = si_llvm_return_fs_outputs; - ctx->abi.load_sample_position = load_sample_position; - ctx->abi.load_sample_mask_in = load_sample_mask_in; - ctx->abi.emit_fbfetch = si_nir_emit_fbfetch; + ctx->abi.emit_outputs = si_llvm_return_fs_outputs; + ctx->abi.load_sample_position = load_sample_position; + ctx->abi.load_sample_mask_in = load_sample_mask_in; + ctx->abi.emit_fbfetch = si_nir_emit_fbfetch; } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c index cb06aa99ca7..122e6976261 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c @@ -22,111 +22,98 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" #include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" /** * Return a value that is equal to the given i32 \p index if it lies in [0,num) * or an undefined value in the same interval otherwise. */ -static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, - LLVMValueRef index, - unsigned num) +static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index, + unsigned num) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0); - LLVMValueRef cc; - - if (util_is_power_of_two_or_zero(num)) { - index = LLVMBuildAnd(builder, index, c_max, ""); - } else { - /* In theory, this MAX pattern should result in code that is - * as good as the bit-wise AND above. - * - * In practice, LLVM generates worse code (at the time of - * writing), because its value tracking is not strong enough. - */ - cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, ""); - index = LLVMBuildSelect(builder, cc, index, c_max, ""); - } - - return index; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0); + LLVMValueRef cc; + + if (util_is_power_of_two_or_zero(num)) { + index = LLVMBuildAnd(builder, index, c_max, ""); + } else { + /* In theory, this MAX pattern should result in code that is + * as good as the bit-wise AND above. + * + * In practice, LLVM generates worse code (at the time of + * writing), because its value tracking is not strong enough. + */ + cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, ""); + index = LLVMBuildSelect(builder, cc, index, c_max, ""); + } + + return index; } static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx) { - LLVMValueRef ptr = - ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); - struct si_shader_selector *sel = ctx->shader->selector; - - /* Do the bounds checking with a descriptor, because - * doing computation and manual bounds checking of 64-bit - * addresses generates horrible VALU code with very high - * VGPR usage and very low SIMD occupancy. - */ - ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); - - LLVMValueRef desc0, desc1; - desc0 = ptr; - desc1 = LLVMConstInt(ctx->ac.i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - - uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (ctx->screen->info.chip_class >= GFX10) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - else - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - - LLVMValueRef desc_elems[] = { - desc0, - desc1, - LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0), - LLVMConstInt(ctx->ac.i32, rsrc3, false) - }; - - return ac_build_gather_values(&ctx->ac, desc_elems, 4); + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); + struct si_shader_selector *sel = ctx->shader->selector; + + /* Do the bounds checking with a descriptor, because + * doing computation and manual bounds checking of 64-bit + * addresses generates horrible VALU code with very high + * VGPR usage and very low SIMD occupancy. + */ + ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, ""); + + LLVMValueRef desc0, desc1; + desc0 = ptr; + desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); + + uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (ctx->screen->info.chip_class >= GFX10) + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + else + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc_elems[] = {desc0, desc1, + LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0), + LLVMConstInt(ctx->ac.i32, rsrc3, false)}; + + return ac_build_gather_values(&ctx->ac, desc_elems, 4); } static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_selector *sel = ctx->shader->selector; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_selector *sel = ctx->shader->selector; - LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); - if (sel->info.const_buffers_declared == 1 && - sel->info.shader_buffers_declared == 0) { - return load_const_buffer_desc_fast_path(ctx); - } + if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) { + return load_const_buffer_desc_fast_path(ctx); + } - index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), ""); + index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers); + index = + LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), ""); - return ac_build_load_to_sgpr(&ctx->ac, ptr, index); + return ac_build_load_to_sgpr(&ctx->ac, ptr, index); } -static LLVMValueRef -load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) +static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, - ctx->const_and_shader_buffers); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); - index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); - index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0), - index, ""); + index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); + index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0), + index, ""); - return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); + return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); } /** @@ -140,181 +127,167 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) * nicer: disabling DCC in the shader still leads to undefined results but * avoids the lockup. */ -static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, - LLVMValueRef rsrc) +static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc) { - if (ctx->screen->info.chip_class <= GFX7) { - return rsrc; - } else { - LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); - } + if (ctx->screen->info.chip_class <= GFX7) { + return rsrc; + } else { + LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); + LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0); + LLVMValueRef tmp; + + tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); + tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); + return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); + } } /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should * adjust "index" to point to FMASK. */ -static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type desc_type, - bool uses_store, bool bindless) +static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list, + LLVMValueRef index, enum ac_descriptor_type desc_type, + bool uses_store, bool bindless) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef rsrc; - - if (desc_type == AC_DESC_BUFFER) { - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), - ctx->ac.i32_1); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); - } else { - assert(desc_type == AC_DESC_IMAGE || - desc_type == AC_DESC_FMASK); - } - - if (bindless) - rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); - else - rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); - - if (desc_type == AC_DESC_IMAGE && uses_store) - rsrc = force_dcc_off(ctx, rsrc); - return rsrc; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef rsrc; + + if (desc_type == AC_DESC_BUFFER) { + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + } else { + assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK); + } + + if (bindless) + rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); + else + rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); + + if (desc_type == AC_DESC_IMAGE && uses_store) + rsrc = force_dcc_off(ctx, rsrc); + return rsrc; } /** * Load an image view, fmask view. or sampler state descriptor. */ -static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, - LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type type) +static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list, + LLVMValueRef index, enum ac_descriptor_type type) { - LLVMBuilderRef builder = ctx->ac.builder; - - switch (type) { - case AC_DESC_IMAGE: - /* The image is at [0:7]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - break; - case AC_DESC_BUFFER: - /* The buffer is in [4:7]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), - ctx->ac.i32_1); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); - break; - case AC_DESC_FMASK: - /* The FMASK is at [8:15]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), - ctx->ac.i32_1); - break; - case AC_DESC_SAMPLER: - /* The sampler state is at [12:15]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), - LLVMConstInt(ctx->ac.i32, 3, 0)); - list = LLVMBuildPointerCast(builder, list, - ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); - break; - case AC_DESC_PLANE_0: - case AC_DESC_PLANE_1: - case AC_DESC_PLANE_2: - /* Only used for the multiplane image support for Vulkan. Should - * never be reached in radeonsi. - */ - unreachable("Plane descriptor requested in radeonsi."); - } - - return ac_build_load_to_sgpr(&ctx->ac, list, index); + LLVMBuilderRef builder = ctx->ac.builder; + + switch (type) { + case AC_DESC_IMAGE: + /* The image is at [0:7]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + break; + case AC_DESC_BUFFER: + /* The buffer is in [4:7]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1); + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + break; + case AC_DESC_FMASK: + /* The FMASK is at [8:15]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); + break; + case AC_DESC_SAMPLER: + /* The sampler state is at [12:15]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), + LLVMConstInt(ctx->ac.i32, 3, 0)); + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); + break; + case AC_DESC_PLANE_0: + case AC_DESC_PLANE_1: + case AC_DESC_PLANE_2: + /* Only used for the multiplane image support for Vulkan. Should + * never be reached in radeonsi. + */ + unreachable("Plane descriptor requested in radeonsi."); + } + + return ac_build_load_to_sgpr(&ctx->ac, list, index); } -static LLVMValueRef -si_nir_load_sampler_desc(struct ac_shader_abi *abi, - unsigned descriptor_set, unsigned base_index, - unsigned constant_index, LLVMValueRef dynamic_index, - enum ac_descriptor_type desc_type, bool image, - bool write, bool bindless) +static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, + unsigned base_index, unsigned constant_index, + LLVMValueRef dynamic_index, + enum ac_descriptor_type desc_type, bool image, + bool write, bool bindless) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned const_index = base_index + constant_index; - - assert(!descriptor_set); - assert(desc_type <= AC_DESC_BUFFER); - - if (bindless) { - LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images); - - /* dynamic_index is the bindless handle */ - if (image) { - /* Bindless image descriptors use 16-dword slots. */ - dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->ac.i64, 2, 0), ""); - /* FMASK is right after the image. */ - if (desc_type == AC_DESC_FMASK) { - dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, - ctx->ac.i32_1, ""); - } - - return si_load_image_desc(ctx, list, dynamic_index, desc_type, - write, true); - } - - /* Since bindless handle arithmetic can contain an unsigned integer - * wraparound and si_load_sampler_desc assumes there isn't any, - * use GEP without "inbounds" (inside ac_build_pointer_add) - * to prevent incorrect code generation and hangs. - */ - dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, - LLVMConstInt(ctx->ac.i64, 2, 0), ""); - list = ac_build_pointer_add(&ctx->ac, list, dynamic_index); - return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type); - } - - unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; - assert(const_index < num_slots || dynamic_index); - - LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images); - LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); - - if (dynamic_index) { - index = LLVMBuildAdd(builder, index, dynamic_index, ""); - - /* From the GL_ARB_shader_image_load_store extension spec: - * - * If a shader performs an image load, store, or atomic - * operation using an image variable declared as an array, - * and if the index used to select an individual element is - * negative or greater than or equal to the size of the - * array, the results of the operation are undefined but may - * not lead to termination. - */ - index = si_llvm_bound_index(ctx, index, num_slots); - } - - if (image) { - /* FMASKs are separate from images. */ - if (desc_type == AC_DESC_FMASK) { - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), ""); - } - index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0), - index, ""); - return si_load_image_desc(ctx, list, index, desc_type, write, false); - } - - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); - return si_load_sampler_desc(ctx, list, index, desc_type); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + unsigned const_index = base_index + constant_index; + + assert(!descriptor_set); + assert(desc_type <= AC_DESC_BUFFER); + + if (bindless) { + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images); + + /* dynamic_index is the bindless handle */ + if (image) { + /* Bindless image descriptors use 16-dword slots. */ + dynamic_index = + LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), ""); + /* FMASK is right after the image. */ + if (desc_type == AC_DESC_FMASK) { + dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, ""); + } + + return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true); + } + + /* Since bindless handle arithmetic can contain an unsigned integer + * wraparound and si_load_sampler_desc assumes there isn't any, + * use GEP without "inbounds" (inside ac_build_pointer_add) + * to prevent incorrect code generation and hangs. + */ + dynamic_index = + LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), ""); + list = ac_build_pointer_add(&ctx->ac, list, dynamic_index); + return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type); + } + + unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; + assert(const_index < num_slots || dynamic_index); + + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images); + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); + + if (dynamic_index) { + index = LLVMBuildAdd(builder, index, dynamic_index, ""); + + /* From the GL_ARB_shader_image_load_store extension spec: + * + * If a shader performs an image load, store, or atomic + * operation using an image variable declared as an array, + * and if the index used to select an individual element is + * negative or greater than or equal to the size of the + * array, the results of the operation are undefined but may + * not lead to termination. + */ + index = si_llvm_bound_index(ctx, index, num_slots); + } + + if (image) { + /* FMASKs are separate from images. */ + if (desc_type == AC_DESC_FMASK) { + index = + LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), ""); + } + index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0), + index, ""); + return si_load_image_desc(ctx, list, index, desc_type, write, false); + } + + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); + return si_load_sampler_desc(ctx, list, index, desc_type); } void si_llvm_init_resource_callbacks(struct si_shader_context *ctx) { - ctx->abi.load_ubo = load_ubo; - ctx->abi.load_ssbo = load_ssbo; - ctx->abi.load_sampler_desc = si_nir_load_sampler_desc; + ctx->abi.load_ubo = load_ubo; + ctx->abi.load_ssbo = load_ssbo; + ctx->abi.load_sampler_desc = si_nir_load_sampler_desc; } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 116e06e5af1..5dba9859988 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -22,23 +22,23 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" #include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) { - switch (ctx->type) { - case PIPE_SHADER_TESS_CTRL: - return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8); + switch (ctx->type) { + case PIPE_SHADER_TESS_CTRL: + return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8); - case PIPE_SHADER_TESS_EVAL: - return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id); + case PIPE_SHADER_TESS_EVAL: + return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id); - default: - assert(0); - return NULL; - } + default: + assert(0); + return NULL; + } } /* Tessellation shaders pass outputs to the next shader using LDS. @@ -62,151 +62,134 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx) * All three shaders VS(LS), TCS, TES share the same LDS space. */ -static LLVMValueRef -get_tcs_in_patch_stride(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13); + return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13); } static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx) { - assert(ctx->type == PIPE_SHADER_TESS_CTRL); + assert(ctx->type == PIPE_SHADER_TESS_CTRL); - if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) - return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4; - return util_last_bit64(ctx->shader->selector->outputs_written) * 4; + return util_last_bit64(ctx->shader->selector->outputs_written) * 4; } static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx) { - unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); + unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx); - return LLVMConstInt(ctx->ac.i32, stride, 0); + return LLVMConstInt(ctx->ac.i32, stride, 0); } static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) { - if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) - return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13); - - const struct si_shader_info *info = &ctx->shader->selector->info; - unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; - unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); - unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); - unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + - num_patch_outputs * 4; - return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0); + if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) + return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13); + + const struct si_shader_info *info = &ctx->shader->selector->info; + unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx); + unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written); + unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4; + return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0); } -static LLVMValueRef -get_tcs_out_patch0_offset(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx) { - return LLVMBuildMul(ctx->ac.builder, - si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16), - LLVMConstInt(ctx->ac.i32, 4, 0), ""); + return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16), + LLVMConstInt(ctx->ac.i32, 4, 0), ""); } -static LLVMValueRef -get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) { - return LLVMBuildMul(ctx->ac.builder, - si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16), - LLVMConstInt(ctx->ac.i32, 4, 0), ""); + return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16), + LLVMConstInt(ctx->ac.i32, 4, 0), ""); } -static LLVMValueRef -get_tcs_in_current_patch_offset(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx) { - LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); + return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, ""); } -static LLVMValueRef -get_tcs_out_current_patch_offset(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx) { - LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset); } -static LLVMValueRef -get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) +static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) { - LLVMValueRef patch0_patch_data_offset = - get_tcs_out_patch0_patch_data_offset(ctx); - LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); - LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); + LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx); + LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); + LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset); } static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) { - unsigned tcs_out_vertices = - ctx->shader->selector ? - ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0; + unsigned tcs_out_vertices = + ctx->shader->selector ? ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] + : 0; - /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ - if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) - return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0); + /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */ + if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices) + return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0); - return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6); + return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6); } static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) { - unsigned stride; - - switch (ctx->type) { - case PIPE_SHADER_VERTEX: - stride = ctx->shader->selector->lshs_vertex_stride / 4; - return LLVMConstInt(ctx->ac.i32, stride, 0); - - case PIPE_SHADER_TESS_CTRL: - if (ctx->screen->info.chip_class >= GFX9 && - ctx->shader->is_monolithic) { - stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; - return LLVMConstInt(ctx->ac.i32, stride, 0); - } - return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8); - - default: - assert(0); - return NULL; - } + unsigned stride; + + switch (ctx->type) { + case PIPE_SHADER_VERTEX: + stride = ctx->shader->selector->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->ac.i32, stride, 0); + + case PIPE_SHADER_TESS_CTRL: + if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) { + stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->ac.i32, stride, 0); + } + return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8); + + default: + assert(0); + return NULL; + } } -static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx, - LLVMValueRef vertex_dw_stride, - LLVMValueRef base_addr, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - ubyte name, ubyte index) +static LLVMValueRef +get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride, + LLVMValueRef base_addr, LLVMValueRef vertex_index, + LLVMValueRef param_index, ubyte name, ubyte index) { - if (vertex_dw_stride) { - base_addr = ac_build_imad(&ctx->ac, vertex_index, - vertex_dw_stride, base_addr); - } - - if (param_index) { - base_addr = ac_build_imad(&ctx->ac, param_index, - LLVMConstInt(ctx->ac.i32, 4, 0), base_addr); - } - - int param = name == TGSI_SEMANTIC_PATCH || - name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER ? - si_shader_io_get_unique_index_patch(name, index) : - si_shader_io_get_unique_index(name, index, false); - - /* Add the base address of the element. */ - return LLVMBuildAdd(ctx->ac.builder, base_addr, - LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); + if (vertex_dw_stride) { + base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr); + } + + if (param_index) { + base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr); + } + + int param = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER + ? si_shader_io_get_unique_index_patch(name, index) + : si_shader_io_get_unique_index(name, index, false); + + /* Add the base address of the element. */ + return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); } /* The offchip buffer layout for TCS->TES is @@ -228,98 +211,88 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context * Note that every attribute has 4 components. */ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, - LLVMValueRef rel_patch_id, - LLVMValueRef vertex_index, + LLVMValueRef rel_patch_id, LLVMValueRef vertex_index, LLVMValueRef param_index) { - LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; - LLVMValueRef param_stride, constant16; - - vertices_per_patch = get_num_tcs_out_vertices(ctx); - num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6); - total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, - num_patches, ""); - - constant16 = LLVMConstInt(ctx->ac.i32, 16, 0); - if (vertex_index) { - base_addr = ac_build_imad(&ctx->ac, rel_patch_id, - vertices_per_patch, vertex_index); - param_stride = total_vertices; - } else { - base_addr = rel_patch_id; - param_stride = num_patches; - } - - base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); - base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); - - if (!vertex_index) { - LLVMValueRef patch_data_offset = - si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20); - - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - patch_data_offset, ""); - } - return base_addr; + LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; + LLVMValueRef param_stride, constant16; + + vertices_per_patch = get_num_tcs_out_vertices(ctx); + num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6); + total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, ""); + + constant16 = LLVMConstInt(ctx->ac.i32, 16, 0); + if (vertex_index) { + base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index); + param_stride = total_vertices; + } else { + base_addr = rel_patch_id; + param_stride = num_patches; + } + + base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr); + base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, ""); + + if (!vertex_index) { + LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20); + + base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, ""); + } + return base_addr; } -static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices( - struct si_shader_context *ctx, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - ubyte name, ubyte index) +static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx, + LLVMValueRef vertex_index, + LLVMValueRef param_index, + ubyte name, ubyte index) { - unsigned param_index_base; - - param_index_base = name == TGSI_SEMANTIC_PATCH || - name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER ? - si_shader_io_get_unique_index_patch(name, index) : - si_shader_io_get_unique_index(name, index, false); - - if (param_index) { - param_index = LLVMBuildAdd(ctx->ac.builder, param_index, - LLVMConstInt(ctx->ac.i32, param_index_base, 0), - ""); - } else { - param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0); - } - - return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), - vertex_index, param_index); + unsigned param_index_base; + + param_index_base = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER + ? si_shader_io_get_unique_index_patch(name, index) + : si_shader_io_get_unique_index(name, index, false); + + if (param_index) { + param_index = LLVMBuildAdd(ctx->ac.builder, param_index, + LLVMConstInt(ctx->ac.i32, param_index_base, 0), ""); + } else { + param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0); + } + + return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index); } -static LLVMValueRef buffer_load(struct si_shader_context *ctx, - LLVMTypeRef type, unsigned swizzle, - LLVMValueRef buffer, LLVMValueRef offset, - LLVMValueRef base, bool can_speculate) +static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle, + LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base, + bool can_speculate) { - LLVMValueRef value, value2; - LLVMTypeRef vec_type = LLVMVectorType(type, 4); + LLVMValueRef value, value2; + LLVMTypeRef vec_type = LLVMVectorType(type, 4); - if (swizzle == ~0) { - value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, ac_glc, can_speculate, false); + if (swizzle == ~0) { + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc, + can_speculate, false); - return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); - } + return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + } - if (ac_get_type_size(type) != 8) { - value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, - 0, ac_glc, can_speculate, false); + if (ac_get_type_size(type) != 8) { + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc, + can_speculate, false); - value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); - return LLVMBuildExtractElement(ctx->ac.builder, value, - LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); - } + value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0), + ""); + } - value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4, ac_glc, can_speculate, false); + value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc, + can_speculate, false); - value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, - swizzle * 4 + 4, ac_glc, can_speculate, false); + value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc, + can_speculate, false); - return si_build_gather_64bit(ctx, type, value, value2); + return si_build_gather_64bit(ctx, type, value, value2); } /** @@ -329,36 +302,34 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx, * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4 * \param dw_addr address in dwords */ -static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, - LLVMTypeRef type, unsigned swizzle, - LLVMValueRef dw_addr) +static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle, + LLVMValueRef dw_addr) { - LLVMValueRef value; + LLVMValueRef value; - if (swizzle == ~0) { - LLVMValueRef values[4]; + if (swizzle == ~0) { + LLVMValueRef values[4]; - for (unsigned chan = 0; chan < 4; chan++) - values[chan] = lshs_lds_load(ctx, type, chan, dw_addr); + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = lshs_lds_load(ctx, type, chan, dw_addr); - return ac_build_gather_values(&ctx->ac, values, 4); - } + return ac_build_gather_values(&ctx->ac, values, 4); + } - /* Split 64-bit loads. */ - if (ac_get_type_size(type) == 8) { - LLVMValueRef lo, hi; + /* Split 64-bit loads. */ + if (ac_get_type_size(type) == 8) { + LLVMValueRef lo, hi; - lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr); - hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr); - return si_build_gather_64bit(ctx, type, lo, hi); - } + lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr); + hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr); + return si_build_gather_64bit(ctx, type, lo, hi); + } - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, - LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); - value = ac_lds_load(&ctx->ac, dw_addr); + value = ac_lds_load(&ctx->ac, dw_addr); - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } /** @@ -368,423 +339,367 @@ static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, * \param dw_addr address in dwords * \param value value to store */ -static void lshs_lds_store(struct si_shader_context *ctx, - unsigned dw_offset_imm, LLVMValueRef dw_addr, - LLVMValueRef value) +static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm, + LLVMValueRef dw_addr, LLVMValueRef value) { - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, - LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), ""); + dw_addr = + LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), ""); - ac_lds_store(&ctx->ac, dw_addr, value); + ac_lds_store(&ctx->ac, dw_addr, value); } -enum si_tess_ring { - TCS_FACTOR_RING, - TESS_OFFCHIP_RING_TCS, - TESS_OFFCHIP_RING_TES, +enum si_tess_ring +{ + TCS_FACTOR_RING, + TESS_OFFCHIP_RING_TCS, + TESS_OFFCHIP_RING_TES, }; -static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, - enum si_tess_ring ring) +static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef addr = ac_get_arg(&ctx->ac, - ring == TESS_OFFCHIP_RING_TES ? - ctx->tes_offchip_addr : - ctx->tcs_out_lds_layout); - - /* TCS only receives high 13 bits of the address. */ - if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { - addr = LLVMBuildAnd(builder, addr, - LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), ""); - } - - if (ring == TCS_FACTOR_RING) { - unsigned tf_offset = ctx->screen->tess_offchip_ring_size; - addr = LLVMBuildAdd(builder, addr, - LLVMConstInt(ctx->ac.i32, tf_offset, 0), ""); - } - - uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - - if (ctx->screen->info.chip_class >= GFX10) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | - S_008F0C_RESOURCE_LEVEL(1); - else - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - - LLVMValueRef desc[4]; - desc[0] = addr; - desc[1] = LLVMConstInt(ctx->ac.i32, - S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); - desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0); - desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false); - - return ac_build_gather_values(&ctx->ac, desc, 4); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef addr = ac_get_arg( + &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout); + + /* TCS only receives high 13 bits of the address. */ + if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) { + addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), ""); + } + + if (ring == TCS_FACTOR_RING) { + unsigned tf_offset = ctx->screen->tess_offchip_ring_size; + addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), ""); + } + + uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (ctx->screen->info.chip_class >= GFX10) + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); + else + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc[4]; + desc[0] = addr; + desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0); + desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0); + desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false); + + return ac_build_gather_values(&ctx->ac, desc, 4); } void si_llvm_preload_tes_rings(struct si_shader_context *ctx) { - ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); + ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES); } -static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_input) +static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type, + LLVMValueRef vertex_index, LLVMValueRef param_index, + unsigned const_index, unsigned location, + unsigned driver_location, unsigned component, + unsigned num_components, bool is_patch, + bool is_compact, bool load_input) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef dw_addr, stride; - ubyte name, index; - - driver_location = driver_location / 4; - - if (load_input) { - name = info->input_semantic_name[driver_location]; - index = info->input_semantic_index[driver_location]; - } else { - name = info->output_semantic_name[driver_location]; - index = info->output_semantic_index[driver_location]; - } - - assert((name == TGSI_SEMANTIC_PATCH || - name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) == is_patch); - - if (load_input) { - stride = get_tcs_in_vertex_dw_stride(ctx); - dw_addr = get_tcs_in_current_patch_offset(ctx); - } else { - if (is_patch) { - stride = NULL; - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - } else { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); - } - } - - if (!param_index) { - param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); - } - - dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, - vertex_index, param_index, - name, index); - - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) - offset *= 2; - - offset += component; - value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr); - } - - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef dw_addr, stride; + ubyte name, index; + + driver_location = driver_location / 4; + + if (load_input) { + name = info->input_semantic_name[driver_location]; + index = info->input_semantic_index[driver_location]; + } else { + name = info->output_semantic_name[driver_location]; + index = info->output_semantic_index[driver_location]; + } + + assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) == is_patch); + + if (load_input) { + stride = get_tcs_in_vertex_dw_stride(ctx); + dw_addr = get_tcs_in_current_patch_offset(ctx); + } else { + if (is_patch) { + stride = NULL; + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + } else { + stride = get_tcs_out_vertex_dw_stride(ctx); + dw_addr = get_tcs_out_current_patch_offset(ctx); + } + } + + if (!param_index) { + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + } + + dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index, + name, index); + + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) + offset *= 2; + + offset += component; + value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); } -static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, - LLVMTypeRef type, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - bool is_patch, - bool is_compact, - bool load_input) +static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type, + LLVMValueRef vertex_index, LLVMValueRef param_index, + unsigned const_index, unsigned location, + unsigned driver_location, unsigned component, + unsigned num_components, bool is_patch, bool is_compact, + bool load_input) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef base, addr; - - driver_location = driver_location / 4; - ubyte name = info->input_semantic_name[driver_location]; - ubyte index = info->input_semantic_index[driver_location]; - - assert((name == TGSI_SEMANTIC_PATCH || - name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) == is_patch); - - base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); - - if (!param_index) { - param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); - } - - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, - param_index, - name, index); - - /* TODO: This will generate rather ordinary llvm code, although it - * should be easy for the optimiser to fix up. In future we might want - * to refactor buffer_load(). - */ - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) { - offset *= 2; - if (offset == 4) { - ubyte name = info->input_semantic_name[driver_location + 1]; - ubyte index = info->input_semantic_index[driver_location + 1]; - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, - vertex_index, - param_index, - name, index); - } - - offset = offset % 4; - } - - offset += component; - value[i + component] = buffer_load(ctx, type, offset, - ctx->tess_offchip_ring, base, addr, true); - } - - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef base, addr; + + driver_location = driver_location / 4; + ubyte name = info->input_semantic_name[driver_location]; + ubyte index = info->input_semantic_index[driver_location]; + + assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) == is_patch); + + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + if (!param_index) { + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + } + + addr = + get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index); + + /* TODO: This will generate rather ordinary llvm code, although it + * should be easy for the optimiser to fix up. In future we might want + * to refactor buffer_load(). + */ + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 8) { + offset *= 2; + if (offset == 4) { + ubyte name = info->input_semantic_name[driver_location + 1]; + ubyte index = info->input_semantic_index[driver_location + 1]; + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, + name, index); + } + + offset = offset % 4; + } + + offset += component; + value[i + component] = + buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); } -static void si_nir_store_output_tcs(struct ac_shader_abi *abi, - const struct nir_variable *var, - LLVMValueRef vertex_index, - LLVMValueRef param_index, - unsigned const_index, - LLVMValueRef src, - unsigned writemask) +static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_variable *var, + LLVMValueRef vertex_index, LLVMValueRef param_index, + unsigned const_index, LLVMValueRef src, unsigned writemask) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - const unsigned component = var->data.location_frac; - unsigned driver_location = var->data.driver_location; - LLVMValueRef dw_addr, stride; - LLVMValueRef buffer, base, addr; - LLVMValueRef values[8]; - bool skip_lds_store; - bool is_tess_factor = false, is_tess_inner = false; - - driver_location = driver_location / 4; - ubyte name = info->output_semantic_name[driver_location]; - ubyte index = info->output_semantic_index[driver_location]; - - bool is_const = !param_index; - if (!param_index) - param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); - - const bool is_patch = var->data.patch || - var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || - var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; - - /* Invalid SPIR-V can cause this. */ - if ((name == TGSI_SEMANTIC_PATCH || - name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) != is_patch) - return; - - if (!is_patch) { - stride = get_tcs_out_vertex_dw_stride(ctx); - dw_addr = get_tcs_out_current_patch_offset(ctx); - dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, - vertex_index, param_index, - name, index); - - skip_lds_store = !info->reads_pervertex_outputs; - } else { - dw_addr = get_tcs_out_current_patch_data_offset(ctx); - dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, - vertex_index, param_index, - name, index); - - skip_lds_store = !info->reads_perpatch_outputs; - - if (is_const && const_index == 0) { - int name = info->output_semantic_name[driver_location]; - - /* Always write tess factors into LDS for the TCS epilog. */ - if (name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) { - /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ - skip_lds_store = !info->reads_tessfactor_outputs && - ctx->shader->selector->info.tessfactors_are_def_in_all_invocs; - is_tess_factor = true; - is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; - } - } - } - - buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - - base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); - - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, - param_index, name, index); - - for (unsigned chan = component; chan < 8; chan++) { - if (!(writemask & (1 << chan))) - continue; - LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); - - unsigned buffer_store_offset = chan % 4; - if (chan == 4) { - ubyte name = info->output_semantic_name[driver_location + 1]; - ubyte index = info->output_semantic_index[driver_location + 1]; - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, - vertex_index, - param_index, - name, index); - } - - /* Skip LDS stores if there is no LDS read of this output. */ - if (!skip_lds_store) - lshs_lds_store(ctx, chan, dw_addr, value); - - value = ac_to_integer(&ctx->ac, value); - values[chan] = value; - - if (writemask != 0xF && !is_tess_factor) { - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, - addr, base, - 4 * buffer_store_offset, - ac_glc); - } - - /* Write tess factors into VGPRs for the epilog. */ - if (is_tess_factor && - ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { - if (!is_tess_inner) { - LLVMBuildStore(ctx->ac.builder, value, /* outer */ - ctx->invoc0_tess_factors[chan]); - } else if (chan < 2) { - LLVMBuildStore(ctx->ac.builder, value, /* inner */ - ctx->invoc0_tess_factors[4 + chan]); - } - } - } - - if (writemask == 0xF && !is_tess_factor) { - LLVMValueRef value = ac_build_gather_values(&ctx->ac, - values, 4); - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, - base, 0, ac_glc); - } + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + const unsigned component = var->data.location_frac; + unsigned driver_location = var->data.driver_location; + LLVMValueRef dw_addr, stride; + LLVMValueRef buffer, base, addr; + LLVMValueRef values[8]; + bool skip_lds_store; + bool is_tess_factor = false, is_tess_inner = false; + + driver_location = driver_location / 4; + ubyte name = info->output_semantic_name[driver_location]; + ubyte index = info->output_semantic_index[driver_location]; + + bool is_const = !param_index; + if (!param_index) + param_index = LLVMConstInt(ctx->ac.i32, const_index, 0); + + const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER; + + /* Invalid SPIR-V can cause this. */ + if ((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER || + name == TGSI_SEMANTIC_TESSOUTER) != is_patch) + return; + + if (!is_patch) { + stride = get_tcs_out_vertex_dw_stride(ctx); + dw_addr = get_tcs_out_current_patch_offset(ctx); + dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index, + name, index); + + skip_lds_store = !info->reads_pervertex_outputs; + } else { + dw_addr = get_tcs_out_current_patch_data_offset(ctx); + dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index, + name, index); + + skip_lds_store = !info->reads_perpatch_outputs; + + if (is_const && const_index == 0) { + int name = info->output_semantic_name[driver_location]; + + /* Always write tess factors into LDS for the TCS epilog. */ + if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) { + /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ + skip_lds_store = !info->reads_tessfactor_outputs && + ctx->shader->selector->info.tessfactors_are_def_in_all_invocs; + is_tess_factor = true; + is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; + } + } + } + + buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + addr = + get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index); + + for (unsigned chan = component; chan < 8; chan++) { + if (!(writemask & (1 << chan))) + continue; + LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); + + unsigned buffer_store_offset = chan % 4; + if (chan == 4) { + ubyte name = info->output_semantic_name[driver_location + 1]; + ubyte index = info->output_semantic_index[driver_location + 1]; + addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, + name, index); + } + + /* Skip LDS stores if there is no LDS read of this output. */ + if (!skip_lds_store) + lshs_lds_store(ctx, chan, dw_addr, value); + + value = ac_to_integer(&ctx->ac, value); + values[chan] = value; + + if (writemask != 0xF && !is_tess_factor) { + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base, + 4 * buffer_store_offset, ac_glc); + } + + /* Write tess factors into VGPRs for the epilog. */ + if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { + if (!is_tess_inner) { + LLVMBuildStore(ctx->ac.builder, value, /* outer */ + ctx->invoc0_tess_factors[chan]); + } else if (chan < 2) { + LLVMBuildStore(ctx->ac.builder, value, /* inner */ + ctx->invoc0_tess_factors[4 + chan]); + } + } + } + + if (writemask == 0xF && !is_tess_factor) { + LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4); + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc); + } } static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMValueRef coord[4] = { - ac_get_arg(&ctx->ac, ctx->tes_u), - ac_get_arg(&ctx->ac, ctx->tes_v), - ctx->ac.f32_0, - ctx->ac.f32_0 - }; - - /* For triangles, the vector should be (u, v, 1-u-v). */ - if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == - PIPE_PRIM_TRIANGLES) { - coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, - LLVMBuildFAdd(ctx->ac.builder, - coord[0], coord[1], ""), ""); - } - return ac_build_gather_values(&ctx->ac, coord, 4); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v), + ctx->ac.f32_0, ctx->ac.f32_0}; + + /* For triangles, the vector should be (u, v, 1-u-v). */ + if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) { + coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1, + LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), ""); + } + return ac_build_gather_values(&ctx->ac, coord, 4); } -static LLVMValueRef load_tess_level(struct si_shader_context *ctx, - unsigned semantic_name) +static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic_name) { - LLVMValueRef base, addr; - - int param = si_shader_io_get_unique_index_patch(semantic_name, 0); + LLVMValueRef base, addr; - base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); - addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, - LLVMConstInt(ctx->ac.i32, param, 0)); + int param = si_shader_io_get_unique_index_patch(semantic_name, 0); - return buffer_load(ctx, ctx->ac.f32, - ~0, ctx->tess_offchip_ring, base, addr, true); + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, + LLVMConstInt(ctx->ac.i32, param, 0)); + return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true); } -static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, - unsigned semantic_name) +static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned semantic_name) { - LLVMValueRef buf, slot, val[4]; - int i, offset; - - slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); - buf = ac_get_arg(&ctx->ac, ctx->rw_buffers); - buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); - offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0; - - for (i = 0; i < 4; i++) - val[i] = si_buffer_load_const(ctx, buf, - LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0)); - return ac_build_gather_values(&ctx->ac, val, 4); + LLVMValueRef buf, slot, val[4]; + int i, offset; + + slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0); + buf = ac_get_arg(&ctx->ac, ctx->rw_buffers); + buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot); + offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0; + + for (i = 0; i < 4; i++) + val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0)); + return ac_build_gather_values(&ctx->ac, val, 4); } -static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, - unsigned varying_id, - bool load_default_state) +static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id, + bool load_default_state) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - unsigned semantic_name; - - if (load_default_state) { - switch (varying_id) { - case VARYING_SLOT_TESS_LEVEL_INNER: - semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL; - break; - case VARYING_SLOT_TESS_LEVEL_OUTER: - semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL; - break; - default: - unreachable("unknown tess level"); - } - return load_tess_level_default(ctx, semantic_name); - } - - switch (varying_id) { - case VARYING_SLOT_TESS_LEVEL_INNER: - semantic_name = TGSI_SEMANTIC_TESSINNER; - break; - case VARYING_SLOT_TESS_LEVEL_OUTER: - semantic_name = TGSI_SEMANTIC_TESSOUTER; - break; - default: - unreachable("unknown tess level"); - } - - return load_tess_level(ctx, semantic_name); - + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + unsigned semantic_name; + + if (load_default_state) { + switch (varying_id) { + case VARYING_SLOT_TESS_LEVEL_INNER: + semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL; + break; + case VARYING_SLOT_TESS_LEVEL_OUTER: + semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL; + break; + default: + unreachable("unknown tess level"); + } + return load_tess_level_default(ctx, semantic_name); + } + + switch (varying_id) { + case VARYING_SLOT_TESS_LEVEL_INNER: + semantic_name = TGSI_SEMANTIC_TESSINNER; + break; + case VARYING_SLOT_TESS_LEVEL_OUTER: + semantic_name = TGSI_SEMANTIC_TESSOUTER; + break; + default: + unreachable("unknown tess level"); + } + + return load_tess_level(ctx, semantic_name); } static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - if (ctx->type == PIPE_SHADER_TESS_CTRL) - return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6); - else if (ctx->type == PIPE_SHADER_TESS_EVAL) - return get_num_tcs_out_vertices(ctx); - else - unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + if (ctx->type == PIPE_SHADER_TESS_CTRL) + return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6); + else if (ctx->type == PIPE_SHADER_TESS_EVAL) + return get_num_tcs_out_vertices(ctx); + else + unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN"); } /** @@ -793,503 +708,450 @@ static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi) */ static void si_copy_tcs_inputs(struct si_shader_context *ctx) { - LLVMValueRef invocation_id, buffer, buffer_offset; - LLVMValueRef lds_vertex_stride, lds_base; - uint64_t inputs; + LLVMValueRef invocation_id, buffer, buffer_offset; + LLVMValueRef lds_vertex_stride, lds_base; + uint64_t inputs; - invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); - buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); + buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); - lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); - lds_base = get_tcs_in_current_patch_offset(ctx); - lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, - lds_base); + lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx); + lds_base = get_tcs_in_current_patch_offset(ctx); + lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base); - inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; - while (inputs) { - unsigned i = u_bit_scan64(&inputs); + inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy; + while (inputs) { + unsigned i = u_bit_scan64(&inputs); - LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->ac.i32, 4 * i, 0), - ""); + LLVMValueRef lds_ptr = + LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), ""); - LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, - get_rel_patch_id(ctx), - invocation_id, - LLVMConstInt(ctx->ac.i32, i, 0)); + LLVMValueRef buffer_addr = get_tcs_tes_buffer_address( + ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0)); - LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr); + LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr); - ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, - buffer_offset, 0, ac_glc); - } + ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0, + ac_glc); + } } -static void si_write_tess_factors(struct si_shader_context *ctx, - LLVMValueRef rel_patch_id, - LLVMValueRef invocation_id, - LLVMValueRef tcs_out_current_patch_data_offset, - LLVMValueRef invoc0_tf_outer[4], - LLVMValueRef invoc0_tf_inner[2]) +static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id, + LLVMValueRef invocation_id, + LLVMValueRef tcs_out_current_patch_data_offset, + LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2]) { - struct si_shader *shader = ctx->shader; - unsigned tess_inner_index, tess_outer_index; - LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; - LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; - unsigned stride, outer_comps, inner_comps, i, offset; - - /* Add a barrier before loading tess factors from LDS. */ - if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) - si_llvm_emit_barrier(ctx); - - /* Do this only for invocation 0, because the tess levels are per-patch, - * not per-vertex. - * - * This can't jump, because invocation 0 executes this. It should - * at least mask out the loads and stores for other invocations. - */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - invocation_id, ctx->ac.i32_0, ""), 6503); - - /* Determine the layout of one tess factor element in the buffer. */ - switch (shader->key.part.tcs.epilog.prim_mode) { - case PIPE_PRIM_LINES: - stride = 2; /* 2 dwords, 1 vec2 store */ - outer_comps = 2; - inner_comps = 0; - break; - case PIPE_PRIM_TRIANGLES: - stride = 4; /* 4 dwords, 1 vec4 store */ - outer_comps = 3; - inner_comps = 1; - break; - case PIPE_PRIM_QUADS: - stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ - outer_comps = 4; - inner_comps = 2; - break; - default: - assert(0); - return; - } - - for (i = 0; i < 4; i++) { - inner[i] = LLVMGetUndef(ctx->ac.i32); - outer[i] = LLVMGetUndef(ctx->ac.i32); - } - - if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { - /* Tess factors are in VGPRs. */ - for (i = 0; i < outer_comps; i++) - outer[i] = out[i] = invoc0_tf_outer[i]; - for (i = 0; i < inner_comps; i++) - inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; - } else { - /* Load tess_inner and tess_outer from LDS. - * Any invocation can write them, so we can't get them from a temporary. - */ - tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); - tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); - - lds_base = tcs_out_current_patch_data_offset; - lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->ac.i32, - tess_inner_index * 4, 0), ""); - lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, - LLVMConstInt(ctx->ac.i32, - tess_outer_index * 4, 0), ""); - - for (i = 0; i < outer_comps; i++) { - outer[i] = out[i] = - lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer); - } - for (i = 0; i < inner_comps; i++) { - inner[i] = out[outer_comps+i] = - lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner); - } - } - - if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { - /* For isolines, the hardware expects tess factors in the - * reverse order from what NIR specifies. - */ - LLVMValueRef tmp = out[0]; - out[0] = out[1]; - out[1] = tmp; - } - - /* Convert the outputs to vectors for stores. */ - vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); - vec1 = NULL; - - if (stride > 4) - vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4); - - /* Get the buffer. */ - buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); - - /* Get the offset. */ - tf_base = ac_get_arg(&ctx->ac, - ctx->tcs_factor_offset); - byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, - LLVMConstInt(ctx->ac.i32, 4 * stride, 0), ""); - - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, - rel_patch_id, ctx->ac.i32_0, ""), 6504); - - /* Store the dynamic HS control word. */ - offset = 0; - if (ctx->screen->info.chip_class <= GFX8) { - ac_build_buffer_store_dword(&ctx->ac, buffer, - LLVMConstInt(ctx->ac.i32, 0x80000000, 0), - 1, ctx->ac.i32_0, tf_base, - offset, ac_glc); - offset += 4; - } - - ac_build_endif(&ctx->ac, 6504); - - /* Store the tessellation factors. */ - ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, - MIN2(stride, 4), byteoffset, tf_base, - offset, ac_glc); - offset += 16; - if (vec1) - ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, - stride - 4, byteoffset, tf_base, - offset, ac_glc); - - /* Store the tess factors into the offchip buffer if TES reads them. */ - if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { - LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; - LLVMValueRef tf_inner_offset; - unsigned param_outer, param_inner; - - buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); - base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); - - param_outer = si_shader_io_get_unique_index_patch( - TGSI_SEMANTIC_TESSOUTER, 0); - tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, - LLVMConstInt(ctx->ac.i32, param_outer, 0)); - - unsigned outer_vec_size = - ac_has_vec3_support(ctx->screen->info.chip_class, false) ? - outer_comps : util_next_power_of_two(outer_comps); - outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size); - - ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, - outer_comps, tf_outer_offset, - base, 0, ac_glc); - if (inner_comps) { - param_inner = si_shader_io_get_unique_index_patch( - TGSI_SEMANTIC_TESSINNER, 0); - tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, - LLVMConstInt(ctx->ac.i32, param_inner, 0)); - - inner_vec = inner_comps == 1 ? inner[0] : - ac_build_gather_values(&ctx->ac, inner, inner_comps); - ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, - inner_comps, tf_inner_offset, - base, 0, ac_glc); - } - } - - ac_build_endif(&ctx->ac, 6503); + struct si_shader *shader = ctx->shader; + unsigned tess_inner_index, tess_outer_index; + LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; + LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; + unsigned stride, outer_comps, inner_comps, i, offset; + + /* Add a barrier before loading tess factors from LDS. */ + if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) + si_llvm_emit_barrier(ctx); + + /* Do this only for invocation 0, because the tess levels are per-patch, + * not per-vertex. + * + * This can't jump, because invocation 0 executes this. It should + * at least mask out the loads and stores for other invocations. + */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503); + + /* Determine the layout of one tess factor element in the buffer. */ + switch (shader->key.part.tcs.epilog.prim_mode) { + case PIPE_PRIM_LINES: + stride = 2; /* 2 dwords, 1 vec2 store */ + outer_comps = 2; + inner_comps = 0; + break; + case PIPE_PRIM_TRIANGLES: + stride = 4; /* 4 dwords, 1 vec4 store */ + outer_comps = 3; + inner_comps = 1; + break; + case PIPE_PRIM_QUADS: + stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */ + outer_comps = 4; + inner_comps = 2; + break; + default: + assert(0); + return; + } + + for (i = 0; i < 4; i++) { + inner[i] = LLVMGetUndef(ctx->ac.i32); + outer[i] = LLVMGetUndef(ctx->ac.i32); + } + + if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { + /* Tess factors are in VGPRs. */ + for (i = 0; i < outer_comps; i++) + outer[i] = out[i] = invoc0_tf_outer[i]; + for (i = 0; i < inner_comps; i++) + inner[i] = out[outer_comps + i] = invoc0_tf_inner[i]; + } else { + /* Load tess_inner and tess_outer from LDS. + * Any invocation can write them, so we can't get them from a temporary. + */ + tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); + tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); + + lds_base = tcs_out_current_patch_data_offset; + lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), ""); + lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base, + LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), ""); + + for (i = 0; i < outer_comps; i++) { + outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer); + } + for (i = 0; i < inner_comps; i++) { + inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner); + } + } + + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what NIR specifies. + */ + LLVMValueRef tmp = out[0]; + out[0] = out[1]; + out[1] = tmp; + } + + /* Convert the outputs to vectors for stores. */ + vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4)); + vec1 = NULL; + + if (stride > 4) + vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4); + + /* Get the buffer. */ + buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING); + + /* Get the offset. */ + tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset); + byteoffset = + LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), ""); + + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504); + + /* Store the dynamic HS control word. */ + offset = 0; + if (ctx->screen->info.chip_class <= GFX8) { + ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1, + ctx->ac.i32_0, tf_base, offset, ac_glc); + offset += 4; + } + + ac_build_endif(&ctx->ac, 6504); + + /* Store the tessellation factors. */ + ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset, + ac_glc); + offset += 16; + if (vec1) + ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset, + ac_glc); + + /* Store the tess factors into the offchip buffer if TES reads them. */ + if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { + LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; + LLVMValueRef tf_inner_offset; + unsigned param_outer, param_inner; + + buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS); + base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset); + + param_outer = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); + tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->ac.i32, param_outer, 0)); + + unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false) + ? outer_comps + : util_next_power_of_two(outer_comps); + outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size); + + ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0, + ac_glc); + if (inner_comps) { + param_inner = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); + tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->ac.i32, param_inner, 0)); + + inner_vec = + inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps); + ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base, + 0, ac_glc); + } + } + + ac_build_endif(&ctx->ac, 6503); } /* This only writes the tessellation factor levels. */ -static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; - - si_copy_tcs_inputs(ctx); - - rel_patch_id = get_rel_patch_id(ctx); - invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); - tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); - - if (ctx->screen->info.chip_class >= GFX9) { - LLVMBasicBlockRef blocks[2] = { - LLVMGetInsertBlock(builder), - ctx->merged_wrap_if_entry_block - }; - LLVMValueRef values[2]; - - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); - - values[0] = rel_patch_id; - values[1] = LLVMGetUndef(ctx->ac.i32); - rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); - - values[0] = tf_lds_offset; - values[1] = LLVMGetUndef(ctx->ac.i32); - tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); - - values[0] = invocation_id; - values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */ - invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); - } - - /* Return epilog parameters from this function. */ - LLVMValueRef ret = ctx->return_value; - unsigned vgpr; - - if (ctx->screen->info.chip_class >= GFX9) { - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, - 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, - 8 + GFX9_SGPR_TCS_OUT_LAYOUT); - /* Tess offchip and tess factor offsets are at the beginning. */ - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); - vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; - } else { - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, - GFX6_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, - GFX6_SGPR_TCS_OUT_LAYOUT); - /* Tess offchip and tess factor offsets are after user SGPRs. */ - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, - GFX6_TCS_NUM_USER_SGPR); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, - GFX6_TCS_NUM_USER_SGPR + 1); - vgpr = GFX6_TCS_NUM_USER_SGPR + 2; - } - - /* VGPRs */ - rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); - invocation_id = ac_to_float(&ctx->ac, invocation_id); - tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); - - /* Leave a hole corresponding to the two input VGPRs. This ensures that - * the invocation_id output does not alias the tcs_rel_ids input, - * which saves a V_MOV on gfx9. - */ - vgpr += 2; - - ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); - - if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { - vgpr++; /* skip the tess factor LDS offset */ - for (unsigned i = 0; i < 6; i++) { - LLVMValueRef value = - LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); - value = ac_to_float(&ctx->ac, value); - ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); - } - } else { - ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); - } - ctx->return_value = ret; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + + si_copy_tcs_inputs(ctx); + + rel_patch_id = get_rel_patch_id(ctx); + invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5); + tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); + + if (ctx->screen->info.chip_class >= GFX9) { + LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block}; + LLVMValueRef values[2]; + + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + + values[0] = rel_patch_id; + values[1] = LLVMGetUndef(ctx->ac.i32); + rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + + values[0] = tf_lds_offset; + values[1] = LLVMGetUndef(ctx->ac.i32); + tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + + values[0] = invocation_id; + values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */ + invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks); + } + + /* Return epilog parameters from this function. */ + LLVMValueRef ret = ctx->return_value; + unsigned vgpr; + + if (ctx->screen->info.chip_class >= GFX9) { + ret = + si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT); + /* Tess offchip and tess factor offsets are at the beginning. */ + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); + vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1; + } else { + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT); + /* Tess offchip and tess factor offsets are after user SGPRs. */ + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1); + vgpr = GFX6_TCS_NUM_USER_SGPR + 2; + } + + /* VGPRs */ + rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id); + invocation_id = ac_to_float(&ctx->ac, invocation_id); + tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset); + + /* Leave a hole corresponding to the two input VGPRs. This ensures that + * the invocation_id output does not alias the tcs_rel_ids input, + * which saves a V_MOV on gfx9. + */ + vgpr += 2; + + ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); + ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); + + if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) { + vgpr++; /* skip the tess factor LDS offset */ + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); + value = ac_to_float(&ctx->ac, value); + ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); + } + } else { + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + } + ctx->return_value = ret; } /* Pass TCS inputs from LS to TCS on GFX9. */ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) { - LLVMValueRef ret = ctx->return_value; - - ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); - ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); - ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); - - ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - - ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - - ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, - 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, - 8 + GFX9_SGPR_TCS_OUT_OFFSETS); - ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, - 8 + GFX9_SGPR_TCS_OUT_LAYOUT); - - unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_to_float(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)), - vgpr++, ""); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, - ac_to_float(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)), - vgpr++, ""); - ctx->return_value = ret; + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); + ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4); + ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + + ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); + + ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS); + ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT); + + unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)), + vgpr++, ""); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, + ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)), + vgpr++, ""); + ctx->return_value = ret; } -void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs) +void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - struct si_shader_info *info = &shader->selector->info; - unsigned i, chan; - LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id); - LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); - LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, - vertex_dw_stride, ""); - - /* Write outputs to LDS. The next shader (TCS aka HS) will read - * its inputs from it. */ - for (i = 0; i < info->num_outputs; i++) { - unsigned name = info->output_semantic_name[i]; - unsigned index = info->output_semantic_index[i]; - - /* The ARB_shader_viewport_layer_array spec contains the - * following issue: - * - * 2) What happens if gl_ViewportIndex or gl_Layer is - * written in the vertex shader and a geometry shader is - * present? - * - * RESOLVED: The value written by the last vertex processing - * stage is used. If the last vertex processing stage - * (vertex, tessellation evaluation or geometry) does not - * statically assign to gl_ViewportIndex or gl_Layer, index - * or layer zero is assumed. - * - * So writes to those outputs in VS-as-LS are simply ignored. - */ - if (name == TGSI_SEMANTIC_LAYER || - name == TGSI_SEMANTIC_VIEWPORT_INDEX) - continue; - - int param = si_shader_io_get_unique_index(name, index, false); - LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, - LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); - - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) - continue; - - lshs_lds_store(ctx, chan, dw_addr, - LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); - } - } - - if (ctx->screen->info.chip_class >= GFX9) - si_set_ls_return_value_for_tcs(ctx); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + struct si_shader_info *info = &shader->selector->info; + unsigned i, chan; + LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id); + LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx); + LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); + + /* Write outputs to LDS. The next shader (TCS aka HS) will read + * its inputs from it. */ + for (i = 0; i < info->num_outputs; i++) { + unsigned name = info->output_semantic_name[i]; + unsigned index = info->output_semantic_index[i]; + + /* The ARB_shader_viewport_layer_array spec contains the + * following issue: + * + * 2) What happens if gl_ViewportIndex or gl_Layer is + * written in the vertex shader and a geometry shader is + * present? + * + * RESOLVED: The value written by the last vertex processing + * stage is used. If the last vertex processing stage + * (vertex, tessellation evaluation or geometry) does not + * statically assign to gl_ViewportIndex or gl_Layer, index + * or layer zero is assumed. + * + * So writes to those outputs in VS-as-LS are simply ignored. + */ + if (name == TGSI_SEMANTIC_LAYER || name == TGSI_SEMANTIC_VIEWPORT_INDEX) + continue; + + int param = si_shader_io_get_unique_index(name, index, false); + LLVMValueRef dw_addr = + LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), ""); + + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + lshs_lds_store(ctx, chan, dw_addr, + LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "")); + } + } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_ls_return_value_for_tcs(ctx); } /** * Compile the TCS epilog function. This writes tesselation factors to memory * based on the output primitive type of the tesselator (determined by TES). */ -void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key) { - memset(&ctx->args, 0, sizeof(ctx->args)); - - if (ctx->screen->info.chip_class >= GFX9) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_factor_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_out_lds_layout); - } else { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_offchip_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_out_lds_layout); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &ctx->tcs_factor_offset); - } - - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ - struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id); - struct ac_arg invocation_id; /* invocation ID within the patch */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id); - struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */ - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, - &tcs_out_current_patch_data_offset); - - struct ac_arg tess_factors[6]; - for (unsigned i = 0; i < 6; i++) - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]); - - /* Create the function. */ - si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, - ctx->screen->info.chip_class >= GFX7 ? 128 : 0); - ac_declare_lds_as_pointer(&ctx->ac); - - LLVMValueRef invoc0_tess_factors[6]; - for (unsigned i = 0; i < 6; i++) - invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]); - - si_write_tess_factors(ctx, - ac_get_arg(&ctx->ac, rel_patch_id), - ac_get_arg(&ctx->ac, invocation_id), - ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset), - invoc0_tess_factors, invoc0_tess_factors + 4); - - LLVMBuildRetVoid(ctx->ac.builder); + memset(&ctx->args, 0, sizeof(ctx->args)); + + if (ctx->screen->info.chip_class >= GFX9) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */ + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + } else { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset); + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset); + } + + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */ + struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id); + struct ac_arg invocation_id; /* invocation ID within the patch */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id); + struct ac_arg + tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */ + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset); + + struct ac_arg tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]); + + /* Create the function. */ + si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0); + ac_declare_lds_as_pointer(&ctx->ac); + + LLVMValueRef invoc0_tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]); + + si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id), + ac_get_arg(&ctx->ac, invocation_id), + ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset), + invoc0_tess_factors, invoc0_tess_factors + 4); + + LLVMBuildRetVoid(ctx->ac.builder); } void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx) { - ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; - ctx->abi.load_tess_level = si_load_tess_level; - ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; - ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; - ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; + ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings; + ctx->abi.load_tess_level = si_load_tess_level; + ctx->abi.store_tcs_outputs = si_nir_store_output_tcs; + ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue; + ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; } void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) { - ctx->abi.load_tess_varyings = si_nir_load_input_tes; - ctx->abi.load_tess_coord = si_load_tess_coord; - ctx->abi.load_tess_level = si_load_tess_level; - ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; - - if (ctx->shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; - else if (ctx->shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + ctx->abi.load_tess_varyings = si_nir_load_input_tes; + ctx->abi.load_tess_coord = si_load_tess_coord; + ctx->abi.load_tess_level = si_load_tess_level; + ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; + + if (ctx->shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (ctx->shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 39c06f41ece..8640150b18c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -22,518 +22,463 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" #include "si_pipe.h" +#include "si_shader_internal.h" #include "sid.h" #include "util/u_memory.h" -static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, - LLVMValueRef i32, unsigned index) +static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index) { - assert(index <= 1); + assert(index <= 1); - if (index == 1) - return LLVMBuildAShr(ctx->ac.builder, i32, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); + if (index == 1) + return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), ""); - return LLVMBuildSExt(ctx->ac.builder, - LLVMBuildTrunc(ctx->ac.builder, i32, - ctx->ac.i16, ""), - ctx->ac.i32, ""); + return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""), + ctx->ac.i32, ""); } -static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, - LLVMValueRef out[4]) +static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4]) { - const struct si_shader_info *info = &ctx->shader->selector->info; - unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - - if (vs_blit_property) { - LLVMValueRef vertex_id = ctx->abi.vertex_id; - LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntULE, vertex_id, - ctx->ac.i32_1, ""); - /* Use LLVMIntNE, because we have 3 vertices and only - * the middle one should use y2. - */ - LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntNE, vertex_id, - ctx->ac.i32_1, ""); - - unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index; - if (input_index == 0) { - /* Position: */ - LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs); - LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 1); - - LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); - LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); - LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); - LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); - - LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - - out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); - out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); - out[2] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 2); - out[3] = ctx->ac.f32_1; - return; - } - - /* Color or texture coordinates: */ - assert(input_index == 1); - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - for (int i = 0; i < 4; i++) { - out[i] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 3 + i); - } - } else { - assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); - LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 3); - LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 4); - LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 5); - LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 6); - - out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - out[2] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 7); - out[3] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 8); - } - return; - } - - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; - union si_vs_fix_fetch fix_fetch; - LLVMValueRef vb_desc; - LLVMValueRef vertex_index; - LLVMValueRef tmp; - - if (input_index < num_vbos_in_user_sgprs) { - vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); - } else { - unsigned index= input_index - num_vbos_in_user_sgprs; - vb_desc = ac_build_load_to_sgpr(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->vertex_buffers), - LLVMConstInt(ctx->ac.i32, index, 0)); - } - - vertex_index = LLVMGetParam(ctx->main_fn, - ctx->vertex_index0.arg_index + - input_index); - - /* Use the open-coded implementation for all loads of doubles and - * of dword-sized data that needs fixups. We need to insert conversion - * code anyway, and the amd/common code does it for us. - * - * Note: On LLVM <= 8, we can only open-code formats with - * channel size >= 4 bytes. - */ - bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); - fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; - if (opencode || - (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || - (fix_fetch.u.log_size == 2)) { - tmp = ac_build_opencoded_load_format( - &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, - fix_fetch.u.format, fix_fetch.u.reverse, !opencode, - vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); - for (unsigned i = 0; i < 4; ++i) - out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); - return; - } - - /* Do multiple loads for special formats. */ - unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); - LLVMValueRef fetches[4]; - unsigned num_fetches; - unsigned fetch_stride; - unsigned channels_per_fetch; - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { - num_fetches = MIN2(required_channels, 3); - fetch_stride = 1 << fix_fetch.u.log_size; - channels_per_fetch = 1; - } else { - num_fetches = 1; - fetch_stride = 0; - channels_per_fetch = required_channels; - } - - for (unsigned i = 0; i < num_fetches; ++i) { - LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); - fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, - channels_per_fetch, 0, true); - } - - if (num_fetches == 1 && channels_per_fetch > 1) { - LLVMValueRef fetch = fetches[0]; - for (unsigned i = 0; i < channels_per_fetch; ++i) { - tmp = LLVMConstInt(ctx->ac.i32, i, false); - fetches[i] = LLVMBuildExtractElement( - ctx->ac.builder, fetch, tmp, ""); - } - num_fetches = channels_per_fetch; - channels_per_fetch = 1; - } - - for (unsigned i = num_fetches; i < 4; ++i) - fetches[i] = LLVMGetUndef(ctx->ac.f32); - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && - required_channels == 4) { - if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) - fetches[3] = ctx->ac.i32_1; - else - fetches[3] = ctx->ac.f32_1; - } else if (fix_fetch.u.log_size == 3 && - (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || - fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || - fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && - required_channels == 4) { - /* For 2_10_10_10, the hardware returns an unsigned value; - * convert it to a signed one. - */ - LLVMValueRef tmp = fetches[3]; - LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); - - /* First, recover the sign-extended signed integer value. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) - tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, ""); - else - tmp = ac_to_integer(&ctx->ac, tmp); - - /* For the integer-like cases, do a natural sign extension. - * - * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 - * and happen to contain 0, 1, 2, 3 as the two LSBs of the - * exponent. - */ - tmp = LLVMBuildShl(ctx->ac.builder, tmp, - fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? - LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); - tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); - - /* Convert back to the right type. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { - LLVMValueRef clamp; - LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); - clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); - } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); - } - - fetches[3] = tmp; - } - - for (unsigned i = 0; i < 4; ++i) - out[i] = ac_to_float(&ctx->ac, fetches[i]); + const struct si_shader_info *info = &ctx->shader->selector->info; + unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + + if (vs_blit_property) { + LLVMValueRef vertex_id = ctx->abi.vertex_id; + LLVMValueRef sel_x1 = + LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, ""); + /* Use LLVMIntNE, because we have 3 vertices and only + * the middle one should use y2. + */ + LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, ""); + + unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index; + if (input_index == 0) { + /* Position: */ + LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs); + LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1); + + LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); + LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); + LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); + LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); + + LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, ""); + LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, ""); + + out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); + out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); + out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2); + out[3] = ctx->ac.f32_1; + return; + } + + /* Color or texture coordinates: */ + assert(input_index == 1); + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + for (int i = 0; i < 4; i++) { + out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i); + } + } else { + assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); + LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3); + LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4); + LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5); + LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6); + + out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, ""); + out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, ""); + out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7); + out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8); + } + return; + } + + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + union si_vs_fix_fetch fix_fetch; + LLVMValueRef vb_desc; + LLVMValueRef vertex_index; + LLVMValueRef tmp; + + if (input_index < num_vbos_in_user_sgprs) { + vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); + } else { + unsigned index = input_index - num_vbos_in_user_sgprs; + vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers), + LLVMConstInt(ctx->ac.i32, index, 0)); + } + + vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index); + + /* Use the open-coded implementation for all loads of doubles and + * of dword-sized data that needs fixups. We need to insert conversion + * code anyway, and the amd/common code does it for us. + * + * Note: On LLVM <= 8, we can only open-code formats with + * channel size >= 4 bytes. + */ + bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); + fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; + if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || + (fix_fetch.u.log_size == 2)) { + tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size, + fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format, + fix_fetch.u.reverse, !opencode, vb_desc, vertex_index, + ctx->ac.i32_0, ctx->ac.i32_0, 0, true); + for (unsigned i = 0; i < 4; ++i) + out[i] = + LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); + return; + } + + /* Do multiple loads for special formats. */ + unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + LLVMValueRef fetches[4]; + unsigned num_fetches; + unsigned fetch_stride; + unsigned channels_per_fetch; + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { + num_fetches = MIN2(required_channels, 3); + fetch_stride = 1 << fix_fetch.u.log_size; + channels_per_fetch = 1; + } else { + num_fetches = 1; + fetch_stride = 0; + channels_per_fetch = required_channels; + } + + for (unsigned i = 0; i < num_fetches; ++i) { + LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); + fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, + channels_per_fetch, 0, true); + } + + if (num_fetches == 1 && channels_per_fetch > 1) { + LLVMValueRef fetch = fetches[0]; + for (unsigned i = 0; i < channels_per_fetch; ++i) { + tmp = LLVMConstInt(ctx->ac.i32, i, false); + fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, ""); + } + num_fetches = channels_per_fetch; + channels_per_fetch = 1; + } + + for (unsigned i = num_fetches; i < 4; ++i) + fetches[i] = LLVMGetUndef(ctx->ac.f32); + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) + fetches[3] = ctx->ac.i32_1; + else + fetches[3] = ctx->ac.f32_1; + } else if (fix_fetch.u.log_size == 3 && + (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || + fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && + required_channels == 4) { + /* For 2_10_10_10, the hardware returns an unsigned value; + * convert it to a signed one. + */ + LLVMValueRef tmp = fetches[3]; + LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); + + /* First, recover the sign-extended signed integer value. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) + tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, ""); + else + tmp = ac_to_integer(&ctx->ac, tmp); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + tmp = LLVMBuildShl( + ctx->ac.builder, tmp, + fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); + tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); + + /* Convert back to the right type. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { + LLVMValueRef clamp; + LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); + } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + } + + fetches[3] = tmp; + } + + for (unsigned i = 0; i < 4; ++i) + out[i] = ac_to_float(&ctx->ac, fetches[i]); } static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index) { - LLVMValueRef input[4]; + LLVMValueRef input[4]; - load_input_vs(ctx, input_index / 4, input); + load_input_vs(ctx, input_index / 4, input); - for (unsigned chan = 0; chan < 4; chan++) { - ctx->inputs[input_index + chan] = - LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, ""); - } + for (unsigned chan = 0; chan < 4; chan++) { + ctx->inputs[input_index + chan] = + LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, ""); + } } void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir) { - uint64_t processed_inputs = 0; - - nir_foreach_variable(variable, &nir->inputs) { - unsigned attrib_count = glsl_count_attribute_slots(variable->type, - true); - unsigned input_idx = variable->data.driver_location; - unsigned loc = variable->data.location; - - for (unsigned i = 0; i < attrib_count; i++) { - /* Packed components share the same location so skip - * them if we have already processed the location. - */ - if (processed_inputs & ((uint64_t)1 << (loc + i))) { - input_idx += 4; - continue; - } - - declare_input_vs(ctx, input_idx); - if (glsl_type_is_dual_slot(variable->type)) { - input_idx += 4; - declare_input_vs(ctx, input_idx); - } - - processed_inputs |= ((uint64_t)1 << (loc + i)); - input_idx += 4; - } - } + uint64_t processed_inputs = 0; + + nir_foreach_variable (variable, &nir->inputs) { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); + unsigned input_idx = variable->data.driver_location; + unsigned loc = variable->data.location; + + for (unsigned i = 0; i < attrib_count; i++) { + /* Packed components share the same location so skip + * them if we have already processed the location. + */ + if (processed_inputs & ((uint64_t)1 << (loc + i))) { + input_idx += 4; + continue; + } + + declare_input_vs(ctx, input_idx); + if (glsl_type_is_dual_slot(variable->type)) { + input_idx += 4; + declare_input_vs(ctx, input_idx); + } + + processed_inputs |= ((uint64_t)1 << (loc + i)); + input_idx += 4; + } + } } -void si_llvm_streamout_store_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out) +void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out) { - unsigned buf_idx = stream_out->output_buffer; - unsigned start = stream_out->start_component; - unsigned num_comps = stream_out->num_components; - LLVMValueRef out[4]; - - assert(num_comps && num_comps <= 4); - if (!num_comps || num_comps > 4) - return; - - /* Load the output as int. */ - for (int j = 0; j < num_comps; j++) { - assert(stream_out->stream == shader_out->vertex_stream[start + j]); - - out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); - } - - /* Pack the output. */ - LLVMValueRef vdata = NULL; - - switch (num_comps) { - case 1: /* as i32 */ - vdata = out[0]; - break; - case 2: /* as v2i32 */ - case 3: /* as v3i32 */ - if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { - vdata = ac_build_gather_values(&ctx->ac, out, num_comps); - break; - } - /* as v4i32 (aligned to 4) */ - out[3] = LLVMGetUndef(ctx->ac.i32); - /* fall through */ - case 4: /* as v4i32 */ - vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); - break; - } - - ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], - vdata, num_comps, - so_write_offsets[buf_idx], - ctx->ac.i32_0, - stream_out->dst_offset * 4, ac_glc | ac_slc); + unsigned buf_idx = stream_out->output_buffer; + unsigned start = stream_out->start_component; + unsigned num_comps = stream_out->num_components; + LLVMValueRef out[4]; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + /* Load the output as int. */ + for (int j = 0; j < num_comps; j++) { + assert(stream_out->stream == shader_out->vertex_stream[start + j]); + + out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; + + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v3i32 */ + if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { + vdata = ac_build_gather_values(&ctx->ac, out, num_comps); + break; + } + /* as v4i32 (aligned to 4) */ + out[3] = LLVMGetUndef(ctx->ac.i32); + /* fall through */ + case 4: /* as v4i32 */ + vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); + break; + } + + ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps, + so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4, + ac_glc | ac_slc); } /** * Write streamout data to buffers for vertex stream @p stream (different * vertex streams can occur for GS copy shaders). */ -void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream) +void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream) { - struct si_shader_selector *sel = ctx->shader->selector; - struct pipe_stream_output_info *so = &sel->so; - LLVMBuilderRef builder = ctx->ac.builder; - int i; - - /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - LLVMValueRef so_vtx_count = - si_unpack_param(ctx, ctx->streamout_config, 16, 7); - - LLVMValueRef tid = ac_get_thread_id(&ctx->ac); - - /* can_emit = tid < so_vtx_count; */ - LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); - - /* Emit the streamout code conditionally. This actually avoids - * out-of-bounds buffer access. The hw tells us via the SGPR - * (so_vtx_count) which threads are allowed to emit streamout data. */ - ac_build_ifcc(&ctx->ac, can_emit, 6501); - { - /* The buffer offset is computed as follows: - * ByteOffset = streamout_offset[buffer_id]*4 + - * (streamout_write_index + thread_id)*stride[buffer_id] + - * attrib_offset - */ - - LLVMValueRef so_write_index = - ac_get_arg(&ctx->ac, - ctx->streamout_write_index); - - /* Compute (streamout_write_index + thread_id). */ - so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); - - /* Load the descriptor and compute the write offset for each - * enabled buffer. */ - LLVMValueRef so_write_offset[4] = {}; - LLVMValueRef so_buffers[4]; - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, - ctx->rw_buffers); - - for (i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; - - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, - SI_VS_STREAMOUT_BUF0 + i, 0); - - so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - LLVMValueRef so_offset = ac_get_arg(&ctx->ac, - ctx->streamout_offset[i]); - so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); - - so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, - LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0), - so_offset); - } - - /* Write streamout data. */ - for (i = 0; i < so->num_outputs; i++) { - unsigned reg = so->output[i].register_index; - - if (reg >= noutput) - continue; - - if (stream != so->output[i].stream) - continue; - - si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, - &so->output[i], &outputs[reg]); - } - } - ac_build_endif(&ctx->ac, 6501); + struct si_shader_selector *sel = ctx->shader->selector; + struct pipe_stream_output_info *so = &sel->so; + LLVMBuilderRef builder = ctx->ac.builder; + int i; + + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ + LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* can_emit = tid < so_vtx_count; */ + LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + + /* Emit the streamout code conditionally. This actually avoids + * out-of-bounds buffer access. The hw tells us via the SGPR + * (so_vtx_count) which threads are allowed to emit streamout data. */ + ac_build_ifcc(&ctx->ac, can_emit, 6501); + { + /* The buffer offset is computed as follows: + * ByteOffset = streamout_offset[buffer_id]*4 + + * (streamout_write_index + thread_id)*stride[buffer_id] + + * attrib_offset + */ + + LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index); + + /* Compute (streamout_write_index + thread_id). */ + so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); + + /* Load the descriptor and compute the write offset for each + * enabled buffer. */ + LLVMValueRef so_write_offset[4] = {}; + LLVMValueRef so_buffers[4]; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + + for (i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0); + + so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]); + so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + so_write_offset[i] = ac_build_imad( + &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset); + } + + /* Write streamout data. */ + for (i = 0; i < so->num_outputs; i++) { + unsigned reg = so->output[i].register_index; + + if (reg >= noutput) + continue; + + if (stream != so->output[i].stream) + continue; + + si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i], + &outputs[reg]); + } + } + ac_build_endif(&ctx->ac, 6501); } -static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, - struct ac_export_args *pos, LLVMValueRef *out_elts) +static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos, + LLVMValueRef *out_elts) { - unsigned reg_index; - unsigned chan; - unsigned const_chan; - LLVMValueRef base_elt; - LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, - SI_VS_CONST_CLIP_PLANES, 0); - LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); - - for (reg_index = 0; reg_index < 2; reg_index ++) { - struct ac_export_args *args = &pos[2 + reg_index]; - - args->out[0] = - args->out[1] = - args->out[2] = - args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f); - - /* Compute dot products of position and user clip plane vectors */ - for (chan = 0; chan < 4; chan++) { - for (const_chan = 0; const_chan < 4; const_chan++) { - LLVMValueRef addr = - LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + - const_chan) * 4, 0); - base_elt = si_buffer_load_const(ctx, const_resource, - addr); - args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, - out_elts[const_chan], args->out[chan]); - } - } - - args->enabled_channels = 0xf; - args->valid_mask = 0; - args->done = 0; - args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; - args->compr = 0; - } + unsigned reg_index; + unsigned chan; + unsigned const_chan; + LLVMValueRef base_elt; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0); + LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); + + for (reg_index = 0; reg_index < 2; reg_index++) { + struct ac_export_args *args = &pos[2 + reg_index]; + + args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f); + + /* Compute dot products of position and user clip plane vectors */ + for (chan = 0; chan < 4; chan++) { + for (const_chan = 0; const_chan < 4; const_chan++) { + LLVMValueRef addr = + LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0); + base_elt = si_buffer_load_const(ctx, const_resource, addr); + args->out[chan] = + ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]); + } + } + + args->enabled_channels = 0xf; + args->valid_mask = 0; + args->done = 0; + args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; + args->compr = 0; + } } /* Initialize arguments for the shader export intrinsic */ -static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, - LLVMValueRef *values, - unsigned target, - struct ac_export_args *args) +static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values, + unsigned target, struct ac_export_args *args) { - args->enabled_channels = 0xf; /* writemask - default is 0xf */ - args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */ - args->done = 0; /* Specify whether this is the last export */ - args->target = target; /* Specify the target we are exporting */ - args->compr = false; + args->enabled_channels = 0xf; /* writemask - default is 0xf */ + args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */ + args->done = 0; /* Specify whether this is the last export */ + args->target = target; /* Specify the target we are exporting */ + args->compr = false; - memcpy(&args->out[0], values, sizeof(values[0]) * 4); + memcpy(&args->out[0], values, sizeof(values[0]) * 4); } -static void si_export_param(struct si_shader_context *ctx, unsigned index, - LLVMValueRef *values) +static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values) { - struct ac_export_args args; + struct ac_export_args args; - si_llvm_init_vs_export_args(ctx, values, - V_008DFC_SQ_EXP_PARAM + index, &args); - ac_build_export(&ctx->ac, &args); + si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args); + ac_build_export(&ctx->ac, &args); } static void si_build_param_exports(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) + struct si_shader_output_values *outputs, unsigned noutput) { - struct si_shader *shader = ctx->shader; - unsigned param_count = 0; - - for (unsigned i = 0; i < noutput; i++) { - unsigned semantic_name = outputs[i].semantic_name; - unsigned semantic_index = outputs[i].semantic_index; - - if (outputs[i].vertex_stream[0] != 0 && - outputs[i].vertex_stream[1] != 0 && - outputs[i].vertex_stream[2] != 0 && - outputs[i].vertex_stream[3] != 0) - continue; - - switch (semantic_name) { - case TGSI_SEMANTIC_LAYER: - case TGSI_SEMANTIC_VIEWPORT_INDEX: - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_PRIMID: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - case TGSI_SEMANTIC_GENERIC: - break; - default: - continue; - } - - if ((semantic_name != TGSI_SEMANTIC_GENERIC || - semantic_index < SI_MAX_IO_GENERIC) && - shader->key.opt.kill_outputs & - (1ull << si_shader_io_get_unique_index(semantic_name, - semantic_index, true))) - continue; - - si_export_param(ctx, param_count, outputs[i].values); - - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count++; - } - - shader->info.nr_param_exports = param_count; + struct si_shader *shader = ctx->shader; + unsigned param_count = 0; + + for (unsigned i = 0; i < noutput; i++) { + unsigned semantic_name = outputs[i].semantic_name; + unsigned semantic_index = outputs[i].semantic_index; + + if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 && + outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0) + continue; + + switch (semantic_name) { + case TGSI_SEMANTIC_LAYER: + case TGSI_SEMANTIC_VIEWPORT_INDEX: + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_PRIMID: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + case TGSI_SEMANTIC_GENERIC: + break; + default: + continue; + } + + if ((semantic_name != TGSI_SEMANTIC_GENERIC || semantic_index < SI_MAX_IO_GENERIC) && + shader->key.opt.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index, true))) + continue; + + si_export_param(ctx, param_count, outputs[i].values); + + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count++; + } + + shader->info.nr_param_exports = param_count; } /** @@ -544,296 +489,272 @@ static void si_build_param_exports(struct si_shader_context *ctx, * is true. */ static void si_vertex_color_clamping(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) + struct si_shader_output_values *outputs, unsigned noutput) { - LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; - bool has_colors = false; - - /* Store original colors to alloca variables. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); - LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); - } - has_colors = true; - } - - if (!has_colors) - return; - - /* The state is in the first bit of the user SGPR. */ - LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits); - cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); - - ac_build_ifcc(&ctx->ac, cond, 6502); - - /* Store clamped colors to alloca variables within the conditional block. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - LLVMBuildStore(ctx->ac.builder, - ac_build_clamp(&ctx->ac, outputs[i].values[j]), - addr[i][j]); - } - } - ac_build_endif(&ctx->ac, 6502); - - /* Load clamped colors */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); - } - } + LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; + bool has_colors = false; + + /* Store original colors to alloca variables. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); + LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); + } + has_colors = true; + } + + if (!has_colors) + return; + + /* The state is in the first bit of the user SGPR. */ + LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); + + ac_build_ifcc(&ctx->ac, cond, 6502); + + /* Store clamped colors to alloca variables within the conditional block. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]), + addr[i][j]); + } + } + ac_build_endif(&ctx->ac, 6502); + + /* Load clamped colors */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); + } + } } /* Generate export instructions for hardware VS shader stage or NGG GS stage * (position and parameter data only). */ void si_llvm_build_vs_exports(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) + struct si_shader_output_values *outputs, unsigned noutput) { - struct si_shader *shader = ctx->shader; - struct ac_export_args pos_args[4] = {}; - LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; - unsigned pos_idx; - int i; - - si_vertex_color_clamping(ctx, outputs, noutput); - - /* Build position exports. */ - for (i = 0; i < noutput; i++) { - switch (outputs[i].semantic_name) { - case TGSI_SEMANTIC_POSITION: - si_llvm_init_vs_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS, &pos_args[0]); - break; - case TGSI_SEMANTIC_PSIZE: - psize_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_LAYER: - layer_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - viewport_index_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_EDGEFLAG: - edgeflag_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_CLIPDIST: - if (!shader->key.opt.clip_disable) { - unsigned index = 2 + outputs[i].semantic_index; - si_llvm_init_vs_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS + index, - &pos_args[index]); - } - break; - case TGSI_SEMANTIC_CLIPVERTEX: - if (!shader->key.opt.clip_disable) { - si_llvm_emit_clipvertex(ctx, pos_args, - outputs[i].values); - } - break; - } - } - - /* We need to add the position output manually if it's missing. */ - if (!pos_args[0].out[0]) { - pos_args[0].enabled_channels = 0xf; /* writemask */ - pos_args[0].valid_mask = 0; /* EXEC mask */ - pos_args[0].done = 0; /* last export? */ - pos_args[0].target = V_008DFC_SQ_EXP_POS; - pos_args[0].compr = 0; /* COMPR flag */ - pos_args[0].out[0] = ctx->ac.f32_0; /* X */ - pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[0].out[3] = ctx->ac.f32_1; /* W */ - } - - bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && - !shader->key.as_ngg; - - /* Write the misc vector (point size, edgeflag, layer, viewport). */ - if (shader->selector->info.writes_psize || - pos_writes_edgeflag || - shader->selector->info.writes_viewport_index || - shader->selector->info.writes_layer) { - pos_args[1].enabled_channels = shader->selector->info.writes_psize | - (pos_writes_edgeflag << 1) | - (shader->selector->info.writes_layer << 2); - - pos_args[1].valid_mask = 0; /* EXEC mask */ - pos_args[1].done = 0; /* last export? */ - pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; - pos_args[1].compr = 0; /* COMPR flag */ - pos_args[1].out[0] = ctx->ac.f32_0; /* X */ - pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[1].out[3] = ctx->ac.f32_0; /* W */ - - if (shader->selector->info.writes_psize) - pos_args[1].out[0] = psize_value; - - if (pos_writes_edgeflag) { - /* The output is a float, but the hw expects an integer - * with the first bit containing the edge flag. */ - edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, - edgeflag_value, - ctx->ac.i32, ""); - edgeflag_value = ac_build_umin(&ctx->ac, - edgeflag_value, - ctx->ac.i32_1); - - /* The LLVM intrinsic expects a float. */ - pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); - } - - if (ctx->screen->info.chip_class >= GFX9) { - /* GFX9 has the layer in out.z[10:0] and the viewport - * index in out.z[19:16]. - */ - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - LLVMValueRef v = viewport_index_value; - - v = ac_to_integer(&ctx->ac, v); - v = LLVMBuildShl(ctx->ac.builder, v, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - v = LLVMBuildOr(ctx->ac.builder, v, - ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); - pos_args[1].out[2] = ac_to_float(&ctx->ac, v); - pos_args[1].enabled_channels |= 1 << 2; - } - } else { - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - pos_args[1].out[3] = viewport_index_value; - pos_args[1].enabled_channels |= 1 << 3; - } - } - } - - for (i = 0; i < 4; i++) - if (pos_args[i].out[0]) - shader->info.nr_pos_exports++; - - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. - * Setting valid_mask=1 prevents it and has no other effect. - */ - if (ctx->screen->info.family == CHIP_NAVI10 || - ctx->screen->info.family == CHIP_NAVI12 || - ctx->screen->info.family == CHIP_NAVI14) - pos_args[0].valid_mask = 1; - - pos_idx = 0; - for (i = 0; i < 4; i++) { - if (!pos_args[i].out[0]) - continue; - - /* Specify the target we are exporting */ - pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; - - if (pos_idx == shader->info.nr_pos_exports) - /* Specify that this is the last export */ - pos_args[i].done = 1; - - ac_build_export(&ctx->ac, &pos_args[i]); - } - - /* Build parameter exports. */ - si_build_param_exports(ctx, outputs, noutput); + struct si_shader *shader = ctx->shader; + struct ac_export_args pos_args[4] = {}; + LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, + viewport_index_value = NULL; + unsigned pos_idx; + int i; + + si_vertex_color_clamping(ctx, outputs, noutput); + + /* Build position exports. */ + for (i = 0; i < noutput; i++) { + switch (outputs[i].semantic_name) { + case TGSI_SEMANTIC_POSITION: + si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]); + break; + case TGSI_SEMANTIC_PSIZE: + psize_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_LAYER: + layer_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + viewport_index_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_EDGEFLAG: + edgeflag_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_CLIPDIST: + if (!shader->key.opt.clip_disable) { + unsigned index = 2 + outputs[i].semantic_index; + si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index, + &pos_args[index]); + } + break; + case TGSI_SEMANTIC_CLIPVERTEX: + if (!shader->key.opt.clip_disable) { + si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values); + } + break; + } + } + + /* We need to add the position output manually if it's missing. */ + if (!pos_args[0].out[0]) { + pos_args[0].enabled_channels = 0xf; /* writemask */ + pos_args[0].valid_mask = 0; /* EXEC mask */ + pos_args[0].done = 0; /* last export? */ + pos_args[0].target = V_008DFC_SQ_EXP_POS; + pos_args[0].compr = 0; /* COMPR flag */ + pos_args[0].out[0] = ctx->ac.f32_0; /* X */ + pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[0].out[3] = ctx->ac.f32_1; /* W */ + } + + bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg; + + /* Write the misc vector (point size, edgeflag, layer, viewport). */ + if (shader->selector->info.writes_psize || pos_writes_edgeflag || + shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) { + pos_args[1].enabled_channels = shader->selector->info.writes_psize | + (pos_writes_edgeflag << 1) | + (shader->selector->info.writes_layer << 2); + + pos_args[1].valid_mask = 0; /* EXEC mask */ + pos_args[1].done = 0; /* last export? */ + pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; + pos_args[1].compr = 0; /* COMPR flag */ + pos_args[1].out[0] = ctx->ac.f32_0; /* X */ + pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[1].out[3] = ctx->ac.f32_0; /* W */ + + if (shader->selector->info.writes_psize) + pos_args[1].out[0] = psize_value; + + if (pos_writes_edgeflag) { + /* The output is a float, but the hw expects an integer + * with the first bit containing the edge flag. */ + edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, ""); + edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1); + + /* The LLVM intrinsic expects a float. */ + pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); + } + + if (ctx->screen->info.chip_class >= GFX9) { + /* GFX9 has the layer in out.z[10:0] and the viewport + * index in out.z[19:16]. + */ + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + LLVMValueRef v = viewport_index_value; + + v = ac_to_integer(&ctx->ac, v); + v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); + pos_args[1].out[2] = ac_to_float(&ctx->ac, v); + pos_args[1].enabled_channels |= 1 << 2; + } + } else { + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + pos_args[1].out[3] = viewport_index_value; + pos_args[1].enabled_channels |= 1 << 3; + } + } + } + + for (i = 0; i < 4; i++) + if (pos_args[i].out[0]) + shader->info.nr_pos_exports++; + + /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + * Setting valid_mask=1 prevents it and has no other effect. + */ + if (ctx->screen->info.family == CHIP_NAVI10 || ctx->screen->info.family == CHIP_NAVI12 || + ctx->screen->info.family == CHIP_NAVI14) + pos_args[0].valid_mask = 1; + + pos_idx = 0; + for (i = 0; i < 4; i++) { + if (!pos_args[i].out[0]) + continue; + + /* Specify the target we are exporting */ + pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; + + if (pos_idx == shader->info.nr_pos_exports) + /* Specify that this is the last export */ + pos_args[i].done = 1; + + ac_build_export(&ctx->ac, &pos_args[i]); + } + + /* Build parameter exports. */ + si_build_param_exports(ctx, outputs, noutput); } -void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, - LLVMValueRef *addrs) +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - struct si_shader_output_values *outputs = NULL; - int i,j; - - assert(!ctx->shader->is_gs_copy_shader); - assert(info->num_outputs <= max_outputs); - - outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); - - for (i = 0; i < info->num_outputs; i++) { - outputs[i].semantic_name = info->output_semantic_name[i]; - outputs[i].semantic_index = info->output_semantic_index[i]; - - for (j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, - addrs[4 * i + j], - ""); - outputs[i].vertex_stream[j] = - (info->output_streams[i] >> (2 * j)) & 3; - } - } - - if (!ctx->screen->use_ngg_streamout && - ctx->shader->selector->so.num_outputs) - si_llvm_emit_streamout(ctx, outputs, i, 0); - - /* Export PrimitiveID. */ - if (ctx->shader->key.mono.u.vs_export_prim_id) { - outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; - outputs[i].semantic_index = 0; - outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); - for (j = 1; j < 4; j++) - outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0); - - memset(outputs[i].vertex_stream, 0, - sizeof(outputs[i].vertex_stream)); - i++; - } - - si_llvm_build_vs_exports(ctx, outputs, i); - FREE(outputs); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader_output_values *outputs = NULL; + int i, j; + + assert(!ctx->shader->is_gs_copy_shader); + assert(info->num_outputs <= max_outputs); + + outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); + + for (i = 0; i < info->num_outputs; i++) { + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; + + for (j = 0; j < 4; j++) { + outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], ""); + outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3; + } + } + + if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs) + si_llvm_emit_streamout(ctx, outputs, i, 0); + + /* Export PrimitiveID. */ + if (ctx->shader->key.mono.u.vs_export_prim_id) { + outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; + outputs[i].semantic_index = 0; + outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); + for (j = 1; j < 4; j++) + outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0); + + memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream)); + i++; + } + + si_llvm_build_vs_exports(ctx, outputs, i); + FREE(outputs); } -static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) +static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef pos[4] = {}; - - assert(info->num_outputs <= max_outputs); - - for (unsigned i = 0; i < info->num_outputs; i++) { - if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) - continue; - - for (unsigned chan = 0; chan < 4; chan++) - pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - break; - } - assert(pos[0] != NULL); - - /* Return the position output. */ - LLVMValueRef ret = ctx->return_value; - for (unsigned chan = 0; chan < 4; chan++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); - ctx->return_value = ret; + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef pos[4] = {}; + + assert(info->num_outputs <= max_outputs); + + for (unsigned i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) + continue; + + for (unsigned chan = 0; chan < 4; chan++) + pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + break; + } + assert(pos[0] != NULL); + + /* Return the position output. */ + LLVMValueRef ret = ctx->return_value; + for (unsigned chan = 0; chan < 4; chan++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); + ctx->return_value = ret; } /** @@ -852,280 +773,252 @@ static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, * (InstanceID + StartInstance), * (InstanceID / 2 + StartInstance) */ -void si_llvm_build_vs_prolog(struct si_shader_context *ctx, - union si_shader_part_key *key) +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key) { - LLVMTypeRef *returns; - LLVMValueRef ret, func; - int num_returns, i; - unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; - unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 + - (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0); - struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; - struct ac_arg input_vgpr_param[10]; - LLVMValueRef input_vgprs[10]; - unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + - num_input_vgprs; - unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ - returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * - sizeof(LLVMTypeRef)); - num_returns = 0; - - /* Declare input and output SGPRs. */ - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &input_sgpr_param[i]); - returns[num_returns++] = ctx->ac.i32; - } - - struct ac_arg merged_wave_info = input_sgpr_param[3]; - - /* Preloaded VGPRs (outputs must be floats) */ - for (i = 0; i < num_input_vgprs; i++) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); - returns[num_returns++] = ctx->ac.f32; - } - - /* Vertex load indices. */ - for (i = 0; i < key->vs_prolog.num_inputs; i++) - returns[num_returns++] = ctx->ac.f32; - - /* Create the function. */ - si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0); - func = ctx->main_fn; - - for (i = 0; i < num_input_vgprs; i++) { - input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]); - } - - if (key->vs_prolog.num_merged_next_stage_vgprs) { - if (!key->vs_prolog.is_monolithic) - si_init_exec_from_input(ctx, merged_wave_info, 0); - - if (key->vs_prolog.as_ls && - ctx->screen->info.has_ls_vgpr_init_bug) { - /* If there are no HS threads, SPI loads the LS VGPRs - * starting at VGPR 0. Shift them back to where they - * belong. - */ - LLVMValueRef has_hs_threads = - LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, - si_unpack_param(ctx, input_sgpr_param[3], 8, 8), - ctx->ac.i32_0, ""); - - for (i = 4; i > 0; --i) { - input_vgprs[i + 1] = - LLVMBuildSelect(ctx->ac.builder, has_hs_threads, - input_vgprs[i + 1], - input_vgprs[i - 1], ""); - } - } - } - - if (key->vs_prolog.gs_fast_launch_tri_list || - key->vs_prolog.gs_fast_launch_tri_strip) { - LLVMValueRef wave_id, thread_id_in_tg; - - wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); - thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id, - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), - ac_get_thread_id(&ctx->ac)); - - /* The GS fast launch initializes all VGPRs to the value of - * the first thread, so we have to add the thread ID. - * - * Only these are initialized by the hw: - * VGPR2: Base Primitive ID - * VGPR5: Base Vertex ID - * VGPR6: Instance ID - */ - - /* Put the vertex thread IDs into VGPRs as-is instead of packing them. - * The NGG cull shader will read them from there. - */ - if (key->vs_prolog.gs_fast_launch_tri_list) { - input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ - LLVMConstInt(ctx->ac.i32, 0, 0)); - input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ - LLVMConstInt(ctx->ac.i32, 1, 0)); - input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ - LLVMConstInt(ctx->ac.i32, 2, 0)); - } else { - assert(key->vs_prolog.gs_fast_launch_tri_strip); - LLVMBuilderRef builder = ctx->ac.builder; - /* Triangle indices: */ - LLVMValueRef index[3] = { - thread_id_in_tg, - LLVMBuildAdd(builder, thread_id_in_tg, - LLVMConstInt(ctx->ac.i32, 1, 0), ""), - LLVMBuildAdd(builder, thread_id_in_tg, - LLVMConstInt(ctx->ac.i32, 2, 0), ""), - }; - LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, - thread_id_in_tg, ctx->ac.i1, ""); - LLVMValueRef flatshade_first = - LLVMBuildICmp(builder, LLVMIntEQ, - si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), - ctx->ac.i32_0, ""); - - ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, - flatshade_first, index); - input_vgprs[0] = index[0]; - input_vgprs[1] = index[1]; - input_vgprs[4] = index[2]; - } - - /* Triangles always have all edge flags set initially. */ - input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); - - input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], - thread_id_in_tg, ""); /* PrimID */ - input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], - thread_id_in_tg, ""); /* VertexID */ - input_vgprs[8] = input_vgprs[6]; /* InstanceID */ - } - - unsigned vertex_id_vgpr = first_vs_vgpr; - unsigned instance_id_vgpr = - ctx->screen->info.chip_class >= GFX10 ? - first_vs_vgpr + 3 : - first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); - - ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; - ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; - - /* InstanceID = VertexID >> 16; - * VertexID = VertexID & 0xffff; - */ - if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { - ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->ac.i32, 0xffff, 0), ""); - } - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); - } - for (i = 0; i < num_input_vgprs; i++) { - LLVMValueRef p = input_vgprs[i]; - - if (i == vertex_id_vgpr) - p = ctx->abi.vertex_id; - else if (i == instance_id_vgpr) - p = ctx->abi.instance_id; - - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, - key->vs_prolog.num_input_sgprs + i, ""); - } - - /* Compute vertex load indices from instance divisors. */ - LLVMValueRef instance_divisor_constbuf = NULL; - - if (key->vs_prolog.states.instance_divisor_is_fetched) { - LLVMValueRef list = si_prolog_get_rw_buffers(ctx); - LLVMValueRef buf_index = - LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); - instance_divisor_constbuf = - ac_build_load_to_sgpr(&ctx->ac, list, buf_index); - } - - for (i = 0; i < key->vs_prolog.num_inputs; i++) { - bool divisor_is_one = - key->vs_prolog.states.instance_divisor_is_one & (1u << i); - bool divisor_is_fetched = - key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index = NULL; - - if (divisor_is_one) { - index = ctx->abi.instance_id; - } else if (divisor_is_fetched) { - LLVMValueRef udiv_factors[4]; - - for (unsigned j = 0; j < 4; j++) { - udiv_factors[j] = - si_buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0)); - udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); - } - /* The faster NUW version doesn't work when InstanceID == UINT_MAX. - * Such InstanceID might not be achievable in a reasonable time though. - */ - index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, - udiv_factors[0], udiv_factors[1], - udiv_factors[2], udiv_factors[3]); - } - - if (divisor_is_one || divisor_is_fetched) { - /* Add StartInstance. */ - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMGetParam(ctx->main_fn, user_sgpr_base + - SI_SGPR_START_INSTANCE), ""); - } else { - /* VertexID + BaseVertex */ - index = LLVMBuildAdd(ctx->ac.builder, - ctx->abi.vertex_id, - LLVMGetParam(func, user_sgpr_base + - SI_SGPR_BASE_VERTEX), ""); - } - - index = ac_to_float(&ctx->ac, index); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, - ctx->args.arg_count + i, ""); - } - - si_llvm_build_ret(ctx, ret); + LLVMTypeRef *returns; + LLVMValueRef ret, func; + int num_returns, i; + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; + unsigned num_input_vgprs = + key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0); + struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; + struct ac_arg input_vgpr_param[10]; + LLVMValueRef input_vgprs[10]; + unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs; + unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef)); + num_returns = 0; + + /* Declare input and output SGPRs. */ + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]); + returns[num_returns++] = ctx->ac.i32; + } + + struct ac_arg merged_wave_info = input_sgpr_param[3]; + + /* Preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < num_input_vgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); + returns[num_returns++] = ctx->ac.f32; + } + + /* Vertex load indices. */ + for (i = 0; i < key->vs_prolog.num_inputs; i++) + returns[num_returns++] = ctx->ac.f32; + + /* Create the function. */ + si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0); + func = ctx->main_fn; + + for (i = 0; i < num_input_vgprs; i++) { + input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]); + } + + if (key->vs_prolog.num_merged_next_stage_vgprs) { + if (!key->vs_prolog.is_monolithic) + si_init_exec_from_input(ctx, merged_wave_info, 0); + + if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) { + /* If there are no HS threads, SPI loads the LS VGPRs + * starting at VGPR 0. Shift them back to where they + * belong. + */ + LLVMValueRef has_hs_threads = + LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, ""); + + for (i = 4; i > 0; --i) { + input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads, + input_vgprs[i + 1], input_vgprs[i - 1], ""); + } + } + } + + if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) { + LLVMValueRef wave_id, thread_id_in_tg; + + wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); + thread_id_in_tg = + ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), + ac_get_thread_id(&ctx->ac)); + + /* The GS fast launch initializes all VGPRs to the value of + * the first thread, so we have to add the thread ID. + * + * Only these are initialized by the hw: + * VGPR2: Base Primitive ID + * VGPR5: Base Vertex ID + * VGPR6: Instance ID + */ + + /* Put the vertex thread IDs into VGPRs as-is instead of packing them. + * The NGG cull shader will read them from there. + */ + if (key->vs_prolog.gs_fast_launch_tri_list) { + input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ + LLVMConstInt(ctx->ac.i32, 0, 0)); + input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ + LLVMConstInt(ctx->ac.i32, 1, 0)); + input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ + LLVMConstInt(ctx->ac.i32, 2, 0)); + } else { + assert(key->vs_prolog.gs_fast_launch_tri_strip); + LLVMBuilderRef builder = ctx->ac.builder; + /* Triangle indices: */ + LLVMValueRef index[3] = { + thread_id_in_tg, + LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""), + LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""), + }; + LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, ""); + LLVMValueRef flatshade_first = LLVMBuildICmp( + builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index); + input_vgprs[0] = index[0]; + input_vgprs[1] = index[1]; + input_vgprs[4] = index[2]; + } + + /* Triangles always have all edge flags set initially. */ + input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); + + input_vgprs[2] = + LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */ + input_vgprs[5] = + LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */ + input_vgprs[8] = input_vgprs[6]; /* InstanceID */ + } + + unsigned vertex_id_vgpr = first_vs_vgpr; + unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10 + ? first_vs_vgpr + 3 + : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); + + ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; + ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; + + /* InstanceID = VertexID >> 16; + * VertexID = VertexID & 0xffff; + */ + if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { + ctx->abi.instance_id = + LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), ""); + ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 0xffff, 0), ""); + } + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); + } + for (i = 0; i < num_input_vgprs; i++) { + LLVMValueRef p = input_vgprs[i]; + + if (i == vertex_id_vgpr) + p = ctx->abi.vertex_id; + else if (i == instance_id_vgpr) + p = ctx->abi.instance_id; + + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + LLVMValueRef instance_divisor_constbuf = NULL; + + if (key->vs_prolog.states.instance_divisor_is_fetched) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); + instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index); + } + + for (i = 0; i < key->vs_prolog.num_inputs; i++) { + bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i); + bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); + } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0], + udiv_factors[1], udiv_factors[2], udiv_factors[3]); + } + + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = + LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), ""); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, + LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), ""); + } + + index = ac_to_float(&ctx->ac, index); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, ""); + } + + si_llvm_build_ret(ctx, ret); } static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) { - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - /* For non-indexed draws, the base vertex set by the driver - * (for direct draws) or the CP (for indirect draws) is the - * first vertex ID, but GLSL expects 0 to be returned. - */ - LLVMValueRef vs_state = ac_get_arg(&ctx->ac, - ctx->vs_state_bits); - LLVMValueRef indexed; - - indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, ""); - indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, ""); - - return LLVMBuildSelect(ctx->ac.builder, indexed, - ac_get_arg(&ctx->ac, ctx->args.base_vertex), - ctx->ac.i32_0, ""); + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + /* For non-indexed draws, the base vertex set by the driver + * (for direct draws) or the CP (for indirect draws) is the + * first vertex ID, but GLSL expects 0 to be returned. + */ + LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits); + LLVMValueRef indexed; + + indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, ""); + indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, ""); + + return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex), + ctx->ac.i32_0, ""); } void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) { - struct si_shader *shader = ctx->shader; - - if (shader->key.as_ls) - ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; - else if (shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (shader->key.opt.vs_as_prim_discard_cs) - ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; - else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; - else if (shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; - - ctx->abi.load_base_vertex = get_base_vertex; + struct si_shader *shader = ctx->shader; + + if (shader->key.as_ls) + ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; + else if (shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (shader->key.opt.vs_as_prim_discard_cs) + ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + + ctx->abi.load_base_vertex = get_base_vertex; } diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 49393af3abd..ddbb5c5c9c7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -22,914 +22,865 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_shader_internal.h" -#include "si_pipe.h" - #include "ac_nir_to_llvm.h" - -#include "tgsi/tgsi_from_mesa.h" - #include "compiler/nir/nir.h" -#include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_deref.h" +#include "compiler/nir_types.h" +#include "si_pipe.h" +#include "si_shader_internal.h" +#include "tgsi/tgsi_from_mesa.h" static const nir_deref_instr *tex_get_texture_deref(nir_tex_instr *instr) { - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_texture_deref: - return nir_src_as_deref(instr->src[i].src); - default: - break; - } - } - - return NULL; + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + return nir_src_as_deref(instr->src[i].src); + default: + break; + } + } + + return NULL; } -static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr) +static nir_variable *intrinsic_get_var(nir_intrinsic_instr *instr) { - return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0])); + return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0])); } -static void gather_usage_helper(const nir_deref_instr **deref_ptr, - unsigned location, - uint8_t mask, - uint8_t *usage_mask) +static void gather_usage_helper(const nir_deref_instr **deref_ptr, unsigned location, uint8_t mask, + uint8_t *usage_mask) { - for (; *deref_ptr; deref_ptr++) { - const nir_deref_instr *deref = *deref_ptr; - switch (deref->deref_type) { - case nir_deref_type_array: { - unsigned elem_size = - glsl_count_attribute_slots(deref->type, false); - if (nir_src_is_const(deref->arr.index)) { - location += elem_size * nir_src_as_uint(deref->arr.index); - } else { - unsigned array_elems = - glsl_get_length(deref_ptr[-1]->type); - for (unsigned i = 0; i < array_elems; i++) { - gather_usage_helper(deref_ptr + 1, - location + elem_size * i, - mask, usage_mask); - } - return; - } - break; - } - case nir_deref_type_struct: { - const struct glsl_type *parent_type = - deref_ptr[-1]->type; - unsigned index = deref->strct.index; - for (unsigned i = 0; i < index; i++) { - const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); - location += glsl_count_attribute_slots(ft, false); - } - break; - } - default: - unreachable("Unhandled deref type in gather_components_used_helper"); - } - } - - usage_mask[location] |= mask & 0xf; - if (mask & 0xf0) - usage_mask[location + 1] |= (mask >> 4) & 0xf; + for (; *deref_ptr; deref_ptr++) { + const nir_deref_instr *deref = *deref_ptr; + switch (deref->deref_type) { + case nir_deref_type_array: { + unsigned elem_size = glsl_count_attribute_slots(deref->type, false); + if (nir_src_is_const(deref->arr.index)) { + location += elem_size * nir_src_as_uint(deref->arr.index); + } else { + unsigned array_elems = glsl_get_length(deref_ptr[-1]->type); + for (unsigned i = 0; i < array_elems; i++) { + gather_usage_helper(deref_ptr + 1, location + elem_size * i, mask, usage_mask); + } + return; + } + break; + } + case nir_deref_type_struct: { + const struct glsl_type *parent_type = deref_ptr[-1]->type; + unsigned index = deref->strct.index; + for (unsigned i = 0; i < index; i++) { + const struct glsl_type *ft = glsl_get_struct_field(parent_type, i); + location += glsl_count_attribute_slots(ft, false); + } + break; + } + default: + unreachable("Unhandled deref type in gather_components_used_helper"); + } + } + + usage_mask[location] |= mask & 0xf; + if (mask & 0xf0) + usage_mask[location + 1] |= (mask >> 4) & 0xf; } -static void gather_usage(const nir_deref_instr *deref, - uint8_t mask, - uint8_t *usage_mask) +static void gather_usage(const nir_deref_instr *deref, uint8_t mask, uint8_t *usage_mask) { - nir_deref_path path; - nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL); - - unsigned location_frac = path.path[0]->var->data.location_frac; - if (glsl_type_is_64bit(deref->type)) { - uint8_t new_mask = 0; - for (unsigned i = 0; i < 4; i++) { - if (mask & (1 << i)) - new_mask |= 0x3 << (2 * i); - } - mask = new_mask << location_frac; - } else { - mask <<= location_frac; - mask &= 0xf; - } - - gather_usage_helper((const nir_deref_instr **)&path.path[1], - path.path[0]->var->data.driver_location, - mask, usage_mask); - - nir_deref_path_finish(&path); + nir_deref_path path; + nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL); + + unsigned location_frac = path.path[0]->var->data.location_frac; + if (glsl_type_is_64bit(deref->type)) { + uint8_t new_mask = 0; + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + new_mask |= 0x3 << (2 * i); + } + mask = new_mask << location_frac; + } else { + mask <<= location_frac; + mask &= 0xf; + } + + gather_usage_helper((const nir_deref_instr **)&path.path[1], + path.path[0]->var->data.driver_location, mask, usage_mask); + + nir_deref_path_finish(&path); } static void gather_intrinsic_load_deref_input_info(const nir_shader *nir, - const nir_intrinsic_instr *instr, - const nir_deref_instr *deref, - struct si_shader_info *info) + const nir_intrinsic_instr *instr, + const nir_deref_instr *deref, + struct si_shader_info *info) { - switch (nir->info.stage) { - case MESA_SHADER_VERTEX: - gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), - info->input_usage_mask); - default:; - } + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), info->input_usage_mask); + default:; + } } static void gather_intrinsic_load_deref_output_info(const nir_shader *nir, - const nir_intrinsic_instr *instr, - nir_variable *var, - struct si_shader_info *info) + const nir_intrinsic_instr *instr, + nir_variable *var, struct si_shader_info *info) { - assert(var && var->data.mode == nir_var_shader_out); - - switch (nir->info.stage) { - case MESA_SHADER_TESS_CTRL: - if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || - var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) - info->reads_tessfactor_outputs = true; - else if (var->data.patch) - info->reads_perpatch_outputs = true; - else - info->reads_pervertex_outputs = true; - break; - - case MESA_SHADER_FRAGMENT: - if (var->data.fb_fetch_output) - info->uses_fbfetch = true; - break; - default:; - } + assert(var && var->data.mode == nir_var_shader_out); + + switch (nir->info.stage) { + case MESA_SHADER_TESS_CTRL: + if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || + var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) + info->reads_tessfactor_outputs = true; + else if (var->data.patch) + info->reads_perpatch_outputs = true; + else + info->reads_pervertex_outputs = true; + break; + + case MESA_SHADER_FRAGMENT: + if (var->data.fb_fetch_output) + info->uses_fbfetch = true; + break; + default:; + } } static void gather_intrinsic_store_deref_output_info(const nir_shader *nir, - const nir_intrinsic_instr *instr, - const nir_deref_instr *deref, - struct si_shader_info *info) + const nir_intrinsic_instr *instr, + const nir_deref_instr *deref, + struct si_shader_info *info) { - switch (nir->info.stage) { - case MESA_SHADER_VERTEX: /* needed by LS, ES */ - case MESA_SHADER_TESS_EVAL: /* needed by ES */ - case MESA_SHADER_GEOMETRY: - gather_usage(deref, nir_intrinsic_write_mask(instr), - info->output_usagemask); - break; - default:; - } + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: /* needed by LS, ES */ + case MESA_SHADER_TESS_EVAL: /* needed by ES */ + case MESA_SHADER_GEOMETRY: + gather_usage(deref, nir_intrinsic_write_mask(instr), info->output_usagemask); + break; + default:; + } } -static void scan_instruction(const struct nir_shader *nir, - struct si_shader_info *info, - nir_instr *instr) +static void scan_instruction(const struct nir_shader *nir, struct si_shader_info *info, + nir_instr *instr) { - if (instr->type == nir_instr_type_alu) { - nir_alu_instr *alu = nir_instr_as_alu(instr); - - switch (alu->op) { - case nir_op_fddx: - case nir_op_fddy: - case nir_op_fddx_fine: - case nir_op_fddy_fine: - case nir_op_fddx_coarse: - case nir_op_fddy_coarse: - info->uses_derivatives = true; - break; - default: - break; - } - } else if (instr->type == nir_instr_type_tex) { - nir_tex_instr *tex = nir_instr_as_tex(instr); - const nir_deref_instr *deref = tex_get_texture_deref(tex); - nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL; - - if (!var) { - info->samplers_declared |= - u_bit_consecutive(tex->sampler_index, 1); - } else { - if (deref->mode != nir_var_uniform || var->data.bindless) - info->uses_bindless_samplers = true; - } - - switch (tex->op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_lod: - info->uses_derivatives = true; - break; - default: - break; - } - } else if (instr->type == nir_instr_type_intrinsic) { - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_front_face: - info->uses_frontface = 1; - break; - case nir_intrinsic_load_instance_id: - info->uses_instanceid = 1; - break; - case nir_intrinsic_load_invocation_id: - info->uses_invocationid = true; - break; - case nir_intrinsic_load_num_work_groups: - info->uses_grid_size = true; - break; - case nir_intrinsic_load_local_invocation_index: - case nir_intrinsic_load_subgroup_id: - case nir_intrinsic_load_num_subgroups: - info->uses_subgroup_info = true; - break; - case nir_intrinsic_load_local_group_size: - /* The block size is translated to IMM with a fixed block size. */ - if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) - info->uses_block_size = true; - break; - case nir_intrinsic_load_local_invocation_id: - case nir_intrinsic_load_work_group_id: { - unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa); - while (mask) { - unsigned i = u_bit_scan(&mask); - - if (intr->intrinsic == nir_intrinsic_load_work_group_id) - info->uses_block_id[i] = true; - else - info->uses_thread_id[i] = true; - } - break; - } - case nir_intrinsic_load_vertex_id: - info->uses_vertexid = 1; - break; - case nir_intrinsic_load_vertex_id_zero_base: - info->uses_vertexid_nobase = 1; - break; - case nir_intrinsic_load_base_vertex: - info->uses_basevertex = 1; - break; - case nir_intrinsic_load_draw_id: - info->uses_drawid = 1; - break; - case nir_intrinsic_load_primitive_id: - info->uses_primid = 1; - break; - case nir_intrinsic_load_sample_mask_in: - info->reads_samplemask = true; - break; - case nir_intrinsic_load_tess_level_inner: - case nir_intrinsic_load_tess_level_outer: - info->reads_tess_factors = true; - break; - case nir_intrinsic_bindless_image_load: - case nir_intrinsic_bindless_image_size: - case nir_intrinsic_bindless_image_samples: - info->uses_bindless_images = true; - break; - case nir_intrinsic_bindless_image_store: - info->uses_bindless_images = true; - info->writes_memory = true; - info->num_memory_instructions++; /* we only care about stores */ - break; - case nir_intrinsic_image_deref_store: - info->writes_memory = true; - info->num_memory_instructions++; /* we only care about stores */ - break; - case nir_intrinsic_bindless_image_atomic_add: - case nir_intrinsic_bindless_image_atomic_imin: - case nir_intrinsic_bindless_image_atomic_umin: - case nir_intrinsic_bindless_image_atomic_imax: - case nir_intrinsic_bindless_image_atomic_umax: - case nir_intrinsic_bindless_image_atomic_and: - case nir_intrinsic_bindless_image_atomic_or: - case nir_intrinsic_bindless_image_atomic_xor: - case nir_intrinsic_bindless_image_atomic_exchange: - case nir_intrinsic_bindless_image_atomic_comp_swap: - info->uses_bindless_images = true; - info->writes_memory = true; - info->num_memory_instructions++; /* we only care about stores */ - break; - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_atomic_inc_wrap: - case nir_intrinsic_image_deref_atomic_dec_wrap: - info->writes_memory = true; - info->num_memory_instructions++; /* we only care about stores */ - break; - case nir_intrinsic_store_ssbo: - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - info->writes_memory = true; - info->num_memory_instructions++; /* we only care about stores */ - break; - case nir_intrinsic_load_color0: - case nir_intrinsic_load_color1: { - unsigned index = intr->intrinsic == nir_intrinsic_load_color1; - uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa); - info->colors_read |= mask << (index * 4); - break; - } - case nir_intrinsic_load_barycentric_pixel: - case nir_intrinsic_load_barycentric_centroid: - case nir_intrinsic_load_barycentric_sample: - case nir_intrinsic_load_barycentric_at_offset: /* uses center */ - case nir_intrinsic_load_barycentric_at_sample: { /* uses center */ - unsigned mode = nir_intrinsic_interp_mode(intr); - - if (mode == INTERP_MODE_FLAT) - break; - - if (mode == INTERP_MODE_NOPERSPECTIVE) { - if (intr->intrinsic == nir_intrinsic_load_barycentric_sample) - info->uses_linear_sample = true; - else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid) - info->uses_linear_centroid = true; - else - info->uses_linear_center = true; - - if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample) - info->uses_linear_opcode_interp_sample = true; - } else { - if (intr->intrinsic == nir_intrinsic_load_barycentric_sample) - info->uses_persp_sample = true; - else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid) - info->uses_persp_centroid = true; - else - info->uses_persp_center = true; - - if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample) - info->uses_persp_opcode_interp_sample = true; - } - break; - } - case nir_intrinsic_load_deref: { - nir_variable *var = intrinsic_get_var(intr); - nir_variable_mode mode = var->data.mode; - - if (mode == nir_var_shader_in) { - /* PS inputs use the interpolated load intrinsics. */ - assert(nir->info.stage != MESA_SHADER_FRAGMENT); - gather_intrinsic_load_deref_input_info(nir, intr, - nir_src_as_deref(intr->src[0]), info); - } else if (mode == nir_var_shader_out) { - gather_intrinsic_load_deref_output_info(nir, intr, var, info); - } - break; - } - case nir_intrinsic_store_deref: { - nir_variable *var = intrinsic_get_var(intr); - - if (var->data.mode == nir_var_shader_out) - gather_intrinsic_store_deref_output_info(nir, intr, - nir_src_as_deref(intr->src[0]), info); - break; - } - case nir_intrinsic_interp_deref_at_centroid: - case nir_intrinsic_interp_deref_at_sample: - case nir_intrinsic_interp_deref_at_offset: - unreachable("interp opcodes should have been lowered"); - break; - default: - break; - } - } + if (instr->type == nir_instr_type_alu) { + nir_alu_instr *alu = nir_instr_as_alu(instr); + + switch (alu->op) { + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + info->uses_derivatives = true; + break; + default: + break; + } + } else if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + const nir_deref_instr *deref = tex_get_texture_deref(tex); + nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL; + + if (!var) { + info->samplers_declared |= u_bit_consecutive(tex->sampler_index, 1); + } else { + if (deref->mode != nir_var_uniform || var->data.bindless) + info->uses_bindless_samplers = true; + } + + switch (tex->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_lod: + info->uses_derivatives = true; + break; + default: + break; + } + } else if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_front_face: + info->uses_frontface = 1; + break; + case nir_intrinsic_load_instance_id: + info->uses_instanceid = 1; + break; + case nir_intrinsic_load_invocation_id: + info->uses_invocationid = true; + break; + case nir_intrinsic_load_num_work_groups: + info->uses_grid_size = true; + break; + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_load_num_subgroups: + info->uses_subgroup_info = true; + break; + case nir_intrinsic_load_local_group_size: + /* The block size is translated to IMM with a fixed block size. */ + if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0) + info->uses_block_size = true; + break; + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_work_group_id: { + unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa); + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (intr->intrinsic == nir_intrinsic_load_work_group_id) + info->uses_block_id[i] = true; + else + info->uses_thread_id[i] = true; + } + break; + } + case nir_intrinsic_load_vertex_id: + info->uses_vertexid = 1; + break; + case nir_intrinsic_load_vertex_id_zero_base: + info->uses_vertexid_nobase = 1; + break; + case nir_intrinsic_load_base_vertex: + info->uses_basevertex = 1; + break; + case nir_intrinsic_load_draw_id: + info->uses_drawid = 1; + break; + case nir_intrinsic_load_primitive_id: + info->uses_primid = 1; + break; + case nir_intrinsic_load_sample_mask_in: + info->reads_samplemask = true; + break; + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: + info->reads_tess_factors = true; + break; + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_samples: + info->uses_bindless_images = true; + break; + case nir_intrinsic_bindless_image_store: + info->uses_bindless_images = true; + info->writes_memory = true; + info->num_memory_instructions++; /* we only care about stores */ + break; + case nir_intrinsic_image_deref_store: + info->writes_memory = true; + info->num_memory_instructions++; /* we only care about stores */ + break; + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + info->uses_bindless_images = true; + info->writes_memory = true; + info->num_memory_instructions++; /* we only care about stores */ + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + info->writes_memory = true; + info->num_memory_instructions++; /* we only care about stores */ + break; + case nir_intrinsic_store_ssbo: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + info->writes_memory = true; + info->num_memory_instructions++; /* we only care about stores */ + break; + case nir_intrinsic_load_color0: + case nir_intrinsic_load_color1: { + unsigned index = intr->intrinsic == nir_intrinsic_load_color1; + uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa); + info->colors_read |= mask << (index * 4); + break; + } + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_offset: /* uses center */ + case nir_intrinsic_load_barycentric_at_sample: { /* uses center */ + unsigned mode = nir_intrinsic_interp_mode(intr); + + if (mode == INTERP_MODE_FLAT) + break; + + if (mode == INTERP_MODE_NOPERSPECTIVE) { + if (intr->intrinsic == nir_intrinsic_load_barycentric_sample) + info->uses_linear_sample = true; + else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid) + info->uses_linear_centroid = true; + else + info->uses_linear_center = true; + + if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample) + info->uses_linear_opcode_interp_sample = true; + } else { + if (intr->intrinsic == nir_intrinsic_load_barycentric_sample) + info->uses_persp_sample = true; + else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid) + info->uses_persp_centroid = true; + else + info->uses_persp_center = true; + + if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample) + info->uses_persp_opcode_interp_sample = true; + } + break; + } + case nir_intrinsic_load_deref: { + nir_variable *var = intrinsic_get_var(intr); + nir_variable_mode mode = var->data.mode; + + if (mode == nir_var_shader_in) { + /* PS inputs use the interpolated load intrinsics. */ + assert(nir->info.stage != MESA_SHADER_FRAGMENT); + gather_intrinsic_load_deref_input_info(nir, intr, nir_src_as_deref(intr->src[0]), info); + } else if (mode == nir_var_shader_out) { + gather_intrinsic_load_deref_output_info(nir, intr, var, info); + } + break; + } + case nir_intrinsic_store_deref: { + nir_variable *var = intrinsic_get_var(intr); + + if (var->data.mode == nir_var_shader_out) + gather_intrinsic_store_deref_output_info(nir, intr, nir_src_as_deref(intr->src[0]), + info); + break; + } + case nir_intrinsic_interp_deref_at_centroid: + case nir_intrinsic_interp_deref_at_sample: + case nir_intrinsic_interp_deref_at_offset: + unreachable("interp opcodes should have been lowered"); + break; + default: + break; + } + } } -static void scan_output_slot(const nir_variable *var, - unsigned var_idx, - unsigned component, unsigned num_components, - struct si_shader_info *info) +static void scan_output_slot(const nir_variable *var, unsigned var_idx, unsigned component, + unsigned num_components, struct si_shader_info *info) { - assert(component + num_components <= 4); - assert(component < 4); - - unsigned semantic_name, semantic_index; - - unsigned location = var->data.location + var_idx; - unsigned drv_location = var->data.driver_location + var_idx; - - if (info->processor == PIPE_SHADER_FRAGMENT) { - tgsi_get_gl_frag_result_semantic(location, - &semantic_name, &semantic_index); - - /* Adjust for dual source blending */ - if (var->data.index > 0) { - semantic_index++; - } - } else { - tgsi_get_gl_varying_semantic(location, true, - &semantic_name, &semantic_index); - } - - ubyte usagemask = ((1 << num_components) - 1) << component; - - unsigned gs_out_streams; - if (var->data.stream & NIR_STREAM_PACKED) { - gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED; - } else { - assert(var->data.stream < 4); - gs_out_streams = 0; - for (unsigned j = 0; j < num_components; ++j) - gs_out_streams |= var->data.stream << (2 * (component + j)); - } - - unsigned streamx = gs_out_streams & 3; - unsigned streamy = (gs_out_streams >> 2) & 3; - unsigned streamz = (gs_out_streams >> 4) & 3; - unsigned streamw = (gs_out_streams >> 6) & 3; - - if (usagemask & TGSI_WRITEMASK_X) { - info->output_streams[drv_location] |= streamx; - info->num_stream_output_components[streamx]++; - } - if (usagemask & TGSI_WRITEMASK_Y) { - info->output_streams[drv_location] |= streamy << 2; - info->num_stream_output_components[streamy]++; - } - if (usagemask & TGSI_WRITEMASK_Z) { - info->output_streams[drv_location] |= streamz << 4; - info->num_stream_output_components[streamz]++; - } - if (usagemask & TGSI_WRITEMASK_W) { - info->output_streams[drv_location] |= streamw << 6; - info->num_stream_output_components[streamw]++; - } - - info->output_semantic_name[drv_location] = semantic_name; - info->output_semantic_index[drv_location] = semantic_index; - - switch (semantic_name) { - case TGSI_SEMANTIC_PRIMID: - info->writes_primid = true; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - info->writes_viewport_index = true; - break; - case TGSI_SEMANTIC_LAYER: - info->writes_layer = true; - break; - case TGSI_SEMANTIC_PSIZE: - info->writes_psize = true; - break; - case TGSI_SEMANTIC_CLIPVERTEX: - info->writes_clipvertex = true; - break; - case TGSI_SEMANTIC_COLOR: - info->colors_written |= 1 << semantic_index; - break; - case TGSI_SEMANTIC_STENCIL: - info->writes_stencil = true; - break; - case TGSI_SEMANTIC_SAMPLEMASK: - info->writes_samplemask = true; - break; - case TGSI_SEMANTIC_EDGEFLAG: - info->writes_edgeflag = true; - break; - case TGSI_SEMANTIC_POSITION: - if (info->processor == PIPE_SHADER_FRAGMENT) - info->writes_z = true; - else - info->writes_position = true; - break; - } + assert(component + num_components <= 4); + assert(component < 4); + + unsigned semantic_name, semantic_index; + + unsigned location = var->data.location + var_idx; + unsigned drv_location = var->data.driver_location + var_idx; + + if (info->processor == PIPE_SHADER_FRAGMENT) { + tgsi_get_gl_frag_result_semantic(location, &semantic_name, &semantic_index); + + /* Adjust for dual source blending */ + if (var->data.index > 0) { + semantic_index++; + } + } else { + tgsi_get_gl_varying_semantic(location, true, &semantic_name, &semantic_index); + } + + ubyte usagemask = ((1 << num_components) - 1) << component; + + unsigned gs_out_streams; + if (var->data.stream & NIR_STREAM_PACKED) { + gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED; + } else { + assert(var->data.stream < 4); + gs_out_streams = 0; + for (unsigned j = 0; j < num_components; ++j) + gs_out_streams |= var->data.stream << (2 * (component + j)); + } + + unsigned streamx = gs_out_streams & 3; + unsigned streamy = (gs_out_streams >> 2) & 3; + unsigned streamz = (gs_out_streams >> 4) & 3; + unsigned streamw = (gs_out_streams >> 6) & 3; + + if (usagemask & TGSI_WRITEMASK_X) { + info->output_streams[drv_location] |= streamx; + info->num_stream_output_components[streamx]++; + } + if (usagemask & TGSI_WRITEMASK_Y) { + info->output_streams[drv_location] |= streamy << 2; + info->num_stream_output_components[streamy]++; + } + if (usagemask & TGSI_WRITEMASK_Z) { + info->output_streams[drv_location] |= streamz << 4; + info->num_stream_output_components[streamz]++; + } + if (usagemask & TGSI_WRITEMASK_W) { + info->output_streams[drv_location] |= streamw << 6; + info->num_stream_output_components[streamw]++; + } + + info->output_semantic_name[drv_location] = semantic_name; + info->output_semantic_index[drv_location] = semantic_index; + + switch (semantic_name) { + case TGSI_SEMANTIC_PRIMID: + info->writes_primid = true; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + info->writes_viewport_index = true; + break; + case TGSI_SEMANTIC_LAYER: + info->writes_layer = true; + break; + case TGSI_SEMANTIC_PSIZE: + info->writes_psize = true; + break; + case TGSI_SEMANTIC_CLIPVERTEX: + info->writes_clipvertex = true; + break; + case TGSI_SEMANTIC_COLOR: + info->colors_written |= 1 << semantic_index; + break; + case TGSI_SEMANTIC_STENCIL: + info->writes_stencil = true; + break; + case TGSI_SEMANTIC_SAMPLEMASK: + info->writes_samplemask = true; + break; + case TGSI_SEMANTIC_EDGEFLAG: + info->writes_edgeflag = true; + break; + case TGSI_SEMANTIC_POSITION: + if (info->processor == PIPE_SHADER_FRAGMENT) + info->writes_z = true; + else + info->writes_position = true; + break; + } } -static void scan_output_helper(const nir_variable *var, - unsigned location, - const struct glsl_type *type, - struct si_shader_info *info) +static void scan_output_helper(const nir_variable *var, unsigned location, + const struct glsl_type *type, struct si_shader_info *info) { - if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) { - for (unsigned i = 0; i < glsl_get_length(type); i++) { - const struct glsl_type *ft = glsl_get_struct_field(type, i); - scan_output_helper(var, location, ft, info); - location += glsl_count_attribute_slots(ft, false); - } - } else if (glsl_type_is_array_or_matrix(type)) { - const struct glsl_type *elem_type = - glsl_get_array_element(type); - unsigned num_elems = glsl_get_length(type); - if (var->data.compact) { - assert(glsl_type_is_scalar(elem_type)); - assert(glsl_get_bit_size(elem_type) == 32); - unsigned component = var->data.location_frac; - scan_output_slot(var, location, component, - MIN2(num_elems, 4 - component), info); - if (component + num_elems > 4) { - scan_output_slot(var, location + 1, 0, - component + num_elems - 4, info); - } - - } else { - unsigned elem_count = glsl_count_attribute_slots(elem_type, false); - for (unsigned i = 0; i < num_elems; i++) { - scan_output_helper(var, location, elem_type, info); - location += elem_count; - } - } - } else if (glsl_type_is_dual_slot(type)) { - unsigned component = var->data.location_frac; - scan_output_slot(var, location, component, 4 - component, info); - scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, - info); - } else { - unsigned component = var->data.location_frac; - assert(glsl_type_is_vector_or_scalar(type)); - unsigned num_components = glsl_get_components(type); - if (glsl_type_is_64bit(type)) - num_components *= 2; - scan_output_slot(var, location, component, num_components, info); - } + if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) { + for (unsigned i = 0; i < glsl_get_length(type); i++) { + const struct glsl_type *ft = glsl_get_struct_field(type, i); + scan_output_helper(var, location, ft, info); + location += glsl_count_attribute_slots(ft, false); + } + } else if (glsl_type_is_array_or_matrix(type)) { + const struct glsl_type *elem_type = glsl_get_array_element(type); + unsigned num_elems = glsl_get_length(type); + if (var->data.compact) { + assert(glsl_type_is_scalar(elem_type)); + assert(glsl_get_bit_size(elem_type) == 32); + unsigned component = var->data.location_frac; + scan_output_slot(var, location, component, MIN2(num_elems, 4 - component), info); + if (component + num_elems > 4) { + scan_output_slot(var, location + 1, 0, component + num_elems - 4, info); + } + + } else { + unsigned elem_count = glsl_count_attribute_slots(elem_type, false); + for (unsigned i = 0; i < num_elems; i++) { + scan_output_helper(var, location, elem_type, info); + location += elem_count; + } + } + } else if (glsl_type_is_dual_slot(type)) { + unsigned component = var->data.location_frac; + scan_output_slot(var, location, component, 4 - component, info); + scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, info); + } else { + unsigned component = var->data.location_frac; + assert(glsl_type_is_vector_or_scalar(type)); + unsigned num_components = glsl_get_components(type); + if (glsl_type_is_64bit(type)) + num_components *= 2; + scan_output_slot(var, location, component, num_components, info); + } } -void si_nir_scan_shader(const struct nir_shader *nir, - struct si_shader_info *info) +void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info) { - nir_function *func; - unsigned i; - - info->processor = pipe_shader_type_from_mesa(nir->info.stage); - - info->properties[TGSI_PROPERTY_NEXT_SHADER] = - pipe_shader_type_from_mesa(nir->info.next_stage); - - if (nir->info.stage == MESA_SHADER_VERTEX) { - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = - nir->info.vs.window_space_position; - info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] = - nir->info.vs.blit_sgprs_amd; - } - - if (nir->info.stage == MESA_SHADER_TESS_CTRL) { - info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = - nir->info.tess.tcs_vertices_out; - } - - if (nir->info.stage == MESA_SHADER_TESS_EVAL) { - if (nir->info.tess.primitive_mode == GL_ISOLINES) - info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES; - else - info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode; - - STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL); - STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == - PIPE_TESS_SPACING_FRACTIONAL_ODD); - STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == - PIPE_TESS_SPACING_FRACTIONAL_EVEN); - - info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3; - info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw; - info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode; - } - - if (nir->info.stage == MESA_SHADER_GEOMETRY) { - info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive; - info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive; - info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out; - info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations; - } - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] = - nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage; - info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage; - - if (nir->info.fs.pixel_center_integer) { - info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = - TGSI_FS_COORD_PIXEL_CENTER_INTEGER; - } - - if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) { - switch (nir->info.fs.depth_layout) { - case FRAG_DEPTH_LAYOUT_ANY: - info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY; - break; - case FRAG_DEPTH_LAYOUT_GREATER: - info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER; - break; - case FRAG_DEPTH_LAYOUT_LESS: - info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS; - break; - case FRAG_DEPTH_LAYOUT_UNCHANGED: - info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED; - break; - default: - unreachable("Unknow depth layout"); - } - } - } - - if (gl_shader_stage_is_compute(nir->info.stage)) { - info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0]; - info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1]; - info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2]; - info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] = nir->info.cs.user_data_components_amd; - } - - i = 0; - uint64_t processed_inputs = 0; - nir_foreach_variable(variable, &nir->inputs) { - unsigned semantic_name, semantic_index; - - const struct glsl_type *type = variable->type; - if (nir_is_per_vertex_io(variable, nir->info.stage)) { - assert(glsl_type_is_array(type)); - type = glsl_get_array_element(type); - } - - unsigned attrib_count = glsl_count_attribute_slots(type, - nir->info.stage == MESA_SHADER_VERTEX); - - i = variable->data.driver_location; - - /* Vertex shader inputs don't have semantics. The state - * tracker has already mapped them to attributes via - * variable->data.driver_location. - */ - if (nir->info.stage == MESA_SHADER_VERTEX) - continue; - - for (unsigned j = 0; j < attrib_count; j++, i++) { - - if (processed_inputs & ((uint64_t)1 << i)) - continue; - - processed_inputs |= ((uint64_t)1 << i); - - tgsi_get_gl_varying_semantic(variable->data.location + j, true, - &semantic_name, &semantic_index); - - info->input_semantic_name[i] = semantic_name; - info->input_semantic_index[i] = semantic_index; - - if (semantic_name == TGSI_SEMANTIC_PRIMID) - info->uses_primid = true; - - if (semantic_name == TGSI_SEMANTIC_COLOR) { - /* We only need this for color inputs. */ - if (variable->data.sample) - info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE; - else if (variable->data.centroid) - info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID; - else - info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER; - } - - enum glsl_base_type base_type = - glsl_get_base_type(glsl_without_array(variable->type)); - - switch (variable->data.interpolation) { - case INTERP_MODE_NONE: - if (glsl_base_type_is_integer(base_type)) { - info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; - break; - } - - if (semantic_name == TGSI_SEMANTIC_COLOR) { - info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR; - break; - } - /* fall-through */ - - case INTERP_MODE_SMOOTH: - assert(!glsl_base_type_is_integer(base_type)); - - info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE; - break; - - case INTERP_MODE_NOPERSPECTIVE: - assert(!glsl_base_type_is_integer(base_type)); - - info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR; - break; - - case INTERP_MODE_FLAT: - info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; - break; - } - } - } - - nir_foreach_variable(variable, &nir->outputs) { - const struct glsl_type *type = variable->type; - if (nir_is_per_vertex_io(variable, nir->info.stage)) { - assert(glsl_type_is_array(type)); - type = glsl_get_array_element(type); - } - - ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false); - scan_output_helper(variable, 0, type, info); - - unsigned loc = variable->data.location; - if (nir->info.stage == MESA_SHADER_FRAGMENT && - loc == FRAG_RESULT_COLOR && - nir->info.outputs_written & (1ull << loc)) { - assert(attrib_count == 1); - info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true; - } - } - - info->num_inputs = nir->num_inputs; - info->num_outputs = nir->num_outputs; - - info->constbuf0_num_slots = nir->num_uniforms; - info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos); - info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos); - if (nir->num_uniforms > 0) - info->const_buffers_declared |= 1; - info->images_declared = u_bit_consecutive(0, nir->info.num_images); - info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1); - info->samplers_declared = nir->info.textures_used; - - info->num_written_clipdistance = nir->info.clip_distance_array_size; - info->num_written_culldistance = nir->info.cull_distance_array_size; - info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance); - info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance); - - if (info->processor == PIPE_SHADER_FRAGMENT) - info->uses_kill = nir->info.fs.uses_discard; - - if (nir->info.stage == MESA_SHADER_TESS_CTRL) { - info->tessfactors_are_def_in_all_invocs = - ac_are_tessfactors_def_in_all_invocs(nir); - } - - func = (struct nir_function *)exec_list_get_head_const(&nir->functions); - nir_foreach_block(block, func->impl) { - nir_foreach_instr(instr, block) - scan_instruction(nir, info, instr); - } + nir_function *func; + unsigned i; + + info->processor = pipe_shader_type_from_mesa(nir->info.stage); + + info->properties[TGSI_PROPERTY_NEXT_SHADER] = pipe_shader_type_from_mesa(nir->info.next_stage); + + if (nir->info.stage == MESA_SHADER_VERTEX) { + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = nir->info.vs.window_space_position; + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] = nir->info.vs.blit_sgprs_amd; + } + + if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = nir->info.tess.tcs_vertices_out; + } + + if (nir->info.stage == MESA_SHADER_TESS_EVAL) { + if (nir->info.tess.primitive_mode == GL_ISOLINES) + info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES; + else + info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode; + + STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_ODD); + STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_EVEN); + + info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3; + info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw; + info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode; + } + + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive; + info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive; + info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out; + info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations; + } + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] = + nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage; + info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage; + + if (nir->info.fs.pixel_center_integer) { + info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = TGSI_FS_COORD_PIXEL_CENTER_INTEGER; + } + + if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) { + switch (nir->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_ANY: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY; + break; + case FRAG_DEPTH_LAYOUT_GREATER: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER; + break; + case FRAG_DEPTH_LAYOUT_LESS: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS; + break; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED; + break; + default: + unreachable("Unknow depth layout"); + } + } + } + + if (gl_shader_stage_is_compute(nir->info.stage)) { + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0]; + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1]; + info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2]; + info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] = + nir->info.cs.user_data_components_amd; + } + + i = 0; + uint64_t processed_inputs = 0; + nir_foreach_variable (variable, &nir->inputs) { + unsigned semantic_name, semantic_index; + + const struct glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + unsigned attrib_count = + glsl_count_attribute_slots(type, nir->info.stage == MESA_SHADER_VERTEX); + + i = variable->data.driver_location; + + /* Vertex shader inputs don't have semantics. The state + * tracker has already mapped them to attributes via + * variable->data.driver_location. + */ + if (nir->info.stage == MESA_SHADER_VERTEX) + continue; + + for (unsigned j = 0; j < attrib_count; j++, i++) { + + if (processed_inputs & ((uint64_t)1 << i)) + continue; + + processed_inputs |= ((uint64_t)1 << i); + + tgsi_get_gl_varying_semantic(variable->data.location + j, true, &semantic_name, + &semantic_index); + + info->input_semantic_name[i] = semantic_name; + info->input_semantic_index[i] = semantic_index; + + if (semantic_name == TGSI_SEMANTIC_PRIMID) + info->uses_primid = true; + + if (semantic_name == TGSI_SEMANTIC_COLOR) { + /* We only need this for color inputs. */ + if (variable->data.sample) + info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE; + else if (variable->data.centroid) + info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID; + else + info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER; + } + + enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array(variable->type)); + + switch (variable->data.interpolation) { + case INTERP_MODE_NONE: + if (glsl_base_type_is_integer(base_type)) { + info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; + break; + } + + if (semantic_name == TGSI_SEMANTIC_COLOR) { + info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR; + break; + } + /* fall-through */ + + case INTERP_MODE_SMOOTH: + assert(!glsl_base_type_is_integer(base_type)); + + info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE; + break; + + case INTERP_MODE_NOPERSPECTIVE: + assert(!glsl_base_type_is_integer(base_type)); + + info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR; + break; + + case INTERP_MODE_FLAT: + info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT; + break; + } + } + } + + nir_foreach_variable (variable, &nir->outputs) { + const struct glsl_type *type = variable->type; + if (nir_is_per_vertex_io(variable, nir->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false); + scan_output_helper(variable, 0, type, info); + + unsigned loc = variable->data.location; + if (nir->info.stage == MESA_SHADER_FRAGMENT && loc == FRAG_RESULT_COLOR && + nir->info.outputs_written & (1ull << loc)) { + assert(attrib_count == 1); + info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true; + } + } + + info->num_inputs = nir->num_inputs; + info->num_outputs = nir->num_outputs; + + info->constbuf0_num_slots = nir->num_uniforms; + info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos); + info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos); + if (nir->num_uniforms > 0) + info->const_buffers_declared |= 1; + info->images_declared = u_bit_consecutive(0, nir->info.num_images); + info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1); + info->samplers_declared = nir->info.textures_used; + + info->num_written_clipdistance = nir->info.clip_distance_array_size; + info->num_written_culldistance = nir->info.cull_distance_array_size; + info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance); + info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance); + + if (info->processor == PIPE_SHADER_FRAGMENT) + info->uses_kill = nir->info.fs.uses_discard; + + if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + info->tessfactors_are_def_in_all_invocs = ac_are_tessfactors_def_in_all_invocs(nir); + } + + func = (struct nir_function *)exec_list_get_head_const(&nir->functions); + nir_foreach_block (block, func->impl) { + nir_foreach_instr (instr, block) + scan_instruction(nir, info, instr); + } } -static void -si_nir_opts(struct nir_shader *nir) +static void si_nir_opts(struct nir_shader *nir) { - bool progress; - - do { - progress = false; - - NIR_PASS_V(nir, nir_lower_vars_to_ssa); - - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_dead_write_vars); - - NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); - NIR_PASS_V(nir, nir_lower_phis_to_scalar); - - /* (Constant) copy propagation is needed for txf with offsets. */ - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_dce); - if (nir_opt_trivial_continues(nir)) { - progress = true; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - } - NIR_PASS(progress, nir, nir_opt_if, true); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); - - /* Needed for algebraic lowering */ - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_constant_folding); - - if (!nir->info.flrp_lowered) { - unsigned lower_flrp = - (nir->options->lower_flrp16 ? 16 : 0) | - (nir->options->lower_flrp32 ? 32 : 0) | - (nir->options->lower_flrp64 ? 64 : 0); - assert(lower_flrp); - bool lower_flrp_progress = false; - - NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, - lower_flrp, - false /* always_precise */, - nir->options->lower_ffma); - if (lower_flrp_progress) { - NIR_PASS(progress, nir, - nir_opt_constant_folding); - progress = true; - } - - /* Nothing should rematerialize any flrps, so we only - * need to do this lowering once. - */ - nir->info.flrp_lowered = true; - } - - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_conditional_discard); - if (nir->options->max_unroll_iterations) { - NIR_PASS(progress, nir, nir_opt_loop_unroll, 0); - } - } while (progress); + bool progress; + + do { + progress = false; + + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_dead_write_vars); + + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); + NIR_PASS_V(nir, nir_lower_phis_to_scalar); + + /* (Constant) copy propagation is needed for txf with offsets. */ + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_dce); + if (nir_opt_trivial_continues(nir)) { + progress = true; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_dce); + } + NIR_PASS(progress, nir, nir_opt_if, true); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); + + /* Needed for algebraic lowering */ + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + if (!nir->info.flrp_lowered) { + unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) | + (nir->options->lower_flrp32 ? 32 : 0) | + (nir->options->lower_flrp64 ? 64 : 0); + assert(lower_flrp); + bool lower_flrp_progress = false; + + NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */, + nir->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, nir, nir_opt_constant_folding); + progress = true; + } + + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + nir->info.flrp_lowered = true; + } + + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_opt_conditional_discard); + if (nir->options->max_unroll_iterations) { + NIR_PASS(progress, nir, nir_opt_loop_unroll, 0); + } + } while (progress); } -static int -type_size_vec4(const struct glsl_type *type, bool bindless) +static int type_size_vec4(const struct glsl_type *type, bool bindless) { - return glsl_count_attribute_slots(type, false); + return glsl_count_attribute_slots(type, false); } -static void -si_nir_lower_color(nir_shader *nir) +static void si_nir_lower_color(nir_shader *nir) { - nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); - - nir_builder b; - nir_builder_init(&b, entrypoint); - - nir_foreach_block(block, entrypoint) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = - nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic != nir_intrinsic_load_deref) - continue; - - nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); - if (deref->mode != nir_var_shader_in) - continue; - - b.cursor = nir_before_instr(instr); - nir_variable *var = nir_deref_instr_get_variable(deref); - nir_ssa_def *def; - - if (var->data.location == VARYING_SLOT_COL0) { - def = nir_load_color0(&b); - } else if (var->data.location == VARYING_SLOT_COL1) { - def = nir_load_color1(&b); - } else { - continue; - } - - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def)); - nir_instr_remove(instr); - } - } + nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, entrypoint); + + nir_foreach_block (block, entrypoint) { + nir_foreach_instr_safe (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_load_deref) + continue; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (deref->mode != nir_var_shader_in) + continue; + + b.cursor = nir_before_instr(instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + nir_ssa_def *def; + + if (var->data.location == VARYING_SLOT_COL0) { + def = nir_load_color0(&b); + } else if (var->data.location == VARYING_SLOT_COL1) { + def = nir_load_color1(&b); + } else { + continue; + } + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def)); + nir_instr_remove(instr); + } + } } static void si_nir_lower_ps_inputs(struct nir_shader *nir) { - if (nir->info.stage != MESA_SHADER_FRAGMENT) - return; - - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), false, true); - - /* Since we're doing nir_lower_io_to_temporaries late, we need - * to lower all the copy_deref's introduced by - * lower_io_to_temporaries before calling nir_lower_io. - */ - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - - si_nir_lower_color(nir); - NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); - - /* This pass needs actual constants */ - NIR_PASS_V(nir, nir_opt_constant_folding); - NIR_PASS_V(nir, nir_io_add_const_offset_to_base, - nir_var_shader_in); + if (nir->info.stage != MESA_SHADER_FRAGMENT) + return; + + NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), false, true); + + /* Since we're doing nir_lower_io_to_temporaries late, we need + * to lower all the copy_deref's introduced by + * lower_io_to_temporaries before calling nir_lower_io. + */ + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + + si_nir_lower_color(nir); + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); + + /* This pass needs actual constants */ + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in); } void si_nir_adjust_driver_locations(struct nir_shader *nir) { - /* Adjust the driver location of inputs and outputs. The state tracker - * interprets them as slots, while the ac/nir backend interprets them - * as individual components. - */ - if (nir->info.stage != MESA_SHADER_FRAGMENT) { - nir_foreach_variable(variable, &nir->inputs) - variable->data.driver_location *= 4; - } - - nir_foreach_variable(variable, &nir->outputs) - variable->data.driver_location *= 4; + /* Adjust the driver location of inputs and outputs. The state tracker + * interprets them as slots, while the ac/nir backend interprets them + * as individual components. + */ + if (nir->info.stage != MESA_SHADER_FRAGMENT) { + nir_foreach_variable (variable, &nir->inputs) + variable->data.driver_location *= 4; + } + + nir_foreach_variable (variable, &nir->outputs) + variable->data.driver_location *= 4; } /** @@ -938,65 +889,64 @@ void si_nir_adjust_driver_locations(struct nir_shader *nir) */ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) { - /* Perform lowerings (and optimizations) of code. - * - * Performance considerations aside, we must: - * - lower certain ALU operations - * - ensure constant offsets for texture instructions are folded - * and copy-propagated - */ - - static const struct nir_lower_tex_options lower_tex_options = { - .lower_txp = ~0u, - }; - NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); - - const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = 64, - .ballot_bit_size = 64, - .lower_to_scalar = true, - .lower_subgroup_masks = true, - .lower_vote_trivial = false, - .lower_vote_eq_to_ballot = true, - }; - NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options); - - /* Lower load constants to scalar and then clean up the mess */ - NIR_PASS_V(nir, nir_lower_load_const_to_scalar); - NIR_PASS_V(nir, nir_lower_var_copies); - NIR_PASS_V(nir, nir_lower_pack); - NIR_PASS_V(nir, nir_opt_access); - si_nir_opts(nir); - - /* Lower large variables that are always constant with load_constant - * intrinsics, which get turned into PC-relative loads from a data - * section next to the shader. - * - * st/mesa calls finalize_nir twice, but we can't call this pass twice. - */ - bool changed = false; - if (!nir->constant_data) { - NIR_PASS(changed, nir, nir_opt_large_constants, - glsl_get_natural_size_align_bytes, 16); - } - - changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class); - if (changed) - si_nir_opts(nir); - - NIR_PASS_V(nir, nir_lower_bool_to_int32); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); - - if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) - NIR_PASS_V(nir, nir_lower_discard_to_demote); + /* Perform lowerings (and optimizations) of code. + * + * Performance considerations aside, we must: + * - lower certain ALU operations + * - ensure constant offsets for texture instructions are folded + * and copy-propagated + */ + + static const struct nir_lower_tex_options lower_tex_options = { + .lower_txp = ~0u, + }; + NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); + + const nir_lower_subgroups_options subgroups_options = { + .subgroup_size = 64, + .ballot_bit_size = 64, + .lower_to_scalar = true, + .lower_subgroup_masks = true, + .lower_vote_trivial = false, + .lower_vote_eq_to_ballot = true, + }; + NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options); + + /* Lower load constants to scalar and then clean up the mess */ + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_pack); + NIR_PASS_V(nir, nir_opt_access); + si_nir_opts(nir); + + /* Lower large variables that are always constant with load_constant + * intrinsics, which get turned into PC-relative loads from a data + * section next to the shader. + * + * st/mesa calls finalize_nir twice, but we can't call this pass twice. + */ + bool changed = false; + if (!nir->constant_data) { + NIR_PASS(changed, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16); + } + + changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class); + if (changed) + si_nir_opts(nir); + + NIR_PASS_V(nir, nir_lower_bool_to_int32); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); + + if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) + NIR_PASS_V(nir, nir_lower_discard_to_demote); } void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize) { - struct si_screen *sscreen = (struct si_screen *)screen; - struct nir_shader *nir = (struct nir_shader *)nirptr; + struct si_screen *sscreen = (struct si_screen *)screen; + struct nir_shader *nir = (struct nir_shader *)nirptr; - nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); - si_nir_lower_ps_inputs(nir); - si_lower_nir(sscreen, nir); + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + si_nir_lower_ps_inputs(nir); + si_lower_nir(sscreen, nir); } diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 30cca361ac4..e5fd089b59f 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -26,68 +26,59 @@ #include "tgsi/tgsi_text.h" #include "tgsi/tgsi_ureg.h" -void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, - unsigned num_layers) +void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers) { - unsigned vs_blit_property; - void **vs; - - switch (type) { - case UTIL_BLITTER_ATTRIB_NONE: - vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : - &sctx->vs_blit_pos; - vs_blit_property = SI_VS_BLIT_SGPRS_POS; - break; - case UTIL_BLITTER_ATTRIB_COLOR: - vs = num_layers > 1 ? &sctx->vs_blit_color_layered : - &sctx->vs_blit_color; - vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR; - break; - case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: - case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: - assert(num_layers == 1); - vs = &sctx->vs_blit_texcoord; - vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD; - break; - default: - assert(0); - return NULL; - } - if (*vs) - return *vs; - - struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX); - if (!ureg) - return NULL; - - /* Tell the shader to load VS inputs from SGPRs: */ - ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property); - ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true); - - /* This is just a pass-through shader with 1-3 MOV instructions. */ - ureg_MOV(ureg, - ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), - ureg_DECL_vs_input(ureg, 0)); - - if (type != UTIL_BLITTER_ATTRIB_NONE) { - ureg_MOV(ureg, - ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), - ureg_DECL_vs_input(ureg, 1)); - } - - if (num_layers > 1) { - struct ureg_src instance_id = - ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0); - struct ureg_dst layer = - ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0); - - ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X), - ureg_scalar(instance_id, TGSI_SWIZZLE_X)); - } - ureg_END(ureg); - - *vs = ureg_create_shader_and_destroy(ureg, &sctx->b); - return *vs; + unsigned vs_blit_property; + void **vs; + + switch (type) { + case UTIL_BLITTER_ATTRIB_NONE: + vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos; + vs_blit_property = SI_VS_BLIT_SGPRS_POS; + break; + case UTIL_BLITTER_ATTRIB_COLOR: + vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color; + vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR; + break; + case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: + case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: + assert(num_layers == 1); + vs = &sctx->vs_blit_texcoord; + vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD; + break; + default: + assert(0); + return NULL; + } + if (*vs) + return *vs; + + struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX); + if (!ureg) + return NULL; + + /* Tell the shader to load VS inputs from SGPRs: */ + ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property); + ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true); + + /* This is just a pass-through shader with 1-3 MOV instructions. */ + ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0)); + + if (type != UTIL_BLITTER_ATTRIB_NONE) { + ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1)); + } + + if (num_layers > 1) { + struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0); + struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0); + + ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X), + ureg_scalar(instance_id, TGSI_SWIZZLE_X)); + } + ureg_END(ureg); + + *vs = ureg_create_shader_and_destroy(ureg, &sctx->b); + return *vs; } /** @@ -97,137 +88,128 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, */ void *si_create_fixed_func_tcs(struct si_context *sctx) { - struct ureg_src outer, inner; - struct ureg_dst tessouter, tessinner; - struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL); + struct ureg_src outer, inner; + struct ureg_dst tessouter, tessinner; + struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL); - if (!ureg) - return NULL; + if (!ureg) + return NULL; - outer = ureg_DECL_system_value(ureg, - TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0); - inner = ureg_DECL_system_value(ureg, - TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0); + outer = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0); + inner = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0); - tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); - tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); + tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); + tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); - ureg_MOV(ureg, tessouter, outer); - ureg_MOV(ureg, tessinner, inner); - ureg_END(ureg); + ureg_MOV(ureg, tessouter, outer); + ureg_MOV(ureg, tessinner, inner); + ureg_END(ureg); - return ureg_create_shader_and_destroy(ureg, &sctx->b); + return ureg_create_shader_and_destroy(ureg, &sctx->b); } /* Create a compute shader implementing clear_buffer or copy_buffer. */ -void *si_create_dma_compute_shader(struct pipe_context *ctx, - unsigned num_dwords_per_thread, - bool dst_stream_cache_policy, bool is_copy) +void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, + bool dst_stream_cache_policy, bool is_copy) { - struct si_screen *sscreen = (struct si_screen *)ctx->screen; - assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); - - unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; - if (dst_stream_cache_policy) - store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; - - /* Don't cache loads, because there is no reuse. */ - unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY; - - unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); - unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); - - for (unsigned i = 0; i < num_mem_ops; i++) { - if (i*4 < num_dwords_per_thread) - inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4); - } - - struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); - if (!ureg) - return NULL; - - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); - - struct ureg_src value; - if (!is_copy) { - ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]); - value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0); - } - - struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); - struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); - struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); - struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); - struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); - struct ureg_src srcbuf; - struct ureg_src *values = NULL; - - if (is_copy) { - srcbuf = ureg_DECL_buffer(ureg, 1, false); - values = malloc(num_mem_ops * sizeof(struct ureg_src)); - } - - /* If there are multiple stores, the first store writes into 0*wavesize+tid, - * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. - */ - ureg_UMAD(ureg, store_addr, blk, - ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid); - /* Convert from a "store size unit" into bytes. */ - ureg_UMUL(ureg, store_addr, ureg_src(store_addr), - ureg_imm1u(ureg, 4 * inst_dwords[0])); - ureg_MOV(ureg, load_addr, ureg_src(store_addr)); - - /* Distance between a load and a store for latency hiding. */ - unsigned load_store_distance = is_copy ? 8 : 0; - - for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { - int d = i - load_store_distance; - - if (is_copy && i < num_mem_ops) { - if (i) { - ureg_UADD(ureg, load_addr, ureg_src(load_addr), - ureg_imm1u(ureg, 4 * inst_dwords[i] * - sscreen->compute_wave_size)); - } - - values[i] = ureg_src(ureg_DECL_temporary(ureg)); - struct ureg_dst dst = - ureg_writemask(ureg_dst(values[i]), - u_bit_consecutive(0, inst_dwords[i])); - struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; - ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, - load_qualifier, TGSI_TEXTURE_BUFFER, 0); - } - - if (d >= 0) { - if (d) { - ureg_UADD(ureg, store_addr, ureg_src(store_addr), - ureg_imm1u(ureg, 4 * inst_dwords[d] * - sscreen->compute_wave_size)); - } - - struct ureg_dst dst = - ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); - struct ureg_src srcs[] = - {ureg_src(store_addr), is_copy ? values[d] : value}; - ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, - store_qualifier, TGSI_TEXTURE_BUFFER, 0); - } - } - ureg_END(ureg); - - struct pipe_compute_state state = {}; - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = ureg_get_tokens(ureg, NULL); - - void *cs = ctx->create_compute_state(ctx, &state); - ureg_destroy(ureg); - ureg_free_tokens(state.prog); - - free(values); - return cs; + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); + + unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; + if (dst_stream_cache_policy) + store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; + + /* Don't cache loads, because there is no reuse. */ + unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY; + + unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); + unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); + + for (unsigned i = 0; i < num_mem_ops; i++) { + if (i * 4 < num_dwords_per_thread) + inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4); + } + + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + struct ureg_src value; + if (!is_copy) { + ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]); + value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0); + } + + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); + struct ureg_src srcbuf; + struct ureg_src *values = NULL; + + if (is_copy) { + srcbuf = ureg_DECL_buffer(ureg, 1, false); + values = malloc(num_mem_ops * sizeof(struct ureg_src)); + } + + /* If there are multiple stores, the first store writes into 0*wavesize+tid, + * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. + */ + ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), + tid); + /* Convert from a "store size unit" into bytes. */ + ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0])); + ureg_MOV(ureg, load_addr, ureg_src(store_addr)); + + /* Distance between a load and a store for latency hiding. */ + unsigned load_store_distance = is_copy ? 8 : 0; + + for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { + int d = i - load_store_distance; + + if (is_copy && i < num_mem_ops) { + if (i) { + ureg_UADD(ureg, load_addr, ureg_src(load_addr), + ureg_imm1u(ureg, 4 * inst_dwords[i] * sscreen->compute_wave_size)); + } + + values[i] = ureg_src(ureg_DECL_temporary(ureg)); + struct ureg_dst dst = + ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i])); + struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier, + TGSI_TEXTURE_BUFFER, 0); + } + + if (d >= 0) { + if (d) { + ureg_UADD(ureg, store_addr, ureg_src(store_addr), + ureg_imm1u(ureg, 4 * inst_dwords[d] * sscreen->compute_wave_size)); + } + + struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); + struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier, + TGSI_TEXTURE_BUFFER, 0); + } + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + ureg_free_tokens(state.prog); + + free(values); + return cs; } /* Create a compute shader that copies DCC from one buffer to another @@ -240,67 +222,63 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, */ void *si_create_dcc_retile_cs(struct pipe_context *ctx) { - struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); - if (!ureg) - return NULL; - - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); - - /* Compute the global thread ID (in idx). */ - struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); - struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); - struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), - TGSI_WRITEMASK_X); - ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid); - - /* Load 2 pairs of offsets for DCC load & store. */ - struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false); - struct ureg_dst offsets = ureg_DECL_temporary(ureg); - struct ureg_src map_load_args[] = {map, ureg_src(idx)}; - - ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, - TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); - - struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, - 0, false, false); - struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, - 0, true, false)); - struct ureg_dst dcc_value[2]; - - /* Copy DCC values: - * dst[offsets.y] = src[offsets.x]; - * dst[offsets.w] = src[offsets.z]; - */ - for (unsigned i = 0; i < 2; i++) { - dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); - - struct ureg_src load_args[] = - {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)}; - ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, - TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); - } - - dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X); - - for (unsigned i = 0; i < 2; i++) { - struct ureg_src store_args[] = { - ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2), - ureg_src(dcc_value[i]) - }; - ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, - TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); - } - ureg_END(ureg); - - struct pipe_compute_state state = {}; - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = ureg_get_tokens(ureg, NULL); - - void *cs = ctx->create_compute_state(ctx, &state); - ureg_destroy(ureg); - return cs; + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + /* Compute the global thread ID (in idx). */ + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid); + + /* Load 2 pairs of offsets for DCC load & store. */ + struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false); + struct ureg_dst offsets = ureg_DECL_temporary(ureg); + struct ureg_src map_load_args[] = {map, ureg_src(idx)}; + + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, TGSI_MEMORY_RESTRICT, + TGSI_TEXTURE_BUFFER, 0); + + struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, 0, false, false); + struct ureg_dst dcc_dst = + ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, 0, true, false)); + struct ureg_dst dcc_value[2]; + + /* Copy DCC values: + * dst[offsets.y] = src[offsets.x]; + * dst[offsets.w] = src[offsets.z]; + */ + for (unsigned i = 0; i < 2; i++) { + dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + + struct ureg_src load_args[] = {dcc_src, + ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i * 2)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, TGSI_MEMORY_RESTRICT, + TGSI_TEXTURE_BUFFER, 0); + } + + dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X); + + for (unsigned i = 0; i < 2; i++) { + struct ureg_src store_args[] = {ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i * 2), + ureg_src(dcc_value[i])}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, TGSI_MEMORY_RESTRICT, + TGSI_TEXTURE_BUFFER, 0); + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + return cs; } /* Create the compute shader that is used to collect the results. @@ -337,186 +315,185 @@ void *si_create_dcc_retile_cs(struct pipe_context *ctx) */ void *si_create_query_result_cs(struct si_context *sctx) { - /* TEMP[0].xy = accumulated result so far - * TEMP[0].z = result not available - * - * TEMP[1].x = current result index - * TEMP[1].y = current pair index - */ - static const char text_tmpl[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL BUFFER[0]\n" - "DCL BUFFER[1]\n" - "DCL BUFFER[2]\n" - "DCL CONST[0][0..1]\n" - "DCL TEMP[0..5]\n" - "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" - "IMM[1] UINT32 {1, 2, 4, 8}\n" - "IMM[2] UINT32 {16, 32, 64, 128}\n" - "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ - "IMM[4] UINT32 {256, 0, 0, 0}\n" - - "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" - "UIF TEMP[5]\n" - /* Check result availability. */ - "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" - "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" - "MOV TEMP[1], TEMP[0].zzzz\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - - /* Load result if available. */ - "UIF TEMP[1]\n" - "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Load previously accumulated result if requested. */ - "MOV TEMP[0], IMM[0].xxxx\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" - "UIF TEMP[4]\n" - "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" - "ENDIF\n" - - "MOV TEMP[1].x, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Break if accumulated result so far is not available. */ - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - /* Break if result_index >= result_count. */ - "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - - /* Load fence and check result availability */ - "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" - "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" - "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - "MOV TEMP[1].y, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Load start and end. */ - "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" - "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - - "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" - - "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" - "UIF TEMP[5].zzzz\n" - /* Load second start/end half-pair and - * take the difference - */ - "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" - "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" - "ENDIF\n" - - "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" - - /* Increment pair index */ - "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" - "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - "ENDLOOP\n" - - /* Increment result index */ - "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" - "ENDLOOP\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" - "UIF TEMP[4]\n" - /* Store accumulated data for chaining. */ - "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" - "ELSE\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" - "UIF TEMP[4]\n" - /* Store result availability. */ - "NOT TEMP[0].z, TEMP[0]\n" - "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Store result if it is available. */ - "NOT TEMP[4], TEMP[0].zzzz\n" - "UIF TEMP[4]\n" - /* Apply timestamp conversion */ - "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" - "UIF TEMP[4]\n" - "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" - "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" - "ENDIF\n" - - /* Convert to boolean */ - "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" - "UIF TEMP[4]\n" - "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" - "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" - "MOV TEMP[0].y, IMM[0].xxxx\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" - "ELSE\n" - /* Clamping */ - "UIF TEMP[0].yyyy\n" - "MOV TEMP[0].x, IMM[0].wwww\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" - "UIF TEMP[4]\n" - "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" - "ENDIF\n" - - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - - "END\n"; - - char text[sizeof(text_tmpl) + 32]; - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {}; - - /* Hard code the frequency into the shader so that the backend can - * use the full range of optimizations for divide-by-constant. - */ - snprintf(text, sizeof(text), text_tmpl, - sctx->screen->info.clock_crystal_freq); - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return sctx->b.create_compute_state(&sctx->b, &state); + /* TEMP[0].xy = accumulated result so far + * TEMP[0].z = result not available + * + * TEMP[1].x = current result index + * TEMP[1].y = current pair index + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0][0..1]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ + "IMM[4] UINT32 {256, 0, 0, 0}\n" + + "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" + "UIF TEMP[5]\n" + /* Check result availability. */ + "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" + "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" + "MOV TEMP[1], TEMP[0].zzzz\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + + /* Load result if available. */ + "UIF TEMP[1]\n" + "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Load previously accumulated result if requested. */ + "MOV TEMP[0], IMM[0].xxxx\n" + "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" + "UIF TEMP[4]\n" + "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + "MOV TEMP[1].x, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Break if accumulated result so far is not available. */ + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + /* Break if result_index >= result_count. */ + "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + + /* Load fence and check result availability */ + "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Load start and end. */ + "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" + "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + + "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" + + "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" + + "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" + "UIF TEMP[5].zzzz\n" + /* Load second start/end half-pair and + * take the difference + */ + "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" + + "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" + "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" + "ENDIF\n" + + "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" + + /* Increment pair index */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" + "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "ENDLOOP\n" + + /* Increment result index */ + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" + "ENDLOOP\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" + "UIF TEMP[4]\n" + /* Store accumulated data for chaining. */ + "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" + "UIF TEMP[4]\n" + /* Store result availability. */ + "NOT TEMP[0].z, TEMP[0]\n" + "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Store result if it is available. */ + "NOT TEMP[4], TEMP[0].zzzz\n" + "UIF TEMP[4]\n" + /* Apply timestamp conversion */ + "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" + "UIF TEMP[4]\n" + "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" + "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" + "ENDIF\n" + + /* Convert to boolean */ + "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" + "UIF TEMP[4]\n" + "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" + "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" + "ELSE\n" + /* Clamping */ + "UIF TEMP[0].yyyy\n" + "MOV TEMP[0].x, IMM[0].wwww\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" + "UIF TEMP[4]\n" + "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" + "ENDIF\n" + + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + char text[sizeof(text_tmpl) + 32]; + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + /* Hard code the frequency into the shader so that the backend can + * use the full range of optimizations for divide-by-constant. + */ + snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq); + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return sctx->b.create_compute_state(&sctx->b, &state); } /* Create a compute shader implementing copy_image. @@ -524,247 +501,238 @@ void *si_create_query_result_cs(struct si_context *sctx) */ void *si_create_copy_image_compute_shader(struct pipe_context *ctx) { - static const char text[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw - "DCL TEMP[0..4], LOCAL\n" - "IMM[0] UINT32 {8, 1, 0, 0}\n" - "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" - "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" - "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" - "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "MOV TEMP[4].xyz, CONST[0][1].xyzw\n" - "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n" - "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {8, 1, 0, 0}\n" + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xyz, CONST[0][1].xyzw\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); } void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx) { - static const char text[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw - "DCL TEMP[0..4], LOCAL\n" - "IMM[0] UINT32 {64, 1, 0, 0}\n" - "MOV TEMP[0].xy, CONST[0][0].xzzw\n" - "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" - "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" - "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "MOV TEMP[4].xy, CONST[0][1].xzzw\n" - "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n" - "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {64, 1, 0, 0}\n" + "MOV TEMP[0].xy, CONST[0][0].xzzw\n" + "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xy, CONST[0][1].xzzw\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); } void *si_clear_render_target_shader(struct pipe_context *ctx) { - static const char text[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw - "DCL TEMP[0..3], LOCAL\n" - "IMM[0] UINT32 {8, 1, 0, 0}\n" - "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" - "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" - "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" - "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" - "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..3], LOCAL\n" + "IMM[0] UINT32 {8, 1, 0, 0}\n" + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" + "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" + "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); } /* TODO: Didn't really test 1D_ARRAY */ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx) { - static const char text[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw - "DCL TEMP[0..3], LOCAL\n" - "IMM[0] UINT32 {64, 1, 0, 0}\n" - "MOV TEMP[0].xy, CONST[0][0].xzzw\n" - "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" - "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" - "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" - "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..3], LOCAL\n" + "IMM[0] UINT32 {64, 1, 0, 0}\n" + "MOV TEMP[0].xy, CONST[0][0].xzzw\n" + "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" + "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" + "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); } void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx) { - static const char text[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL BUFFER[0]\n" - "DCL CONST[0][0..0]\n" // 0:xyzw - "DCL TEMP[0..0]\n" - "IMM[0] UINT32 {64, 1, 12, 0}\n" - "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" - "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes - "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); + static const char text[] = "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL BUFFER[0]\n" + "DCL CONST[0][0..0]\n" // 0:xyzw + "DCL TEMP[0..0]\n" + "IMM[0] UINT32 {64, 1, 12, 0}\n" + "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes + "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); } - /* Load samples from the image, and copy them to the same image. This looks like * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are * reordered to match expanded FMASK. * * After the shader finishes, FMASK should be cleared to identity. */ -void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, - bool is_array) +void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array) { - enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : - TGSI_TEXTURE_2D_MSAA; - struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); - if (!ureg) - return NULL; - - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); - - /* Compute the image coordinates. */ - struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false); - struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); - struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); - struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), - TGSI_WRITEMASK_XYZW); - ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), - ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8), - ureg_swizzle(tid, 0, 1, 1, 1)); - if (is_array) { - ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), - ureg_scalar(blk, TGSI_SWIZZLE_Z)); - } - - /* Load samples, resolving FMASK. */ - struct ureg_dst sample[8]; - assert(num_samples <= ARRAY_SIZE(sample)); - - for (unsigned i = 0; i < num_samples; i++) { - sample[i] = ureg_DECL_temporary(ureg); - - ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), - ureg_imm1u(ureg, i)); - - struct ureg_src srcs[] = {image, ureg_src(coord)}; - ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, - TGSI_MEMORY_RESTRICT, target, 0); - } - - /* Store samples, ignoring FMASK. */ - for (unsigned i = 0; i < num_samples; i++) { - ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), - ureg_imm1u(ureg, i)); - - struct ureg_dst dst_image = ureg_dst(image); - struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])}; - ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, - TGSI_MEMORY_RESTRICT, target, 0); - } - ureg_END(ureg); - - struct pipe_compute_state state = {}; - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = ureg_get_tokens(ureg, NULL); - - void *cs = ctx->create_compute_state(ctx, &state); - ureg_destroy(ureg); - return cs; + enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA; + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + /* Compute the image coordinates. */ + struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false); + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW); + ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1), + ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1)); + if (is_array) { + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z)); + } + + /* Load samples, resolving FMASK. */ + struct ureg_dst sample[8]; + assert(num_samples <= ARRAY_SIZE(sample)); + + for (unsigned i = 0; i < num_samples; i++) { + sample[i] = ureg_DECL_temporary(ureg); + + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i)); + + struct ureg_src srcs[] = {image, ureg_src(coord)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target, + 0); + } + + /* Store samples, ignoring FMASK. */ + for (unsigned i = 0; i < num_samples; i++) { + ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i)); + + struct ureg_dst dst_image = ureg_dst(image); + struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT, + target, 0); + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + return cs; } /* Create the compute shader that is used to collect the results of gfx10+ @@ -798,196 +766,192 @@ void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, */ void *gfx10_create_sh_query_result_cs(struct si_context *sctx) { - /* TEMP[0].x = accumulated result so far - * TEMP[0].y = result missing - * TEMP[0].z = whether we're in overflow mode - */ - static const char text_tmpl[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL BUFFER[0]\n" - "DCL BUFFER[1]\n" - "DCL BUFFER[2]\n" - "DCL CONST[0][0..0]\n" - "DCL TEMP[0..5]\n" - "IMM[0] UINT32 {0, 7, 0, 4294967295}\n" - "IMM[1] UINT32 {1, 2, 4, 8}\n" - "IMM[2] UINT32 {16, 32, 64, 128}\n" - - /* - acc_result = 0; - acc_missing = 0; - if (chain & 1) { - acc_result = buffer[1][0]; - acc_missing = buffer[1][1]; - } - */ - "MOV TEMP[0].xy, IMM[0].xxxx\n" - "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" - "UIF TEMP[5]\n" - "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" - "ENDIF\n" - - /* - is_overflow (TEMP[0].z) = (config & 7) >= 2; - result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count; - base_offset (TEMP[1].y) = 0; - for (;;) { - if (!result_remaining) - break; - result_remaining--; - */ - "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" - - "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" - "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" - "MOV TEMP[1].y, IMM[0].xxxx\n" - - "BGNLOOP\n" - "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" - - /* - fence = buffer[0]@(base_offset + 32); - if (!fence) { - acc_missing = ~0u; - break; - } - */ - "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n" - "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" - "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" - "UIF TEMP[5]\n" - "MOV TEMP[0].y, TEMP[5].xxxx\n" - "BRK\n" - "ENDIF\n" - - /* - stream_offset (TEMP[2].x) = base_offset + offset; - - if (!(config & 7)) { - acc_result += buffer[0]@stream_offset; - } - */ - "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" - - "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" - "UIF TEMP[5]\n" - "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" - "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" - "ENDIF\n" - - /* - if ((config & 7) >= 2) { - count (TEMP[2].y) = (config & 1) ? 4 : 1; - */ - "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" - "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" - "UIF TEMP[5]\n" - "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" - "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" - - /* - do { - generated = buffer[0]@stream_offset; - emitted = buffer[0]@(stream_offset + 16); - if (generated != emitted) { - acc_result = 1; - result_remaining = 0; - break; - } - - stream_offset += 4; - } while (--count); - */ - "BGNLOOP\n" - "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" - "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n" - "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n" - "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n" - "UIF TEMP[5]\n" - "MOV TEMP[0].x, IMM[1].xxxx\n" - "MOV TEMP[1].y, IMM[0].xxxx\n" - "BRK\n" - "ENDIF\n" - - "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" - "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n" - "ENDLOOP\n" - "ENDIF\n" - - /* - base_offset += 64; - } // end outer loop - */ - "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n" - "ENDLOOP\n" - - /* - if (chain & 2) { - buffer[2][0] = acc_result; - buffer[2][1] = acc_missing; - } else { - */ - "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" - "UIF TEMP[5]\n" - "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" - "ELSE\n" - - /* - if ((config & 7) == 1) { - acc_result = acc_missing ? 0 : 1; - acc_missing = 0; - } - */ - "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" - "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" - "UIF TEMP[5]\n" - "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" - "MOV TEMP[0].y, IMM[0].xxxx\n" - "ENDIF\n" - - /* - if (!acc_missing) { - buffer[2][0] = acc_result; - if (config & 8) - buffer[2][1] = 0; - } - */ - "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" - "UIF TEMP[5]\n" - "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" - - "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" - "UIF TEMP[5]\n" - "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {}; - - if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return sctx->b.create_compute_state(&sctx->b, &state); + /* TEMP[0].x = accumulated result so far + * TEMP[0].y = result missing + * TEMP[0].z = whether we're in overflow mode + */ + static const char text_tmpl[] = "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0][0..0]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 7, 0, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + + /* + acc_result = 0; + acc_missing = 0; + if (chain & 1) { + acc_result = buffer[1][0]; + acc_missing = buffer[1][1]; + } + */ + "MOV TEMP[0].xy, IMM[0].xxxx\n" + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + /* + is_overflow (TEMP[0].z) = (config & 7) >= 2; + result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : + result_count; base_offset (TEMP[1].y) = 0; for (;;) { if + (!result_remaining) break; result_remaining--; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" + + "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" + "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + + "BGNLOOP\n" + "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" + + /* + fence = buffer[0]@(base_offset + 32); + if (!fence) { + acc_missing = ~0u; + break; + } + */ + "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].y, TEMP[5].xxxx\n" + "BRK\n" + "ENDIF\n" + + /* + stream_offset (TEMP[2].x) = base_offset + offset; + + if (!(config & 7)) { + acc_result += buffer[0]@stream_offset; + } + */ + "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" + + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" + "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" + "ENDIF\n" + + /* + if ((config & 7) >= 2) { + count (TEMP[2].y) = (config & 1) ? 4 : 1; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" + "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" + + /* + do { + generated = buffer[0]@stream_offset; + emitted = buffer[0]@(stream_offset + 16); + if (generated != emitted) { + acc_result = 1; + result_remaining = 0; + break; + } + + stream_offset += 4; + } while (--count); + */ + "BGNLOOP\n" + "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" + "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n" + "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n" + "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].x, IMM[1].xxxx\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BRK\n" + "ENDIF\n" + + "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" + "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n" + "ENDLOOP\n" + "ENDIF\n" + + /* + base_offset += 64; + } // end outer loop + */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n" + "ENDLOOP\n" + + /* + if (chain & 2) { + buffer[2][0] = acc_result; + buffer[2][1] = acc_missing; + } else { + */ + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + + /* + if ((config & 7) == 1) { + acc_result = acc_missing ? 0 : 1; + acc_missing = 0; + } + */ + "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + /* + if (!acc_missing) { + buffer[2][0] = acc_result; + if (config & 8) + buffer[2][1] = 0; + } + */ + "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + + "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return sctx->b.create_compute_state(&sctx->b, &state); } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 46d7c71b2de..60aa0865502 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -23,51 +23,49 @@ */ #include "si_build_pm4.h" -#include "sid.h" #include "si_query.h" - -#include "util/u_dual_blend.h" +#include "sid.h" +#include "util/fast_idiv_by_const.h" #include "util/format/u_format.h" #include "util/format/u_format_s3tc.h" +#include "util/u_dual_blend.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" -#include "util/fast_idiv_by_const.h" struct gfx10_format { - unsigned img_format:9; + unsigned img_format : 9; - /* Various formats are only supported with workarounds for vertex fetch, - * and some 32_32_32 formats are supported natively, but only for buffers - * (possibly with some image support, actually, but no filtering). */ - bool buffers_only:1; + /* Various formats are only supported with workarounds for vertex fetch, + * and some 32_32_32 formats are supported natively, but only for buffers + * (possibly with some image support, actually, but no filtering). */ + bool buffers_only : 1; }; #include "gfx10_format_table.h" static unsigned si_map_swizzle(unsigned swizzle) { - switch (swizzle) { - case PIPE_SWIZZLE_Y: - return V_008F0C_SQ_SEL_Y; - case PIPE_SWIZZLE_Z: - return V_008F0C_SQ_SEL_Z; - case PIPE_SWIZZLE_W: - return V_008F0C_SQ_SEL_W; - case PIPE_SWIZZLE_0: - return V_008F0C_SQ_SEL_0; - case PIPE_SWIZZLE_1: - return V_008F0C_SQ_SEL_1; - default: /* PIPE_SWIZZLE_X */ - return V_008F0C_SQ_SEL_X; - } + switch (swizzle) { + case PIPE_SWIZZLE_Y: + return V_008F0C_SQ_SEL_Y; + case PIPE_SWIZZLE_Z: + return V_008F0C_SQ_SEL_Z; + case PIPE_SWIZZLE_W: + return V_008F0C_SQ_SEL_W; + case PIPE_SWIZZLE_0: + return V_008F0C_SQ_SEL_0; + case PIPE_SWIZZLE_1: + return V_008F0C_SQ_SEL_1; + default: /* PIPE_SWIZZLE_X */ + return V_008F0C_SQ_SEL_X; + } } /* 12.4 fixed-point */ static unsigned si_pack_float_12p4(float x) { - return x <= 0 ? 0 : - x >= 4096 ? 0xffff : x * 16; + return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16; } /* @@ -78,202 +76,191 @@ static unsigned si_pack_float_12p4(float x) */ static void si_emit_cb_render_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_state_blend *blend = sctx->queued.named.blend; - /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, - * but you never know. */ - uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & - blend->cb_target_mask; - unsigned i; - - /* Avoid a hang that happens when dual source blending is enabled - * but there is not enough color outputs. This is undefined behavior, - * so disable color writes completely. - * - * Reproducible with Unigine Heaven 4.0 and drirc missing. - */ - if (blend->dual_src_blend && - sctx->ps_shader.cso && - (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) - cb_target_mask = 0; - - /* GFX9: Flush DFSM when CB_TARGET_MASK changes. - * I think we don't have to do anything between IBs. - */ - if (sctx->screen->dpbb_allowed && - sctx->last_cb_target_mask != cb_target_mask) { - sctx->last_cb_target_mask = cb_target_mask; - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); - } - - unsigned initial_cdw = cs->current.cdw; - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, - SI_TRACKED_CB_TARGET_MASK, cb_target_mask); - - if (sctx->chip_class >= GFX8) { - /* DCC MSAA workaround. - * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- - * COMBINER_DISABLE, but that would be more complicated. - */ - bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && - sctx->framebuffer.nr_samples >= 2; - unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; - - radeon_opt_set_context_reg( - sctx, R_028424_CB_DCC_CONTROL, - SI_TRACKED_CB_DCC_CONTROL, - S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | - S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | - S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | - S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); - } - - /* RB+ register settings. */ - if (sctx->screen->info.rbplus_allowed) { - unsigned spi_shader_col_format = - sctx->ps_shader.cso ? - sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0; - unsigned sx_ps_downconvert = 0; - unsigned sx_blend_opt_epsilon = 0; - unsigned sx_blend_opt_control = 0; - - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - struct si_surface *surf = - (struct si_surface*)sctx->framebuffer.state.cbufs[i]; - unsigned format, swap, spi_format, colormask; - bool has_alpha, has_rgb; - - if (!surf) { - /* If the color buffer is not set, the driver sets 32_R - * as the SPI color format, because the hw doesn't allow - * holes between color outputs, so also set this to - * enable RB+. - */ - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); - continue; - } - - format = G_028C70_FORMAT(surf->cb_color_info); - swap = G_028C70_COMP_SWAP(surf->cb_color_info); - spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; - colormask = (cb_target_mask >> (i * 4)) & 0xf; - - /* Set if RGB and A are present. */ - has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib); - - if (format == V_028C70_COLOR_8 || - format == V_028C70_COLOR_16 || - format == V_028C70_COLOR_32) - has_rgb = !has_alpha; - else - has_rgb = true; - - /* Check the colormask and export format. */ - if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A))) - has_rgb = false; - if (!(colormask & PIPE_MASK_A)) - has_alpha = false; - - if (spi_format == V_028714_SPI_SHADER_ZERO) { - has_rgb = false; - has_alpha = false; - } - - /* Disable value checking for disabled channels. */ - if (!has_rgb) - sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); - if (!has_alpha) - sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); - - /* Enable down-conversion for 32bpp and smaller formats. */ - switch (format) { - case V_028C70_COLOR_8: - case V_028C70_COLOR_8_8: - case V_028C70_COLOR_8_8_8_8: - /* For 1 and 2-channel formats, use the superset thereof. */ - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || - spi_format == V_028714_SPI_SHADER_UINT16_ABGR || - spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); - sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_5_6_5: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_1_5_5_5: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); - sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_4_4_4_4: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); - sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); - } - break; - - case V_028C70_COLOR_32: - if (swap == V_028C70_SWAP_STD && - spi_format == V_028714_SPI_SHADER_32_R) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); - else if (swap == V_028C70_SWAP_ALT_REV && - spi_format == V_028714_SPI_SHADER_32_AR) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); - break; - - case V_028C70_COLOR_16: - case V_028C70_COLOR_16_16: - /* For 1-channel formats, use the superset thereof. */ - if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || - spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || - spi_format == V_028714_SPI_SHADER_UINT16_ABGR || - spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { - if (swap == V_028C70_SWAP_STD || - swap == V_028C70_SWAP_STD_REV) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); - else - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); - } - break; - - case V_028C70_COLOR_10_11_11: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); - break; - - case V_028C70_COLOR_2_10_10_10: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { - sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); - sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); - } - break; - } - } - - /* If there are no color outputs, the first color export is - * always enabled as 32_R, so also set this to enable RB+. - */ - if (!sx_ps_downconvert) - sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; - - /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ - radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, - SI_TRACKED_SX_PS_DOWNCONVERT, - sx_ps_downconvert, sx_blend_opt_epsilon, - sx_blend_opt_control); - } - if (initial_cdw != cs->current.cdw) - sctx->context_roll = true; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_state_blend *blend = sctx->queued.named.blend; + /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, + * but you never know. */ + uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask; + unsigned i; + + /* Avoid a hang that happens when dual source blending is enabled + * but there is not enough color outputs. This is undefined behavior, + * so disable color writes completely. + * + * Reproducible with Unigine Heaven 4.0 and drirc missing. + */ + if (blend->dual_src_blend && sctx->ps_shader.cso && + (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) + cb_target_mask = 0; + + /* GFX9: Flush DFSM when CB_TARGET_MASK changes. + * I think we don't have to do anything between IBs. + */ + if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) { + sctx->last_cb_target_mask = cb_target_mask; + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + + unsigned initial_cdw = cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + + if (sctx->chip_class >= GFX8) { + /* DCC MSAA workaround. + * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- + * COMBINER_DISABLE, but that would be more complicated. + */ + bool oc_disable = + blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2; + unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; + + radeon_opt_set_context_reg( + sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | + S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | + S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | + S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); + } + + /* RB+ register settings. */ + if (sctx->screen->info.rbplus_allowed) { + unsigned spi_shader_col_format = + sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format + : 0; + unsigned sx_ps_downconvert = 0; + unsigned sx_blend_opt_epsilon = 0; + unsigned sx_blend_opt_control = 0; + + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i]; + unsigned format, swap, spi_format, colormask; + bool has_alpha, has_rgb; + + if (!surf) { + /* If the color buffer is not set, the driver sets 32_R + * as the SPI color format, because the hw doesn't allow + * holes between color outputs, so also set this to + * enable RB+. + */ + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); + continue; + } + + format = G_028C70_FORMAT(surf->cb_color_info); + swap = G_028C70_COMP_SWAP(surf->cb_color_info); + spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; + colormask = (cb_target_mask >> (i * 4)) & 0xf; + + /* Set if RGB and A are present. */ + has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib); + + if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || + format == V_028C70_COLOR_32) + has_rgb = !has_alpha; + else + has_rgb = true; + + /* Check the colormask and export format. */ + if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A))) + has_rgb = false; + if (!(colormask & PIPE_MASK_A)) + has_alpha = false; + + if (spi_format == V_028714_SPI_SHADER_ZERO) { + has_rgb = false; + has_alpha = false; + } + + /* Disable value checking for disabled channels. */ + if (!has_rgb) + sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4); + if (!has_alpha) + sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4); + + /* Enable down-conversion for 32bpp and smaller formats. */ + switch (format) { + case V_028C70_COLOR_8: + case V_028C70_COLOR_8_8: + case V_028C70_COLOR_8_8_8_8: + /* For 1 and 2-channel formats, use the superset thereof. */ + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || + spi_format == V_028714_SPI_SHADER_UINT16_ABGR || + spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4); + sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_5_6_5: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4); + sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_1_5_5_5: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4); + sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_4_4_4_4: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4); + sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4); + } + break; + + case V_028C70_COLOR_32: + if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); + else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4); + break; + + case V_028C70_COLOR_16: + case V_028C70_COLOR_16_16: + /* For 1-channel formats, use the superset thereof. */ + if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || + spi_format == V_028714_SPI_SHADER_SNORM16_ABGR || + spi_format == V_028714_SPI_SHADER_UINT16_ABGR || + spi_format == V_028714_SPI_SHADER_SINT16_ABGR) { + if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4); + else + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4); + } + break; + + case V_028C70_COLOR_10_11_11: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); + break; + + case V_028C70_COLOR_2_10_10_10: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4); + sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); + } + break; + } + } + + /* If there are no color outputs, the first color export is + * always enabled as 32_R, so also set this to enable RB+. + */ + if (!sx_ps_downconvert) + sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + + /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ + radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); + } + if (initial_cdw != cs->current.cdw) + sctx->context_roll = true; } /* @@ -282,551 +269,507 @@ static void si_emit_cb_render_state(struct si_context *sctx) static uint32_t si_translate_blend_function(int blend_func) { - switch (blend_func) { - case PIPE_BLEND_ADD: - return V_028780_COMB_DST_PLUS_SRC; - case PIPE_BLEND_SUBTRACT: - return V_028780_COMB_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return V_028780_COMB_DST_MINUS_SRC; - case PIPE_BLEND_MIN: - return V_028780_COMB_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return V_028780_COMB_MAX_DST_SRC; - default: - PRINT_ERR("Unknown blend function %d\n", blend_func); - assert(0); - break; - } - return 0; + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028780_COMB_DST_PLUS_SRC; + case PIPE_BLEND_SUBTRACT: + return V_028780_COMB_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028780_COMB_DST_MINUS_SRC; + case PIPE_BLEND_MIN: + return V_028780_COMB_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return V_028780_COMB_MAX_DST_SRC; + default: + PRINT_ERR("Unknown blend function %d\n", blend_func); + assert(0); + break; + } + return 0; } static uint32_t si_translate_blend_factor(int blend_fact) { - switch (blend_fact) { - case PIPE_BLENDFACTOR_ONE: - return V_028780_BLEND_ONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return V_028780_BLEND_SRC_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return V_028780_BLEND_SRC_ALPHA; - case PIPE_BLENDFACTOR_DST_ALPHA: - return V_028780_BLEND_DST_ALPHA; - case PIPE_BLENDFACTOR_DST_COLOR: - return V_028780_BLEND_DST_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return V_028780_BLEND_SRC_ALPHA_SATURATE; - case PIPE_BLENDFACTOR_CONST_COLOR: - return V_028780_BLEND_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_CONST_ALPHA: - return V_028780_BLEND_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_ZERO: - return V_028780_BLEND_ZERO; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return V_028780_BLEND_ONE_MINUS_SRC_COLOR; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return V_028780_BLEND_ONE_MINUS_DST_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return V_028780_BLEND_ONE_MINUS_DST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_SRC1_COLOR: - return V_028780_BLEND_SRC1_COLOR; - case PIPE_BLENDFACTOR_SRC1_ALPHA: - return V_028780_BLEND_SRC1_ALPHA; - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - return V_028780_BLEND_INV_SRC1_COLOR; - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return V_028780_BLEND_INV_SRC1_ALPHA; - default: - PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact); - assert(0); - break; - } - return 0; + switch (blend_fact) { + case PIPE_BLENDFACTOR_ONE: + return V_028780_BLEND_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return V_028780_BLEND_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028780_BLEND_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return V_028780_BLEND_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return V_028780_BLEND_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return V_028780_BLEND_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return V_028780_BLEND_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return V_028780_BLEND_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_ZERO: + return V_028780_BLEND_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return V_028780_BLEND_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return V_028780_BLEND_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return V_028780_BLEND_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return V_028780_BLEND_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return V_028780_BLEND_SRC1_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return V_028780_BLEND_INV_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return V_028780_BLEND_INV_SRC1_ALPHA; + default: + PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact); + assert(0); + break; + } + return 0; } static uint32_t si_translate_blend_opt_function(int blend_func) { - switch (blend_func) { - case PIPE_BLEND_ADD: - return V_028760_OPT_COMB_ADD; - case PIPE_BLEND_SUBTRACT: - return V_028760_OPT_COMB_SUBTRACT; - case PIPE_BLEND_REVERSE_SUBTRACT: - return V_028760_OPT_COMB_REVSUBTRACT; - case PIPE_BLEND_MIN: - return V_028760_OPT_COMB_MIN; - case PIPE_BLEND_MAX: - return V_028760_OPT_COMB_MAX; - default: - return V_028760_OPT_COMB_BLEND_DISABLED; - } + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028760_OPT_COMB_ADD; + case PIPE_BLEND_SUBTRACT: + return V_028760_OPT_COMB_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028760_OPT_COMB_REVSUBTRACT; + case PIPE_BLEND_MIN: + return V_028760_OPT_COMB_MIN; + case PIPE_BLEND_MAX: + return V_028760_OPT_COMB_MAX; + default: + return V_028760_OPT_COMB_BLEND_DISABLED; + } } static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha) { - switch (blend_fact) { - case PIPE_BLENDFACTOR_ZERO: - return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; - case PIPE_BLENDFACTOR_ONE: - return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 - : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 - : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE - : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; - default: - return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - } + switch (blend_fact) { + case PIPE_BLENDFACTOR_ZERO: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; + case PIPE_BLENDFACTOR_ONE: + return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 + : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 + : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE + : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + default: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + } } -static void si_blend_check_commutativity(struct si_screen *sscreen, - struct si_state_blend *blend, - enum pipe_blend_func func, - enum pipe_blendfactor src, - enum pipe_blendfactor dst, - unsigned chanmask) +static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend, + enum pipe_blend_func func, enum pipe_blendfactor src, + enum pipe_blendfactor dst, unsigned chanmask) { - /* Src factor is allowed when it does not depend on Dst */ - static const uint32_t src_allowed = - (1u << PIPE_BLENDFACTOR_ONE) | - (1u << PIPE_BLENDFACTOR_SRC_COLOR) | - (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | - (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) | - (1u << PIPE_BLENDFACTOR_CONST_COLOR) | - (1u << PIPE_BLENDFACTOR_CONST_ALPHA) | - (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | - (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) | - (1u << PIPE_BLENDFACTOR_ZERO) | - (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | - (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | - (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) | - (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA); - - if (dst == PIPE_BLENDFACTOR_ONE && - (src_allowed & (1u << src))) { - /* Addition is commutative, but floating point addition isn't - * associative: subtle changes can be introduced via different - * rounding. - * - * Out-of-order is also non-deterministic, which means that - * this breaks OpenGL invariance requirements. So only enable - * out-of-order additive blending if explicitly allowed by a - * setting. - */ - if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN || - (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add)) - blend->commutative_4bit |= chanmask; - } + /* Src factor is allowed when it does not depend on Dst */ + static const uint32_t src_allowed = + (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) | + (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) | + (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) | + (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) | + (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) | + (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA); + + if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) { + /* Addition is commutative, but floating point addition isn't + * associative: subtle changes can be introduced via different + * rounding. + * + * Out-of-order is also non-deterministic, which means that + * this breaks OpenGL invariance requirements. So only enable + * out-of-order additive blending if explicitly allowed by a + * setting. + */ + if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN || + (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add)) + blend->commutative_4bit |= chanmask; + } } /** * Get rid of DST in the blend factors by commuting the operands: * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) */ -static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, - unsigned *dst_factor, unsigned expected_dst, - unsigned replacement_src) +static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor, + unsigned expected_dst, unsigned replacement_src) { - if (*src_factor == expected_dst && - *dst_factor == PIPE_BLENDFACTOR_ZERO) { - *src_factor = PIPE_BLENDFACTOR_ZERO; - *dst_factor = replacement_src; - - /* Commuting the operands requires reversing subtractions. */ - if (*func == PIPE_BLEND_SUBTRACT) - *func = PIPE_BLEND_REVERSE_SUBTRACT; - else if (*func == PIPE_BLEND_REVERSE_SUBTRACT) - *func = PIPE_BLEND_SUBTRACT; - } + if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) { + *src_factor = PIPE_BLENDFACTOR_ZERO; + *dst_factor = replacement_src; + + /* Commuting the operands requires reversing subtractions. */ + if (*func == PIPE_BLEND_SUBTRACT) + *func = PIPE_BLEND_REVERSE_SUBTRACT; + else if (*func == PIPE_BLEND_REVERSE_SUBTRACT) + *func = PIPE_BLEND_SUBTRACT; + } } static bool si_blend_factor_uses_dst(unsigned factor) { - return factor == PIPE_BLENDFACTOR_DST_COLOR || - factor == PIPE_BLENDFACTOR_DST_ALPHA || - factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || - factor == PIPE_BLENDFACTOR_INV_DST_COLOR; + return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA || + factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR; } static void *si_create_blend_state_mode(struct pipe_context *ctx, - const struct pipe_blend_state *state, - unsigned mode) + const struct pipe_blend_state *state, unsigned mode) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend); - struct si_pm4_state *pm4 = &blend->pm4; - uint32_t sx_mrt_blend_opt[8] = {0}; - uint32_t color_control = 0; - bool logicop_enable = state->logicop_enable && - state->logicop_func != PIPE_LOGICOP_COPY; - - if (!blend) - return NULL; - - blend->alpha_to_coverage = state->alpha_to_coverage; - blend->alpha_to_one = state->alpha_to_one; - blend->dual_src_blend = util_blend_state_is_dual(state, 0); - blend->logicop_enable = logicop_enable; - - if (logicop_enable) { - color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); - } else { - color_control |= S_028808_ROP3(0xcc); - } - - si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, - S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | - S_028B70_ALPHA_TO_MASK_OFFSET0(3) | - S_028B70_ALPHA_TO_MASK_OFFSET1(1) | - S_028B70_ALPHA_TO_MASK_OFFSET2(0) | - S_028B70_ALPHA_TO_MASK_OFFSET3(2) | - S_028B70_OFFSET_ROUND(1)); - - if (state->alpha_to_coverage) - blend->need_src_alpha_4bit |= 0xf; - - blend->cb_target_mask = 0; - blend->cb_target_enabled_4bit = 0; - - for (int i = 0; i < 8; i++) { - /* state->rt entries > 0 only written if independent blending */ - const int j = state->independent_blend_enable ? i : 0; - - unsigned eqRGB = state->rt[j].rgb_func; - unsigned srcRGB = state->rt[j].rgb_src_factor; - unsigned dstRGB = state->rt[j].rgb_dst_factor; - unsigned eqA = state->rt[j].alpha_func; - unsigned srcA = state->rt[j].alpha_src_factor; - unsigned dstA = state->rt[j].alpha_dst_factor; - - unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; - unsigned blend_cntl = 0; - - sx_mrt_blend_opt[i] = - S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | - S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); - - /* Only set dual source blending for MRT0 to avoid a hang. */ - if (i >= 1 && blend->dual_src_blend) { - /* Vulkan does this for dual source blending. */ - if (i == 1) - blend_cntl |= S_028780_ENABLE(1); - - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - /* Only addition and subtraction equations are supported with - * dual source blending. - */ - if (blend->dual_src_blend && - (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX || - eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) { - assert(!"Unsupported equation for dual source blending"); - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - /* cb_render_state will disable unused ones */ - blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i); - if (state->rt[j].colormask) - blend->cb_target_enabled_4bit |= 0xf << (4 * i); - - if (!state->rt[j].colormask || !state->rt[j].blend_enable) { - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - continue; - } - - si_blend_check_commutativity(sctx->screen, blend, - eqRGB, srcRGB, dstRGB, 0x7 << (4 * i)); - si_blend_check_commutativity(sctx->screen, blend, - eqA, srcA, dstA, 0x8 << (4 * i)); - - /* Blending optimizations for RB+. - * These transformations don't change the behavior. - * - * First, get rid of DST in the blend factors: - * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) - */ - si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, - PIPE_BLENDFACTOR_DST_COLOR, - PIPE_BLENDFACTOR_SRC_COLOR); - si_blend_remove_dst(&eqA, &srcA, &dstA, - PIPE_BLENDFACTOR_DST_COLOR, - PIPE_BLENDFACTOR_SRC_COLOR); - si_blend_remove_dst(&eqA, &srcA, &dstA, - PIPE_BLENDFACTOR_DST_ALPHA, - PIPE_BLENDFACTOR_SRC_ALPHA); - - /* Look up the ideal settings from tables. */ - srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); - dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); - srcA_opt = si_translate_blend_opt_factor(srcA, true); - dstA_opt = si_translate_blend_opt_factor(dstA, true); - - /* Handle interdependencies. */ - if (si_blend_factor_uses_dst(srcRGB)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - if (si_blend_factor_uses_dst(srcA)) - dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - - if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE && - (dstRGB == PIPE_BLENDFACTOR_ZERO || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)) - dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; - - /* Set the final value. */ - sx_mrt_blend_opt[i] = - S_028760_COLOR_SRC_OPT(srcRGB_opt) | - S_028760_COLOR_DST_OPT(dstRGB_opt) | - S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | - S_028760_ALPHA_SRC_OPT(srcA_opt) | - S_028760_ALPHA_DST_OPT(dstA_opt) | - S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); - - /* Set blend state. */ - blend_cntl |= S_028780_ENABLE(1); - blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); - blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); - blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); - - if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { - blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); - blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); - blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); - blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); - } - si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); - - blend->blend_enable_4bit |= 0xfu << (i * 4); - - if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14) - blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); - - /* This is only important for formats without alpha. */ - if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || - srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || - dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA) - blend->need_src_alpha_4bit |= 0xfu << (i * 4); - } - - if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable) - blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; - - if (blend->cb_target_mask) { - color_control |= S_028808_MODE(mode); - } else { - color_control |= S_028808_MODE(V_028808_CB_DISABLE); - } - - if (sctx->screen->info.rbplus_allowed) { - /* Disable RB+ blend optimizations for dual source blending. - * Vulkan does this. - */ - if (blend->dual_src_blend) { - for (int i = 0; i < 8; i++) { - sx_mrt_blend_opt[i] = - S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | - S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); - } - } - - for (int i = 0; i < 8; i++) - si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, - sx_mrt_blend_opt[i]); - - /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ - if (blend->dual_src_blend || logicop_enable || - mode == V_028808_CB_RESOLVE) - color_control |= S_028808_DISABLE_DUAL_QUAD(1); - } - - si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); - return blend; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend); + struct si_pm4_state *pm4 = &blend->pm4; + uint32_t sx_mrt_blend_opt[8] = {0}; + uint32_t color_control = 0; + bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY; + + if (!blend) + return NULL; + + blend->alpha_to_coverage = state->alpha_to_coverage; + blend->alpha_to_one = state->alpha_to_one; + blend->dual_src_blend = util_blend_state_is_dual(state, 0); + blend->logicop_enable = logicop_enable; + + if (logicop_enable) { + color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); + } else { + color_control |= S_028808_ROP3(0xcc); + } + + si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK, + S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) | + S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) | + S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) | + S_028B70_OFFSET_ROUND(1)); + + if (state->alpha_to_coverage) + blend->need_src_alpha_4bit |= 0xf; + + blend->cb_target_mask = 0; + blend->cb_target_enabled_4bit = 0; + + for (int i = 0; i < 8; i++) { + /* state->rt entries > 0 only written if independent blending */ + const int j = state->independent_blend_enable ? i : 0; + + unsigned eqRGB = state->rt[j].rgb_func; + unsigned srcRGB = state->rt[j].rgb_src_factor; + unsigned dstRGB = state->rt[j].rgb_dst_factor; + unsigned eqA = state->rt[j].alpha_func; + unsigned srcA = state->rt[j].alpha_src_factor; + unsigned dstA = state->rt[j].alpha_dst_factor; + + unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt; + unsigned blend_cntl = 0; + + sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); + + /* Only set dual source blending for MRT0 to avoid a hang. */ + if (i >= 1 && blend->dual_src_blend) { + /* Vulkan does this for dual source blending. */ + if (i == 1) + blend_cntl |= S_028780_ENABLE(1); + + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + /* Only addition and subtraction equations are supported with + * dual source blending. + */ + if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX || + eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) { + assert(!"Unsupported equation for dual source blending"); + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + /* cb_render_state will disable unused ones */ + blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i); + if (state->rt[j].colormask) + blend->cb_target_enabled_4bit |= 0xf << (4 * i); + + if (!state->rt[j].colormask || !state->rt[j].blend_enable) { + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + continue; + } + + si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i)); + si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i)); + + /* Blending optimizations for RB+. + * These transformations don't change the behavior. + * + * First, get rid of DST in the blend factors: + * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC) + */ + si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR, + PIPE_BLENDFACTOR_SRC_COLOR); + si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR, + PIPE_BLENDFACTOR_SRC_COLOR); + si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA, + PIPE_BLENDFACTOR_SRC_ALPHA); + + /* Look up the ideal settings from tables. */ + srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false); + dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false); + srcA_opt = si_translate_blend_opt_factor(srcA, true); + dstA_opt = si_translate_blend_opt_factor(dstA, true); + + /* Handle interdependencies. */ + if (si_blend_factor_uses_dst(srcRGB)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + if (si_blend_factor_uses_dst(srcA)) + dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + + if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE && + (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)) + dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + + /* Set the final value. */ + sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) | + S_028760_COLOR_DST_OPT(dstRGB_opt) | + S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) | + S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) | + S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA)); + + /* Set blend state. */ + blend_cntl |= S_028780_ENABLE(1); + blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); + blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); + blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + + if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { + blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1); + blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); + blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); + blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + } + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); + + blend->blend_enable_4bit |= 0xfu << (i * 4); + + if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14) + blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); + + /* This is only important for formats without alpha. */ + if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || + srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || + srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA) + blend->need_src_alpha_4bit |= 0xfu << (i * 4); + } + + if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable) + blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; + + if (blend->cb_target_mask) { + color_control |= S_028808_MODE(mode); + } else { + color_control |= S_028808_MODE(V_028808_CB_DISABLE); + } + + if (sctx->screen->info.rbplus_allowed) { + /* Disable RB+ blend optimizations for dual source blending. + * Vulkan does this. + */ + if (blend->dual_src_blend) { + for (int i = 0; i < 8; i++) { + sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); + } + } + + for (int i = 0; i < 8; i++) + si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]); + + /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ + if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE) + color_control |= S_028808_DISABLE_DUAL_QUAD(1); + } + + si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + return blend; } -static void *si_create_blend_state(struct pipe_context *ctx, - const struct pipe_blend_state *state) +static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state) { - return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); + return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); } static void si_bind_blend_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_blend *old_blend = sctx->queued.named.blend; - struct si_state_blend *blend = (struct si_state_blend *)state; - - if (!blend) - blend = (struct si_state_blend *)sctx->noop_blend; - - si_pm4_bind_state(sctx, blend, blend); - - if (old_blend->cb_target_mask != blend->cb_target_mask || - old_blend->dual_src_blend != blend->dual_src_blend || - (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit && - sctx->framebuffer.nr_samples >= 2 && - sctx->screen->dcc_msaa_allowed)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (old_blend->cb_target_mask != blend->cb_target_mask || - old_blend->alpha_to_coverage != blend->alpha_to_coverage || - old_blend->alpha_to_one != blend->alpha_to_one || - old_blend->dual_src_blend != blend->dual_src_blend || - old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) - sctx->do_update_shaders = true; - - if (sctx->screen->dpbb_allowed && - (old_blend->alpha_to_coverage != blend->alpha_to_coverage || - old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->screen->has_out_of_order_rast && - ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || - old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || - old_blend->commutative_4bit != blend->commutative_4bit || - old_blend->logicop_enable != blend->logicop_enable))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_blend *old_blend = sctx->queued.named.blend; + struct si_state_blend *blend = (struct si_state_blend *)state; + + if (!blend) + blend = (struct si_state_blend *)sctx->noop_blend; + + si_pm4_bind_state(sctx, blend, blend); + + if (old_blend->cb_target_mask != blend->cb_target_mask || + old_blend->dual_src_blend != blend->dual_src_blend || + (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit && + sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (old_blend->cb_target_mask != blend->cb_target_mask || + old_blend->alpha_to_coverage != blend->alpha_to_coverage || + old_blend->alpha_to_one != blend->alpha_to_one || + old_blend->dual_src_blend != blend->dual_src_blend || + old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) + sctx->do_update_shaders = true; + + if (sctx->screen->dpbb_allowed && + (old_blend->alpha_to_coverage != blend->alpha_to_coverage || + old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || + old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || + old_blend->commutative_4bit != blend->commutative_4bit || + old_blend->logicop_enable != blend->logicop_enable))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } static void si_delete_blend_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->queued.named.blend == state) - si_bind_blend_state(ctx, sctx->noop_blend); + if (sctx->queued.named.blend == state) + si_bind_blend_state(ctx, sctx->noop_blend); - si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); + si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); } -static void si_set_blend_color(struct pipe_context *ctx, - const struct pipe_blend_color *state) +static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state) { - struct si_context *sctx = (struct si_context *)ctx; - static const struct pipe_blend_color zeros; + struct si_context *sctx = (struct si_context *)ctx; + static const struct pipe_blend_color zeros; - sctx->blend_color.state = *state; - sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; - si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color); + sctx->blend_color.state = *state; + sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; + si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color); } static void si_emit_blend_color(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); - radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); + radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); + radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4); } /* * Clipping */ -static void si_set_clip_state(struct pipe_context *ctx, - const struct pipe_clip_state *state) +static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb; - static const struct pipe_clip_state zeros; - - if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0) - return; - - sctx->clip_state.state = *state; - sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state); - - cb.buffer = NULL; - cb.user_buffer = state->ucp; - cb.buffer_offset = 0; - cb.buffer_size = 4*4*8; - si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb); - pipe_resource_reference(&cb.buffer, NULL); + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb; + static const struct pipe_clip_state zeros; + + if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0) + return; + + sctx->clip_state.state = *state; + sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0; + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state); + + cb.buffer = NULL; + cb.user_buffer = state->ucp; + cb.buffer_offset = 0; + cb.buffer_size = 4 * 4 * 8; + si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb); + pipe_resource_reference(&cb.buffer, NULL); } static void si_emit_clip_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); - radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); + radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4); + radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4); } static void si_emit_clip_regs(struct si_context *sctx) { - struct si_shader *vs = si_get_vs_state(sctx); - struct si_shader_selector *vs_sel = vs->selector; - struct si_shader_info *info = &vs_sel->info; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned window_space = - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - unsigned clipdist_mask = vs_sel->clipdist_mask; - unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; - unsigned culldist_mask = vs_sel->culldist_mask; - unsigned total_mask; - - if (vs->key.opt.clip_disable) { - assert(!info->culldist_writemask); - clipdist_mask = 0; - culldist_mask = 0; - } - total_mask = clipdist_mask | culldist_mask; - - /* Clip distances on points have no effect, so need to be implemented - * as cull distances. This applies for the clipvertex case as well. - * - * Setting this for primitives other than points should have no adverse - * effects. - */ - clipdist_mask &= rs->clip_plane_enable; - culldist_mask |= clipdist_mask; - - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | - clipdist_mask | (culldist_mask << 8); - - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - pa_cl_cntl, - ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - } else { - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, - vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); - } - radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, - SI_TRACKED_PA_CL_CLIP_CNTL, - rs->pa_cl_clip_cntl | - ucp_mask | - S_028810_CLIP_DISABLE(window_space)); - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + struct si_shader *vs = si_get_vs_state(sctx); + struct si_shader_selector *vs_sel = vs->selector; + struct si_shader_info *info = &vs_sel->info; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + unsigned clipdist_mask = vs_sel->clipdist_mask; + unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; + unsigned culldist_mask = vs_sel->culldist_mask; + unsigned total_mask; + + if (vs->key.opt.clip_disable) { + assert(!info->culldist_writemask); + clipdist_mask = 0; + culldist_mask = 0; + } + total_mask = clipdist_mask | culldist_mask; + + /* Clip distances on points have no effect, so need to be implemented + * as cull distances. This applies for the clipvertex case as well. + * + * Setting this for primitives other than points should have no adverse + * effects. + */ + clipdist_mask &= rs->clip_plane_enable; + culldist_mask |= clipdist_mask; + + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask | + (culldist_mask << 8); + + if (sctx->chip_class >= GFX10) { + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl, + ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + } else { + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); + } + radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } /* @@ -834,28 +777,28 @@ static void si_emit_clip_regs(struct si_context *sctx) */ static void si_update_poly_offset_state(struct si_context *sctx) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { - si_pm4_bind_state(sctx, poly_offset, NULL); - return; - } - - /* Use the user format, not db_render_format, so that the polygon - * offset behaves as expected by applications. - */ - switch (sctx->framebuffer.state.zsbuf->texture->format) { - case PIPE_FORMAT_Z16_UNORM: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); - break; - default: /* 24-bit */ - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); - break; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); - break; - } + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { + si_pm4_bind_state(sctx, poly_offset, NULL); + return; + } + + /* Use the user format, not db_render_format, so that the polygon + * offset behaves as expected by applications. + */ + switch (sctx->framebuffer.state.zsbuf->texture->format) { + case PIPE_FORMAT_Z16_UNORM: + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]); + break; + default: /* 24-bit */ + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]); + break; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]); + break; + } } /* @@ -864,245 +807,228 @@ static void si_update_poly_offset_state(struct si_context *sctx) static uint32_t si_translate_fill(uint32_t func) { - switch(func) { - case PIPE_POLYGON_MODE_FILL: - return V_028814_X_DRAW_TRIANGLES; - case PIPE_POLYGON_MODE_LINE: - return V_028814_X_DRAW_LINES; - case PIPE_POLYGON_MODE_POINT: - return V_028814_X_DRAW_POINTS; - default: - assert(0); - return V_028814_X_DRAW_POINTS; - } + switch (func) { + case PIPE_POLYGON_MODE_FILL: + return V_028814_X_DRAW_TRIANGLES; + case PIPE_POLYGON_MODE_LINE: + return V_028814_X_DRAW_LINES; + case PIPE_POLYGON_MODE_POINT: + return V_028814_X_DRAW_POINTS; + default: + assert(0); + return V_028814_X_DRAW_POINTS; + } } -static void *si_create_rs_state(struct pipe_context *ctx, - const struct pipe_rasterizer_state *state) +static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state) { - struct si_screen *sscreen = ((struct si_context *)ctx)->screen; - struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); - struct si_pm4_state *pm4 = &rs->pm4; - unsigned tmp, i; - float psize_min, psize_max; - - if (!rs) { - return NULL; - } - - if (!state->front_ccw) { - rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); - } else { - rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); - rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); - } - rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; - rs->provoking_vertex_first = state->flatshade_first; - rs->scissor_enable = state->scissor; - rs->clip_halfz = state->clip_halfz; - rs->two_side = state->light_twoside; - rs->multisample_enable = state->multisample; - rs->force_persample_interp = state->force_persample_interp; - rs->clip_plane_enable = state->clip_plane_enable; - rs->half_pixel_center = state->half_pixel_center; - rs->line_stipple_enable = state->line_stipple_enable; - rs->poly_stipple_enable = state->poly_stipple_enable; - rs->line_smooth = state->line_smooth; - rs->line_width = state->line_width; - rs->poly_smooth = state->poly_smooth; - rs->uses_poly_offset = state->offset_point || state->offset_line || - state->offset_tri; - rs->clamp_fragment_color = state->clamp_fragment_color; - rs->clamp_vertex_color = state->clamp_vertex_color; - rs->flatshade = state->flatshade; - rs->flatshade_first = state->flatshade_first; - rs->sprite_coord_enable = state->sprite_coord_enable; - rs->rasterizer_discard = state->rasterizer_discard; - rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL && - !(state->cull_face & PIPE_FACE_FRONT)) || - (state->fill_back != PIPE_POLYGON_MODE_FILL && - !(state->cull_face & PIPE_FACE_BACK)); - rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE && - !(state->cull_face & PIPE_FACE_FRONT)) || - (state->fill_back == PIPE_POLYGON_MODE_LINE && - !(state->cull_face & PIPE_FACE_BACK)); - rs->pa_sc_line_stipple = state->line_stipple_enable ? - S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | - S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; - rs->pa_cl_clip_cntl = - S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) | - S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) | - S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) | - S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) | - S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); - - si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, - S_0286D4_FLAT_SHADE_ENA(1) | - S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | - S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | - S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | - S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | - S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | - S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); - - /* point size 12.4 fixed point */ - tmp = (unsigned)(state->point_size * 8.0); - si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); - - if (state->point_size_per_vertex) { - psize_min = util_get_min_point_size(state); - psize_max = SI_MAX_POINT_SIZE; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = state->point_size; - psize_max = state->point_size; - } - rs->max_point_size = psize_max; - - /* Divide by two, because 0.5 = 1 pixel. */ - si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, - S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) | - S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2))); - - si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, - S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2))); - si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0, - S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | - S_028A48_MSAA_ENABLE(state->multisample || - state->poly_smooth || - state->line_smooth) | - S_028A48_VPORT_SCISSOR_ENABLE(1) | - S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); - - si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); - si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, - S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | - S_028814_FACE(!state->front_ccw) | - S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | - S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | - S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(rs->polygon_mode_enabled) | - S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | - S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); - - if (!rs->uses_poly_offset) - return rs; - - rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); - if (!rs->pm4_poly_offset) { - FREE(rs); - return NULL; - } - - /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ - for (i = 0; i < 3; i++) { - struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; - float offset_units = state->offset_units; - float offset_scale = state->offset_scale * 16.0f; - uint32_t pa_su_poly_offset_db_fmt_cntl = 0; - - if (!state->offset_units_unscaled) { - switch (i) { - case 0: /* 16-bit zbuffer */ - offset_units *= 4.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); - break; - case 1: /* 24-bit zbuffer */ - offset_units *= 2.0f; - pa_su_poly_offset_db_fmt_cntl = - S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); - break; - case 2: /* 32-bit zbuffer */ - offset_units *= 1.0f; - pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | - S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); - break; - } - } - - si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, - fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, - fui(offset_units)); - si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, - fui(offset_scale)); - si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, - fui(offset_units)); - si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, - pa_su_poly_offset_db_fmt_cntl); - } - - return rs; + struct si_screen *sscreen = ((struct si_context *)ctx)->screen; + struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer); + struct si_pm4_state *pm4 = &rs->pm4; + unsigned tmp, i; + float psize_min, psize_max; + + if (!rs) { + return NULL; + } + + if (!state->front_ccw) { + rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK); + } else { + rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT); + rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK); + } + rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far; + rs->provoking_vertex_first = state->flatshade_first; + rs->scissor_enable = state->scissor; + rs->clip_halfz = state->clip_halfz; + rs->two_side = state->light_twoside; + rs->multisample_enable = state->multisample; + rs->force_persample_interp = state->force_persample_interp; + rs->clip_plane_enable = state->clip_plane_enable; + rs->half_pixel_center = state->half_pixel_center; + rs->line_stipple_enable = state->line_stipple_enable; + rs->poly_stipple_enable = state->poly_stipple_enable; + rs->line_smooth = state->line_smooth; + rs->line_width = state->line_width; + rs->poly_smooth = state->poly_smooth; + rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri; + rs->clamp_fragment_color = state->clamp_fragment_color; + rs->clamp_vertex_color = state->clamp_vertex_color; + rs->flatshade = state->flatshade; + rs->flatshade_first = state->flatshade_first; + rs->sprite_coord_enable = state->sprite_coord_enable; + rs->rasterizer_discard = state->rasterizer_discard; + rs->polygon_mode_enabled = + (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK)); + rs->polygon_mode_is_lines = + (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK)); + rs->pa_sc_line_stipple = state->line_stipple_enable + ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | + S_028A0C_REPEAT_COUNT(state->line_stipple_factor) + : 0; + rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) | + S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) | + S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) | + S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) | + S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + + si_pm4_set_reg( + pm4, R_0286D4_SPI_INTERP_CONTROL_0, + S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | + S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT)); + + /* point size 12.4 fixed point */ + tmp = (unsigned)(state->point_size * 8.0); + si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp)); + + if (state->point_size_per_vertex) { + psize_min = util_get_min_point_size(state); + psize_max = SI_MAX_POINT_SIZE; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = state->point_size; + psize_max = state->point_size; + } + rs->max_point_size = psize_max; + + /* Divide by two, because 0.5 = 1 pixel. */ + si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX, + S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) | + S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2))); + + si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, + S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2))); + si_pm4_set_reg( + pm4, R_028A48_PA_SC_MODE_CNTL_0, + S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) | + S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) | + S_028A48_VPORT_SCISSOR_ENABLE(1) | + S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9)); + + si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp)); + si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL, + S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) | + S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_FACE(!state->front_ccw) | + S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | + S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | + S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | + S_028814_POLY_MODE(rs->polygon_mode_enabled) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); + + if (!rs->uses_poly_offset) + return rs; + + rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state)); + if (!rs->pm4_poly_offset) { + FREE(rs); + return NULL; + } + + /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */ + for (i = 0; i < 3; i++) { + struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i]; + float offset_units = state->offset_units; + float offset_scale = state->offset_scale * 16.0f; + uint32_t pa_su_poly_offset_db_fmt_cntl = 0; + + if (!state->offset_units_unscaled) { + switch (i) { + case 0: /* 16-bit zbuffer */ + offset_units *= 4.0f; + pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16); + break; + case 1: /* 24-bit zbuffer */ + offset_units *= 2.0f; + pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24); + break; + case 2: /* 32-bit zbuffer */ + offset_units *= 1.0f; + pa_su_poly_offset_db_fmt_cntl = + S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + break; + } + } + + si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale)); + si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units)); + si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale)); + si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units)); + si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl); + } + + return rs; } static void si_bind_rs_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_rasterizer *old_rs = - (struct si_state_rasterizer*)sctx->queued.named.rasterizer; - struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - - if (!rs) - rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; - - if (old_rs->multisample_enable != rs->multisample_enable) { - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - /* Update the small primitive filter workaround if necessary. */ - if (sctx->screen->info.has_msaa_sample_loc_bug && - sctx->framebuffer.nr_samples > 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; - sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color); - - si_pm4_bind_state(sctx, rasterizer, rs); - si_update_poly_offset_state(sctx); - - if (old_rs->scissor_enable != rs->scissor_enable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - - if (old_rs->line_width != rs->line_width || - old_rs->max_point_size != rs->max_point_size || - old_rs->half_pixel_center != rs->half_pixel_center) - si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); - - if (old_rs->clip_halfz != rs->clip_halfz) - si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); - - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - if (old_rs->clip_plane_enable != rs->clip_plane_enable || - old_rs->rasterizer_discard != rs->rasterizer_discard || - old_rs->sprite_coord_enable != rs->sprite_coord_enable || - old_rs->flatshade != rs->flatshade || - old_rs->two_side != rs->two_side || - old_rs->multisample_enable != rs->multisample_enable || - old_rs->poly_stipple_enable != rs->poly_stipple_enable || - old_rs->poly_smooth != rs->poly_smooth || - old_rs->line_smooth != rs->line_smooth || - old_rs->clamp_fragment_color != rs->clamp_fragment_color || - old_rs->force_persample_interp != rs->force_persample_interp) - sctx->do_update_shaders = true; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer; + struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; + + if (!rs) + rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; + + if (old_rs->multisample_enable != rs->multisample_enable) { + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + /* Update the small primitive filter workaround if necessary. */ + if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR; + sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color); + + si_pm4_bind_state(sctx, rasterizer, rs); + si_update_poly_offset_state(sctx); + + if (old_rs->scissor_enable != rs->scissor_enable) + si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); + + if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size || + old_rs->half_pixel_center != rs->half_pixel_center) + si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + + if (old_rs->clip_halfz != rs->clip_halfz) + si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); + + if (old_rs->clip_plane_enable != rs->clip_plane_enable || + old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + + if (old_rs->clip_plane_enable != rs->clip_plane_enable || + old_rs->rasterizer_discard != rs->rasterizer_discard || + old_rs->sprite_coord_enable != rs->sprite_coord_enable || + old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side || + old_rs->multisample_enable != rs->multisample_enable || + old_rs->poly_stipple_enable != rs->poly_stipple_enable || + old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth || + old_rs->clamp_fragment_color != rs->clamp_fragment_color || + old_rs->force_persample_interp != rs->force_persample_interp) + sctx->do_update_shaders = true; } static void si_delete_rs_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - if (sctx->queued.named.rasterizer == state) - si_bind_rs_state(ctx, sctx->discard_rasterizer_state); + if (sctx->queued.named.rasterizer == state) + si_bind_rs_state(ctx, sctx->discard_rasterizer_state); - FREE(rs->pm4_poly_offset); - si_pm4_delete_state(sctx, rasterizer, rs); + FREE(rs->pm4_poly_offset); + si_pm4_delete_state(sctx, rasterizer, rs); } /* @@ -1110,81 +1036,75 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) */ static void si_emit_stencil_ref(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; - struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; - - radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); - radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | - S_028430_STENCILMASK(dsa->valuemask[0]) | - S_028430_STENCILWRITEMASK(dsa->writemask[0]) | - S_028430_STENCILOPVAL(1)); - radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | - S_028434_STENCILMASK_BF(dsa->valuemask[1]) | - S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | - S_028434_STENCILOPVAL_BF(1)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; + struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; + + radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); + radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | + S_028430_STENCILMASK(dsa->valuemask[0]) | + S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1)); + radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) | + S_028434_STENCILMASK_BF(dsa->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | + S_028434_STENCILOPVAL_BF(1)); } -static void si_set_stencil_ref(struct pipe_context *ctx, - const struct pipe_stencil_ref *state) +static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0) - return; + if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0) + return; - sctx->stencil_ref.state = *state; - si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); + sctx->stencil_ref.state = *state; + si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - /* * DSA */ static uint32_t si_translate_stencil_op(int s_op) { - switch (s_op) { - case PIPE_STENCIL_OP_KEEP: - return V_02842C_STENCIL_KEEP; - case PIPE_STENCIL_OP_ZERO: - return V_02842C_STENCIL_ZERO; - case PIPE_STENCIL_OP_REPLACE: - return V_02842C_STENCIL_REPLACE_TEST; - case PIPE_STENCIL_OP_INCR: - return V_02842C_STENCIL_ADD_CLAMP; - case PIPE_STENCIL_OP_DECR: - return V_02842C_STENCIL_SUB_CLAMP; - case PIPE_STENCIL_OP_INCR_WRAP: - return V_02842C_STENCIL_ADD_WRAP; - case PIPE_STENCIL_OP_DECR_WRAP: - return V_02842C_STENCIL_SUB_WRAP; - case PIPE_STENCIL_OP_INVERT: - return V_02842C_STENCIL_INVERT; - default: - PRINT_ERR("Unknown stencil op %d", s_op); - assert(0); - break; - } - return 0; + switch (s_op) { + case PIPE_STENCIL_OP_KEEP: + return V_02842C_STENCIL_KEEP; + case PIPE_STENCIL_OP_ZERO: + return V_02842C_STENCIL_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return V_02842C_STENCIL_REPLACE_TEST; + case PIPE_STENCIL_OP_INCR: + return V_02842C_STENCIL_ADD_CLAMP; + case PIPE_STENCIL_OP_DECR: + return V_02842C_STENCIL_SUB_CLAMP; + case PIPE_STENCIL_OP_INCR_WRAP: + return V_02842C_STENCIL_ADD_WRAP; + case PIPE_STENCIL_OP_DECR_WRAP: + return V_02842C_STENCIL_SUB_WRAP; + case PIPE_STENCIL_OP_INVERT: + return V_02842C_STENCIL_INVERT; + default: + PRINT_ERR("Unknown stencil op %d", s_op); + assert(0); + break; + } + return 0; } static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s) { - return s->enabled && s->writemask && - (s->fail_op != PIPE_STENCIL_OP_KEEP || - s->zfail_op != PIPE_STENCIL_OP_KEEP || - s->zpass_op != PIPE_STENCIL_OP_KEEP); + return s->enabled && s->writemask && + (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP || + s->zpass_op != PIPE_STENCIL_OP_KEEP); } static bool si_order_invariant_stencil_op(enum pipe_stencil_op op) { - /* REPLACE is normally order invariant, except when the stencil - * reference value is written by the fragment shader. Tracking this - * interaction does not seem worth the effort, so be conservative. */ - return op != PIPE_STENCIL_OP_INCR && - op != PIPE_STENCIL_OP_DECR && - op != PIPE_STENCIL_OP_REPLACE; + /* REPLACE is normally order invariant, except when the stencil + * reference value is written by the fragment shader. Tracking this + * interaction does not seem worth the effort, so be conservative. */ + return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE; } /* Compute whether, assuming Z writes are disabled, this stencil state is order @@ -1192,325 +1112,304 @@ static bool si_order_invariant_stencil_op(enum pipe_stencil_op op) * final stencil buffer result does not depend on the order of fragments. */ static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state) { - return !state->enabled || !state->writemask || - /* The following assumes that Z writes are disabled. */ - (state->func == PIPE_FUNC_ALWAYS && - si_order_invariant_stencil_op(state->zpass_op) && - si_order_invariant_stencil_op(state->zfail_op)) || - (state->func == PIPE_FUNC_NEVER && - si_order_invariant_stencil_op(state->fail_op)); + return !state->enabled || !state->writemask || + /* The following assumes that Z writes are disabled. */ + (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) && + si_order_invariant_stencil_op(state->zfail_op)) || + (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op)); } static void *si_create_dsa_state(struct pipe_context *ctx, - const struct pipe_depth_stencil_alpha_state *state) + const struct pipe_depth_stencil_alpha_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); - struct si_pm4_state *pm4 = &dsa->pm4; - unsigned db_depth_control; - uint32_t db_stencil_control = 0; - - if (!dsa) { - return NULL; - } - - dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask; - dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask; - dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; - dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; - - db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) | - S_028800_Z_WRITE_ENABLE(state->depth.writemask) | - S_028800_ZFUNC(state->depth.func) | - S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test); - - /* stencil */ - if (state->stencil[0].enabled) { - db_depth_control |= S_028800_STENCIL_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); - db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); - db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); - db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); - - if (state->stencil[1].enabled) { - db_depth_control |= S_028800_BACKFACE_ENABLE(1); - db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); - db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); - db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); - db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); - } - } - - /* alpha */ - if (state->alpha.enabled) { - dsa->alpha_func = state->alpha.func; - - si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + - SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value)); - } else { - dsa->alpha_func = PIPE_FUNC_ALWAYS; - } - - si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); - if (state->stencil[0].enabled) - si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); - if (state->depth.bounds_test) { - si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min)); - si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max)); - } - - dsa->depth_enabled = state->depth.enabled; - dsa->depth_write_enabled = state->depth.enabled && - state->depth.writemask; - dsa->stencil_enabled = state->stencil[0].enabled; - dsa->stencil_write_enabled = state->stencil[0].enabled && - (si_dsa_writes_stencil(&state->stencil[0]) || - si_dsa_writes_stencil(&state->stencil[1])); - dsa->db_can_write = dsa->depth_write_enabled || - dsa->stencil_write_enabled; - - bool zfunc_is_ordered = - state->depth.func == PIPE_FUNC_NEVER || - state->depth.func == PIPE_FUNC_LESS || - state->depth.func == PIPE_FUNC_LEQUAL || - state->depth.func == PIPE_FUNC_GREATER || - state->depth.func == PIPE_FUNC_GEQUAL; - - bool nozwrite_and_order_invariant_stencil = - !dsa->db_can_write || - (!dsa->depth_write_enabled && - si_order_invariant_stencil_state(&state->stencil[0]) && - si_order_invariant_stencil_state(&state->stencil[1])); - - dsa->order_invariance[1].zs = - nozwrite_and_order_invariant_stencil || - (!dsa->stencil_write_enabled && zfunc_is_ordered); - dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered; - - dsa->order_invariance[1].pass_set = - nozwrite_and_order_invariant_stencil || - (!dsa->stencil_write_enabled && - (state->depth.func == PIPE_FUNC_ALWAYS || - state->depth.func == PIPE_FUNC_NEVER)); - dsa->order_invariance[0].pass_set = - !dsa->depth_write_enabled || - (state->depth.func == PIPE_FUNC_ALWAYS || - state->depth.func == PIPE_FUNC_NEVER); - - dsa->order_invariance[1].pass_last = - sctx->screen->assume_no_z_fights && - !dsa->stencil_write_enabled && - dsa->depth_write_enabled && zfunc_is_ordered; - dsa->order_invariance[0].pass_last = - sctx->screen->assume_no_z_fights && - dsa->depth_write_enabled && zfunc_is_ordered; - - return dsa; + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa); + struct si_pm4_state *pm4 = &dsa->pm4; + unsigned db_depth_control; + uint32_t db_stencil_control = 0; + + if (!dsa) { + return NULL; + } + + dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask; + dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask; + dsa->stencil_ref.writemask[0] = state->stencil[0].writemask; + dsa->stencil_ref.writemask[1] = state->stencil[1].writemask; + + db_depth_control = + S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) | + S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test); + + /* stencil */ + if (state->stencil[0].enabled) { + db_depth_control |= S_028800_STENCIL_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func); + db_stencil_control |= + S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op)); + db_stencil_control |= + S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op)); + db_stencil_control |= + S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op)); + + if (state->stencil[1].enabled) { + db_depth_control |= S_028800_BACKFACE_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func); + db_stencil_control |= + S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op)); + db_stencil_control |= + S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op)); + db_stencil_control |= + S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op)); + } + } + + /* alpha */ + if (state->alpha.enabled) { + dsa->alpha_func = state->alpha.func; + + si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4, + fui(state->alpha.ref_value)); + } else { + dsa->alpha_func = PIPE_FUNC_ALWAYS; + } + + si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control); + if (state->stencil[0].enabled) + si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control); + if (state->depth.bounds_test) { + si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min)); + si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max)); + } + + dsa->depth_enabled = state->depth.enabled; + dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask; + dsa->stencil_enabled = state->stencil[0].enabled; + dsa->stencil_write_enabled = + state->stencil[0].enabled && + (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1])); + dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled; + + bool zfunc_is_ordered = + state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS || + state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER || + state->depth.func == PIPE_FUNC_GEQUAL; + + bool nozwrite_and_order_invariant_stencil = + !dsa->db_can_write || + (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) && + si_order_invariant_stencil_state(&state->stencil[1])); + + dsa->order_invariance[1].zs = + nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered); + dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered; + + dsa->order_invariance[1].pass_set = + nozwrite_and_order_invariant_stencil || + (!dsa->stencil_write_enabled && + (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER)); + dsa->order_invariance[0].pass_set = + !dsa->depth_write_enabled || + (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER); + + dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights && + !dsa->stencil_write_enabled && dsa->depth_write_enabled && + zfunc_is_ordered; + dsa->order_invariance[0].pass_last = + sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered; + + return dsa; } static void si_bind_dsa_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_dsa *old_dsa = sctx->queued.named.dsa; - struct si_state_dsa *dsa = state; - - if (!dsa) - dsa = (struct si_state_dsa *)sctx->noop_dsa; - - si_pm4_bind_state(sctx, dsa, dsa); - - if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, - sizeof(struct si_dsa_stencil_ref_part)) != 0) { - sctx->stencil_ref.dsa_part = dsa->stencil_ref; - si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); - } - - if (old_dsa->alpha_func != dsa->alpha_func) - sctx->do_update_shaders = true; - - if (sctx->screen->dpbb_allowed && - ((old_dsa->depth_enabled != dsa->depth_enabled || - old_dsa->stencil_enabled != dsa->stencil_enabled || - old_dsa->db_can_write != dsa->db_can_write))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->screen->has_out_of_order_rast && - (memcmp(old_dsa->order_invariance, dsa->order_invariance, - sizeof(old_dsa->order_invariance)))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_dsa *old_dsa = sctx->queued.named.dsa; + struct si_state_dsa *dsa = state; + + if (!dsa) + dsa = (struct si_state_dsa *)sctx->noop_dsa; + + si_pm4_bind_state(sctx, dsa, dsa); + + if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part, + sizeof(struct si_dsa_stencil_ref_part)) != 0) { + sctx->stencil_ref.dsa_part = dsa->stencil_ref; + si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); + } + + if (old_dsa->alpha_func != dsa->alpha_func) + sctx->do_update_shaders = true; + + if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled || + old_dsa->stencil_enabled != dsa->stencil_enabled || + old_dsa->db_can_write != dsa->db_can_write))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->screen->has_out_of_order_rast && + (memcmp(old_dsa->order_invariance, dsa->order_invariance, + sizeof(old_dsa->order_invariance)))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } static void si_delete_dsa_state(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->queued.named.dsa == state) - si_bind_dsa_state(ctx, sctx->noop_dsa); + if (sctx->queued.named.dsa == state) + si_bind_dsa_state(ctx, sctx->noop_dsa); - si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); + si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); } static void *si_create_db_flush_dsa(struct si_context *sctx) { - struct pipe_depth_stencil_alpha_state dsa = {}; + struct pipe_depth_stencil_alpha_state dsa = {}; - return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa); + return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa); } /* DB RENDER STATE */ static void si_set_active_query_state(struct pipe_context *ctx, bool enable) { - struct si_context *sctx = (struct si_context*)ctx; - - /* Pipeline stat & streamout queries. */ - if (enable) { - sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; - } else { - sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; - sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; - } - - /* Occlusion queries. */ - if (sctx->occlusion_queries_disabled != !enable) { - sctx->occlusion_queries_disabled = !enable; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - } + struct si_context *sctx = (struct si_context *)ctx; + + /* Pipeline stat & streamout queries. */ + if (enable) { + sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + } else { + sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; + sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + } + + /* Occlusion queries. */ + if (sctx->occlusion_queries_disabled != !enable) { + sctx->occlusion_queries_disabled = !enable; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + } } -void si_set_occlusion_query_state(struct si_context *sctx, - bool old_perfect_enable) +void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable) { - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - bool perfect_enable = sctx->num_perfect_occlusion_queries != 0; + bool perfect_enable = sctx->num_perfect_occlusion_queries != 0; - if (perfect_enable != old_perfect_enable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + if (perfect_enable != old_perfect_enable) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - st->saved_compute = sctx->cs_shader_state.program; + st->saved_compute = sctx->cs_shader_state.program; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); - st->saved_ssbo_writable_mask = 0; + st->saved_ssbo_writable_mask = 0; - for (unsigned i = 0; i < 3; i++) { - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(i))) - st->saved_ssbo_writable_mask |= 1 << i; - } + for (unsigned i = 0; i < 3; i++) { + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(i))) + st->saved_ssbo_writable_mask |= 1 << i; + } } void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - sctx->b.bind_compute_state(&sctx->b, st->saved_compute); + sctx->b.bind_compute_state(&sctx->b, st->saved_compute); - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - pipe_resource_reference(&st->saved_const0.buffer, NULL); + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + pipe_resource_reference(&st->saved_const0.buffer, NULL); - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, - st->saved_ssbo_writable_mask); - for (unsigned i = 0; i < 3; ++i) - pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, + st->saved_ssbo_writable_mask); + for (unsigned i = 0; i < 3; ++i) + pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); } static void si_emit_db_render_state(struct si_context *sctx) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned db_shader_control, db_render_control, db_count_control; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - /* DB_RENDER_CONTROL */ - if (sctx->dbcb_depth_copy_enabled || - sctx->dbcb_stencil_copy_enabled) { - db_render_control = - S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | - S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | - S_028000_COPY_CENTROID(1) | - S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); - } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { - db_render_control = - S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | - S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); - } else { - db_render_control = - S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | - S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); - } - - /* DB_COUNT_CONTROL (occlusion queries) */ - if (sctx->num_occlusion_queries > 0 && - !sctx->occlusion_queries_disabled) { - bool perfect = sctx->num_perfect_occlusion_queries > 0; - bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; - - if (sctx->chip_class >= GFX7) { - unsigned log_sample_rate = sctx->framebuffer.log_samples; - - /* Stoney doesn't increment occlusion query counters - * if the sample rate is 16x. Use 8x sample rate instead. - */ - if (sctx->family == CHIP_STONEY) - log_sample_rate = MIN2(log_sample_rate, 3); - - db_count_control = - S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | - S_028004_SAMPLE_RATE(log_sample_rate) | - S_028004_ZPASS_ENABLE(1) | - S_028004_SLICE_EVEN_ENABLE(1) | - S_028004_SLICE_ODD_ENABLE(1); - } else { - db_count_control = - S_028004_PERFECT_ZPASS_COUNTS(perfect) | - S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); - } - } else { - /* Disable occlusion queries. */ - if (sctx->chip_class >= GFX7) { - db_count_control = 0; - } else { - db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); - } - } - - radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, - SI_TRACKED_DB_RENDER_CONTROL, db_render_control, - db_count_control); - - /* DB_RENDER_OVERRIDE2 */ - radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, - SI_TRACKED_DB_RENDER_OVERRIDE2, - S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | - S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | - S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4)); - - db_shader_control = sctx->ps_db_shader_control; - - /* Bug workaround for smoothing (overrasterization) on GFX6. */ - if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) { - db_shader_control &= C_02880C_Z_ORDER; - db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z); - } - - /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ - if (!rs->multisample_enable) - db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; - - if (sctx->screen->info.has_rbplus && - !sctx->screen->info.rbplus_allowed) - db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); - - radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, - SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned db_shader_control, db_render_control, db_count_control; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + /* DB_RENDER_CONTROL */ + if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { + db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | + S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | + S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); + } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { + db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | + S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); + } else { + db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); + } + + /* DB_COUNT_CONTROL (occlusion queries) */ + if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) { + bool perfect = sctx->num_perfect_occlusion_queries > 0; + bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; + + if (sctx->chip_class >= GFX7) { + unsigned log_sample_rate = sctx->framebuffer.log_samples; + + /* Stoney doesn't increment occlusion query counters + * if the sample rate is 16x. Use 8x sample rate instead. + */ + if (sctx->family == CHIP_STONEY) + log_sample_rate = MIN2(log_sample_rate, 3); + + db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | + S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) | + S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1); + } else { + db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); + } + } else { + /* Disable occlusion queries. */ + if (sctx->chip_class >= GFX7) { + db_count_control = 0; + } else { + db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1); + } + } + + radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, + db_render_control, db_count_control); + + /* DB_RENDER_OVERRIDE2 */ + radeon_opt_set_context_reg( + sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, + S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | + S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | + S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4)); + + db_shader_control = sctx->ps_db_shader_control; + + /* Bug workaround for smoothing (overrasterization) on GFX6. */ + if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) { + db_shader_control &= C_02880C_Z_ORDER; + db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z); + } + + /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ + if (!rs->multisample_enable) + db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; + + if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed) + db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); + + radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } /* @@ -1518,514 +1417,500 @@ static void si_emit_db_render_state(struct si_context *sctx) */ static uint32_t si_translate_colorformat(enum pipe_format format) { - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return V_028C70_COLOR_INVALID; - -#define HAS_SIZE(x,y,z,w) \ - (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ - desc->channel[2].size == (z) && desc->channel[3].size == (w)) - - if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ - return V_028C70_COLOR_10_11_11; - - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return V_028C70_COLOR_INVALID; - - /* hw cannot support mixed formats (except depth/stencil, since - * stencil is not written to). */ - if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - return V_028C70_COLOR_INVALID; - - switch (desc->nr_channels) { - case 1: - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8; - case 16: - return V_028C70_COLOR_16; - case 32: - return V_028C70_COLOR_32; - } - break; - case 2: - if (desc->channel[0].size == desc->channel[1].size) { - switch (desc->channel[0].size) { - case 8: - return V_028C70_COLOR_8_8; - case 16: - return V_028C70_COLOR_16_16; - case 32: - return V_028C70_COLOR_32_32; - } - } else if (HAS_SIZE(8,24,0,0)) { - return V_028C70_COLOR_24_8; - } else if (HAS_SIZE(24,8,0,0)) { - return V_028C70_COLOR_8_24; - } - break; - case 3: - if (HAS_SIZE(5,6,5,0)) { - return V_028C70_COLOR_5_6_5; - } else if (HAS_SIZE(32,8,24,0)) { - return V_028C70_COLOR_X24_8_32_FLOAT; - } - break; - case 4: - if (desc->channel[0].size == desc->channel[1].size && - desc->channel[0].size == desc->channel[2].size && - desc->channel[0].size == desc->channel[3].size) { - switch (desc->channel[0].size) { - case 4: - return V_028C70_COLOR_4_4_4_4; - case 8: - return V_028C70_COLOR_8_8_8_8; - case 16: - return V_028C70_COLOR_16_16_16_16; - case 32: - return V_028C70_COLOR_32_32_32_32; - } - } else if (HAS_SIZE(5,5,5,1)) { - return V_028C70_COLOR_1_5_5_5; - } else if (HAS_SIZE(1,5,5,5)) { - return V_028C70_COLOR_5_5_5_1; - } else if (HAS_SIZE(10,10,10,2)) { - return V_028C70_COLOR_2_10_10_10; - } - break; - } - return V_028C70_COLOR_INVALID; + const struct util_format_description *desc = util_format_description(format); + if (!desc) + return V_028C70_COLOR_INVALID; + +#define HAS_SIZE(x, y, z, w) \ + (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \ + desc->channel[2].size == (z) && desc->channel[3].size == (w)) + + if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ + return V_028C70_COLOR_10_11_11; + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return V_028C70_COLOR_INVALID; + + /* hw cannot support mixed formats (except depth/stencil, since + * stencil is not written to). */ + if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) + return V_028C70_COLOR_INVALID; + + switch (desc->nr_channels) { + case 1: + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8; + case 16: + return V_028C70_COLOR_16; + case 32: + return V_028C70_COLOR_32; + } + break; + case 2: + if (desc->channel[0].size == desc->channel[1].size) { + switch (desc->channel[0].size) { + case 8: + return V_028C70_COLOR_8_8; + case 16: + return V_028C70_COLOR_16_16; + case 32: + return V_028C70_COLOR_32_32; + } + } else if (HAS_SIZE(8, 24, 0, 0)) { + return V_028C70_COLOR_24_8; + } else if (HAS_SIZE(24, 8, 0, 0)) { + return V_028C70_COLOR_8_24; + } + break; + case 3: + if (HAS_SIZE(5, 6, 5, 0)) { + return V_028C70_COLOR_5_6_5; + } else if (HAS_SIZE(32, 8, 24, 0)) { + return V_028C70_COLOR_X24_8_32_FLOAT; + } + break; + case 4: + if (desc->channel[0].size == desc->channel[1].size && + desc->channel[0].size == desc->channel[2].size && + desc->channel[0].size == desc->channel[3].size) { + switch (desc->channel[0].size) { + case 4: + return V_028C70_COLOR_4_4_4_4; + case 8: + return V_028C70_COLOR_8_8_8_8; + case 16: + return V_028C70_COLOR_16_16_16_16; + case 32: + return V_028C70_COLOR_32_32_32_32; + } + } else if (HAS_SIZE(5, 5, 5, 1)) { + return V_028C70_COLOR_1_5_5_5; + } else if (HAS_SIZE(1, 5, 5, 5)) { + return V_028C70_COLOR_5_5_5_1; + } else if (HAS_SIZE(10, 10, 10, 2)) { + return V_028C70_COLOR_2_10_10_10; + } + break; + } + return V_028C70_COLOR_INVALID; } static uint32_t si_colorformat_endian_swap(uint32_t colorformat) { - if (SI_BIG_ENDIAN) { - switch(colorformat) { - /* 8-bit buffers. */ - case V_028C70_COLOR_8: - return V_028C70_ENDIAN_NONE; - - /* 16-bit buffers. */ - case V_028C70_COLOR_5_6_5: - case V_028C70_COLOR_1_5_5_5: - case V_028C70_COLOR_4_4_4_4: - case V_028C70_COLOR_16: - case V_028C70_COLOR_8_8: - return V_028C70_ENDIAN_8IN16; - - /* 32-bit buffers. */ - case V_028C70_COLOR_8_8_8_8: - case V_028C70_COLOR_2_10_10_10: - case V_028C70_COLOR_8_24: - case V_028C70_COLOR_24_8: - case V_028C70_COLOR_16_16: - return V_028C70_ENDIAN_8IN32; - - /* 64-bit buffers. */ - case V_028C70_COLOR_16_16_16_16: - return V_028C70_ENDIAN_8IN16; - - case V_028C70_COLOR_32_32: - return V_028C70_ENDIAN_8IN32; - - /* 128-bit buffers. */ - case V_028C70_COLOR_32_32_32_32: - return V_028C70_ENDIAN_8IN32; - default: - return V_028C70_ENDIAN_NONE; /* Unsupported. */ - } - } else { - return V_028C70_ENDIAN_NONE; - } + if (SI_BIG_ENDIAN) { + switch (colorformat) { + /* 8-bit buffers. */ + case V_028C70_COLOR_8: + return V_028C70_ENDIAN_NONE; + + /* 16-bit buffers. */ + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_16: + case V_028C70_COLOR_8_8: + return V_028C70_ENDIAN_8IN16; + + /* 32-bit buffers. */ + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_16_16: + return V_028C70_ENDIAN_8IN32; + + /* 64-bit buffers. */ + case V_028C70_COLOR_16_16_16_16: + return V_028C70_ENDIAN_8IN16; + + case V_028C70_COLOR_32_32: + return V_028C70_ENDIAN_8IN32; + + /* 128-bit buffers. */ + case V_028C70_COLOR_32_32_32_32: + return V_028C70_ENDIAN_8IN32; + default: + return V_028C70_ENDIAN_NONE; /* Unsupported. */ + } + } else { + return V_028C70_ENDIAN_NONE; + } } static uint32_t si_translate_dbformat(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return V_028040_Z_16; - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return V_028040_Z_24; /* deprecated on AMD GCN */ - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return V_028040_Z_32_FLOAT; - default: - return V_028040_Z_INVALID; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_028040_Z_16; + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_028040_Z_24; /* deprecated on AMD GCN */ + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_028040_Z_32_FLOAT; + default: + return V_028040_Z_INVALID; + } } /* * Texture translation */ -static uint32_t si_translate_texformat(struct pipe_screen *screen, - enum pipe_format format, - const struct util_format_description *desc, - int first_non_void) +static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format, + const struct util_format_description *desc, + int first_non_void) { - struct si_screen *sscreen = (struct si_screen*)screen; - bool uniform = true; - int i; - - assert(sscreen->info.chip_class <= GFX9); - - /* Colorspace (return non-RGB formats directly). */ - switch (desc->colorspace) { - /* Depth stencil formats */ - case UTIL_FORMAT_COLORSPACE_ZS: - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return V_008F14_IMG_DATA_FORMAT_16; - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_S8X24_UINT: - /* - * Implemented as an 8_8_8_8 data format to fix texture - * gathers in stencil sampling. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - if (sscreen->info.chip_class <= GFX8) - return V_008F14_IMG_DATA_FORMAT_8_8_8_8; - - if (format == PIPE_FORMAT_X24S8_UINT) - return V_008F14_IMG_DATA_FORMAT_8_24; - else - return V_008F14_IMG_DATA_FORMAT_24_8; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return V_008F14_IMG_DATA_FORMAT_8_24; - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return V_008F14_IMG_DATA_FORMAT_24_8; - case PIPE_FORMAT_S8_UINT: - return V_008F14_IMG_DATA_FORMAT_8; - case PIPE_FORMAT_Z32_FLOAT: - return V_008F14_IMG_DATA_FORMAT_32; - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return V_008F14_IMG_DATA_FORMAT_X24_8_32; - default: - goto out_unknown; - } - - case UTIL_FORMAT_COLORSPACE_YUV: - goto out_unknown; /* TODO */ - - case UTIL_FORMAT_COLORSPACE_SRGB: - if (desc->nr_channels != 4 && desc->nr_channels != 1) - goto out_unknown; - break; - - default: - break; - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_RGTC1_SNORM: - case PIPE_FORMAT_LATC1_SNORM: - case PIPE_FORMAT_RGTC1_UNORM: - case PIPE_FORMAT_LATC1_UNORM: - return V_008F14_IMG_DATA_FORMAT_BC4; - case PIPE_FORMAT_RGTC2_SNORM: - case PIPE_FORMAT_LATC2_SNORM: - case PIPE_FORMAT_RGTC2_UNORM: - case PIPE_FORMAT_LATC2_UNORM: - return V_008F14_IMG_DATA_FORMAT_BC5; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && - (sscreen->info.family == CHIP_STONEY || - sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN)) { - switch (format) { - case PIPE_FORMAT_ETC1_RGB8: - case PIPE_FORMAT_ETC2_RGB8: - case PIPE_FORMAT_ETC2_SRGB8: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGB; - case PIPE_FORMAT_ETC2_RGB8A1: - case PIPE_FORMAT_ETC2_SRGB8A1: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1; - case PIPE_FORMAT_ETC2_RGBA8: - case PIPE_FORMAT_ETC2_SRGBA8: - return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA; - case PIPE_FORMAT_ETC2_R11_UNORM: - case PIPE_FORMAT_ETC2_R11_SNORM: - return V_008F14_IMG_DATA_FORMAT_ETC2_R; - case PIPE_FORMAT_ETC2_RG11_UNORM: - case PIPE_FORMAT_ETC2_RG11_SNORM: - return V_008F14_IMG_DATA_FORMAT_ETC2_RG; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_BPTC_RGBA_UNORM: - case PIPE_FORMAT_BPTC_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC7; - case PIPE_FORMAT_BPTC_RGB_FLOAT: - case PIPE_FORMAT_BPTC_RGB_UFLOAT: - return V_008F14_IMG_DATA_FORMAT_BC6; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { - switch (format) { - case PIPE_FORMAT_R8G8_B8G8_UNORM: - case PIPE_FORMAT_G8R8_B8R8_UNORM: - return V_008F14_IMG_DATA_FORMAT_GB_GR; - case PIPE_FORMAT_G8R8_G8B8_UNORM: - case PIPE_FORMAT_R8G8_R8B8_UNORM: - return V_008F14_IMG_DATA_FORMAT_BG_RG; - default: - goto out_unknown; - } - } - - if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - if (!sscreen->info.has_format_bc1_through_bc7) - goto out_unknown; - - switch (format) { - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - case PIPE_FORMAT_DXT1_SRGB: - case PIPE_FORMAT_DXT1_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC1; - case PIPE_FORMAT_DXT3_RGBA: - case PIPE_FORMAT_DXT3_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC2; - case PIPE_FORMAT_DXT5_RGBA: - case PIPE_FORMAT_DXT5_SRGBA: - return V_008F14_IMG_DATA_FORMAT_BC3; - default: - goto out_unknown; - } - } - - if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { - return V_008F14_IMG_DATA_FORMAT_5_9_9_9; - } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { - return V_008F14_IMG_DATA_FORMAT_10_11_11; - } - - /* R8G8Bx_SNORM - TODO CxV8U8 */ - - /* hw cannot support mixed formats (except depth/stencil, since only - * depth is read).*/ - if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) - goto out_unknown; - - /* See whether the components are of the same size. */ - for (i = 1; i < desc->nr_channels; i++) { - uniform = uniform && desc->channel[0].size == desc->channel[i].size; - } - - /* Non-uniform formats. */ - if (!uniform) { - switch(desc->nr_channels) { - case 3: - if (desc->channel[0].size == 5 && - desc->channel[1].size == 6 && - desc->channel[2].size == 5) { - return V_008F14_IMG_DATA_FORMAT_5_6_5; - } - goto out_unknown; - case 4: - if (desc->channel[0].size == 5 && - desc->channel[1].size == 5 && - desc->channel[2].size == 5 && - desc->channel[3].size == 1) { - return V_008F14_IMG_DATA_FORMAT_1_5_5_5; - } - if (desc->channel[0].size == 1 && - desc->channel[1].size == 5 && - desc->channel[2].size == 5 && - desc->channel[3].size == 5) { - return V_008F14_IMG_DATA_FORMAT_5_5_5_1; - } - if (desc->channel[0].size == 10 && - desc->channel[1].size == 10 && - desc->channel[2].size == 10 && - desc->channel[3].size == 2) { - return V_008F14_IMG_DATA_FORMAT_2_10_10_10; - } - goto out_unknown; - } - goto out_unknown; - } - - if (first_non_void < 0 || first_non_void > 3) - goto out_unknown; - - /* uniform formats */ - switch (desc->channel[first_non_void].size) { - case 4: - switch (desc->nr_channels) { + struct si_screen *sscreen = (struct si_screen *)screen; + bool uniform = true; + int i; + + assert(sscreen->info.chip_class <= GFX9); + + /* Colorspace (return non-RGB formats directly). */ + switch (desc->colorspace) { + /* Depth stencil formats */ + case UTIL_FORMAT_COLORSPACE_ZS: + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_008F14_IMG_DATA_FORMAT_16; + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_S8X24_UINT: + /* + * Implemented as an 8_8_8_8 data format to fix texture + * gathers in stencil sampling. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + if (sscreen->info.chip_class <= GFX8) + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + + if (format == PIPE_FORMAT_X24S8_UINT) + return V_008F14_IMG_DATA_FORMAT_8_24; + else + return V_008F14_IMG_DATA_FORMAT_24_8; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8_24; + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return V_008F14_IMG_DATA_FORMAT_24_8; + case PIPE_FORMAT_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8; + case PIPE_FORMAT_Z32_FLOAT: + return V_008F14_IMG_DATA_FORMAT_32; + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_008F14_IMG_DATA_FORMAT_X24_8_32; + default: + goto out_unknown; + } + + case UTIL_FORMAT_COLORSPACE_YUV: + goto out_unknown; /* TODO */ + + case UTIL_FORMAT_COLORSPACE_SRGB: + if (desc->nr_channels != 4 && desc->nr_channels != 1) + goto out_unknown; + break; + + default: + break; + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_RGTC1_SNORM: + case PIPE_FORMAT_LATC1_SNORM: + case PIPE_FORMAT_RGTC1_UNORM: + case PIPE_FORMAT_LATC1_UNORM: + return V_008F14_IMG_DATA_FORMAT_BC4; + case PIPE_FORMAT_RGTC2_SNORM: + case PIPE_FORMAT_LATC2_SNORM: + case PIPE_FORMAT_RGTC2_UNORM: + case PIPE_FORMAT_LATC2_UNORM: + return V_008F14_IMG_DATA_FORMAT_BC5; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && + (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN)) { + switch (format) { + case PIPE_FORMAT_ETC1_RGB8: + case PIPE_FORMAT_ETC2_RGB8: + case PIPE_FORMAT_ETC2_SRGB8: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGB; + case PIPE_FORMAT_ETC2_RGB8A1: + case PIPE_FORMAT_ETC2_SRGB8A1: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1; + case PIPE_FORMAT_ETC2_RGBA8: + case PIPE_FORMAT_ETC2_SRGBA8: + return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA; + case PIPE_FORMAT_ETC2_R11_UNORM: + case PIPE_FORMAT_ETC2_R11_SNORM: + return V_008F14_IMG_DATA_FORMAT_ETC2_R; + case PIPE_FORMAT_ETC2_RG11_UNORM: + case PIPE_FORMAT_ETC2_RG11_SNORM: + return V_008F14_IMG_DATA_FORMAT_ETC2_RG; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_BPTC_RGBA_UNORM: + case PIPE_FORMAT_BPTC_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC7; + case PIPE_FORMAT_BPTC_RGB_FLOAT: + case PIPE_FORMAT_BPTC_RGB_UFLOAT: + return V_008F14_IMG_DATA_FORMAT_BC6; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + switch (format) { + case PIPE_FORMAT_R8G8_B8G8_UNORM: + case PIPE_FORMAT_G8R8_B8R8_UNORM: + return V_008F14_IMG_DATA_FORMAT_GB_GR; + case PIPE_FORMAT_G8R8_G8B8_UNORM: + case PIPE_FORMAT_R8G8_R8B8_UNORM: + return V_008F14_IMG_DATA_FORMAT_BG_RG; + default: + goto out_unknown; + } + } + + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + if (!sscreen->info.has_format_bc1_through_bc7) + goto out_unknown; + + switch (format) { + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC1; + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT3_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC2; + case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_DXT5_SRGBA: + return V_008F14_IMG_DATA_FORMAT_BC3; + default: + goto out_unknown; + } + } + + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_5_9_9_9; + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_10_11_11; + } + + /* R8G8Bx_SNORM - TODO CxV8U8 */ + + /* hw cannot support mixed formats (except depth/stencil, since only + * depth is read).*/ + if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) + goto out_unknown; + + /* See whether the components are of the same size. */ + for (i = 1; i < desc->nr_channels; i++) { + uniform = uniform && desc->channel[0].size == desc->channel[i].size; + } + + /* Non-uniform formats. */ + if (!uniform) { + switch (desc->nr_channels) { + case 3: + if (desc->channel[0].size == 5 && desc->channel[1].size == 6 && + desc->channel[2].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_6_5; + } + goto out_unknown; + case 4: + if (desc->channel[0].size == 5 && desc->channel[1].size == 5 && + desc->channel[2].size == 5 && desc->channel[3].size == 1) { + return V_008F14_IMG_DATA_FORMAT_1_5_5_5; + } + if (desc->channel[0].size == 1 && desc->channel[1].size == 5 && + desc->channel[2].size == 5 && desc->channel[3].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_5_5_1; + } + if (desc->channel[0].size == 10 && desc->channel[1].size == 10 && + desc->channel[2].size == 10 && desc->channel[3].size == 2) { + return V_008F14_IMG_DATA_FORMAT_2_10_10_10; + } + goto out_unknown; + } + goto out_unknown; + } + + if (first_non_void < 0 || first_non_void > 3) + goto out_unknown; + + /* uniform formats */ + switch (desc->channel[first_non_void].size) { + case 4: + switch (desc->nr_channels) { #if 0 /* Not supported for render targets */ case 2: return V_008F14_IMG_DATA_FORMAT_4_4; #endif - case 4: - return V_008F14_IMG_DATA_FORMAT_4_4_4_4; - } - break; - case 8: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_8; - case 2: - return V_008F14_IMG_DATA_FORMAT_8_8; - case 4: - return V_008F14_IMG_DATA_FORMAT_8_8_8_8; - } - break; - case 16: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_16; - case 2: - return V_008F14_IMG_DATA_FORMAT_16_16; - case 4: - return V_008F14_IMG_DATA_FORMAT_16_16_16_16; - } - break; - case 32: - switch (desc->nr_channels) { - case 1: - return V_008F14_IMG_DATA_FORMAT_32; - case 2: - return V_008F14_IMG_DATA_FORMAT_32_32; + case 4: + return V_008F14_IMG_DATA_FORMAT_4_4_4_4; + } + break; + case 8: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_8; + case 2: + return V_008F14_IMG_DATA_FORMAT_8_8; + case 4: + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_16; + case 2: + return V_008F14_IMG_DATA_FORMAT_16_16; + case 4: + return V_008F14_IMG_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_32; + case 2: + return V_008F14_IMG_DATA_FORMAT_32_32; #if 0 /* Not supported for render targets */ case 3: return V_008F14_IMG_DATA_FORMAT_32_32_32; #endif - case 4: - return V_008F14_IMG_DATA_FORMAT_32_32_32_32; - } - } + case 4: + return V_008F14_IMG_DATA_FORMAT_32_32_32_32; + } + } out_unknown: - return ~0; + return ~0; } static unsigned si_tex_wrap(unsigned wrap) { - switch (wrap) { - default: - case PIPE_TEX_WRAP_REPEAT: - return V_008F30_SQ_TEX_WRAP; - case PIPE_TEX_WRAP_CLAMP: - return V_008F30_SQ_TEX_CLAMP_HALF_BORDER; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - return V_008F30_SQ_TEX_CLAMP_BORDER; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return V_008F30_SQ_TEX_MIRROR; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER; - } + switch (wrap) { + default: + case PIPE_TEX_WRAP_REPEAT: + return V_008F30_SQ_TEX_WRAP; + case PIPE_TEX_WRAP_CLAMP: + return V_008F30_SQ_TEX_CLAMP_HALF_BORDER; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_CLAMP_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return V_008F30_SQ_TEX_MIRROR; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER; + } } static unsigned si_tex_mipfilter(unsigned filter) { - switch (filter) { - case PIPE_TEX_MIPFILTER_NEAREST: - return V_008F38_SQ_TEX_Z_FILTER_POINT; - case PIPE_TEX_MIPFILTER_LINEAR: - return V_008F38_SQ_TEX_Z_FILTER_LINEAR; - default: - case PIPE_TEX_MIPFILTER_NONE: - return V_008F38_SQ_TEX_Z_FILTER_NONE; - } + switch (filter) { + case PIPE_TEX_MIPFILTER_NEAREST: + return V_008F38_SQ_TEX_Z_FILTER_POINT; + case PIPE_TEX_MIPFILTER_LINEAR: + return V_008F38_SQ_TEX_Z_FILTER_LINEAR; + default: + case PIPE_TEX_MIPFILTER_NONE: + return V_008F38_SQ_TEX_Z_FILTER_NONE; + } } static unsigned si_tex_compare(unsigned compare) { - switch (compare) { - default: - case PIPE_FUNC_NEVER: - return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; - case PIPE_FUNC_LESS: - return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; - case PIPE_FUNC_EQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; - case PIPE_FUNC_LEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; - case PIPE_FUNC_GREATER: - return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; - case PIPE_FUNC_NOTEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; - case PIPE_FUNC_GEQUAL: - return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; - case PIPE_FUNC_ALWAYS: - return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; - } + switch (compare) { + default: + case PIPE_FUNC_NEVER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + case PIPE_FUNC_LESS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; + case PIPE_FUNC_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; + case PIPE_FUNC_LEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; + case PIPE_FUNC_GREATER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; + case PIPE_FUNC_NOTEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; + case PIPE_FUNC_GEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; + case PIPE_FUNC_ALWAYS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; + } } -static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, - unsigned view_target, unsigned nr_samples) +static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target, + unsigned nr_samples) { - unsigned res_target = tex->buffer.b.b.target; - - if (view_target == PIPE_TEXTURE_CUBE || - view_target == PIPE_TEXTURE_CUBE_ARRAY) - res_target = view_target; - /* If interpreting cubemaps as something else, set 2D_ARRAY. */ - else if (res_target == PIPE_TEXTURE_CUBE || - res_target == PIPE_TEXTURE_CUBE_ARRAY) - res_target = PIPE_TEXTURE_2D_ARRAY; - - /* GFX9 allocates 1D textures as 2D. */ - if ((res_target == PIPE_TEXTURE_1D || - res_target == PIPE_TEXTURE_1D_ARRAY) && - sscreen->info.chip_class == GFX9 && - tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { - if (res_target == PIPE_TEXTURE_1D) - res_target = PIPE_TEXTURE_2D; - else - res_target = PIPE_TEXTURE_2D_ARRAY; - } - - switch (res_target) { - default: - case PIPE_TEXTURE_1D: - return V_008F1C_SQ_RSRC_IMG_1D; - case PIPE_TEXTURE_1D_ARRAY: - return V_008F1C_SQ_RSRC_IMG_1D_ARRAY; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : - V_008F1C_SQ_RSRC_IMG_2D; - case PIPE_TEXTURE_2D_ARRAY: - return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : - V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - case PIPE_TEXTURE_3D: - return V_008F1C_SQ_RSRC_IMG_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return V_008F1C_SQ_RSRC_IMG_CUBE; - } + unsigned res_target = tex->buffer.b.b.target; + + if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY) + res_target = view_target; + /* If interpreting cubemaps as something else, set 2D_ARRAY. */ + else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY) + res_target = PIPE_TEXTURE_2D_ARRAY; + + /* GFX9 allocates 1D textures as 2D. */ + if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) && + sscreen->info.chip_class == GFX9 && + tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { + if (res_target == PIPE_TEXTURE_1D) + res_target = PIPE_TEXTURE_2D; + else + res_target = PIPE_TEXTURE_2D_ARRAY; + } + + switch (res_target) { + default: + case PIPE_TEXTURE_1D: + return V_008F1C_SQ_RSRC_IMG_1D; + case PIPE_TEXTURE_1D_ARRAY: + return V_008F1C_SQ_RSRC_IMG_1D_ARRAY; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D; + case PIPE_TEXTURE_2D_ARRAY: + return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + case PIPE_TEXTURE_3D: + return V_008F1C_SQ_RSRC_IMG_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return V_008F1C_SQ_RSRC_IMG_CUBE; + } } /* @@ -2034,1748 +1919,1663 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format) { - struct si_screen *sscreen = (struct si_screen *)screen; + struct si_screen *sscreen = (struct si_screen *)screen; - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - if (!fmt->img_format || fmt->buffers_only) - return false; - return true; - } + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + if (!fmt->img_format || fmt->buffers_only) + return false; + return true; + } - const struct util_format_description *desc = util_format_description(format); - if (!desc) - return false; + const struct util_format_description *desc = util_format_description(format); + if (!desc) + return false; - return si_translate_texformat(screen, format, desc, - util_format_get_first_non_void_channel(format)) != ~0U; + return si_translate_texformat(screen, format, desc, + util_format_get_first_non_void_channel(format)) != ~0U; } static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen, - const struct util_format_description *desc, - int first_non_void) + const struct util_format_description *desc, + int first_non_void) { - int i; - - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); - - if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) - return V_008F0C_BUF_DATA_FORMAT_10_11_11; - - assert(first_non_void >= 0); - - if (desc->nr_channels == 4 && - desc->channel[0].size == 10 && - desc->channel[1].size == 10 && - desc->channel[2].size == 10 && - desc->channel[3].size == 2) - return V_008F0C_BUF_DATA_FORMAT_2_10_10_10; - - /* See whether the components are of the same size. */ - for (i = 0; i < desc->nr_channels; i++) { - if (desc->channel[first_non_void].size != desc->channel[i].size) - return V_008F0C_BUF_DATA_FORMAT_INVALID; - } - - switch (desc->channel[first_non_void].size) { - case 8: - switch (desc->nr_channels) { - case 1: - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_8; - case 2: - return V_008F0C_BUF_DATA_FORMAT_8_8; - case 4: - return V_008F0C_BUF_DATA_FORMAT_8_8_8_8; - } - break; - case 16: - switch (desc->nr_channels) { - case 1: - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_16; - case 2: - return V_008F0C_BUF_DATA_FORMAT_16_16; - case 4: - return V_008F0C_BUF_DATA_FORMAT_16_16_16_16; - } - break; - case 32: - switch (desc->nr_channels) { - case 1: - return V_008F0C_BUF_DATA_FORMAT_32; - case 2: - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 3: - return V_008F0C_BUF_DATA_FORMAT_32_32_32; - case 4: - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - } - break; - case 64: - /* Legacy double formats. */ - switch (desc->nr_channels) { - case 1: /* 1 load */ - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 2: /* 1 load */ - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - case 3: /* 3 loads */ - return V_008F0C_BUF_DATA_FORMAT_32_32; - case 4: /* 2 loads */ - return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; - } - break; - } - - return V_008F0C_BUF_DATA_FORMAT_INVALID; + int i; + + assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return V_008F0C_BUF_DATA_FORMAT_10_11_11; + + assert(first_non_void >= 0); + + if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 && + desc->channel[2].size == 10 && desc->channel[3].size == 2) + return V_008F0C_BUF_DATA_FORMAT_2_10_10_10; + + /* See whether the components are of the same size. */ + for (i = 0; i < desc->nr_channels; i++) { + if (desc->channel[first_non_void].size != desc->channel[i].size) + return V_008F0C_BUF_DATA_FORMAT_INVALID; + } + + switch (desc->channel[first_non_void].size) { + case 8: + switch (desc->nr_channels) { + case 1: + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_8; + case 2: + return V_008F0C_BUF_DATA_FORMAT_8_8; + case 4: + return V_008F0C_BUF_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_16; + case 2: + return V_008F0C_BUF_DATA_FORMAT_16_16; + case 4: + return V_008F0C_BUF_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F0C_BUF_DATA_FORMAT_32; + case 2: + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 3: + return V_008F0C_BUF_DATA_FORMAT_32_32_32; + case 4: + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + } + break; + case 64: + /* Legacy double formats. */ + switch (desc->nr_channels) { + case 1: /* 1 load */ + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 2: /* 1 load */ + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + case 3: /* 3 loads */ + return V_008F0C_BUF_DATA_FORMAT_32_32; + case 4: /* 2 loads */ + return V_008F0C_BUF_DATA_FORMAT_32_32_32_32; + } + break; + } + + return V_008F0C_BUF_DATA_FORMAT_INVALID; } static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen, - const struct util_format_description *desc, - int first_non_void) + const struct util_format_description *desc, + int first_non_void) { - assert(((struct si_screen *)screen)->info.chip_class <= GFX9); - - if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) - return V_008F0C_BUF_NUM_FORMAT_FLOAT; - - assert(first_non_void >= 0); - - switch (desc->channel[first_non_void].type) { - case UTIL_FORMAT_TYPE_SIGNED: - case UTIL_FORMAT_TYPE_FIXED: - if (desc->channel[first_non_void].size >= 32 || - desc->channel[first_non_void].pure_integer) - return V_008F0C_BUF_NUM_FORMAT_SINT; - else if (desc->channel[first_non_void].normalized) - return V_008F0C_BUF_NUM_FORMAT_SNORM; - else - return V_008F0C_BUF_NUM_FORMAT_SSCALED; - break; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[first_non_void].size >= 32 || - desc->channel[first_non_void].pure_integer) - return V_008F0C_BUF_NUM_FORMAT_UINT; - else if (desc->channel[first_non_void].normalized) - return V_008F0C_BUF_NUM_FORMAT_UNORM; - else - return V_008F0C_BUF_NUM_FORMAT_USCALED; - break; - case UTIL_FORMAT_TYPE_FLOAT: - default: - return V_008F0C_BUF_NUM_FORMAT_FLOAT; - } + assert(((struct si_screen *)screen)->info.chip_class <= GFX9); + + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return V_008F0C_BUF_NUM_FORMAT_FLOAT; + + assert(first_non_void >= 0); + + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_SIGNED: + case UTIL_FORMAT_TYPE_FIXED: + if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_SINT; + else if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_SNORM; + else + return V_008F0C_BUF_NUM_FORMAT_SSCALED; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer) + return V_008F0C_BUF_NUM_FORMAT_UINT; + else if (desc->channel[first_non_void].normalized) + return V_008F0C_BUF_NUM_FORMAT_UNORM; + else + return V_008F0C_BUF_NUM_FORMAT_USCALED; + break; + case UTIL_FORMAT_TYPE_FLOAT: + default: + return V_008F0C_BUF_NUM_FORMAT_FLOAT; + } } -static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, - enum pipe_format format, - unsigned usage) +static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format, + unsigned usage) { - struct si_screen *sscreen = (struct si_screen *)screen; - const struct util_format_description *desc; - int first_non_void; - unsigned data_format; - - assert((usage & ~(PIPE_BIND_SHADER_IMAGE | - PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_VERTEX_BUFFER)) == 0); - - desc = util_format_description(format); - if (!desc) - return 0; - - /* There are no native 8_8_8 or 16_16_16 data formats, and we currently - * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well - * for read-only access (with caveats surrounding bounds checks), but - * obviously fails for write access which we have to implement for - * shader images. Luckily, OpenGL doesn't expect this to be supported - * anyway, and so the only impact is on PBO uploads / downloads, which - * shouldn't be expected to be fast for GL_RGB anyway. - */ - if (desc->block.bits == 3 * 8 || - desc->block.bits == 3 * 16) { - if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) { - usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW); - if (!usage) - return 0; - } - } - - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - if (!fmt->img_format || fmt->img_format >= 128) - return 0; - return usage; - } - - first_non_void = util_format_get_first_non_void_channel(format); - data_format = si_translate_buffer_dataformat(screen, desc, first_non_void); - if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID) - return 0; - - return usage; + struct si_screen *sscreen = (struct si_screen *)screen; + const struct util_format_description *desc; + int first_non_void; + unsigned data_format; + + assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) == + 0); + + desc = util_format_description(format); + if (!desc) + return 0; + + /* There are no native 8_8_8 or 16_16_16 data formats, and we currently + * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well + * for read-only access (with caveats surrounding bounds checks), but + * obviously fails for write access which we have to implement for + * shader images. Luckily, OpenGL doesn't expect this to be supported + * anyway, and so the only impact is on PBO uploads / downloads, which + * shouldn't be expected to be fast for GL_RGB anyway. + */ + if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) { + if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) { + usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW); + if (!usage) + return 0; + } + } + + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + if (!fmt->img_format || fmt->img_format >= 128) + return 0; + return usage; + } + + first_non_void = util_format_get_first_non_void_channel(format); + data_format = si_translate_buffer_dataformat(screen, desc, first_non_void); + if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID) + return 0; + + return usage; } static bool si_is_colorbuffer_format_supported(enum pipe_format format) { - return si_translate_colorformat(format) != V_028C70_COLOR_INVALID && - si_translate_colorswap(format, false) != ~0U; + return si_translate_colorformat(format) != V_028C70_COLOR_INVALID && + si_translate_colorswap(format, false) != ~0U; } static bool si_is_zs_format_supported(enum pipe_format format) { - return si_translate_dbformat(format) != V_028040_Z_INVALID; + return si_translate_dbformat(format) != V_028040_Z_INVALID; } -static bool si_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) +static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format, + enum pipe_texture_target target, unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - struct si_screen *sscreen = (struct si_screen *)screen; - unsigned retval = 0; - - if (target >= PIPE_MAX_TEXTURE_TYPES) { - PRINT_ERR("radeonsi: unsupported texture type %d\n", target); - return false; - } - - if (MAX2(1, sample_count) < MAX2(1, storage_sample_count)) - return false; - - if (sample_count > 1) { - if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) - return false; - - /* Only power-of-two sample counts are supported. */ - if (!util_is_power_of_two_or_zero(sample_count) || - !util_is_power_of_two_or_zero(storage_sample_count)) - return false; - - /* MSAA support without framebuffer attachments. */ - if (format == PIPE_FORMAT_NONE && sample_count <= 16) - return true; - - if (!sscreen->info.has_eqaa_surface_allocator || - util_format_is_depth_or_stencil(format)) { - /* Color without EQAA or depth/stencil. */ - if (sample_count > 8 || - sample_count != storage_sample_count) - return false; - } else { - /* Color with EQAA. */ - if (sample_count > 16 || - storage_sample_count > 8) - return false; - } - } - - if (usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE)) { - if (target == PIPE_BUFFER) { - retval |= si_is_vertex_format_supported( - screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE)); - } else { - if (si_is_sampler_format_supported(screen, format)) - retval |= usage & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_SHADER_IMAGE); - } - } - - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_BLENDABLE)) && - si_is_colorbuffer_format_supported(format)) { - retval |= usage & - (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED); - if (!util_format_is_pure_integer(format) && - !util_format_is_depth_or_stencil(format)) - retval |= usage & PIPE_BIND_BLENDABLE; - } - - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - si_is_zs_format_supported(format)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } - - if (usage & PIPE_BIND_VERTEX_BUFFER) { - retval |= si_is_vertex_format_supported(screen, format, - PIPE_BIND_VERTEX_BUFFER); - } - - if ((usage & PIPE_BIND_LINEAR) && - !util_format_is_compressed(format) && - !(usage & PIPE_BIND_DEPTH_STENCIL)) - retval |= PIPE_BIND_LINEAR; - - return retval == usage; + struct si_screen *sscreen = (struct si_screen *)screen; + unsigned retval = 0; + + if (target >= PIPE_MAX_TEXTURE_TYPES) { + PRINT_ERR("radeonsi: unsupported texture type %d\n", target); + return false; + } + + if (MAX2(1, sample_count) < MAX2(1, storage_sample_count)) + return false; + + if (sample_count > 1) { + if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) + return false; + + /* Only power-of-two sample counts are supported. */ + if (!util_is_power_of_two_or_zero(sample_count) || + !util_is_power_of_two_or_zero(storage_sample_count)) + return false; + + /* MSAA support without framebuffer attachments. */ + if (format == PIPE_FORMAT_NONE && sample_count <= 16) + return true; + + if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) { + /* Color without EQAA or depth/stencil. */ + if (sample_count > 8 || sample_count != storage_sample_count) + return false; + } else { + /* Color with EQAA. */ + if (sample_count > 16 || storage_sample_count > 8) + return false; + } + } + + if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) { + if (target == PIPE_BUFFER) { + retval |= si_is_vertex_format_supported( + screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)); + } else { + if (si_is_sampler_format_supported(screen, format)) + retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); + } + } + + if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) && + si_is_colorbuffer_format_supported(format)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED); + if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format)) + retval |= usage & PIPE_BIND_BLENDABLE; + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if (usage & PIPE_BIND_VERTEX_BUFFER) { + retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER); + } + + if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) && + !(usage & PIPE_BIND_DEPTH_STENCIL)) + retval |= PIPE_BIND_LINEAR; + + return retval == usage; } /* * framebuffer handling */ -static void si_choose_spi_color_formats(struct si_surface *surf, - unsigned format, unsigned swap, - unsigned ntype, bool is_depth) +static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap, + unsigned ntype, bool is_depth) { - /* Alpha is needed for alpha-to-coverage. - * Blending may be with or without alpha. - */ - unsigned normal = 0; /* most optimal, may not support blending or export alpha */ - unsigned alpha = 0; /* exports alpha, but may not support blending */ - unsigned blend = 0; /* supports blending, but may not export alpha */ - unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ - - /* Choose the SPI color formats. These are required values for RB+. - * Other chips have multiple choices, though they are not necessarily better. - */ - switch (format) { - case V_028C70_COLOR_5_6_5: - case V_028C70_COLOR_1_5_5_5: - case V_028C70_COLOR_5_5_5_1: - case V_028C70_COLOR_4_4_4_4: - case V_028C70_COLOR_10_11_11: - case V_028C70_COLOR_11_11_10: - case V_028C70_COLOR_8: - case V_028C70_COLOR_8_8: - case V_028C70_COLOR_8_8_8_8: - case V_028C70_COLOR_10_10_10_2: - case V_028C70_COLOR_2_10_10_10: - if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - break; - - case V_028C70_COLOR_16: - case V_028C70_COLOR_16_16: - case V_028C70_COLOR_16_16_16_16: - if (ntype == V_028C70_NUMBER_UNORM || - ntype == V_028C70_NUMBER_SNORM) { - /* UNORM16 and SNORM16 don't support blending */ - if (ntype == V_028C70_NUMBER_UNORM) - normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; - else - normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; - - /* Use 32 bits per channel for blending. */ - if (format == V_028C70_COLOR_16) { - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = V_028714_SPI_SHADER_32_R; - blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else if (format == V_028C70_COLOR_16_16) { - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = V_028714_SPI_SHADER_32_GR; - blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else /* 16_16_16_16 */ - blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else if (ntype == V_028C70_NUMBER_FLOAT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - else - assert(0); - break; - - case V_028C70_COLOR_32: - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = normal = V_028714_SPI_SHADER_32_R; - alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32: - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = normal = V_028714_SPI_SHADER_32_GR; - alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32_32_32: - case V_028C70_COLOR_8_24: - case V_028C70_COLOR_24_8: - case V_028C70_COLOR_X24_8_32_FLOAT: - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - break; - - default: - assert(0); - return; - } - - /* The DB->CB copy needs 32_ABGR. */ - if (is_depth) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - - surf->spi_shader_col_format = normal; - surf->spi_shader_col_format_alpha = alpha; - surf->spi_shader_col_format_blend = blend; - surf->spi_shader_col_format_blend_alpha = blend_alpha; + /* Alpha is needed for alpha-to-coverage. + * Blending may be with or without alpha. + */ + unsigned normal = 0; /* most optimal, may not support blending or export alpha */ + unsigned alpha = 0; /* exports alpha, but may not support blending */ + unsigned blend = 0; /* supports blending, but may not export alpha */ + unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ + + /* Choose the SPI color formats. These are required values for RB+. + * Other chips have multiple choices, though they are not necessarily better. + */ + switch (format) { + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_5_5_5_1: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_10_11_11: + case V_028C70_COLOR_11_11_10: + case V_028C70_COLOR_8: + case V_028C70_COLOR_8_8: + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_10_10_10_2: + case V_028C70_COLOR_2_10_10_10: + if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + break; + + case V_028C70_COLOR_16: + case V_028C70_COLOR_16_16: + case V_028C70_COLOR_16_16_16_16: + if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) { + /* UNORM16 and SNORM16 don't support blending */ + if (ntype == V_028C70_NUMBER_UNORM) + normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; + else + normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; + + /* Use 32 bits per channel for blending. */ + if (format == V_028C70_COLOR_16) { + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = V_028714_SPI_SHADER_32_R; + blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else if (format == V_028C70_COLOR_16_16) { + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = V_028714_SPI_SHADER_32_GR; + blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + blend = blend_alpha = V_028714_SPI_SHADER_32_AR; + else + assert(0); + } else /* 16_16_16_16 */ + blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (ntype == V_028C70_NUMBER_UINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; + else if (ntype == V_028C70_NUMBER_SINT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; + else if (ntype == V_028C70_NUMBER_FLOAT) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; + else + assert(0); + break; + + case V_028C70_COLOR_32: + if (swap == V_028C70_SWAP_STD) { /* R */ + blend = normal = V_028714_SPI_SHADER_32_R; + alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; + } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32: + if (swap == V_028C70_SWAP_STD) { /* RG */ + blend = normal = V_028714_SPI_SHADER_32_GR; + alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; + } else if (swap == V_028C70_SWAP_ALT) /* RA */ + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; + else + assert(0); + break; + + case V_028C70_COLOR_32_32_32_32: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_X24_8_32_FLOAT: + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; + break; + + default: + assert(0); + return; + } + + /* The DB->CB copy needs 32_ABGR. */ + if (is_depth) + alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; + + surf->spi_shader_col_format = normal; + surf->spi_shader_col_format_alpha = alpha; + surf->spi_shader_col_format_blend = blend; + surf->spi_shader_col_format_blend_alpha = blend_alpha; } -static void si_initialize_color_surface(struct si_context *sctx, - struct si_surface *surf) +static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf) { - struct si_texture *tex = (struct si_texture*)surf->base.texture; - unsigned color_info, color_attrib; - unsigned format, swap, ntype, endian; - const struct util_format_description *desc; - int firstchan; - unsigned blend_clamp = 0, blend_bypass = 0; - - desc = util_format_description(surf->base.format); - for (firstchan = 0; firstchan < 4; firstchan++) { - if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) { - break; - } - } - if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) { - ntype = V_028C70_NUMBER_FLOAT; - } else { - ntype = V_028C70_NUMBER_UNORM; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - ntype = V_028C70_NUMBER_SRGB; - else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_SINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_SNORM; - } - } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) { - if (desc->channel[firstchan].pure_integer) { - ntype = V_028C70_NUMBER_UINT; - } else { - assert(desc->channel[firstchan].normalized); - ntype = V_028C70_NUMBER_UNORM; - } - } - } - - format = si_translate_colorformat(surf->base.format); - if (format == V_028C70_COLOR_INVALID) { - PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); - } - assert(format != V_028C70_COLOR_INVALID); - swap = si_translate_colorswap(surf->base.format, false); - endian = si_colorformat_endian_swap(format); - - /* blend clamp should be set for all NORM/SRGB types */ - if (ntype == V_028C70_NUMBER_UNORM || - ntype == V_028C70_NUMBER_SNORM || - ntype == V_028C70_NUMBER_SRGB) - blend_clamp = 1; - - /* set blend bypass according to docs if SINT/UINT or - 8/24 COLOR variants */ - if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || - format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || - format == V_028C70_COLOR_X24_8_32_FLOAT) { - blend_clamp = 0; - blend_bypass = 1; - } - - if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) { - if (format == V_028C70_COLOR_8 || - format == V_028C70_COLOR_8_8 || - format == V_028C70_COLOR_8_8_8_8) - surf->color_is_int8 = true; - else if (format == V_028C70_COLOR_10_10_10_2 || - format == V_028C70_COLOR_2_10_10_10) - surf->color_is_int10 = true; - } - - color_info = S_028C70_FORMAT(format) | - S_028C70_COMP_SWAP(swap) | - S_028C70_BLEND_CLAMP(blend_clamp) | - S_028C70_BLEND_BYPASS(blend_bypass) | - S_028C70_SIMPLE_FLOAT(1) | - S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && - ntype != V_028C70_NUMBER_SNORM && - ntype != V_028C70_NUMBER_SRGB && - format != V_028C70_COLOR_8_24 && - format != V_028C70_COLOR_24_8) | - S_028C70_NUMBER_TYPE(ntype) | - S_028C70_ENDIAN(endian); - - /* Intensity is implemented as Red, so treat it that way. */ - color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 || - util_format_is_intensity(surf->base.format)); - - if (tex->buffer.b.b.nr_samples > 1) { - unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); - unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); - - color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | - S_028C74_NUM_FRAGMENTS(log_fragments); - - if (tex->surface.fmask_offset) { - color_info |= S_028C70_COMPRESSION(1); - unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); - - if (sctx->chip_class == GFX6) { - /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */ - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh); - } - } - } - - if (sctx->chip_class >= GFX10) { - unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; - - /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and - 64 for APU because all of our APUs to date use DIMMs which have - a request granularity size of 64B while all other chips have a - 32B request size */ - if (!sctx->screen->info.has_dedicated_vram) - min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - - surf->cb_dcc_control = - S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | - S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(0) | - S_028C78_INDEPENDENT_128B_BLOCKS(1); - } else if (sctx->chip_class >= GFX8) { - unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; - unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; - - /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and - 64 for APU because all of our APUs to date use DIMMs which have - a request granularity size of 64B while all other chips have a - 32B request size */ - if (!sctx->screen->info.has_dedicated_vram) - min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; - - if (tex->buffer.b.b.nr_storage_samples > 1) { - if (tex->surface.bpe == 1) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - else if (tex->surface.bpe == 2) - max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; - } - - surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | - S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | - S_028C78_INDEPENDENT_64B_BLOCKS(1); - } - - /* This must be set for fast clear to work without FMASK. */ - if (!tex->surface.fmask_size && sctx->chip_class == GFX6) { - unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh); - color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); - } - - /* GFX10 field has the same base shift as the GFX6 field */ - unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | - S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer); - unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0); - - if (sctx->chip_class >= GFX10) { - color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); - - surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | - S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | - S_028EE0_RESOURCE_LEVEL(1); - } else if (sctx->chip_class == GFX9) { - color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); - color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | - S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); - } - - if (sctx->chip_class >= GFX9) { - surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) | - S_028C68_MIP0_HEIGHT(surf->height0 - 1) | - S_028C68_MAX_MIP(tex->buffer.b.b.last_level); - } - - surf->cb_color_view = color_view; - surf->cb_color_info = color_info; - surf->cb_color_attrib = color_attrib; - - /* Determine pixel shader export format */ - si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth); - - surf->color_initialized = true; + struct si_texture *tex = (struct si_texture *)surf->base.texture; + unsigned color_info, color_attrib; + unsigned format, swap, ntype, endian; + const struct util_format_description *desc; + int firstchan; + unsigned blend_clamp = 0, blend_bypass = 0; + + desc = util_format_description(surf->base.format); + for (firstchan = 0; firstchan < 4; firstchan++) { + if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) { + break; + } + } + if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; + } else { + ntype = V_028C70_NUMBER_UNORM; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + ntype = V_028C70_NUMBER_SRGB; + else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) { + if (desc->channel[firstchan].pure_integer) { + ntype = V_028C70_NUMBER_SINT; + } else { + assert(desc->channel[firstchan].normalized); + ntype = V_028C70_NUMBER_SNORM; + } + } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) { + if (desc->channel[firstchan].pure_integer) { + ntype = V_028C70_NUMBER_UINT; + } else { + assert(desc->channel[firstchan].normalized); + ntype = V_028C70_NUMBER_UNORM; + } + } + } + + format = si_translate_colorformat(surf->base.format); + if (format == V_028C70_COLOR_INVALID) { + PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); + } + assert(format != V_028C70_COLOR_INVALID); + swap = si_translate_colorswap(surf->base.format, false); + endian = si_colorformat_endian_swap(format); + + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM || + ntype == V_028C70_NUMBER_SRGB) + blend_clamp = 1; + + /* set blend bypass according to docs if SINT/UINT or + 8/24 COLOR variants */ + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || + format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || + format == V_028C70_COLOR_X24_8_32_FLOAT) { + blend_clamp = 0; + blend_bypass = 1; + } + + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) { + if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 || + format == V_028C70_COLOR_8_8_8_8) + surf->color_is_int8 = true; + else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10) + surf->color_is_int10 = true; + } + + color_info = + S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) | + S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) | + S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM && + ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 && + format != V_028C70_COLOR_24_8) | + S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian); + + /* Intensity is implemented as Red, so treat it that way. */ + color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 || + util_format_is_intensity(surf->base.format)); + + if (tex->buffer.b.b.nr_samples > 1) { + unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples); + unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples); + + color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments); + + if (tex->surface.fmask_offset) { + color_info |= S_028C70_COMPRESSION(1); + unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); + + if (sctx->chip_class == GFX6) { + /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */ + color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh); + } + } + } + + if (sctx->chip_class >= GFX10) { + unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; + + /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and + 64 for APU because all of our APUs to date use DIMMs which have + a request granularity size of 64B while all other chips have a + 32B request size */ + if (!sctx->screen->info.has_dedicated_vram) + min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; + + surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(0) | + S_028C78_INDEPENDENT_128B_BLOCKS(1); + } else if (sctx->chip_class >= GFX8) { + unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B; + unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B; + + /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and + 64 for APU because all of our APUs to date use DIMMs which have + a request granularity size of 64B while all other chips have a + 32B request size */ + if (!sctx->screen->info.has_dedicated_vram) + min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B; + + if (tex->buffer.b.b.nr_storage_samples > 1) { + if (tex->surface.bpe == 1) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + else if (tex->surface.bpe == 2) + max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + } + + surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) | + S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) | + S_028C78_INDEPENDENT_64B_BLOCKS(1); + } + + /* This must be set for fast clear to work without FMASK. */ + if (!tex->surface.fmask_size && sctx->chip_class == GFX6) { + unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh); + color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh); + } + + /* GFX10 field has the same base shift as the GFX6 field */ + unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) | + S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer); + unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0); + + if (sctx->chip_class >= GFX10) { + color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level); + + surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | + S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | + S_028EE0_RESOURCE_LEVEL(1); + } else if (sctx->chip_class == GFX9) { + color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); + color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | + S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); + } + + if (sctx->chip_class >= GFX9) { + surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) | + S_028C68_MIP0_HEIGHT(surf->height0 - 1) | + S_028C68_MAX_MIP(tex->buffer.b.b.last_level); + } + + surf->cb_color_view = color_view; + surf->cb_color_info = color_info; + surf->cb_color_attrib = color_attrib; + + /* Determine pixel shader export format */ + si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth); + + surf->color_initialized = true; } -static void si_init_depth_surface(struct si_context *sctx, - struct si_surface *surf) +static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf) { - struct si_texture *tex = (struct si_texture*)surf->base.texture; - unsigned level = surf->base.u.tex.level; - unsigned format, stencil_format; - uint32_t z_info, s_info; - - format = si_translate_dbformat(tex->db_render_format); - stencil_format = tex->surface.has_stencil ? - V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; - - assert(format != V_028040_Z_INVALID); - if (format == V_028040_Z_INVALID) - PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); - - surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | - S_028008_SLICE_MAX(surf->base.u.tex.last_layer); - surf->db_htile_data_base = 0; - surf->db_htile_surface = 0; - - if (sctx->chip_class >= GFX10) { - surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | - S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); - } - - if (sctx->chip_class >= GFX9) { - assert(tex->surface.u.gfx9.surf_offset == 0); - surf->db_depth_base = tex->buffer.gpu_address >> 8; - surf->db_stencil_base = (tex->buffer.gpu_address + - tex->surface.u.gfx9.stencil_offset) >> 8; - z_info = S_028038_FORMAT(format) | - S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | - S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028038_MAXMIP(tex->buffer.b.b.last_level); - s_info = S_02803C_FORMAT(stencil_format) | - S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); - - if (sctx->chip_class == GFX9) { - surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch); - surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch); - } - surf->db_depth_view |= S_028008_MIPID(level); - surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | - S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); - - if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028038_TILE_SURFACE_ENABLE(1) | - S_028038_ALLOW_EXPCLEAR(1); - - if (tex->tc_compatible_htile) { - unsigned max_zplanes = 4; - - if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && - tex->buffer.b.b.nr_samples > 1) - max_zplanes = 2; - - z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); - - if (sctx->chip_class >= GFX10) { - z_info |= S_028040_ITERATE_FLUSH(1); - s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); - } else { - z_info |= S_028038_ITERATE_FLUSH(1); - s_info |= S_02803C_ITERATE_FLUSH(1); - } - } - - if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { - /* Stencil buffer workaround ported from the GFX6-GFX8 code. - * See that for explanation. - */ - s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); - } else { - /* Use all HTILE for depth if there's no stencil. */ - s_info |= S_02803C_TILE_STENCIL_DISABLE(1); - } - - surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->surface.htile_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | - S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); - if (sctx->chip_class == GFX9) { - surf->db_htile_surface |= - S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned); - } - } - } else { - /* GFX6-GFX8 */ - struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level]; - - assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0); - - surf->db_depth_base = (tex->buffer.gpu_address + - tex->surface.u.legacy.level[level].offset) >> 8; - surf->db_stencil_base = (tex->buffer.gpu_address + - tex->surface.u.legacy.stencil_level[level].offset) >> 8; - - z_info = S_028040_FORMAT(format) | - S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); - s_info = S_028044_FORMAT(stencil_format); - surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile); - - if (sctx->chip_class >= GFX7) { - struct radeon_info *info = &sctx->screen->info; - unsigned index = tex->surface.u.legacy.tiling_index[level]; - unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level]; - unsigned macro_index = tex->surface.u.legacy.macro_tile_index; - unsigned tile_mode = info->si_tile_mode_array[index]; - unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; - unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; - - surf->db_depth_info |= - S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | - S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | - S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | - S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | - S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | - S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); - z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); - s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); - } else { - unsigned tile_mode_index = si_tile_mode_index(tex, level, false); - z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); - tile_mode_index = si_tile_mode_index(tex, level, true); - s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); - } - - surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | - S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1); - surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * - levelinfo->nblk_y) / 64 - 1); - - if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { - z_info |= S_028040_TILE_SURFACE_ENABLE(1) | - S_028040_ALLOW_EXPCLEAR(1); - - if (tex->surface.has_stencil) { - /* Workaround: For a not yet understood reason, the - * combination of MSAA, fast stencil clear and stencil - * decompress messes with subsequent stencil buffer - * uses. Problem was reproduced on Verde, Bonaire, - * Tonga, and Carrizo. - * - * Disabling EXPCLEAR works around the problem. - * - * Check piglit's arb_texture_multisample-stencil-clear - * test if you want to try changing this. - */ - if (tex->buffer.b.b.nr_samples <= 1) - s_info |= S_028044_ALLOW_EXPCLEAR(1); - } else if (!tex->tc_compatible_htile) { - /* Use all of the htile_buffer for depth if there's no stencil. - * This must not be set when TC-compatible HTILE is enabled - * due to a hw bug. - */ - s_info |= S_028044_TILE_STENCIL_DISABLE(1); - } - - surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->surface.htile_offset) >> 8; - surf->db_htile_surface = S_028ABC_FULL_CACHE(1); - - if (tex->tc_compatible_htile) { - surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1); - - /* 0 = full compression. N = only compress up to N-1 Z planes. */ - if (tex->buffer.b.b.nr_samples <= 1) - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5); - else if (tex->buffer.b.b.nr_samples <= 4) - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3); - else - z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2); - } - } - } - - surf->db_z_info = z_info; - surf->db_stencil_info = s_info; - - surf->depth_initialized = true; + struct si_texture *tex = (struct si_texture *)surf->base.texture; + unsigned level = surf->base.u.tex.level; + unsigned format, stencil_format; + uint32_t z_info, s_info; + + format = si_translate_dbformat(tex->db_render_format); + stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID; + + assert(format != V_028040_Z_INVALID); + if (format == V_028040_Z_INVALID) + PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format); + + surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) | + S_028008_SLICE_MAX(surf->base.u.tex.last_layer); + surf->db_htile_data_base = 0; + surf->db_htile_surface = 0; + + if (sctx->chip_class >= GFX10) { + surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) | + S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11); + } + + if (sctx->chip_class >= GFX9) { + assert(tex->surface.u.gfx9.surf_offset == 0); + surf->db_depth_base = tex->buffer.gpu_address >> 8; + surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8; + z_info = S_028038_FORMAT(format) | + S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) | + S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028038_MAXMIP(tex->buffer.b.b.last_level); + s_info = S_02803C_FORMAT(stencil_format) | + S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode); + + if (sctx->chip_class == GFX9) { + surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch); + surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch); + } + surf->db_depth_view |= S_028008_MIPID(level); + surf->db_depth_size = + S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); + + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { + z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1); + + if (tex->tc_compatible_htile) { + unsigned max_zplanes = 4; + + if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1) + max_zplanes = 2; + + z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); + + if (sctx->chip_class >= GFX10) { + z_info |= S_028040_ITERATE_FLUSH(1); + s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); + } else { + z_info |= S_028038_ITERATE_FLUSH(1); + s_info |= S_02803C_ITERATE_FLUSH(1); + } + } + + if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { + /* Stencil buffer workaround ported from the GFX6-GFX8 code. + * See that for explanation. + */ + s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1); + } else { + /* Use all HTILE for depth if there's no stencil. */ + s_info |= S_02803C_TILE_STENCIL_DISABLE(1); + } + + surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8; + surf->db_htile_surface = + S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); + if (sctx->chip_class == GFX9) { + surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned); + } + } + } else { + /* GFX6-GFX8 */ + struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level]; + + assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0); + + surf->db_depth_base = + (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8; + surf->db_stencil_base = + (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8; + + z_info = + S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)); + s_info = S_028044_FORMAT(stencil_format); + surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile); + + if (sctx->chip_class >= GFX7) { + struct radeon_info *info = &sctx->screen->info; + unsigned index = tex->surface.u.legacy.tiling_index[level]; + unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level]; + unsigned macro_index = tex->surface.u.legacy.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[index]; + unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index]; + unsigned macro_mode = info->cik_macrotile_mode_array[macro_index]; + + surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) | + S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) | + S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) | + S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) | + S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) | + S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode)); + z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode)); + s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode)); + } else { + unsigned tile_mode_index = si_tile_mode_index(tex, level, false); + z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index); + tile_mode_index = si_tile_mode_index(tex, level, true); + s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index); + } + + surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) | + S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1); + surf->db_depth_slice = + S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1); + + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { + z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1); + + if (tex->surface.has_stencil) { + /* Workaround: For a not yet understood reason, the + * combination of MSAA, fast stencil clear and stencil + * decompress messes with subsequent stencil buffer + * uses. Problem was reproduced on Verde, Bonaire, + * Tonga, and Carrizo. + * + * Disabling EXPCLEAR works around the problem. + * + * Check piglit's arb_texture_multisample-stencil-clear + * test if you want to try changing this. + */ + if (tex->buffer.b.b.nr_samples <= 1) + s_info |= S_028044_ALLOW_EXPCLEAR(1); + } else if (!tex->tc_compatible_htile) { + /* Use all of the htile_buffer for depth if there's no stencil. + * This must not be set when TC-compatible HTILE is enabled + * due to a hw bug. + */ + s_info |= S_028044_TILE_STENCIL_DISABLE(1); + } + + surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8; + surf->db_htile_surface = S_028ABC_FULL_CACHE(1); + + if (tex->tc_compatible_htile) { + surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1); + + /* 0 = full compression. N = only compress up to N-1 Z planes. */ + if (tex->buffer.b.b.nr_samples <= 1) + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5); + else if (tex->buffer.b.b.nr_samples <= 4) + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3); + else + z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2); + } + } + } + + surf->db_z_info = z_info; + surf->db_stencil_info = s_info; + + surf->depth_initialized = true; } void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) { - if (sctx->decompression_enabled) - return; - - if (sctx->framebuffer.state.zsbuf) { - struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; - struct si_texture *tex = (struct si_texture *)surf->texture; - - tex->dirty_level_mask |= 1 << surf->u.tex.level; - - if (tex->surface.has_stencil) - tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; - } - - unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask; - while (compressed_cb_mask) { - unsigned i = u_bit_scan(&compressed_cb_mask); - struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; - struct si_texture *tex = (struct si_texture*)surf->texture; - - if (tex->surface.fmask_offset) { - tex->dirty_level_mask |= 1 << surf->u.tex.level; - tex->fmask_is_identity = false; - } - if (tex->dcc_gather_statistics) - tex->separate_dcc_dirty = true; - } + if (sctx->decompression_enabled) + return; + + if (sctx->framebuffer.state.zsbuf) { + struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; + struct si_texture *tex = (struct si_texture *)surf->texture; + + tex->dirty_level_mask |= 1 << surf->u.tex.level; + + if (tex->surface.has_stencil) + tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; + } + + unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask; + while (compressed_cb_mask) { + unsigned i = u_bit_scan(&compressed_cb_mask); + struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; + struct si_texture *tex = (struct si_texture *)surf->texture; + + if (tex->surface.fmask_offset) { + tex->dirty_level_mask |= 1 << surf->u.tex.level; + tex->fmask_is_identity = false; + } + if (tex->dcc_gather_statistics) + tex->separate_dcc_dirty = true; + } } static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state) { - for (int i = 0; i < state->nr_cbufs; ++i) { - struct si_surface *surf = NULL; - struct si_texture *tex; + for (int i = 0; i < state->nr_cbufs; ++i) { + struct si_surface *surf = NULL; + struct si_texture *tex; - if (!state->cbufs[i]) - continue; - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; + if (!state->cbufs[i]) + continue; + surf = (struct si_surface *)state->cbufs[i]; + tex = (struct si_texture *)surf->base.texture; - p_atomic_dec(&tex->framebuffers_bound); - } + p_atomic_dec(&tex->framebuffers_bound); + } } static void si_set_framebuffer_state(struct pipe_context *ctx, - const struct pipe_framebuffer_state *state) + const struct pipe_framebuffer_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_surface *surf = NULL; - struct si_texture *tex; - bool old_any_dst_linear = sctx->framebuffer.any_dst_linear; - unsigned old_nr_samples = sctx->framebuffer.nr_samples; - unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit; - bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf; - bool old_has_stencil = - old_has_zsbuf && - ((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; - bool unbound = false; - int i; - - /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs - * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. - * We could implement the full workaround here, but it's a useless case. - */ - if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) { - unreachable("the framebuffer shouldn't have zero area"); - return; - } - - si_update_fb_dirtiness_after_rendering(sctx); - - for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (!sctx->framebuffer.state.cbufs[i]) - continue; - - tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; - if (tex->dcc_gather_statistics) - vi_separate_dcc_stop_query(sctx, tex); - } - - /* Disable DCC if the formats are incompatible. */ - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; - - if (!surf->dcc_incompatible) - continue; - - /* Since the DCC decompression calls back into set_framebuffer- - * _state, we need to unbind the framebuffer, so that - * vi_separate_dcc_stop_query isn't called twice with the same - * color buffer. - */ - if (!unbound) { - util_copy_framebuffer_state(&sctx->framebuffer.state, NULL); - unbound = true; - } - - if (vi_dcc_enabled(tex, surf->base.u.tex.level)) - if (!si_texture_disable_dcc(sctx, tex)) - si_decompress_dcc(sctx, tex); - - surf->dcc_incompatible = false; - } - - /* Only flush TC when changing the framebuffer state, because - * the only client not using TC that can change textures is - * the framebuffer. - * - * Wait for compute shaders because of possible transitions: - * - FB write -> shader read - * - shader write -> FB read - * - * DB caches are flushed on demand (using si_decompress_textures). - * - * When MSAA is enabled, CB and TC caches are flushed on demand - * (after FMASK decompression). Shader write -> FB read transitions - * cannot happen for MSAA textures, because MSAA shader images are - * not supported. - * - * Only flush and wait for CB if there is actually a bound color buffer. - */ - if (sctx->framebuffer.uncompressed_cb_mask) { - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, - sctx->framebuffer.CB_has_shader_readable_metadata, - sctx->framebuffer.all_DCC_pipe_aligned); - } - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - - /* u_blitter doesn't invoke depth decompression when it does multiple - * blits in a row, but the only case when it matters for DB is when - * doing generate_mipmap. So here we flush DB manually between - * individual generate_mipmap blits. - * Note that lower mipmap levels aren't compressed. - */ - if (sctx->generate_mipmap_for_depth) { - si_make_DB_shader_coherent(sctx, 1, false, - sctx->framebuffer.DB_has_shader_readable_metadata); - } else if (sctx->chip_class == GFX9) { - /* It appears that DB metadata "leaks" in a sequence of: - * - depth clear - * - DCC decompress for shader image writes (with DB disabled) - * - render with DEPTH_BEFORE_SHADER=1 - * Flushing DB metadata works around the problem. - */ - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; - } - - /* Take the maximum of the old and new count. If the new count is lower, - * dirtying is needed to disable the unbound colorbuffers. - */ - sctx->framebuffer.dirty_cbufs |= - (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; - sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; - - si_dec_framebuffer_counters(&sctx->framebuffer.state); - util_copy_framebuffer_state(&sctx->framebuffer.state, state); - - sctx->framebuffer.colorbuf_enabled_4bit = 0; - sctx->framebuffer.spi_shader_col_format = 0; - sctx->framebuffer.spi_shader_col_format_alpha = 0; - sctx->framebuffer.spi_shader_col_format_blend = 0; - sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; - sctx->framebuffer.color_is_int8 = 0; - sctx->framebuffer.color_is_int10 = 0; - - sctx->framebuffer.compressed_cb_mask = 0; - sctx->framebuffer.uncompressed_cb_mask = 0; - sctx->framebuffer.displayable_dcc_cb_mask = 0; - sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); - sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples; - sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); - sctx->framebuffer.any_dst_linear = false; - sctx->framebuffer.CB_has_shader_readable_metadata = false; - sctx->framebuffer.DB_has_shader_readable_metadata = false; - sctx->framebuffer.all_DCC_pipe_aligned = true; - sctx->framebuffer.min_bytes_per_pixel = 0; - - for (i = 0; i < state->nr_cbufs; i++) { - if (!state->cbufs[i]) - continue; - - surf = (struct si_surface*)state->cbufs[i]; - tex = (struct si_texture*)surf->base.texture; - - if (!surf->color_initialized) { - si_initialize_color_surface(sctx, surf); - } - - sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4); - sctx->framebuffer.spi_shader_col_format |= - surf->spi_shader_col_format << (i * 4); - sctx->framebuffer.spi_shader_col_format_alpha |= - surf->spi_shader_col_format_alpha << (i * 4); - sctx->framebuffer.spi_shader_col_format_blend |= - surf->spi_shader_col_format_blend << (i * 4); - sctx->framebuffer.spi_shader_col_format_blend_alpha |= - surf->spi_shader_col_format_blend_alpha << (i * 4); - - if (surf->color_is_int8) - sctx->framebuffer.color_is_int8 |= 1 << i; - if (surf->color_is_int10) - sctx->framebuffer.color_is_int10 |= 1 << i; - - if (tex->surface.fmask_offset) - sctx->framebuffer.compressed_cb_mask |= 1 << i; - else - sctx->framebuffer.uncompressed_cb_mask |= 1 << i; - - if (tex->surface.dcc_offset) - sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i; - - /* Don't update nr_color_samples for non-AA buffers. - * (e.g. destination of MSAA resolve) - */ - if (tex->buffer.b.b.nr_samples >= 2 && - tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) { - sctx->framebuffer.nr_color_samples = - MIN2(sctx->framebuffer.nr_color_samples, - tex->buffer.b.b.nr_storage_samples); - sctx->framebuffer.nr_color_samples = - MAX2(1, sctx->framebuffer.nr_color_samples); - } - - if (tex->surface.is_linear) - sctx->framebuffer.any_dst_linear = true; - - if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { - sctx->framebuffer.CB_has_shader_readable_metadata = true; - - if (sctx->chip_class >= GFX9 && - !tex->surface.u.gfx9.dcc.pipe_aligned) - sctx->framebuffer.all_DCC_pipe_aligned = false; - } - - si_context_add_resource_size(sctx, surf->base.texture); - - p_atomic_inc(&tex->framebuffers_bound); - - if (tex->dcc_gather_statistics) { - /* Dirty tracking must be enabled for DCC usage analysis. */ - sctx->framebuffer.compressed_cb_mask |= 1 << i; - vi_separate_dcc_start_query(sctx, tex); - } - - /* Update the minimum but don't keep 0. */ - if (!sctx->framebuffer.min_bytes_per_pixel || - tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) - sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; - } - - /* For optimal DCC performance. */ - if (sctx->chip_class >= GFX10) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; - else - sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; - - struct si_texture *zstex = NULL; - - if (state->zsbuf) { - surf = (struct si_surface*)state->zsbuf; - zstex = (struct si_texture*)surf->base.texture; - - if (!surf->depth_initialized) { - si_init_depth_surface(sctx, surf); - } - - if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, - PIPE_MASK_ZS)) - sctx->framebuffer.DB_has_shader_readable_metadata = true; - - si_context_add_resource_size(sctx, surf->base.texture); - - /* Update the minimum but don't keep 0. */ - if (!sctx->framebuffer.min_bytes_per_pixel || - zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) - sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; - } - - si_update_ps_colorbuf0_slot(sctx); - si_update_poly_offset_state(sctx); - si_update_ngg_small_prim_precision(sctx); - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - - if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - if (sctx->screen->has_out_of_order_rast && - (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || - !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || - (zstex && zstex->surface.has_stencil != old_has_stencil))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - if (sctx->framebuffer.nr_samples != old_nr_samples) { - struct pipe_constant_buffer constbuf = {0}; - - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - constbuf.buffer = sctx->sample_pos_buffer; - - /* Set sample locations as fragment shader constants. */ - switch (sctx->framebuffer.nr_samples) { - case 1: - constbuf.buffer_offset = 0; - break; - case 2: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 - - (ubyte*)sctx->sample_positions.x1; - break; - case 4: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 - - (ubyte*)sctx->sample_positions.x1; - break; - case 8: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 - - (ubyte*)sctx->sample_positions.x1; - break; - case 16: - constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 - - (ubyte*)sctx->sample_positions.x1; - break; - default: - PRINT_ERR("Requested an invalid number of samples %i.\n", - sctx->framebuffer.nr_samples); - assert(0); - } - constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4; - si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf); - - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - - sctx->do_update_shaders = true; - - if (!sctx->decompression_enabled) { - /* Prevent textures decompression when the framebuffer state - * changes come from the decompression passes themselves. - */ - sctx->need_check_render_feedback = true; - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_surface *surf = NULL; + struct si_texture *tex; + bool old_any_dst_linear = sctx->framebuffer.any_dst_linear; + unsigned old_nr_samples = sctx->framebuffer.nr_samples; + unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit; + bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf; + bool old_has_stencil = + old_has_zsbuf && + ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil; + bool unbound = false; + int i; + + /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs + * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. + * We could implement the full workaround here, but it's a useless case. + */ + if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) { + unreachable("the framebuffer shouldn't have zero area"); + return; + } + + si_update_fb_dirtiness_after_rendering(sctx); + + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (!sctx->framebuffer.state.cbufs[i]) + continue; + + tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture; + if (tex->dcc_gather_statistics) + vi_separate_dcc_stop_query(sctx, tex); + } + + /* Disable DCC if the formats are incompatible. */ + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; + + surf = (struct si_surface *)state->cbufs[i]; + tex = (struct si_texture *)surf->base.texture; + + if (!surf->dcc_incompatible) + continue; + + /* Since the DCC decompression calls back into set_framebuffer- + * _state, we need to unbind the framebuffer, so that + * vi_separate_dcc_stop_query isn't called twice with the same + * color buffer. + */ + if (!unbound) { + util_copy_framebuffer_state(&sctx->framebuffer.state, NULL); + unbound = true; + } + + if (vi_dcc_enabled(tex, surf->base.u.tex.level)) + if (!si_texture_disable_dcc(sctx, tex)) + si_decompress_dcc(sctx, tex); + + surf->dcc_incompatible = false; + } + + /* Only flush TC when changing the framebuffer state, because + * the only client not using TC that can change textures is + * the framebuffer. + * + * Wait for compute shaders because of possible transitions: + * - FB write -> shader read + * - shader write -> FB read + * + * DB caches are flushed on demand (using si_decompress_textures). + * + * When MSAA is enabled, CB and TC caches are flushed on demand + * (after FMASK decompression). Shader write -> FB read transitions + * cannot happen for MSAA textures, because MSAA shader images are + * not supported. + * + * Only flush and wait for CB if there is actually a bound color buffer. + */ + if (sctx->framebuffer.uncompressed_cb_mask) { + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata, + sctx->framebuffer.all_DCC_pipe_aligned); + } + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + /* u_blitter doesn't invoke depth decompression when it does multiple + * blits in a row, but the only case when it matters for DB is when + * doing generate_mipmap. So here we flush DB manually between + * individual generate_mipmap blits. + * Note that lower mipmap levels aren't compressed. + */ + if (sctx->generate_mipmap_for_depth) { + si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata); + } else if (sctx->chip_class == GFX9) { + /* It appears that DB metadata "leaks" in a sequence of: + * - depth clear + * - DCC decompress for shader image writes (with DB disabled) + * - render with DEPTH_BEFORE_SHADER=1 + * Flushing DB metadata works around the problem. + */ + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; + } + + /* Take the maximum of the old and new count. If the new count is lower, + * dirtying is needed to disable the unbound colorbuffers. + */ + sctx->framebuffer.dirty_cbufs |= + (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; + sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; + + si_dec_framebuffer_counters(&sctx->framebuffer.state); + util_copy_framebuffer_state(&sctx->framebuffer.state, state); + + sctx->framebuffer.colorbuf_enabled_4bit = 0; + sctx->framebuffer.spi_shader_col_format = 0; + sctx->framebuffer.spi_shader_col_format_alpha = 0; + sctx->framebuffer.spi_shader_col_format_blend = 0; + sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; + sctx->framebuffer.color_is_int8 = 0; + sctx->framebuffer.color_is_int10 = 0; + + sctx->framebuffer.compressed_cb_mask = 0; + sctx->framebuffer.uncompressed_cb_mask = 0; + sctx->framebuffer.displayable_dcc_cb_mask = 0; + sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); + sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples; + sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); + sctx->framebuffer.any_dst_linear = false; + sctx->framebuffer.CB_has_shader_readable_metadata = false; + sctx->framebuffer.DB_has_shader_readable_metadata = false; + sctx->framebuffer.all_DCC_pipe_aligned = true; + sctx->framebuffer.min_bytes_per_pixel = 0; + + for (i = 0; i < state->nr_cbufs; i++) { + if (!state->cbufs[i]) + continue; + + surf = (struct si_surface *)state->cbufs[i]; + tex = (struct si_texture *)surf->base.texture; + + if (!surf->color_initialized) { + si_initialize_color_surface(sctx, surf); + } + + sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4); + sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4); + sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4); + sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4); + sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha + << (i * 4); + + if (surf->color_is_int8) + sctx->framebuffer.color_is_int8 |= 1 << i; + if (surf->color_is_int10) + sctx->framebuffer.color_is_int10 |= 1 << i; + + if (tex->surface.fmask_offset) + sctx->framebuffer.compressed_cb_mask |= 1 << i; + else + sctx->framebuffer.uncompressed_cb_mask |= 1 << i; + + if (tex->surface.dcc_offset) + sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i; + + /* Don't update nr_color_samples for non-AA buffers. + * (e.g. destination of MSAA resolve) + */ + if (tex->buffer.b.b.nr_samples >= 2 && + tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) { + sctx->framebuffer.nr_color_samples = + MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples); + sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples); + } + + if (tex->surface.is_linear) + sctx->framebuffer.any_dst_linear = true; + + if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { + sctx->framebuffer.CB_has_shader_readable_metadata = true; + + if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned) + sctx->framebuffer.all_DCC_pipe_aligned = false; + } + + si_context_add_resource_size(sctx, surf->base.texture); + + p_atomic_inc(&tex->framebuffers_bound); + + if (tex->dcc_gather_statistics) { + /* Dirty tracking must be enabled for DCC usage analysis. */ + sctx->framebuffer.compressed_cb_mask |= 1 << i; + vi_separate_dcc_start_query(sctx, tex); + } + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; + } + + /* For optimal DCC performance. */ + if (sctx->chip_class >= GFX10) + sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; + else + sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; + + struct si_texture *zstex = NULL; + + if (state->zsbuf) { + surf = (struct si_surface *)state->zsbuf; + zstex = (struct si_texture *)surf->base.texture; + + if (!surf->depth_initialized) { + si_init_depth_surface(sctx, surf); + } + + if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS)) + sctx->framebuffer.DB_has_shader_readable_metadata = true; + + si_context_add_resource_size(sctx, surf->base.texture); + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; + } + + si_update_ps_colorbuf0_slot(sctx); + si_update_poly_offset_state(sctx); + si_update_ngg_small_prim_precision(sctx); + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + + if (sctx->framebuffer.any_dst_linear != old_any_dst_linear) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->screen->has_out_of_order_rast && + (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit || + !!sctx->framebuffer.state.zsbuf != old_has_zsbuf || + (zstex && zstex->surface.has_stencil != old_has_stencil))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->framebuffer.nr_samples != old_nr_samples) { + struct pipe_constant_buffer constbuf = {0}; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + constbuf.buffer = sctx->sample_pos_buffer; + + /* Set sample locations as fragment shader constants. */ + switch (sctx->framebuffer.nr_samples) { + case 1: + constbuf.buffer_offset = 0; + break; + case 2: + constbuf.buffer_offset = + (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1; + break; + case 4: + constbuf.buffer_offset = + (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1; + break; + case 8: + constbuf.buffer_offset = + (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1; + break; + case 16: + constbuf.buffer_offset = + (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1; + break; + default: + PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples); + assert(0); + } + constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4; + si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + + sctx->do_update_shaders = true; + + if (!sctx->decompression_enabled) { + /* Prevent textures decompression when the framebuffer state + * changes come from the decompression passes themselves. + */ + sctx->need_check_render_feedback = true; + } } static void si_emit_framebuffer_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct pipe_framebuffer_state *state = &sctx->framebuffer.state; - unsigned i, nr_cbufs = state->nr_cbufs; - struct si_texture *tex = NULL; - struct si_surface *cb = NULL; - unsigned cb_color_info = 0; - - /* Colorbuffers. */ - for (i = 0; i < nr_cbufs; i++) { - uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; - unsigned cb_color_attrib; - - if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) - continue; - - cb = (struct si_surface*)state->cbufs[i]; - if (!cb) { - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, - S_028C70_FORMAT(V_028C70_COLOR_INVALID)); - continue; - } - - tex = (struct si_texture *)cb->base.texture; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READWRITE, - tex->buffer.b.b.nr_samples > 1 ? - RADEON_PRIO_COLOR_BUFFER_MSAA : - RADEON_PRIO_COLOR_BUFFER); - - if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - tex->cmask_buffer, RADEON_USAGE_READWRITE, - RADEON_PRIO_SEPARATE_META); - } - - if (tex->dcc_separate_buffer) - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - tex->dcc_separate_buffer, - RADEON_USAGE_READWRITE, - RADEON_PRIO_SEPARATE_META); - - /* Compute mutable surface parameters. */ - cb_color_base = tex->buffer.gpu_address >> 8; - cb_color_fmask = 0; - cb_color_cmask = tex->cmask_base_address_reg; - cb_dcc_base = 0; - cb_color_info = cb->cb_color_info | tex->cb_color_info; - cb_color_attrib = cb->cb_color_attrib; - - if (cb->base.u.tex.level > 0) - cb_color_info &= C_028C70_FAST_CLEAR; - - if (tex->surface.fmask_offset) { - cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; - cb_color_fmask |= tex->surface.fmask_tile_swizzle; - } - - /* Set up DCC. */ - if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { - bool is_msaa_resolve_dst = state->cbufs[0] && - state->cbufs[0]->texture->nr_samples > 1 && - state->cbufs[1] == &cb->base && - state->cbufs[1]->texture->nr_samples <= 1; - - if (!is_msaa_resolve_dst) - cb_color_info |= S_028C70_DCC_ENABLE(1); - - cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->surface.dcc_offset) >> 8; - - unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; - dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; - cb_dcc_base |= dcc_tile_swizzle; - } - - if (sctx->chip_class >= GFX10) { - unsigned cb_color_attrib3; - - /* Set mutable surface parameters. */ - cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; - cb_color_base |= tex->surface.tile_swizzle; - if (!tex->surface.fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - - cb_color_attrib3 = cb->cb_color_attrib3 | - S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | - S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned); - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, 0); /* hole */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - - radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, - cb_color_base >> 32); - radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, - cb_color_cmask >> 32); - radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, - cb_color_fmask >> 32); - radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, - cb_dcc_base >> 32); - radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, - cb->cb_color_attrib2); - radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, - cb_color_attrib3); - } else if (sctx->chip_class == GFX9) { - struct gfx9_surf_meta_flags meta; - - if (tex->surface.dcc_offset) - meta = tex->surface.u.gfx9.dcc; - else - meta = tex->surface.u.gfx9.cmask; - - /* Set mutable surface parameters. */ - cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; - cb_color_base |= tex->surface.tile_swizzle; - if (!tex->surface.fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | - S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_028C74_RB_ALIGNED(meta.rb_aligned) | - S_028C74_PIPE_ALIGNED(meta.pipe_aligned); - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ - radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ - radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ - - radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, - S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch)); - } else { - /* Compute mutable surface parameters (GFX6-GFX8). */ - const struct legacy_surf_level *level_info = - &tex->surface.u.legacy.level[cb->base.u.tex.level]; - unsigned pitch_tile_max, slice_tile_max, tile_mode_index; - unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice; - - cb_color_base += level_info->offset >> 8; - /* Only macrotiled modes can set tile swizzle. */ - if (level_info->mode == RADEON_SURF_MODE_2D) - cb_color_base |= tex->surface.tile_swizzle; - - if (!tex->surface.fmask_offset) - cb_color_fmask = cb_color_base; - if (cb->base.u.tex.level > 0) - cb_color_cmask = cb_color_base; - if (cb_dcc_base) - cb_dcc_base += level_info->dcc_offset >> 8; - - pitch_tile_max = level_info->nblk_x / 8 - 1; - slice_tile_max = level_info->nblk_x * - level_info->nblk_y / 64 - 1; - tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false); - - cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); - cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); - cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); - - if (tex->surface.fmask_offset) { - if (sctx->chip_class >= GFX7) - cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); - cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); - cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max); - } else { - /* This must be set for fast clear to work without FMASK. */ - if (sctx->chip_class >= GFX7) - cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); - cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); - cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); - } - - radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, - sctx->chip_class >= GFX8 ? 14 : 13); - radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ - radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ - radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ - radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ - radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ - radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ - radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ - radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ - radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ - radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ - radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ - radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ - radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ - - if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ - radeon_emit(cs, cb_dcc_base); - } - } - for (; i < 8 ; i++) - if (sctx->framebuffer.dirty_cbufs & (1 << i)) - radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); - - /* ZS buffer. */ - if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { - struct si_surface *zb = (struct si_surface*)state->zsbuf; - struct si_texture *tex = (struct si_texture*)zb->base.texture; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - &tex->buffer, RADEON_USAGE_READWRITE, - zb->base.texture->nr_samples > 1 ? - RADEON_PRIO_DEPTH_BUFFER_MSAA : - RADEON_PRIO_DEPTH_BUFFER); - - if (sctx->chip_class >= GFX10) { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); - - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); - radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ - } else if (sctx->chip_class == GFX9) { - radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); - radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ - radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ - - radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); - radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ - radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ - } else { - radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); - - radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); - radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */ - radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ - S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0)); - radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ - radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ - radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ - radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ - } - - radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); - radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */ - radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */ - - radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); - radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface); - } else if (sctx->framebuffer.dirty_zsbuf) { - if (sctx->chip_class == GFX9) - radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); - else - radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); - - radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ - radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ - } - - /* Framebuffer dimensions. */ - /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */ - radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, - S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); - - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); - } - - sctx->framebuffer.dirty_cbufs = 0; - sctx->framebuffer.dirty_zsbuf = false; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + unsigned i, nr_cbufs = state->nr_cbufs; + struct si_texture *tex = NULL; + struct si_surface *cb = NULL; + unsigned cb_color_info = 0; + + /* Colorbuffers. */ + for (i = 0; i < nr_cbufs; i++) { + uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; + unsigned cb_color_attrib; + + if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) + continue; + + cb = (struct si_surface *)state->cbufs[i]; + if (!cb) { + radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT(V_028C70_COLOR_INVALID)); + continue; + } + + tex = (struct si_texture *)cb->base.texture; + radeon_add_to_buffer_list( + sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE, + tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER); + + if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_SEPARATE_META); + } + + if (tex->dcc_separate_buffer) + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer, + RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META); + + /* Compute mutable surface parameters. */ + cb_color_base = tex->buffer.gpu_address >> 8; + cb_color_fmask = 0; + cb_color_cmask = tex->cmask_base_address_reg; + cb_dcc_base = 0; + cb_color_info = cb->cb_color_info | tex->cb_color_info; + cb_color_attrib = cb->cb_color_attrib; + + if (cb->base.u.tex.level > 0) + cb_color_info &= C_028C70_FAST_CLEAR; + + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; + cb_color_fmask |= tex->surface.fmask_tile_swizzle; + } + + /* Set up DCC. */ + if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { + bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] == &cb->base && + state->cbufs[1]->texture->nr_samples <= 1; + + if (!is_msaa_resolve_dst) + cb_color_info |= S_028C70_DCC_ENABLE(1); + + cb_dcc_base = + ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >> + 8; + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; + dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; + cb_dcc_base |= dcc_tile_swizzle; + } + + if (sctx->chip_class >= GFX10) { + unsigned cb_color_attrib3; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + + cb_color_attrib3 = cb->cb_color_attrib3 | + S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | + S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned); + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, 0); /* hole */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + + radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4, + cb_color_cmask >> 32); + radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4, + cb_color_fmask >> 32); + radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); + } else if (sctx->chip_class == GFX9) { + struct gfx9_surf_meta_flags meta; + + if (tex->surface.dcc_offset) + meta = tex->surface.u.gfx9.dcc; + else + meta = tex->surface.u.gfx9.cmask; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) | + S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_028C74_RB_ALIGNED(meta.rb_aligned) | + S_028C74_PIPE_ALIGNED(meta.pipe_aligned); + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */ + radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */ + radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */ + + radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4, + S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch)); + } else { + /* Compute mutable surface parameters (GFX6-GFX8). */ + const struct legacy_surf_level *level_info = + &tex->surface.u.legacy.level[cb->base.u.tex.level]; + unsigned pitch_tile_max, slice_tile_max, tile_mode_index; + unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice; + + cb_color_base += level_info->offset >> 8; + /* Only macrotiled modes can set tile swizzle. */ + if (level_info->mode == RADEON_SURF_MODE_2D) + cb_color_base |= tex->surface.tile_swizzle; + + if (!tex->surface.fmask_offset) + cb_color_fmask = cb_color_base; + if (cb->base.u.tex.level > 0) + cb_color_cmask = cb_color_base; + if (cb_dcc_base) + cb_dcc_base += level_info->dcc_offset >> 8; + + pitch_tile_max = level_info->nblk_x / 8 - 1; + slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1; + tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false); + + cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index); + cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); + cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); + + if (tex->surface.fmask_offset) { + if (sctx->chip_class >= GFX7) + cb_color_pitch |= + S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); + cb_color_attrib |= + S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); + cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max); + } else { + /* This must be set for fast clear to work without FMASK. */ + if (sctx->chip_class >= GFX7) + cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max); + cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index); + cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max); + } + + radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, + sctx->chip_class >= GFX8 ? 14 : 13); + radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */ + radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */ + radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */ + radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */ + radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */ + radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */ + radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */ + radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */ + radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */ + radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */ + radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */ + radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */ + radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */ + + if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */ + radeon_emit(cs, cb_dcc_base); + } + } + for (; i < 8; i++) + if (sctx->framebuffer.dirty_cbufs & (1 << i)) + radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + + /* ZS buffer. */ + if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { + struct si_surface *zb = (struct si_surface *)state->zsbuf; + struct si_texture *tex = (struct si_texture *)zb->base.texture; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE, + zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA + : RADEON_PRIO_DEPTH_BUFFER); + + if (sctx->chip_class >= GFX10) { + radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + + radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7); + radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */ + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + + radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); + radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + } else if (sctx->chip_class == GFX9) { + radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); + radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ + radeon_emit(cs, + S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ + + radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10); + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cs, + S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */ + + radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2); + radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */ + radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */ + } else { + radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + + radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9); + radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */ + radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */ + S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0)); + radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */ + radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ + radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ + radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */ + radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */ + } + + radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); + radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */ + radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */ + + radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface); + } else if (sctx->framebuffer.dirty_zsbuf) { + if (sctx->chip_class == GFX9) + radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2); + else + radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2); + + radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */ + radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */ + } + + /* Framebuffer dimensions. */ + /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */ + radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); + + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + + sctx->framebuffer.dirty_cbufs = 0; + sctx->framebuffer.dirty_zsbuf = false; } static void si_emit_msaa_sample_locs(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned nr_samples = sctx->framebuffer.nr_samples; - bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; - - /* Smoothing (only possible with nr_samples == 1) uses the same - * sample locations as the MSAA it simulates. - */ - if (nr_samples <= 1 && sctx->smoothing_enabled) - nr_samples = SI_NUM_SMOOTH_AA_SAMPLES; - - /* On Polaris, the small primitive filter uses the sample locations - * even when MSAA is off, so we need to make sure they're set to 0. - * - * GFX10 uses sample locations unconditionally, so they always need - * to be set up. - */ - if ((nr_samples >= 2 || has_msaa_sample_loc_bug || - sctx->chip_class >= GFX10) && - nr_samples != sctx->sample_locs_num_samples) { - sctx->sample_locs_num_samples = nr_samples; - si_emit_sample_locations(cs, nr_samples); - } - - if (sctx->family >= CHIP_POLARIS10) { - unsigned small_prim_filter_cntl = - S_028830_SMALL_PRIM_FILTER_ENABLE(1) | - /* line bug */ - S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12); - - /* The alternative of setting sample locations to 0 would - * require a DB flush to avoid Z errors, see - * https://bugs.freedesktop.org/show_bug.cgi?id=96908 - */ - if (has_msaa_sample_loc_bug && - sctx->framebuffer.nr_samples > 1 && - !rs->multisample_enable) - small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE; - - radeon_opt_set_context_reg(sctx, - R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, - small_prim_filter_cntl); - } - - /* The exclusion bits can be set to improve rasterization efficiency - * if no sample lies on the pixel boundary (-8 sample offset). - */ - bool exclusion = sctx->chip_class >= GFX7 && - (!rs->multisample_enable || nr_samples != 16); - radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, - S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | - S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned nr_samples = sctx->framebuffer.nr_samples; + bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; + + /* Smoothing (only possible with nr_samples == 1) uses the same + * sample locations as the MSAA it simulates. + */ + if (nr_samples <= 1 && sctx->smoothing_enabled) + nr_samples = SI_NUM_SMOOTH_AA_SAMPLES; + + /* On Polaris, the small primitive filter uses the sample locations + * even when MSAA is off, so we need to make sure they're set to 0. + * + * GFX10 uses sample locations unconditionally, so they always need + * to be set up. + */ + if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) && + nr_samples != sctx->sample_locs_num_samples) { + sctx->sample_locs_num_samples = nr_samples; + si_emit_sample_locations(cs, nr_samples); + } + + if (sctx->family >= CHIP_POLARIS10) { + unsigned small_prim_filter_cntl = + S_028830_SMALL_PRIM_FILTER_ENABLE(1) | + /* line bug */ + S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12); + + /* The alternative of setting sample locations to 0 would + * require a DB flush to avoid Z errors, see + * https://bugs.freedesktop.org/show_bug.cgi?id=96908 + */ + if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable) + small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE; + + radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL, + SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl); + } + + /* The exclusion bits can be set to improve rasterization efficiency + * if no sample lies on the pixel boundary (-8 sample offset). + */ + bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16); + radeon_opt_set_context_reg( + sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, + S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); } static bool si_out_of_order_rasterization(struct si_context *sctx) { - struct si_state_blend *blend = sctx->queued.named.blend; - struct si_state_dsa *dsa = sctx->queued.named.dsa; + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_dsa *dsa = sctx->queued.named.dsa; - if (!sctx->screen->has_out_of_order_rast) - return false; + if (!sctx->screen->has_out_of_order_rast) + return false; - unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; + unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; - colormask &= blend->cb_target_enabled_4bit; + colormask &= blend->cb_target_enabled_4bit; - /* Conservative: No logic op. */ - if (colormask && blend->logicop_enable) - return false; + /* Conservative: No logic op. */ + if (colormask && blend->logicop_enable) + return false; - struct si_dsa_order_invariance dsa_order_invariant = { - .zs = true, .pass_set = true, .pass_last = false - }; + struct si_dsa_order_invariance dsa_order_invariant = {.zs = true, + .pass_set = true, + .pass_last = false}; - if (sctx->framebuffer.state.zsbuf) { - struct si_texture *zstex = - (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; - bool has_stencil = zstex->surface.has_stencil; - dsa_order_invariant = dsa->order_invariance[has_stencil]; - if (!dsa_order_invariant.zs) - return false; + if (sctx->framebuffer.state.zsbuf) { + struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture; + bool has_stencil = zstex->surface.has_stencil; + dsa_order_invariant = dsa->order_invariance[has_stencil]; + if (!dsa_order_invariant.zs) + return false; - /* The set of PS invocations is always order invariant, - * except when early Z/S tests are requested. */ - if (sctx->ps_shader.cso && - sctx->ps_shader.cso->info.writes_memory && - sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && - !dsa_order_invariant.pass_set) - return false; + /* The set of PS invocations is always order invariant, + * except when early Z/S tests are requested. */ + if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory && + sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && + !dsa_order_invariant.pass_set) + return false; - if (sctx->num_perfect_occlusion_queries != 0 && - !dsa_order_invariant.pass_set) - return false; - } + if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set) + return false; + } - if (!colormask) - return true; + if (!colormask) + return true; - unsigned blendmask = colormask & blend->blend_enable_4bit; + unsigned blendmask = colormask & blend->blend_enable_4bit; - if (blendmask) { - /* Only commutative blending. */ - if (blendmask & ~blend->commutative_4bit) - return false; + if (blendmask) { + /* Only commutative blending. */ + if (blendmask & ~blend->commutative_4bit) + return false; - if (!dsa_order_invariant.pass_set) - return false; - } + if (!dsa_order_invariant.pass_set) + return false; + } - if (colormask & ~blendmask) { - if (!dsa_order_invariant.pass_last) - return false; - } + if (colormask & ~blendmask) { + if (!dsa_order_invariant.pass_last) + return false; + } - return true; + return true; } static void si_emit_msaa_config(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes; - /* 33% faster rendering to linear color buffers */ - bool dst_is_linear = sctx->framebuffer.any_dst_linear; - bool out_of_order_rast = si_out_of_order_rasterization(sctx); - unsigned sc_mode_cntl_1 = - S_028A4C_WALK_SIZE(dst_is_linear) | - S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | - S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | - S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | - S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | - /* always 1: */ - S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | - S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | - S_028A4C_TILE_WALK_ORDER_ENABLE(1) | - S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | - S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | - S_028A4C_FORCE_EOV_REZ_ENABLE(1); - unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | - S_028804_INCOHERENT_EQAA_READS(1) | - S_028804_INTERPOLATE_COMP_Z(1) | - S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); - unsigned coverage_samples, color_samples, z_samples; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - - /* S: Coverage samples (up to 16x): - * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES) - * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES) - * - * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples): - * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES) - * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES) - * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or - * # from the closest defined sample if Z is uncompressed (same quality as the number of - * # Z samples). - * - * F: Color samples (up to 8x, must be <= coverage samples): - * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS) - * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES) - * - * Can be anything between coverage and color samples: - * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES) - * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES) - * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES) - * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE) - * # All are currently set the same as coverage samples. - * - * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown" - * flag for undefined color samples. A shader-based resolve must handle unknowns - * or mask them out with AND. Unknowns can also be guessed from neighbors via - * an edge-detect shader-based resolve, which is required to make "color samples = 1" - * useful. The CB resolve always drops unknowns. - * - * Sensible AA configurations: - * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed - * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed - * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed - * EQAA 8s 8z 8f = 8x MSAA - * EQAA 8s 8z 4f - might look the same as 8x MSAA - * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry - * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed - * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed - * EQAA 4s 4z 4f = 4x MSAA - * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry - * EQAA 2s 2z 2f = 2x MSAA - */ - if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { - coverage_samples = sctx->framebuffer.nr_samples; - color_samples = sctx->framebuffer.nr_color_samples; - - if (sctx->framebuffer.state.zsbuf) { - z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples; - z_samples = MAX2(1, z_samples); - } else { - z_samples = coverage_samples; - } - } else if (sctx->smoothing_enabled) { - coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES; - } else { - coverage_samples = color_samples = z_samples = 1; - } - - /* Required by OpenGL line rasterization. - * - * TODO: We should also enable perpendicular endcaps for AA lines, - * but that requires implementing line stippling in the pixel - * shader. SC can only do line stippling with axis-aligned - * endcaps. - */ - unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); - unsigned sc_aa_config = 0; - - if (coverage_samples > 1) { - /* distance from the pixel center, indexed by log2(nr_samples) */ - static unsigned max_dist[] = { - 0, /* unused */ - 4, /* 2x MSAA */ - 6, /* 4x MSAA */ - 7, /* 8x MSAA */ - 8, /* 16x MSAA */ - }; - unsigned log_samples = util_logbase2(coverage_samples); - unsigned log_z_samples = util_logbase2(z_samples); - unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); - unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); - - sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); - sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | - S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | - S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); - - if (sctx->framebuffer.nr_samples > 1) { - db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | - S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | - S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | - S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); - sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); - } else if (sctx->smoothing_enabled) { - db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); - } - } - - unsigned initial_cdw = cs->current.cdw; - - /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ - radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, - SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl, - sc_aa_config); - /* R_028804_DB_EQAA */ - radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, - db_eqaa); - /* R_028A4C_PA_SC_MODE_CNTL_1 */ - radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, - SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - - if (initial_cdw != cs->current.cdw) { - sctx->context_roll = true; - - /* GFX9: Flush DFSM when the AA mode changes. */ - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); - } - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes; + /* 33% faster rendering to linear color buffers */ + bool dst_is_linear = sctx->framebuffer.any_dst_linear; + bool out_of_order_rast = si_out_of_order_rasterization(sctx); + unsigned sc_mode_cntl_1 = + S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) | + S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) | + S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) | + S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) | + /* always 1: */ + S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | + S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | + S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1); + unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) | + S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1); + unsigned coverage_samples, color_samples, z_samples; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + + /* S: Coverage samples (up to 16x): + * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES) + * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES) + * + * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples): + * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES) + * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES) + * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or + * # from the closest defined sample if Z is uncompressed (same quality as the number of + * # Z samples). + * + * F: Color samples (up to 8x, must be <= coverage samples): + * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS) + * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES) + * + * Can be anything between coverage and color samples: + * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES) + * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES) + * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES) + * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE) + * # All are currently set the same as coverage samples. + * + * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown" + * flag for undefined color samples. A shader-based resolve must handle unknowns + * or mask them out with AND. Unknowns can also be guessed from neighbors via + * an edge-detect shader-based resolve, which is required to make "color samples = 1" + * useful. The CB resolve always drops unknowns. + * + * Sensible AA configurations: + * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed + * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed + * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed + * EQAA 8s 8z 8f = 8x MSAA + * EQAA 8s 8z 4f - might look the same as 8x MSAA + * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry + * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed + * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed + * EQAA 4s 4z 4f = 4x MSAA + * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry + * EQAA 2s 2z 2f = 2x MSAA + */ + if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) { + coverage_samples = sctx->framebuffer.nr_samples; + color_samples = sctx->framebuffer.nr_color_samples; + + if (sctx->framebuffer.state.zsbuf) { + z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples; + z_samples = MAX2(1, z_samples); + } else { + z_samples = coverage_samples; + } + } else if (sctx->smoothing_enabled) { + coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES; + } else { + coverage_samples = color_samples = z_samples = 1; + } + + /* Required by OpenGL line rasterization. + * + * TODO: We should also enable perpendicular endcaps for AA lines, + * but that requires implementing line stippling in the pixel + * shader. SC can only do line stippling with axis-aligned + * endcaps. + */ + unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + unsigned sc_aa_config = 0; + + if (coverage_samples > 1) { + /* distance from the pixel center, indexed by log2(nr_samples) */ + static unsigned max_dist[] = { + 0, /* unused */ + 4, /* 2x MSAA */ + 6, /* 4x MSAA */ + 7, /* 8x MSAA */ + 8, /* 16x MSAA */ + }; + unsigned log_samples = util_logbase2(coverage_samples); + unsigned log_z_samples = util_logbase2(z_samples); + unsigned ps_iter_samples = si_get_ps_iter_samples(sctx); + unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples); + + sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); + sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | + S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); + + if (sctx->framebuffer.nr_samples > 1) { + db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | + S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | + S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); + sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); + } else if (sctx->smoothing_enabled) { + db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); + } + } + + unsigned initial_cdw = cs->current.cdw; + + /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ + radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl, sc_aa_config); + /* R_028804_DB_EQAA */ + radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + /* R_028A4C_PA_SC_MODE_CNTL_1 */ + radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + + if (initial_cdw != cs->current.cdw) { + sctx->context_roll = true; + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + } } void si_update_ps_iter_samples(struct si_context *sctx) { - if (sctx->framebuffer.nr_samples > 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + if (sctx->framebuffer.nr_samples > 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); } static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - /* The hardware can only do sample shading with 2^n samples. */ - min_samples = util_next_power_of_two(min_samples); + /* The hardware can only do sample shading with 2^n samples. */ + min_samples = util_next_power_of_two(min_samples); - if (sctx->ps_iter_samples == min_samples) - return; + if (sctx->ps_iter_samples == min_samples) + return; - sctx->ps_iter_samples = min_samples; - sctx->do_update_shaders = true; + sctx->ps_iter_samples = min_samples; + sctx->do_update_shaders = true; - si_update_ps_iter_samples(sctx); + si_update_ps_iter_samples(sctx); } /* @@ -3786,650 +3586,607 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) * Build the sampler view descriptor for a buffer texture. * @param state 256-bit descriptor; only the high 128 bits are filled in */ -void -si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, - enum pipe_format format, - unsigned offset, unsigned size, - uint32_t *state) +void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, + enum pipe_format format, unsigned offset, unsigned size, + uint32_t *state) { - const struct util_format_description *desc; - unsigned stride; - unsigned num_records; - - desc = util_format_description(format); - stride = desc->block.bits / 8; - - num_records = size / stride; - num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride); - - /* The NUM_RECORDS field has a different meaning depending on the chip, - * instruction type, STRIDE, and SWIZZLE_ENABLE. - * - * GFX6-7,10: - * - If STRIDE == 0, it's in byte units. - * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN. - * - * GFX8: - * - For SMEM and STRIDE == 0, it's in byte units. - * - For SMEM and STRIDE != 0, it's in units of STRIDE. - * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units. - * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE. - * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_- - * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when - * using SMEM. This can be done in the shader by clearing STRIDE with s_and. - * That way the same descriptor can be used by both SMEM and VMEM. - * - * GFX9: - * - For SMEM and STRIDE == 0, it's in byte units. - * - For SMEM and STRIDE != 0, it's in units of STRIDE. - * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. - * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. - */ - if (screen->info.chip_class == GFX8) - num_records *= stride; - - state[4] = 0; - state[5] = S_008F04_STRIDE(stride); - state[6] = num_records; - state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - - if (screen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = &gfx10_format_table[format]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) - * - 1: index >= NUM_RECORDS - * - 2: NUM_RECORDS == 0 - * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS - * else: swizzle_address >= NUM_RECORDS - */ - state[7] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - int first_non_void; - unsigned num_format, data_format; - - first_non_void = util_format_get_first_non_void_channel(format); - num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void); - data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void); - - state[7] |= S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); - } + const struct util_format_description *desc; + unsigned stride; + unsigned num_records; + + desc = util_format_description(format); + stride = desc->block.bits / 8; + + num_records = size / stride; + num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride); + + /* The NUM_RECORDS field has a different meaning depending on the chip, + * instruction type, STRIDE, and SWIZZLE_ENABLE. + * + * GFX6-7,10: + * - If STRIDE == 0, it's in byte units. + * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN. + * + * GFX8: + * - For SMEM and STRIDE == 0, it's in byte units. + * - For SMEM and STRIDE != 0, it's in units of STRIDE. + * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units. + * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE. + * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_- + * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when + * using SMEM. This can be done in the shader by clearing STRIDE with s_and. + * That way the same descriptor can be used by both SMEM and VMEM. + * + * GFX9: + * - For SMEM and STRIDE == 0, it's in byte units. + * - For SMEM and STRIDE != 0, it's in units of STRIDE. + * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. + * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. + */ + if (screen->info.chip_class == GFX8) + num_records *= stride; + + state[4] = 0; + state[5] = S_008F04_STRIDE(stride); + state[6] = num_records; + state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + if (screen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[format]; + + /* OOB_SELECT chooses the out-of-bounds check: + * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE) + * - 1: index >= NUM_RECORDS + * - 2: NUM_RECORDS == 0 + * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS + * else: swizzle_address >= NUM_RECORDS + */ + state[7] |= S_008F0C_FORMAT(fmt->img_format) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + int first_non_void; + unsigned num_format, data_format; + + first_non_void = util_format_get_first_non_void_channel(format); + num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void); + data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void); + + state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + } } static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4]) { - unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; - - if (swizzle[3] == PIPE_SWIZZLE_X) { - /* For the pre-defined border color values (white, opaque - * black, transparent black), the only thing that matters is - * that the alpha channel winds up in the correct place - * (because the RGB channels are all the same) so either of - * these enumerations will work. - */ - if (swizzle[2] == PIPE_SWIZZLE_Y) - bc_swizzle = V_008F20_BC_SWIZZLE_WZYX; - else - bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ; - } else if (swizzle[0] == PIPE_SWIZZLE_X) { - if (swizzle[1] == PIPE_SWIZZLE_Y) - bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; - else - bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ; - } else if (swizzle[1] == PIPE_SWIZZLE_X) { - bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ; - } else if (swizzle[2] == PIPE_SWIZZLE_X) { - bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW; - } - - return bc_swizzle; + unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + + if (swizzle[3] == PIPE_SWIZZLE_X) { + /* For the pre-defined border color values (white, opaque + * black, transparent black), the only thing that matters is + * that the alpha channel winds up in the correct place + * (because the RGB channels are all the same) so either of + * these enumerations will work. + */ + if (swizzle[2] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_WZYX; + else + bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ; + } else if (swizzle[0] == PIPE_SWIZZLE_X) { + if (swizzle[1] == PIPE_SWIZZLE_Y) + bc_swizzle = V_008F20_BC_SWIZZLE_XYZW; + else + bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ; + } else if (swizzle[1] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ; + } else if (swizzle[2] == PIPE_SWIZZLE_X) { + bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW; + } + + return bc_swizzle; } /** * Build the sampler view descriptor for a texture. */ -static void -gfx10_make_texture_descriptor(struct si_screen *screen, - struct si_texture *tex, - bool sampler, - enum pipe_texture_target target, - enum pipe_format pipe_format, - const unsigned char state_swizzle[4], - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned width, unsigned height, unsigned depth, - uint32_t *state, - uint32_t *fmask_state) +static void gfx10_make_texture_descriptor( + struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target, + enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level, + unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height, + unsigned depth, uint32_t *state, uint32_t *fmask_state) { - struct pipe_resource *res = &tex->buffer.b.b; - const struct util_format_description *desc; - unsigned img_format; - unsigned char swizzle[4]; - unsigned type; - uint64_t va; - - desc = util_format_description(pipe_format); - img_format = gfx10_format_table[pipe_format].img_format; - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; - const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; - const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; - bool is_stencil = false; - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - is_stencil = true; - break; - case PIPE_FORMAT_X24S8_UINT: - /* - * X24S8 is implemented as an 8_8_8_8 data format, to - * fix texture gathers. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); - is_stencil = true; - break; - default: - util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); - is_stencil = pipe_format == PIPE_FORMAT_S8_UINT; - } - - if (tex->upgraded_depth && !is_stencil) { - assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT); - img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP; - } - } else { - util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); - } - - if (!sampler && - (res->target == PIPE_TEXTURE_CUBE || - res->target == PIPE_TEXTURE_CUBE_ARRAY)) { - /* For the purpose of shader images, treat cube maps as 2D - * arrays. - */ - type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - } else { - type = si_tex_dim(screen, tex, target, res->nr_samples); - } - - if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { - height = 1; - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - if (sampler || res->target != PIPE_TEXTURE_3D) - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) - depth = res->array_size / 6; - - state[0] = 0; - state[1] = S_00A004_FORMAT(img_format) | - S_00A004_WIDTH_LO(width - 1); - state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | - S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(1); - state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? - 0 : first_level) | - S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? - util_logbase2(res->nr_samples) : - last_level) | - S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | - S_00A00C_TYPE(type); - /* Depth is the the last accessible layer on gfx9+. The hw doesn't need - * to know the total number of layers. - */ - state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) - ? depth - 1 : last_layer) | - S_00A010_BASE_ARRAY(first_layer); - state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | - S_00A014_MAX_MIP(res->nr_samples > 1 ? - util_logbase2(res->nr_samples) : - tex->buffer.b.b.last_level) | - S_00A014_PERF_MOD(4); - state[6] = 0; - state[7] = 0; - - if (tex->surface.dcc_offset) { - state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | - S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | - S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); - } - - /* Initialize the sampler view for FMASK. */ - if (tex->surface.fmask_offset) { - uint32_t format; - - va = tex->buffer.gpu_address + tex->surface.fmask_offset; - -#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1; - break; - case FMASK(2,2): - format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2; - break; - case FMASK(4,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1; - break; - case FMASK(4,2): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2; - break; - case FMASK(4,4): - format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4; - break; - case FMASK(8,1): - format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1; - break; - case FMASK(8,2): - format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2; - break; - case FMASK(8,4): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4; - break; - case FMASK(8,8): - format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8; - break; - case FMASK(16,1): - format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1; - break; - case FMASK(16,2): - format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2; - break; - case FMASK(16,4): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4; - break; - case FMASK(16,8): - format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8; - break; - default: - unreachable("invalid nr_samples"); - } + struct pipe_resource *res = &tex->buffer.b.b; + const struct util_format_description *desc; + unsigned img_format; + unsigned char swizzle[4]; + unsigned type; + uint64_t va; + + desc = util_format_description(pipe_format); + img_format = gfx10_format_table[pipe_format].img_format; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; + const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; + const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; + bool is_stencil = false; + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + is_stencil = true; + break; + case PIPE_FORMAT_X24S8_UINT: + /* + * X24S8 is implemented as an 8_8_8_8 data format, to + * fix texture gathers. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); + is_stencil = true; + break; + default: + util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); + is_stencil = pipe_format == PIPE_FORMAT_S8_UINT; + } + + if (tex->upgraded_depth && !is_stencil) { + assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT); + img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP; + } + } else { + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + } + + if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) { + /* For the purpose of shader images, treat cube maps as 2D + * arrays. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + } else { + type = si_tex_dim(screen, tex, target, res->nr_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = res->array_size / 6; + + state[0] = 0; + state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1); + state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(1); + state[3] = + S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) | + S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) | + S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type); + /* Depth is the the last accessible layer on gfx9+. The hw doesn't need + * to know the total number of layers. + */ + state[4] = + S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) | + S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | + S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples) + : tex->buffer.b.b.last_level) | + S_00A014_PERF_MOD(4); + state[6] = 0; + state[7] = 0; + + if (tex->surface.dcc_offset) { + state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | + S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + } + + /* Initialize the sampler view for FMASK. */ + if (tex->surface.fmask_offset) { + uint32_t format; + + va = tex->buffer.gpu_address + tex->surface.fmask_offset; + +#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2, 1): + format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1; + break; + case FMASK(2, 2): + format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2; + break; + case FMASK(4, 1): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1; + break; + case FMASK(4, 2): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2; + break; + case FMASK(4, 4): + format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4; + break; + case FMASK(8, 1): + format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1; + break; + case FMASK(8, 2): + format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2; + break; + case FMASK(8, 4): + format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4; + break; + case FMASK(8, 8): + format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8; + break; + case FMASK(16, 1): + format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1; + break; + case FMASK(16, 2): + format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2; + break; + case FMASK(16, 4): + format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4; + break; + case FMASK(16, 8): + format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8; + break; + default: + unreachable("invalid nr_samples"); + } #undef FMASK - fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; - fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | - S_00A004_FORMAT(format) | - S_00A004_WIDTH_LO(width - 1); - fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | - S_00A008_HEIGHT(height - 1) | - S_00A008_RESOURCE_LEVEL(1); - fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | - S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) | - S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | - S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0)); - fmask_state[4] = S_00A010_DEPTH(last_layer) | - S_00A010_BASE_ARRAY(first_layer); - fmask_state[5] = 0; - fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned); - fmask_state[7] = 0; - } + fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; + fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) | + S_00A004_WIDTH_LO(width - 1); + fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) | + S_00A008_RESOURCE_LEVEL(1); + fmask_state[3] = + S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) | + S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0)); + fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer); + fmask_state[5] = 0; + fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned); + fmask_state[7] = 0; + } } /** * Build the sampler view descriptor for a texture (SI-GFX9). */ -static void -si_make_texture_descriptor(struct si_screen *screen, - struct si_texture *tex, - bool sampler, - enum pipe_texture_target target, - enum pipe_format pipe_format, - const unsigned char state_swizzle[4], - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned width, unsigned height, unsigned depth, - uint32_t *state, - uint32_t *fmask_state) +static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex, + bool sampler, enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], unsigned first_level, + unsigned last_level, unsigned first_layer, + unsigned last_layer, unsigned width, unsigned height, + unsigned depth, uint32_t *state, uint32_t *fmask_state) { - struct pipe_resource *res = &tex->buffer.b.b; - const struct util_format_description *desc; - unsigned char swizzle[4]; - int first_non_void; - unsigned num_format, data_format, type, num_samples; - uint64_t va; - - desc = util_format_description(pipe_format); - - num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? - MAX2(1, res->nr_samples) : - MAX2(1, res->nr_storage_samples); - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; - const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; - const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - case PIPE_FORMAT_X32_S8X24_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - break; - case PIPE_FORMAT_X24S8_UINT: - /* - * X24S8 is implemented as an 8_8_8_8 data format, to - * fix texture gathers. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on GFX8. - */ - if (screen->info.chip_class <= GFX8) - util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); - else - util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); - break; - default: - util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); - } - } else { - util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); - } - - first_non_void = util_format_get_first_non_void_channel(pipe_format); - - switch (pipe_format) { - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - break; - default: - if (first_non_void < 0) { - if (util_format_is_compressed(pipe_format)) { - switch (pipe_format) { - case PIPE_FORMAT_DXT1_SRGB: - case PIPE_FORMAT_DXT1_SRGBA: - case PIPE_FORMAT_DXT3_SRGBA: - case PIPE_FORMAT_DXT5_SRGBA: - case PIPE_FORMAT_BPTC_SRGBA: - case PIPE_FORMAT_ETC2_SRGB8: - case PIPE_FORMAT_ETC2_SRGB8A1: - case PIPE_FORMAT_ETC2_SRGBA8: - num_format = V_008F14_IMG_NUM_FORMAT_SRGB; - break; - case PIPE_FORMAT_RGTC1_SNORM: - case PIPE_FORMAT_LATC1_SNORM: - case PIPE_FORMAT_RGTC2_SNORM: - case PIPE_FORMAT_LATC2_SNORM: - case PIPE_FORMAT_ETC2_R11_SNORM: - case PIPE_FORMAT_ETC2_RG11_SNORM: - /* implies float, so use SNORM/UNORM to determine - whether data is signed or not */ - case PIPE_FORMAT_BPTC_RGB_FLOAT: - num_format = V_008F14_IMG_NUM_FORMAT_SNORM; - break; - default: - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - break; - } - } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - } else { - num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; - } - } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - num_format = V_008F14_IMG_NUM_FORMAT_SRGB; - } else { - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - - switch (desc->channel[first_non_void].type) { - case UTIL_FORMAT_TYPE_FLOAT: - num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; - break; - case UTIL_FORMAT_TYPE_SIGNED: - if (desc->channel[first_non_void].normalized) - num_format = V_008F14_IMG_NUM_FORMAT_SNORM; - else if (desc->channel[first_non_void].pure_integer) - num_format = V_008F14_IMG_NUM_FORMAT_SINT; - else - num_format = V_008F14_IMG_NUM_FORMAT_SSCALED; - break; - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[first_non_void].normalized) - num_format = V_008F14_IMG_NUM_FORMAT_UNORM; - else if (desc->channel[first_non_void].pure_integer) - num_format = V_008F14_IMG_NUM_FORMAT_UINT; - else - num_format = V_008F14_IMG_NUM_FORMAT_USCALED; - } - } - } - - data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void); - if (data_format == ~0) { - data_format = 0; - } - - /* S8 with Z32 HTILE needs a special format. */ - if (screen->info.chip_class == GFX9 && - pipe_format == PIPE_FORMAT_S8_UINT && - tex->tc_compatible_htile) - data_format = V_008F14_IMG_DATA_FORMAT_S8_32; - - if (!sampler && - (res->target == PIPE_TEXTURE_CUBE || - res->target == PIPE_TEXTURE_CUBE_ARRAY || - (screen->info.chip_class <= GFX8 && - res->target == PIPE_TEXTURE_3D))) { - /* For the purpose of shader images, treat cube maps and 3D - * textures as 2D arrays. For 3D textures, the address - * calculations for mipmaps are different, so we rely on the - * caller to effectively disable mipmaps. - */ - type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; - - assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); - } else { - type = si_tex_dim(screen, tex, target, num_samples); - } - - if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { - height = 1; - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - if (sampler || res->target != PIPE_TEXTURE_3D) - depth = res->array_size; - } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) - depth = res->array_size / 6; - - state[0] = 0; - state[1] = (S_008F14_DATA_FORMAT(data_format) | - S_008F14_NUM_FORMAT(num_format)); - state[2] = (S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1) | - S_008F18_PERF_MOD(4)); - state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) | - S_008F1C_LAST_LEVEL(num_samples > 1 ? - util_logbase2(num_samples) : - last_level) | - S_008F1C_TYPE(type)); - state[4] = 0; - state[5] = S_008F24_BASE_ARRAY(first_layer); - state[6] = 0; - state[7] = 0; - - if (screen->info.chip_class == GFX9) { - unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); - - /* Depth is the the last accessible layer on Gfx9. - * The hw doesn't need to know the total number of layers. - */ - if (type == V_008F1C_SQ_RSRC_IMG_3D) - state[4] |= S_008F20_DEPTH(depth - 1); - else - state[4] |= S_008F20_DEPTH(last_layer); - - state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle); - state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? - util_logbase2(num_samples) : - tex->buffer.b.b.last_level); - } else { - state[3] |= S_008F1C_POW2_PAD(res->last_level > 0); - state[4] |= S_008F20_DEPTH(depth - 1); - state[5] |= S_008F24_LAST_ARRAY(last_layer); - } - - if (tex->surface.dcc_offset) { - state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); - } else { - /* The last dword is unused by hw. The shader uses it to clear - * bits in the first dword of sampler state. - */ - if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) { - if (first_level == last_level) - state[7] = C_008F30_MAX_ANISO_RATIO; - else - state[7] = 0xffffffff; - } - } - - /* Initialize the sampler view for FMASK. */ - if (tex->surface.fmask_offset) { - uint32_t data_format, num_format; - - va = tex->buffer.gpu_address + tex->surface.fmask_offset; - -#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - if (screen->info.chip_class == GFX9) { - data_format = V_008F14_IMG_DATA_FORMAT_FMASK; - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - num_format = V_008F14_IMG_FMASK_8_2_1; - break; - case FMASK(2,2): - num_format = V_008F14_IMG_FMASK_8_2_2; - break; - case FMASK(4,1): - num_format = V_008F14_IMG_FMASK_8_4_1; - break; - case FMASK(4,2): - num_format = V_008F14_IMG_FMASK_8_4_2; - break; - case FMASK(4,4): - num_format = V_008F14_IMG_FMASK_8_4_4; - break; - case FMASK(8,1): - num_format = V_008F14_IMG_FMASK_8_8_1; - break; - case FMASK(8,2): - num_format = V_008F14_IMG_FMASK_16_8_2; - break; - case FMASK(8,4): - num_format = V_008F14_IMG_FMASK_32_8_4; - break; - case FMASK(8,8): - num_format = V_008F14_IMG_FMASK_32_8_8; - break; - case FMASK(16,1): - num_format = V_008F14_IMG_FMASK_16_16_1; - break; - case FMASK(16,2): - num_format = V_008F14_IMG_FMASK_32_16_2; - break; - case FMASK(16,4): - num_format = V_008F14_IMG_FMASK_64_16_4; - break; - case FMASK(16,8): - num_format = V_008F14_IMG_FMASK_64_16_8; - break; - default: - unreachable("invalid nr_samples"); - } - } else { - switch (FMASK(res->nr_samples, res->nr_storage_samples)) { - case FMASK(2,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1; - break; - case FMASK(2,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; - break; - case FMASK(4,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1; - break; - case FMASK(4,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2; - break; - case FMASK(4,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; - break; - case FMASK(8,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1; - break; - case FMASK(8,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2; - break; - case FMASK(8,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4; - break; - case FMASK(8,8): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; - break; - case FMASK(16,1): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1; - break; - case FMASK(16,2): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2; - break; - case FMASK(16,4): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4; - break; - case FMASK(16,8): - data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8; - break; - default: - unreachable("invalid nr_samples"); - } - num_format = V_008F14_IMG_NUM_FORMAT_UINT; - } + struct pipe_resource *res = &tex->buffer.b.b; + const struct util_format_description *desc; + unsigned char swizzle[4]; + int first_non_void; + unsigned num_format, data_format, type, num_samples; + uint64_t va; + + desc = util_format_description(pipe_format); + + num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples) + : MAX2(1, res->nr_storage_samples); + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0}; + const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1}; + const unsigned char swizzle_wwww[4] = {3, 3, 3, 3}; + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + case PIPE_FORMAT_X32_S8X24_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + break; + case PIPE_FORMAT_X24S8_UINT: + /* + * X24S8 is implemented as an 8_8_8_8 data format, to + * fix texture gathers. This affects at least + * GL45-CTS.texture_cube_map_array.sampling on GFX8. + */ + if (screen->info.chip_class <= GFX8) + util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); + else + util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle); + break; + default: + util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle); + } + } else { + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + } + + first_non_void = util_format_get_first_non_void_channel(pipe_format); + + switch (pipe_format) { + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + break; + default: + if (first_non_void < 0) { + if (util_format_is_compressed(pipe_format)) { + switch (pipe_format) { + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + case PIPE_FORMAT_DXT3_SRGBA: + case PIPE_FORMAT_DXT5_SRGBA: + case PIPE_FORMAT_BPTC_SRGBA: + case PIPE_FORMAT_ETC2_SRGB8: + case PIPE_FORMAT_ETC2_SRGB8A1: + case PIPE_FORMAT_ETC2_SRGBA8: + num_format = V_008F14_IMG_NUM_FORMAT_SRGB; + break; + case PIPE_FORMAT_RGTC1_SNORM: + case PIPE_FORMAT_LATC1_SNORM: + case PIPE_FORMAT_RGTC2_SNORM: + case PIPE_FORMAT_LATC2_SNORM: + case PIPE_FORMAT_ETC2_R11_SNORM: + case PIPE_FORMAT_ETC2_RG11_SNORM: + /* implies float, so use SNORM/UNORM to determine + whether data is signed or not */ + case PIPE_FORMAT_BPTC_RGB_FLOAT: + num_format = V_008F14_IMG_NUM_FORMAT_SNORM; + break; + default: + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + break; + } + } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + } else { + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + } + } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + num_format = V_008F14_IMG_NUM_FORMAT_SRGB; + } else { + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_FLOAT: + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + break; + case UTIL_FORMAT_TYPE_SIGNED: + if (desc->channel[first_non_void].normalized) + num_format = V_008F14_IMG_NUM_FORMAT_SNORM; + else if (desc->channel[first_non_void].pure_integer) + num_format = V_008F14_IMG_NUM_FORMAT_SINT; + else + num_format = V_008F14_IMG_NUM_FORMAT_SSCALED; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + if (desc->channel[first_non_void].normalized) + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + else if (desc->channel[first_non_void].pure_integer) + num_format = V_008F14_IMG_NUM_FORMAT_UINT; + else + num_format = V_008F14_IMG_NUM_FORMAT_USCALED; + } + } + } + + data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void); + if (data_format == ~0) { + data_format = 0; + } + + /* S8 with Z32 HTILE needs a special format. */ + if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT && + tex->tc_compatible_htile) + data_format = V_008F14_IMG_DATA_FORMAT_S8_32; + + if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY || + (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) { + /* For the purpose of shader images, treat cube maps and 3D + * textures as 2D arrays. For 3D textures, the address + * calculations for mipmaps are different, so we rely on the + * caller to effectively disable mipmaps. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + + assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); + } else { + type = si_tex_dim(screen, tex, target, num_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { + height = 1; + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) + depth = res->array_size / 6; + + state[0] = 0; + state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format)); + state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4)); + state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) | + S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) | + S_008F1C_TYPE(type)); + state[4] = 0; + state[5] = S_008F24_BASE_ARRAY(first_layer); + state[6] = 0; + state[7] = 0; + + if (screen->info.chip_class == GFX9) { + unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); + + /* Depth is the the last accessible layer on Gfx9. + * The hw doesn't need to know the total number of layers. + */ + if (type == V_008F1C_SQ_RSRC_IMG_3D) + state[4] |= S_008F20_DEPTH(depth - 1); + else + state[4] |= S_008F20_DEPTH(last_layer); + + state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle); + state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples) + : tex->buffer.b.b.last_level); + } else { + state[3] |= S_008F1C_POW2_PAD(res->last_level > 0); + state[4] |= S_008F20_DEPTH(depth - 1); + state[5] |= S_008F24_LAST_ARRAY(last_layer); + } + + if (tex->surface.dcc_offset) { + state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); + } else { + /* The last dword is unused by hw. The shader uses it to clear + * bits in the first dword of sampler state. + */ + if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) { + if (first_level == last_level) + state[7] = C_008F30_MAX_ANISO_RATIO; + else + state[7] = 0xffffffff; + } + } + + /* Initialize the sampler view for FMASK. */ + if (tex->surface.fmask_offset) { + uint32_t data_format, num_format; + + va = tex->buffer.gpu_address + tex->surface.fmask_offset; + +#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) + if (screen->info.chip_class == GFX9) { + data_format = V_008F14_IMG_DATA_FORMAT_FMASK; + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2, 1): + num_format = V_008F14_IMG_FMASK_8_2_1; + break; + case FMASK(2, 2): + num_format = V_008F14_IMG_FMASK_8_2_2; + break; + case FMASK(4, 1): + num_format = V_008F14_IMG_FMASK_8_4_1; + break; + case FMASK(4, 2): + num_format = V_008F14_IMG_FMASK_8_4_2; + break; + case FMASK(4, 4): + num_format = V_008F14_IMG_FMASK_8_4_4; + break; + case FMASK(8, 1): + num_format = V_008F14_IMG_FMASK_8_8_1; + break; + case FMASK(8, 2): + num_format = V_008F14_IMG_FMASK_16_8_2; + break; + case FMASK(8, 4): + num_format = V_008F14_IMG_FMASK_32_8_4; + break; + case FMASK(8, 8): + num_format = V_008F14_IMG_FMASK_32_8_8; + break; + case FMASK(16, 1): + num_format = V_008F14_IMG_FMASK_16_16_1; + break; + case FMASK(16, 2): + num_format = V_008F14_IMG_FMASK_32_16_2; + break; + case FMASK(16, 4): + num_format = V_008F14_IMG_FMASK_64_16_4; + break; + case FMASK(16, 8): + num_format = V_008F14_IMG_FMASK_64_16_8; + break; + default: + unreachable("invalid nr_samples"); + } + } else { + switch (FMASK(res->nr_samples, res->nr_storage_samples)) { + case FMASK(2, 1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1; + break; + case FMASK(2, 2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2; + break; + case FMASK(4, 1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1; + break; + case FMASK(4, 2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2; + break; + case FMASK(4, 4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4; + break; + case FMASK(8, 1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1; + break; + case FMASK(8, 2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2; + break; + case FMASK(8, 4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4; + break; + case FMASK(8, 8): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8; + break; + case FMASK(16, 1): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1; + break; + case FMASK(16, 2): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2; + break; + case FMASK(16, 4): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4; + break; + case FMASK(16, 8): + data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8; + break; + default: + unreachable("invalid nr_samples"); + } + num_format = V_008F14_IMG_NUM_FORMAT_UINT; + } #undef FMASK - fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; - fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | - S_008F14_DATA_FORMAT(data_format) | - S_008F14_NUM_FORMAT(num_format); - fmask_state[2] = S_008F18_WIDTH(width - 1) | - S_008F18_HEIGHT(height - 1); - fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | - S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | - S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0)); - fmask_state[4] = 0; - fmask_state[5] = S_008F24_BASE_ARRAY(first_layer); - fmask_state[6] = 0; - fmask_state[7] = 0; - - if (screen->info.chip_class == GFX9) { - fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode); - fmask_state[4] |= S_008F20_DEPTH(last_layer) | - S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch); - fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | - S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned); - } else { - fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index); - fmask_state[4] |= S_008F20_DEPTH(depth - 1) | - S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1); - fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer); - } - } + fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle; + fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) | + S_008F14_NUM_FORMAT(num_format); + fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1); + fmask_state[3] = + S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) | + S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) | + S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0)); + fmask_state[4] = 0; + fmask_state[5] = S_008F24_BASE_ARRAY(first_layer); + fmask_state[6] = 0; + fmask_state[7] = 0; + + if (screen->info.chip_class == GFX9) { + fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode); + fmask_state[4] |= + S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch); + fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) | + S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned); + } else { + fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index); + fmask_state[4] |= S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1); + fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer); + } + } } /** @@ -4442,1282 +4199,1195 @@ si_make_texture_descriptor(struct si_screen *screen, * @param height0 height0 override (for compressed textures as int) * @param force_level set the base address to the level (for compressed textures) */ -struct pipe_sampler_view * -si_create_sampler_view_custom(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state, - unsigned width0, unsigned height0, - unsigned force_level) +struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); - struct si_texture *tex = (struct si_texture*)texture; - unsigned base_level, first_level, last_level; - unsigned char state_swizzle[4]; - unsigned height, depth, width; - unsigned last_layer = state->u.tex.last_layer; - enum pipe_format pipe_format; - const struct legacy_surf_level *surflevel; - - if (!view) - return NULL; - - /* initialize base object */ - view->base = *state; - view->base.texture = NULL; - view->base.reference.count = 1; - view->base.context = ctx; - - assert(texture); - pipe_resource_reference(&view->base.texture, texture); - - if (state->format == PIPE_FORMAT_X24S8_UINT || - state->format == PIPE_FORMAT_S8X24_UINT || - state->format == PIPE_FORMAT_X32_S8X24_UINT || - state->format == PIPE_FORMAT_S8_UINT) - view->is_stencil_sampler = true; - - /* Buffer resource. */ - if (texture->target == PIPE_BUFFER) { - si_make_buffer_descriptor(sctx->screen, - si_resource(texture), - state->format, - state->u.buf.offset, - state->u.buf.size, - view->state); - return &view->base; - } - - state_swizzle[0] = state->swizzle_r; - state_swizzle[1] = state->swizzle_g; - state_swizzle[2] = state->swizzle_b; - state_swizzle[3] = state->swizzle_a; - - base_level = 0; - first_level = state->u.tex.first_level; - last_level = state->u.tex.last_level; - width = width0; - height = height0; - depth = texture->depth0; - - if (sctx->chip_class <= GFX8 && force_level) { - assert(force_level == first_level && - force_level == last_level); - base_level = force_level; - first_level = 0; - last_level = 0; - width = u_minify(width, force_level); - height = u_minify(height, force_level); - depth = u_minify(depth, force_level); - } - - /* This is not needed if state trackers set last_layer correctly. */ - if (state->target == PIPE_TEXTURE_1D || - state->target == PIPE_TEXTURE_2D || - state->target == PIPE_TEXTURE_RECT || - state->target == PIPE_TEXTURE_CUBE) - last_layer = state->u.tex.first_layer; - - /* Texturing with separate depth and stencil. */ - pipe_format = state->format; - - /* Depth/stencil texturing sometimes needs separate texture. */ - if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) { - if (!tex->flushed_depth_texture && - !si_init_flushed_depth_texture(ctx, texture)) { - pipe_resource_reference(&view->base.texture, NULL); - FREE(view); - return NULL; - } - - assert(tex->flushed_depth_texture); - - /* Override format for the case where the flushed texture - * contains only Z or only S. - */ - if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format) - pipe_format = tex->flushed_depth_texture->buffer.b.b.format; - - tex = tex->flushed_depth_texture; - } - - surflevel = tex->surface.u.legacy.level; - - if (tex->db_compatible) { - if (!view->is_stencil_sampler) - pipe_format = tex->db_render_format; - - switch (pipe_format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - pipe_format = PIPE_FORMAT_Z32_FLOAT; - break; - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - /* Z24 is always stored like this for DB - * compatibility. - */ - pipe_format = PIPE_FORMAT_Z24X8_UNORM; - break; - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_S8X24_UINT: - case PIPE_FORMAT_X32_S8X24_UINT: - pipe_format = PIPE_FORMAT_S8_UINT; - surflevel = tex->surface.u.legacy.stencil_level; - break; - default:; - } - } - - view->dcc_incompatible = - vi_dcc_formats_are_incompatible(texture, - state->u.tex.first_level, - state->format); - - sctx->screen->make_texture_descriptor(sctx->screen, tex, true, - state->target, pipe_format, state_swizzle, - first_level, last_level, - state->u.tex.first_layer, last_layer, - width, height, depth, - view->state, view->fmask_state); - - const struct util_format_description *desc = util_format_description(pipe_format); - view->is_integer = false; - - for (unsigned i = 0; i < desc->nr_channels; ++i) { - if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) - continue; - - /* Whether the number format is {U,S}{SCALED,INT} */ - view->is_integer = - (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || - desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) && - (desc->channel[i].pure_integer || !desc->channel[i].normalized); - break; - } - - view->base_level_info = &surflevel[base_level]; - view->base_level = base_level; - view->block_width = util_format_get_blockwidth(pipe_format); - return &view->base; + struct si_context *sctx = (struct si_context *)ctx; + struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view); + struct si_texture *tex = (struct si_texture *)texture; + unsigned base_level, first_level, last_level; + unsigned char state_swizzle[4]; + unsigned height, depth, width; + unsigned last_layer = state->u.tex.last_layer; + enum pipe_format pipe_format; + const struct legacy_surf_level *surflevel; + + if (!view) + return NULL; + + /* initialize base object */ + view->base = *state; + view->base.texture = NULL; + view->base.reference.count = 1; + view->base.context = ctx; + + assert(texture); + pipe_resource_reference(&view->base.texture, texture); + + if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT || + state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT) + view->is_stencil_sampler = true; + + /* Buffer resource. */ + if (texture->target == PIPE_BUFFER) { + si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format, + state->u.buf.offset, state->u.buf.size, view->state); + return &view->base; + } + + state_swizzle[0] = state->swizzle_r; + state_swizzle[1] = state->swizzle_g; + state_swizzle[2] = state->swizzle_b; + state_swizzle[3] = state->swizzle_a; + + base_level = 0; + first_level = state->u.tex.first_level; + last_level = state->u.tex.last_level; + width = width0; + height = height0; + depth = texture->depth0; + + if (sctx->chip_class <= GFX8 && force_level) { + assert(force_level == first_level && force_level == last_level); + base_level = force_level; + first_level = 0; + last_level = 0; + width = u_minify(width, force_level); + height = u_minify(height, force_level); + depth = u_minify(depth, force_level); + } + + /* This is not needed if state trackers set last_layer correctly. */ + if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D || + state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE) + last_layer = state->u.tex.first_layer; + + /* Texturing with separate depth and stencil. */ + pipe_format = state->format; + + /* Depth/stencil texturing sometimes needs separate texture. */ + if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) { + if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) { + pipe_resource_reference(&view->base.texture, NULL); + FREE(view); + return NULL; + } + + assert(tex->flushed_depth_texture); + + /* Override format for the case where the flushed texture + * contains only Z or only S. + */ + if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format) + pipe_format = tex->flushed_depth_texture->buffer.b.b.format; + + tex = tex->flushed_depth_texture; + } + + surflevel = tex->surface.u.legacy.level; + + if (tex->db_compatible) { + if (!view->is_stencil_sampler) + pipe_format = tex->db_render_format; + + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Z24 is always stored like this for DB + * compatibility. + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_S8X24_UINT: + case PIPE_FORMAT_X32_S8X24_UINT: + pipe_format = PIPE_FORMAT_S8_UINT; + surflevel = tex->surface.u.legacy.stencil_level; + break; + default:; + } + } + + view->dcc_incompatible = + vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format); + + sctx->screen->make_texture_descriptor( + sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level, + state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state); + + const struct util_format_description *desc = util_format_description(pipe_format); + view->is_integer = false; + + for (unsigned i = 0; i < desc->nr_channels; ++i) { + if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) + continue; + + /* Whether the number format is {U,S}{SCALED,INT} */ + view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || + desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) && + (desc->channel[i].pure_integer || !desc->channel[i].normalized); + break; + } + + view->base_level_info = &surflevel[base_level]; + view->base_level = base_level; + view->block_width = util_format_get_blockwidth(pipe_format); + return &view->base; } -static struct pipe_sampler_view * -si_create_sampler_view(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state) +static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state) { - return si_create_sampler_view_custom(ctx, texture, state, - texture ? texture->width0 : 0, - texture ? texture->height0 : 0, 0); + return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0, + texture ? texture->height0 : 0, 0); } -static void si_sampler_view_destroy(struct pipe_context *ctx, - struct pipe_sampler_view *state) +static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state) { - struct si_sampler_view *view = (struct si_sampler_view *)state; + struct si_sampler_view *view = (struct si_sampler_view *)state; - pipe_resource_reference(&state->texture, NULL); - FREE(view); + pipe_resource_reference(&state->texture, NULL); + FREE(view); } static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter) { - return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || - wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER || - (linear_filter && - (wrap == PIPE_TEX_WRAP_CLAMP || - wrap == PIPE_TEX_WRAP_MIRROR_CLAMP)); + return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER || + (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP)); } static uint32_t si_translate_border_color(struct si_context *sctx, - const struct pipe_sampler_state *state, - const union pipe_color_union *color, - bool is_integer) + const struct pipe_sampler_state *state, + const union pipe_color_union *color, bool is_integer) { - bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST || - state->mag_img_filter != PIPE_TEX_FILTER_NEAREST; - - if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) && - !wrap_mode_uses_border_color(state->wrap_t, linear_filter) && - !wrap_mode_uses_border_color(state->wrap_r, linear_filter)) - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); - -#define simple_border_types(elt) \ -do { \ - if (color->elt[0] == 0 && color->elt[1] == 0 && \ - color->elt[2] == 0 && color->elt[3] == 0) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \ - if (color->elt[0] == 0 && color->elt[1] == 0 && \ - color->elt[2] == 0 && color->elt[3] == 1) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \ - if (color->elt[0] == 1 && color->elt[1] == 1 && \ - color->elt[2] == 1 && color->elt[3] == 1) \ - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \ -} while (false) - - if (is_integer) - simple_border_types(ui); - else - simple_border_types(f); + bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST || + state->mag_img_filter != PIPE_TEX_FILTER_NEAREST; + + if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) && + !wrap_mode_uses_border_color(state->wrap_t, linear_filter) && + !wrap_mode_uses_border_color(state->wrap_r, linear_filter)) + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); + +#define simple_border_types(elt) \ + do { \ + if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \ + if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \ + if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1) \ + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \ + } while (false) + + if (is_integer) + simple_border_types(ui); + else + simple_border_types(f); #undef simple_border_types - int i; - - /* Check if the border has been uploaded already. */ - for (i = 0; i < sctx->border_color_count; i++) - if (memcmp(&sctx->border_color_table[i], color, - sizeof(*color)) == 0) - break; - - if (i >= SI_MAX_BORDER_COLORS) { - /* Getting 4096 unique border colors is very unlikely. */ - fprintf(stderr, "radeonsi: The border color table is full. " - "Any new border colors will be just black. " - "Please file a bug.\n"); - return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); - } - - if (i == sctx->border_color_count) { - /* Upload a new border color. */ - memcpy(&sctx->border_color_table[i], color, - sizeof(*color)); - util_memcpy_cpu_to_le32(&sctx->border_color_map[i], - color, sizeof(*color)); - sctx->border_color_count++; - } - - return S_008F3C_BORDER_COLOR_PTR(i) | - S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER); + int i; + + /* Check if the border has been uploaded already. */ + for (i = 0; i < sctx->border_color_count; i++) + if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0) + break; + + if (i >= SI_MAX_BORDER_COLORS) { + /* Getting 4096 unique border colors is very unlikely. */ + fprintf(stderr, "radeonsi: The border color table is full. " + "Any new border colors will be just black. " + "Please file a bug.\n"); + return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); + } + + if (i == sctx->border_color_count) { + /* Upload a new border color. */ + memcpy(&sctx->border_color_table[i], color, sizeof(*color)); + util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color)); + sctx->border_color_count++; + } + + return S_008F3C_BORDER_COLOR_PTR(i) | + S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER); } static inline int S_FIXED(float value, unsigned frac_bits) { - return value * (1 << frac_bits); + return value * (1 << frac_bits); } static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso) { - if (filter == PIPE_TEX_FILTER_LINEAR) - return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR - : V_008F38_SQ_TEX_XY_FILTER_BILINEAR; - else - return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT - : V_008F38_SQ_TEX_XY_FILTER_POINT; + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR + : V_008F38_SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT + : V_008F38_SQ_TEX_XY_FILTER_POINT; } static inline unsigned si_tex_aniso_filter(unsigned filter) { - if (filter < 2) - return 0; - if (filter < 4) - return 1; - if (filter < 8) - return 2; - if (filter < 16) - return 3; - return 4; + if (filter < 2) + return 0; + if (filter < 4) + return 1; + if (filter < 8) + return 2; + if (filter < 16) + return 3; + return 4; } static void *si_create_sampler_state(struct pipe_context *ctx, - const struct pipe_sampler_state *state) + const struct pipe_sampler_state *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_screen *sscreen = sctx->screen; - struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); - unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso - : state->max_anisotropy; - unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); - union pipe_color_union clamped_border_color; - - if (!rstate) { - return NULL; - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_screen *sscreen = sctx->screen; + struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); + unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy; + unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); + union pipe_color_union clamped_border_color; + + if (!rstate) { + return NULL; + } #ifndef NDEBUG - rstate->magic = SI_SAMPLER_STATE_MAGIC; + rstate->magic = SI_SAMPLER_STATE_MAGIC; #endif - rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | - S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | - S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | - S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | - S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | - S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | - S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | - S_008F30_ANISO_BIAS(max_aniso_ratio) | - S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | - S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); - rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | - S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | - S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); - rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | - S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | - S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | - S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | - S_008F38_MIP_POINT_PRECLAMP(0)); - rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false); - - if (sscreen->info.chip_class >= GFX10) { - rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1); - } else { - rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | - S_008F38_FILTER_PREC_FIX(1) | - S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8); - } - - /* Create sampler resource for integer textures. */ - memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val)); - rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true); - - /* Create sampler resource for upgraded depth textures. */ - memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val)); - - for (unsigned i = 0; i < 4; ++i) { - /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE - * when the border color is 1.0. */ - clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1); - } - - if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) { - if (sscreen->info.chip_class <= GFX9) - rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1); - } else { - rstate->upgraded_depth_val[3] = - si_translate_border_color(sctx, state, &clamped_border_color, false); - } - - return rstate; + rstate->val[0] = + (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | + S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | + S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | + S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | + S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | + S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | + S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); + rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | + S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | + S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0)); + rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | + S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) | + S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) | + S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | + S_008F38_MIP_POINT_PRECLAMP(0)); + rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false); + + if (sscreen->info.chip_class >= GFX10) { + rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1); + } else { + rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | + S_008F38_FILTER_PREC_FIX(1) | + S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8); + } + + /* Create sampler resource for integer textures. */ + memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val)); + rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true); + + /* Create sampler resource for upgraded depth textures. */ + memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val)); + + for (unsigned i = 0; i < 4; ++i) { + /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE + * when the border color is 1.0. */ + clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1); + } + + if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) { + if (sscreen->info.chip_class <= GFX9) + rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1); + } else { + rstate->upgraded_depth_val[3] = + si_translate_border_color(sctx, state, &clamped_border_color, false); + } + + return rstate; } static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - if (sctx->sample_mask == (uint16_t)sample_mask) - return; + if (sctx->sample_mask == (uint16_t)sample_mask) + return; - sctx->sample_mask = sample_mask; - si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask); + sctx->sample_mask = sample_mask; + si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask); } static void si_emit_sample_mask(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned mask = sctx->sample_mask; - - /* Needed for line and polygon smoothing as well as for the Polaris - * small primitive filter. We expect the state tracker to take care of - * this for us. - */ - assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || - (mask & 1 && sctx->blitter->running)); - - radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); - radeon_emit(cs, mask | (mask << 16)); - radeon_emit(cs, mask | (mask << 16)); + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned mask = sctx->sample_mask; + + /* Needed for line and polygon smoothing as well as for the Polaris + * small primitive filter. We expect the state tracker to take care of + * this for us. + */ + assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || + (mask & 1 && sctx->blitter->running)); + + radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); + radeon_emit(cs, mask | (mask << 16)); + radeon_emit(cs, mask | (mask << 16)); } static void si_delete_sampler_state(struct pipe_context *ctx, void *state) { #ifndef NDEBUG - struct si_sampler_state *s = state; + struct si_sampler_state *s = state; - assert(s->magic == SI_SAMPLER_STATE_MAGIC); - s->magic = 0; + assert(s->magic == SI_SAMPLER_STATE_MAGIC); + s->magic = 0; #endif - free(state); + free(state); } /* * Vertex elements & buffers */ -struct si_fast_udiv_info32 -si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits) +struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits) { - struct util_fast_udiv_info info = - util_compute_fast_udiv_info(D, num_bits, 32); - - struct si_fast_udiv_info32 result = { - info.multiplier, - info.pre_shift, - info.post_shift, - info.increment, - }; - return result; + struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32); + + struct si_fast_udiv_info32 result = { + info.multiplier, + info.pre_shift, + info.post_shift, + info.increment, + }; + return result; } -static void *si_create_vertex_elements(struct pipe_context *ctx, - unsigned count, - const struct pipe_vertex_element *elements) +static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, + const struct pipe_vertex_element *elements) { - struct si_screen *sscreen = (struct si_screen*)ctx->screen; - struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); - bool used[SI_NUM_VERTEX_BUFFERS] = {}; - struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; - STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16); - STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); - STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); - int i; - - assert(count <= SI_MAX_ATTRIBS); - if (!v) - return NULL; - - v->count = count; - - unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ? - count - sscreen->num_vbos_in_user_sgprs : 0; - v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); - - for (i = 0; i < count; ++i) { - const struct util_format_description *desc; - const struct util_format_channel_description *channel; - int first_non_void; - unsigned vbo_index = elements[i].vertex_buffer_index; - - if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { - FREE(v); - return NULL; - } - - unsigned instance_divisor = elements[i].instance_divisor; - if (instance_divisor) { - v->uses_instance_divisors = true; - - if (instance_divisor == 1) { - v->instance_divisor_is_one |= 1u << i; - } else { - v->instance_divisor_is_fetched |= 1u << i; - divisor_factors[i] = - si_compute_fast_udiv_info32(instance_divisor, 32); - } - } - - if (!used[vbo_index]) { - v->first_vb_use_mask |= 1 << i; - used[vbo_index] = true; - } - - desc = util_format_description(elements[i].src_format); - first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); - channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; - - v->format_size[i] = desc->block.bits / 8; - v->src_offset[i] = elements[i].src_offset; - v->vertex_buffer_index[i] = vbo_index; - - bool always_fix = false; - union si_vs_fix_fetch fix_fetch; - unsigned log_hw_load_size; /* the load element size as seen by the hardware */ - - fix_fetch.bits = 0; - log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3); - - if (channel) { - switch (channel->type) { - case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; - case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break; - case UTIL_FORMAT_TYPE_SIGNED: { - if (channel->pure_integer) - fix_fetch.u.format = AC_FETCH_FORMAT_SINT; - else if (channel->normalized) - fix_fetch.u.format = AC_FETCH_FORMAT_SNORM; - else - fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED; - break; - } - case UTIL_FORMAT_TYPE_UNSIGNED: { - if (channel->pure_integer) - fix_fetch.u.format = AC_FETCH_FORMAT_UINT; - else if (channel->normalized) - fix_fetch.u.format = AC_FETCH_FORMAT_UNORM; - else - fix_fetch.u.format = AC_FETCH_FORMAT_USCALED; - break; - } - default: unreachable("bad format type"); - } - } else { - switch (elements[i].src_format) { - case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; - default: unreachable("bad other format"); - } - } - - if (desc->channel[0].size == 10) { - fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */ - log_hw_load_size = 2; - - /* The hardware always treats the 2-bit alpha channel as - * unsigned, so a shader workaround is needed. The affected - * chips are GFX8 and older except Stoney (GFX8.1). - */ - always_fix = sscreen->info.chip_class <= GFX8 && - sscreen->info.family != CHIP_STONEY && - channel->type == UTIL_FORMAT_TYPE_SIGNED; - } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { - fix_fetch.u.log_size = 3; /* special encoding */ - fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; - log_hw_load_size = 2; - } else { - fix_fetch.u.log_size = util_logbase2(channel->size) - 3; - fix_fetch.u.num_channels_m1 = desc->nr_channels - 1; - - /* Always fix up: - * - doubles (multiple loads + truncate to float) - * - 32-bit requiring a conversion - */ - always_fix = - (fix_fetch.u.log_size == 3) || - (fix_fetch.u.log_size == 2 && - fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT && - fix_fetch.u.format != AC_FETCH_FORMAT_UINT && - fix_fetch.u.format != AC_FETCH_FORMAT_SINT); - - /* Also fixup 8_8_8 and 16_16_16. */ - if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) { - always_fix = true; - log_hw_load_size = fix_fetch.u.log_size; - } - } - - if (desc->swizzle[0] != PIPE_SWIZZLE_X) { - assert(desc->swizzle[0] == PIPE_SWIZZLE_Z && - (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0)); - fix_fetch.u.reverse = 1; - } - - /* Force the workaround for unaligned access here already if the - * offset relative to the vertex buffer base is unaligned. - * - * There is a theoretical case in which this is too conservative: - * if the vertex buffer's offset is also unaligned in just the - * right way, we end up with an aligned address after all. - * However, this case should be extremely rare in practice (it - * won't happen in well-behaved applications), and taking it - * into account would complicate the fast path (where everything - * is nicely aligned). - */ - bool check_alignment = - log_hw_load_size >= 1 && - (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10); - bool opencode = sscreen->options.vs_fetch_always_opencode; - - if (check_alignment && - (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) - opencode = true; - - if (always_fix || check_alignment || opencode) - v->fix_fetch[i] = fix_fetch.bits; - - if (opencode) - v->fix_fetch_opencode |= 1 << i; - if (opencode || always_fix) - v->fix_fetch_always |= 1 << i; - - if (check_alignment && !opencode) { - assert(log_hw_load_size == 1 || log_hw_load_size == 2); - - v->fix_fetch_unaligned |= 1 << i; - v->hw_load_is_dword |= (log_hw_load_size - 1) << i; - v->vb_alignment_check_mask |= 1 << vbo_index; - } - - v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); - - if (sscreen->info.chip_class >= GFX10) { - const struct gfx10_format *fmt = - &gfx10_format_table[elements[i].src_format]; - assert(fmt->img_format != 0 && fmt->img_format < 128); - v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - unsigned data_format, num_format; - data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); - num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); - v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); - } - } - - if (v->instance_divisor_is_fetched) { - unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); - - v->instance_divisor_factor_buffer = - (struct si_resource*) - pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, - num_divisors * sizeof(divisor_factors[0])); - if (!v->instance_divisor_factor_buffer) { - FREE(v); - return NULL; - } - void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, - NULL, PIPE_TRANSFER_WRITE); - memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0])); - } - return v; + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); + bool used[SI_NUM_VERTEX_BUFFERS] = {}; + struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; + STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16); + STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); + int i; + + assert(count <= SI_MAX_ATTRIBS); + if (!v) + return NULL; + + v->count = count; + + unsigned alloc_count = + count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0; + v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); + + for (i = 0; i < count; ++i) { + const struct util_format_description *desc; + const struct util_format_channel_description *channel; + int first_non_void; + unsigned vbo_index = elements[i].vertex_buffer_index; + + if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { + FREE(v); + return NULL; + } + + unsigned instance_divisor = elements[i].instance_divisor; + if (instance_divisor) { + v->uses_instance_divisors = true; + + if (instance_divisor == 1) { + v->instance_divisor_is_one |= 1u << i; + } else { + v->instance_divisor_is_fetched |= 1u << i; + divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32); + } + } + + if (!used[vbo_index]) { + v->first_vb_use_mask |= 1 << i; + used[vbo_index] = true; + } + + desc = util_format_description(elements[i].src_format); + first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); + channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; + + v->format_size[i] = desc->block.bits / 8; + v->src_offset[i] = elements[i].src_offset; + v->vertex_buffer_index[i] = vbo_index; + + bool always_fix = false; + union si_vs_fix_fetch fix_fetch; + unsigned log_hw_load_size; /* the load element size as seen by the hardware */ + + fix_fetch.bits = 0; + log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3); + + if (channel) { + switch (channel->type) { + case UTIL_FORMAT_TYPE_FLOAT: + fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; + break; + case UTIL_FORMAT_TYPE_FIXED: + fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; + break; + case UTIL_FORMAT_TYPE_SIGNED: { + if (channel->pure_integer) + fix_fetch.u.format = AC_FETCH_FORMAT_SINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_SNORM; + else + fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED; + break; + } + case UTIL_FORMAT_TYPE_UNSIGNED: { + if (channel->pure_integer) + fix_fetch.u.format = AC_FETCH_FORMAT_UINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_UNORM; + else + fix_fetch.u.format = AC_FETCH_FORMAT_USCALED; + break; + } + default: + unreachable("bad format type"); + } + } else { + switch (elements[i].src_format) { + case PIPE_FORMAT_R11G11B10_FLOAT: + fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; + break; + default: + unreachable("bad other format"); + } + } + + if (desc->channel[0].size == 10) { + fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */ + log_hw_load_size = 2; + + /* The hardware always treats the 2-bit alpha channel as + * unsigned, so a shader workaround is needed. The affected + * chips are GFX8 and older except Stoney (GFX8.1). + */ + always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY && + channel->type == UTIL_FORMAT_TYPE_SIGNED; + } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { + fix_fetch.u.log_size = 3; /* special encoding */ + fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; + log_hw_load_size = 2; + } else { + fix_fetch.u.log_size = util_logbase2(channel->size) - 3; + fix_fetch.u.num_channels_m1 = desc->nr_channels - 1; + + /* Always fix up: + * - doubles (multiple loads + truncate to float) + * - 32-bit requiring a conversion + */ + always_fix = (fix_fetch.u.log_size == 3) || + (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT && + fix_fetch.u.format != AC_FETCH_FORMAT_UINT && + fix_fetch.u.format != AC_FETCH_FORMAT_SINT); + + /* Also fixup 8_8_8 and 16_16_16. */ + if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) { + always_fix = true; + log_hw_load_size = fix_fetch.u.log_size; + } + } + + if (desc->swizzle[0] != PIPE_SWIZZLE_X) { + assert(desc->swizzle[0] == PIPE_SWIZZLE_Z && + (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0)); + fix_fetch.u.reverse = 1; + } + + /* Force the workaround for unaligned access here already if the + * offset relative to the vertex buffer base is unaligned. + * + * There is a theoretical case in which this is too conservative: + * if the vertex buffer's offset is also unaligned in just the + * right way, we end up with an aligned address after all. + * However, this case should be extremely rare in practice (it + * won't happen in well-behaved applications), and taking it + * into account would complicate the fast path (where everything + * is nicely aligned). + */ + bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 || + sscreen->info.chip_class == GFX10); + bool opencode = sscreen->options.vs_fetch_always_opencode; + + if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) + opencode = true; + + if (always_fix || check_alignment || opencode) + v->fix_fetch[i] = fix_fetch.bits; + + if (opencode) + v->fix_fetch_opencode |= 1 << i; + if (opencode || always_fix) + v->fix_fetch_always |= 1 << i; + + if (check_alignment && !opencode) { + assert(log_hw_load_size == 1 || log_hw_load_size == 2); + + v->fix_fetch_unaligned |= 1 << i; + v->hw_load_is_dword |= (log_hw_load_size - 1) << i; + v->vb_alignment_check_mask |= 1 << vbo_index; + } + + v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + if (sscreen->info.chip_class >= GFX10) { + const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format]; + assert(fmt->img_format != 0 && fmt->img_format < 128); + v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1); + } else { + unsigned data_format, num_format; + data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); + num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); + v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + } + } + + if (v->instance_divisor_is_fetched) { + unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); + + v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create( + &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0])); + if (!v->instance_divisor_factor_buffer) { + FREE(v); + return NULL; + } + void *map = + sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_TRANSFER_WRITE); + memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0])); + } + return v; } static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_vertex_elements *old = sctx->vertex_elements; - struct si_vertex_elements *v = (struct si_vertex_elements*)state; - - sctx->vertex_elements = v; - sctx->num_vertex_elements = v ? v->count : 0; - - if (sctx->num_vertex_elements) { - sctx->vertex_buffers_dirty = true; - } else { - sctx->vertex_buffer_pointer_dirty = false; - sctx->vertex_buffer_user_sgprs_dirty = false; - } - - if (v && - (!old || - old->count != v->count || - old->uses_instance_divisors != v->uses_instance_divisors || - /* we don't check which divisors changed */ - v->uses_instance_divisors || - (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned || - ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && - memcmp(old->vertex_buffer_index, v->vertex_buffer_index, - sizeof(v->vertex_buffer_index[0]) * v->count)) || - /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are - * functions of fix_fetch and the src_offset alignment. - * If they change and fix_fetch doesn't, it must be due to different - * src_offset alignment, which is reflected in fix_fetch_opencode. */ - old->fix_fetch_opencode != v->fix_fetch_opencode || - memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) - sctx->do_update_shaders = true; - - if (v && v->instance_divisor_is_fetched) { - struct pipe_constant_buffer cb; - - cb.buffer = &v->instance_divisor_factor_buffer->b.b; - cb.user_buffer = NULL; - cb.buffer_offset = 0; - cb.buffer_size = 0xffffffff; - si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); - } + struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *old = sctx->vertex_elements; + struct si_vertex_elements *v = (struct si_vertex_elements *)state; + + sctx->vertex_elements = v; + sctx->num_vertex_elements = v ? v->count : 0; + + if (sctx->num_vertex_elements) { + sctx->vertex_buffers_dirty = true; + } else { + sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; + } + + if (v && (!old || old->count != v->count || + old->uses_instance_divisors != v->uses_instance_divisors || + /* we don't check which divisors changed */ + v->uses_instance_divisors || + (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & + sctx->vertex_buffer_unaligned || + ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && + memcmp(old->vertex_buffer_index, v->vertex_buffer_index, + sizeof(v->vertex_buffer_index[0]) * v->count)) || + /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are + * functions of fix_fetch and the src_offset alignment. + * If they change and fix_fetch doesn't, it must be due to different + * src_offset alignment, which is reflected in fix_fetch_opencode. */ + old->fix_fetch_opencode != v->fix_fetch_opencode || + memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) + sctx->do_update_shaders = true; + + if (v && v->instance_divisor_is_fetched) { + struct pipe_constant_buffer cb; + + cb.buffer = &v->instance_divisor_factor_buffer->b.b; + cb.user_buffer = NULL; + cb.buffer_offset = 0; + cb.buffer_size = 0xffffffff; + si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); + } } static void si_delete_vertex_element(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_vertex_elements *v = (struct si_vertex_elements*)state; - - if (sctx->vertex_elements == state) { - sctx->vertex_elements = NULL; - sctx->num_vertex_elements = 0; - } - si_resource_reference(&v->instance_divisor_factor_buffer, NULL); - FREE(state); + struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *v = (struct si_vertex_elements *)state; + + if (sctx->vertex_elements == state) { + sctx->vertex_elements = NULL; + sctx->num_vertex_elements = 0; + } + si_resource_reference(&v->instance_divisor_factor_buffer, NULL); + FREE(state); } -static void si_set_vertex_buffers(struct pipe_context *ctx, - unsigned start_slot, unsigned count, - const struct pipe_vertex_buffer *buffers) +static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count, + const struct pipe_vertex_buffer *buffers) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; - unsigned updated_mask = u_bit_consecutive(start_slot, count); - uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; - uint32_t unaligned = 0; - int i; - - assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); - - if (buffers) { - for (i = 0; i < count; i++) { - const struct pipe_vertex_buffer *src = buffers + i; - struct pipe_vertex_buffer *dsti = dst + i; - struct pipe_resource *buf = src->buffer.resource; - unsigned slot_bit = 1 << (start_slot + i); - - pipe_resource_reference(&dsti->buffer.resource, buf); - dsti->buffer_offset = src->buffer_offset; - dsti->stride = src->stride; - - if (dsti->buffer_offset & 3 || dsti->stride & 3) - unaligned |= slot_bit; - - si_context_add_resource_size(sctx, buf); - if (buf) - si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; - } - } else { - for (i = 0; i < count; i++) { - pipe_resource_reference(&dst[i].buffer.resource, NULL); - } - unaligned &= ~updated_mask; - } - sctx->vertex_buffers_dirty = true; - sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned; - - /* Check whether alignment may have changed in a way that requires - * shader changes. This check is conservative: a vertex buffer can only - * trigger a shader change if the misalignment amount changes (e.g. - * from byte-aligned to short-aligned), but we only keep track of - * whether buffers are at least dword-aligned, since that should always - * be the case in well-behaved applications anyway. - */ - if (sctx->vertex_elements && - (sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & updated_mask)) - sctx->do_update_shaders = true; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; + unsigned updated_mask = u_bit_consecutive(start_slot, count); + uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; + uint32_t unaligned = 0; + int i; + + assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); + + if (buffers) { + for (i = 0; i < count; i++) { + const struct pipe_vertex_buffer *src = buffers + i; + struct pipe_vertex_buffer *dsti = dst + i; + struct pipe_resource *buf = src->buffer.resource; + unsigned slot_bit = 1 << (start_slot + i); + + pipe_resource_reference(&dsti->buffer.resource, buf); + dsti->buffer_offset = src->buffer_offset; + dsti->stride = src->stride; + + if (dsti->buffer_offset & 3 || dsti->stride & 3) + unaligned |= slot_bit; + + si_context_add_resource_size(sctx, buf); + if (buf) + si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; + } + } else { + for (i = 0; i < count; i++) { + pipe_resource_reference(&dst[i].buffer.resource, NULL); + } + unaligned &= ~updated_mask; + } + sctx->vertex_buffers_dirty = true; + sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned; + + /* Check whether alignment may have changed in a way that requires + * shader changes. This check is conservative: a vertex buffer can only + * trigger a shader change if the misalignment amount changes (e.g. + * from byte-aligned to short-aligned), but we only keep track of + * whether buffers are at least dword-aligned, since that should always + * be the case in well-behaved applications anyway. + */ + if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask & + (unaligned | orig_unaligned) & updated_mask)) + sctx->do_update_shaders = true; } /* * Misc */ -static void si_set_tess_state(struct pipe_context *ctx, - const float default_outer_level[4], - const float default_inner_level[2]) +static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4], + const float default_inner_level[2]) { - struct si_context *sctx = (struct si_context *)ctx; - struct pipe_constant_buffer cb; - float array[8]; + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_constant_buffer cb; + float array[8]; - memcpy(array, default_outer_level, sizeof(float) * 4); - memcpy(array+4, default_inner_level, sizeof(float) * 2); + memcpy(array, default_outer_level, sizeof(float) * 4); + memcpy(array + 4, default_inner_level, sizeof(float) * 2); - cb.buffer = NULL; - cb.user_buffer = NULL; - cb.buffer_size = sizeof(array); + cb.buffer = NULL; + cb.user_buffer = NULL; + cb.buffer_size = sizeof(array); - si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer, - (void*)array, sizeof(array), - &cb.buffer_offset); + si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array), + &cb.buffer_offset); - si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); - pipe_resource_reference(&cb.buffer, NULL); + si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb); + pipe_resource_reference(&cb.buffer, NULL); } static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - si_update_fb_dirtiness_after_rendering(sctx); + si_update_fb_dirtiness_after_rendering(sctx); - /* Multisample surfaces are flushed in si_decompress_textures. */ - if (sctx->framebuffer.uncompressed_cb_mask) { - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, - sctx->framebuffer.CB_has_shader_readable_metadata, - sctx->framebuffer.all_DCC_pipe_aligned); - } + /* Multisample surfaces are flushed in si_decompress_textures. */ + if (sctx->framebuffer.uncompressed_cb_mask) { + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata, + sctx->framebuffer.all_DCC_pipe_aligned); + } } /* This only ensures coherency for shader image/buffer stores. */ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { - struct si_context *sctx = (struct si_context *)ctx; - - if (!(flags & ~PIPE_BARRIER_UPDATE)) - return; - - /* Subsequent commands must wait for all shader invocations to - * complete. */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - - if (flags & PIPE_BARRIER_CONSTANT_BUFFER) - sctx->flags |= SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE; - - if (flags & (PIPE_BARRIER_VERTEX_BUFFER | - PIPE_BARRIER_SHADER_BUFFER | - PIPE_BARRIER_TEXTURE | - PIPE_BARRIER_IMAGE | - PIPE_BARRIER_STREAMOUT_BUFFER | - PIPE_BARRIER_GLOBAL_BUFFER)) { - /* As far as I can tell, L1 contents are written back to L2 - * automatically at end of shader, but the contents of other - * L1 caches might still be stale. */ - sctx->flags |= SI_CONTEXT_INV_VCACHE; - } - - if (flags & PIPE_BARRIER_INDEX_BUFFER) { - /* Indices are read through TC L2 since GFX8. - * L1 isn't used. - */ - if (sctx->screen->info.chip_class <= GFX7) - sctx->flags |= SI_CONTEXT_WB_L2; - } - - /* MSAA color, any depth and any stencil are flushed in - * si_decompress_textures when needed. - */ - if (flags & PIPE_BARRIER_FRAMEBUFFER && - sctx->framebuffer.uncompressed_cb_mask) { - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - - if (sctx->chip_class <= GFX8) - sctx->flags |= SI_CONTEXT_WB_L2; - } - - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->screen->info.chip_class <= GFX8 && - flags & PIPE_BARRIER_INDIRECT_BUFFER) - sctx->flags |= SI_CONTEXT_WB_L2; + struct si_context *sctx = (struct si_context *)ctx; + + if (!(flags & ~PIPE_BARRIER_UPDATE)) + return; + + /* Subsequent commands must wait for all shader invocations to + * complete. */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) + sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE | + PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) { + /* As far as I can tell, L1 contents are written back to L2 + * automatically at end of shader, but the contents of other + * L1 caches might still be stale. */ + sctx->flags |= SI_CONTEXT_INV_VCACHE; + } + + if (flags & PIPE_BARRIER_INDEX_BUFFER) { + /* Indices are read through TC L2 since GFX8. + * L1 isn't used. + */ + if (sctx->screen->info.chip_class <= GFX7) + sctx->flags |= SI_CONTEXT_WB_L2; + } + + /* MSAA color, any depth and any stencil are flushed in + * si_decompress_textures when needed. + */ + if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) { + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + if (sctx->chip_class <= GFX8) + sctx->flags |= SI_CONTEXT_WB_L2; + } + + /* Indirect buffers use TC L2 on GFX9, but not older hw. */ + if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) + sctx->flags |= SI_CONTEXT_WB_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { - struct pipe_blend_state blend; + struct pipe_blend_state blend; - memset(&blend, 0, sizeof(blend)); - blend.independent_blend_enable = true; - blend.rt[0].colormask = 0xf; - return si_create_blend_state_mode(&sctx->b, &blend, mode); + memset(&blend, 0, sizeof(blend)); + blend.independent_blend_enable = true; + blend.rt[0].colormask = 0xf; + return si_create_blend_state_mode(&sctx->b, &blend, mode); } static void si_init_config(struct si_context *sctx); void si_init_state_compute_functions(struct si_context *sctx) { - sctx->b.create_sampler_state = si_create_sampler_state; - sctx->b.delete_sampler_state = si_delete_sampler_state; - sctx->b.create_sampler_view = si_create_sampler_view; - sctx->b.sampler_view_destroy = si_sampler_view_destroy; - sctx->b.memory_barrier = si_memory_barrier; + sctx->b.create_sampler_state = si_create_sampler_state; + sctx->b.delete_sampler_state = si_delete_sampler_state; + sctx->b.create_sampler_view = si_create_sampler_view; + sctx->b.sampler_view_destroy = si_sampler_view_destroy; + sctx->b.memory_barrier = si_memory_barrier; } void si_init_state_functions(struct si_context *sctx) { - sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; - sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs; - sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; - sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; - sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; - sctx->atoms.s.sample_mask.emit = si_emit_sample_mask; - sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state; - sctx->atoms.s.blend_color.emit = si_emit_blend_color; - sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; - sctx->atoms.s.clip_state.emit = si_emit_clip_state; - sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; - - sctx->b.create_blend_state = si_create_blend_state; - sctx->b.bind_blend_state = si_bind_blend_state; - sctx->b.delete_blend_state = si_delete_blend_state; - sctx->b.set_blend_color = si_set_blend_color; - - sctx->b.create_rasterizer_state = si_create_rs_state; - sctx->b.bind_rasterizer_state = si_bind_rs_state; - sctx->b.delete_rasterizer_state = si_delete_rs_state; - - sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state; - sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state; - sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state; - - sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx); - sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); - sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); - sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); - sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); - - sctx->b.set_clip_state = si_set_clip_state; - sctx->b.set_stencil_ref = si_set_stencil_ref; - - sctx->b.set_framebuffer_state = si_set_framebuffer_state; - - sctx->b.set_sample_mask = si_set_sample_mask; - - sctx->b.create_vertex_elements_state = si_create_vertex_elements; - sctx->b.bind_vertex_elements_state = si_bind_vertex_elements; - sctx->b.delete_vertex_elements_state = si_delete_vertex_element; - sctx->b.set_vertex_buffers = si_set_vertex_buffers; - - sctx->b.texture_barrier = si_texture_barrier; - sctx->b.set_min_samples = si_set_min_samples; - sctx->b.set_tess_state = si_set_tess_state; - - sctx->b.set_active_query_state = si_set_active_query_state; - - si_init_config(sctx); + sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs; + sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; + sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; + sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; + sctx->atoms.s.sample_mask.emit = si_emit_sample_mask; + sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state; + sctx->atoms.s.blend_color.emit = si_emit_blend_color; + sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; + sctx->atoms.s.clip_state.emit = si_emit_clip_state; + sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; + + sctx->b.create_blend_state = si_create_blend_state; + sctx->b.bind_blend_state = si_bind_blend_state; + sctx->b.delete_blend_state = si_delete_blend_state; + sctx->b.set_blend_color = si_set_blend_color; + + sctx->b.create_rasterizer_state = si_create_rs_state; + sctx->b.bind_rasterizer_state = si_bind_rs_state; + sctx->b.delete_rasterizer_state = si_delete_rs_state; + + sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state; + sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state; + sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state; + + sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx); + sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE); + sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS); + sctx->custom_blend_eliminate_fastclear = + si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR); + sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); + + sctx->b.set_clip_state = si_set_clip_state; + sctx->b.set_stencil_ref = si_set_stencil_ref; + + sctx->b.set_framebuffer_state = si_set_framebuffer_state; + + sctx->b.set_sample_mask = si_set_sample_mask; + + sctx->b.create_vertex_elements_state = si_create_vertex_elements; + sctx->b.bind_vertex_elements_state = si_bind_vertex_elements; + sctx->b.delete_vertex_elements_state = si_delete_vertex_element; + sctx->b.set_vertex_buffers = si_set_vertex_buffers; + + sctx->b.texture_barrier = si_texture_barrier; + sctx->b.set_min_samples = si_set_min_samples; + sctx->b.set_tess_state = si_set_tess_state; + + sctx->b.set_active_query_state = si_set_active_query_state; + + si_init_config(sctx); } void si_init_screen_state_functions(struct si_screen *sscreen) { - sscreen->b.is_format_supported = si_is_format_supported; + sscreen->b.is_format_supported = si_is_format_supported; - if (sscreen->info.chip_class >= GFX10) { - sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; - } else { - sscreen->make_texture_descriptor = si_make_texture_descriptor; - } + if (sscreen->info.chip_class >= GFX10) { + sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; + } else { + sscreen->make_texture_descriptor = si_make_texture_descriptor; + } } -static void si_set_grbm_gfx_index(struct si_context *sctx, - struct si_pm4_state *pm4, unsigned value) +static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value) { - unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : - R_00802C_GRBM_GFX_INDEX; - si_pm4_set_reg(pm4, reg, value); + unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX; + si_pm4_set_reg(pm4, reg, value); } -static void si_set_grbm_gfx_index_se(struct si_context *sctx, - struct si_pm4_state *pm4, unsigned se) +static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se) { - assert(se == ~0 || se < sctx->screen->info.max_se); - si_set_grbm_gfx_index(sctx, pm4, - (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : - S_030800_SE_INDEX(se)) | - S_030800_SH_BROADCAST_WRITES(1) | - S_030800_INSTANCE_BROADCAST_WRITES(1)); + assert(se == ~0 || se < sctx->screen->info.max_se); + si_set_grbm_gfx_index(sctx, pm4, + (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) | + S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); } -static void -si_write_harvested_raster_configs(struct si_context *sctx, - struct si_pm4_state *pm4, - unsigned raster_config, - unsigned raster_config_1) +static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4, + unsigned raster_config, unsigned raster_config_1) { - unsigned num_se = MAX2(sctx->screen->info.max_se, 1); - unsigned raster_config_se[4]; - unsigned se; - - ac_get_harvested_configs(&sctx->screen->info, - raster_config, - &raster_config_1, - raster_config_se); - - for (se = 0; se < num_se; se++) { - si_set_grbm_gfx_index_se(sctx, pm4, se); - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); - } - si_set_grbm_gfx_index(sctx, pm4, ~0); - - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); - } + unsigned num_se = MAX2(sctx->screen->info.max_se, 1); + unsigned raster_config_se[4]; + unsigned se; + + ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se); + + for (se = 0; se < num_se; se++) { + si_set_grbm_gfx_index_se(sctx, pm4, se); + si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]); + } + si_set_grbm_gfx_index(sctx, pm4, ~0); + + if (sctx->chip_class >= GFX7) { + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); + } } static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4) { - struct si_screen *sscreen = sctx->screen; - unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16); - unsigned rb_mask = sscreen->info.enabled_rb_mask; - unsigned raster_config = sscreen->pa_sc_raster_config; - unsigned raster_config_1 = sscreen->pa_sc_raster_config_1; - - if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { - /* Always use the default config when all backends are enabled - * (or when we failed to determine the enabled backends). - */ - si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, - raster_config); - if (sctx->chip_class >= GFX7) - si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, - raster_config_1); - } else { - si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); - } + struct si_screen *sscreen = sctx->screen; + unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16); + unsigned rb_mask = sscreen->info.enabled_rb_mask; + unsigned raster_config = sscreen->pa_sc_raster_config; + unsigned raster_config_1 = sscreen->pa_sc_raster_config_1; + + if (!rb_mask || util_bitcount(rb_mask) >= num_rb) { + /* Always use the default config when all backends are enabled + * (or when we failed to determine the enabled backends). + */ + si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config); + if (sctx->chip_class >= GFX7) + si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1); + } else { + si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1); + } } static void si_init_config(struct si_context *sctx) { - struct si_screen *sscreen = sctx->screen; - uint64_t border_color_va = sctx->border_color_buffer->gpu_address; - bool has_clear_state = sscreen->info.has_clear_state; - struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); - - if (!pm4) - return; - - si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1)); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1)); - si_pm4_cmd_end(pm4, false); - - if (has_clear_state) { - si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE); - si_pm4_cmd_add(pm4, 0); - si_pm4_cmd_end(pm4, false); - } - - if (sctx->chip_class <= GFX8) - si_set_raster_config(sctx, pm4); - - si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); - - /* FIXME calculate these values somehow ??? */ - if (sctx->chip_class <= GFX8) { - si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); - si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); - } - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); - si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); - si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); - } - - if (sscreen->info.chip_class <= GFX9) - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); - if (sctx->chip_class < GFX7) - si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | - S_008A14_CLIP_VTX_REORDER_ENA(1)); - - /* CLEAR_STATE doesn't restore these correctly. */ - si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, - S_028244_BR_X(16384) | S_028244_BR_Y(16384)); - - /* CLEAR_STATE doesn't clear these correctly on certain generations. - * I don't know why. Deduced by trial and error. - */ - if (sctx->chip_class <= GFX7 || !has_clear_state) { - si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); - si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); - si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, - S_028034_BR_X(16384) | S_028034_BR_Y(16384)); - } - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, - S_028230_ER_TRI(0xA) | - S_028230_ER_POINT(0xA) | - S_028230_ER_RECT(0xA) | - /* Required by DX10_DIAMOND_TEST_ENA: */ - S_028230_ER_LINE_LR(0x1A) | - S_028230_ER_LINE_RL(0x26) | - S_028230_ER_LINE_TB(0xA) | - S_028230_ER_LINE_BT(0xA)); - si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); - si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); - si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); - si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); - si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); - } - - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); - si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); - si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); - si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); - } else if (sctx->chip_class == GFX9) { - si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); - } else { - /* These registers, when written, also overwrite the CLEAR_STATE - * context, so we can't rely on CLEAR_STATE setting them. - * It would be an issue if there was another UMD changing them. - */ - si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); - } - - if (sctx->chip_class >= GFX7) { - if (sctx->chip_class >= GFX10) { - /* Logical CUs 16 - 31 */ - si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, - S_00B404_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, - S_00B104_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, - S_00B004_CU_EN(0xffff)); - } - - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); - } else { - si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); - - /* If this is 0, Bonaire can hang even if GS isn't being used. - * Other chips are unaffected. These are suboptimal values, - * but we don't use on-chip GS. - */ - si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, - S_028A44_ES_VERTS_PER_SUBGRP(64) | - S_028A44_GS_PRIMS_PER_SUBGRP(4)); - } - - /* Compute LATE_ALLOC_VS.LIMIT. */ - unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; - unsigned late_alloc_wave64 = 0; /* The limit is per SH. */ - unsigned cu_mask_vs = 0xffff; - unsigned cu_mask_gs = 0xffff; - - if (sctx->chip_class >= GFX10) { - /* For Wave32, the hw will launch twice the number of late - * alloc waves, so 1 == 2x wave32. - */ - if (!sscreen->info.use_late_alloc) { - late_alloc_wave64 = 0; - } else if (num_cu_per_sh <= 6) { - late_alloc_wave64 = num_cu_per_sh - 2; - } else { - late_alloc_wave64 = (num_cu_per_sh - 2) * 4; - - /* CU2 & CU3 disabled because of the dual CU design */ - /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ - cu_mask_vs = 0xfff3; - cu_mask_gs = sscreen->use_ngg && - sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff; - } - } else { - if (!sscreen->info.use_late_alloc) { - late_alloc_wave64 = 0; - } else if (num_cu_per_sh <= 4) { - /* Too few available compute units per SH. Disallowing - * VS to run on one CU could hurt us more than late VS - * allocation would help. - * - * 2 is the highest safe number that allows us to keep - * all CUs enabled. - */ - late_alloc_wave64 = 2; - } else { - /* This is a good initial value, allowing 1 late_alloc - * wave per SIMD on num_cu - 2. - */ - late_alloc_wave64 = (num_cu_per_sh - 2) * 4; - } - - if (late_alloc_wave64 > 2) - cu_mask_vs = 0xfffe; /* 1 CU disabled */ - } - - /* VS can't execute on one CU if the limit is > 2. */ - si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(cu_mask_vs) | - S_00B118_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, - S_00B11C_LIMIT(late_alloc_wave64)); - - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); - - si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, - S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); - } - - if (sctx->chip_class >= GFX10) { - /* Break up a pixel wave if it contains deallocs for more than - * half the parameter cache. - * - * To avoid a deadlock where pixel waves aren't launched - * because they're waiting for more pixels while the frontend - * is stuck waiting for PC space, the maximum allowed value is - * the size of the PC minus the largest possible allocation for - * a single primitive shader subgroup. - */ - si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, - S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); - - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, - sscreen->info.pa_sc_tile_steering_override); - } - - /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */ - unsigned meta_write_policy, meta_read_policy; - /* TODO: investigate whether LRU improves performance on other chips too */ - if (sscreen->info.num_render_backends <= 4) { - meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ - meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ - } else { - meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */ - meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */ - } - - si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, - S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_HTILE_WR_POLICY(meta_write_policy) | - S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_HTILE_RD_POLICY(meta_read_policy)); - - si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, - S_028410_CMASK_WR_POLICY(meta_write_policy) | - S_028410_FMASK_WR_POLICY(meta_write_policy) | - S_028410_DCC_WR_POLICY(meta_write_policy) | - S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_CMASK_RD_POLICY(meta_read_policy) | - S_028410_FMASK_RD_POLICY(meta_read_policy) | - S_028410_DCC_RD_POLICY(meta_read_policy) | - S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); - si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); - - si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, - S_00B0C0_SOFT_GROUPING_EN(1) | - S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); - si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); - } - - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, - S_028B50_ACCUM_ISOLINE(40) | - S_028B50_ACCUM_TRI(30) | - S_028B50_ACCUM_QUAD(24) | - S_028B50_DONUT_SPLIT(24) | - S_028B50_TRAP_SPLIT(6)); - } else if (sctx->chip_class >= GFX8) { - unsigned vgt_tess_distribution; - - vgt_tess_distribution = - S_028B50_ACCUM_ISOLINE(32) | - S_028B50_ACCUM_TRI(11) | - S_028B50_ACCUM_QUAD(11) | - S_028B50_DONUT_SPLIT(16); - - /* Testing with Unigine Heaven extreme tesselation yielded best results - * with TRAP_SPLIT = 3. - */ - if (sctx->family == CHIP_FIJI || - sctx->family >= CHIP_POLARIS10) - vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); - - si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); - } else if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); - si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); - } - - si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, - S_028084_ADDRESS(border_color_va >> 40)); - } - si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, - RADEON_PRIO_BORDER_COLORS); - - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, - S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | - S_028C48_MAX_PRIM_PER_BATCH(1023)); - si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, - S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); - si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); - } - - si_pm4_upload_indirect_buffer(sctx, pm4); - sctx->init_config = pm4; + struct si_screen *sscreen = sctx->screen; + uint64_t border_color_va = sctx->border_color_buffer->gpu_address; + bool has_clear_state = sscreen->info.has_clear_state; + struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + + if (!pm4) + return; + + si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL); + si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1)); + si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1)); + si_pm4_cmd_end(pm4, false); + + if (has_clear_state) { + si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE); + si_pm4_cmd_add(pm4, 0); + si_pm4_cmd_end(pm4, false); + } + + if (sctx->chip_class <= GFX8) + si_set_raster_config(sctx, pm4); + + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + if (!has_clear_state) + si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); + + /* FIXME calculate these values somehow ??? */ + if (sctx->chip_class <= GFX8) { + si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); + si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); + } + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); + si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); + si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + } + + if (sscreen->info.chip_class <= GFX9) + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); + if (!has_clear_state) + si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); + if (sctx->chip_class < GFX7) + si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, + S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); + + /* CLEAR_STATE doesn't restore these correctly. */ + si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, + S_028244_BR_X(16384) | S_028244_BR_Y(16384)); + + /* CLEAR_STATE doesn't clear these correctly on certain generations. + * I don't know why. Deduced by trial and error. + */ + if (sctx->chip_class <= GFX7 || !has_clear_state) { + si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); + si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, + S_028034_BR_X(16384) | S_028034_BR_Y(16384)); + } + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, + S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) | + /* Required by DX10_DIAMOND_TEST_ENA: */ + S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) | + S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA)); + si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); + si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); + si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); + si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); + si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); + } + + if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); + si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + } else if (sctx->chip_class == GFX9) { + si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); + } else { + /* These registers, when written, also overwrite the CLEAR_STATE + * context, so we can't rely on CLEAR_STATE setting them. + * It would be an issue if there was another UMD changing them. + */ + si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); + } + + if (sctx->chip_class >= GFX7) { + if (sctx->chip_class >= GFX10) { + /* Logical CUs 16 - 31 */ + si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff)); + } + + if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); + } else { + si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, + S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, + S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); + + /* If this is 0, Bonaire can hang even if GS isn't being used. + * Other chips are unaffected. These are suboptimal values, + * but we don't use on-chip GS. + */ + si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, + S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); + } + + /* Compute LATE_ALLOC_VS.LIMIT. */ + unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; + unsigned late_alloc_wave64 = 0; /* The limit is per SH. */ + unsigned cu_mask_vs = 0xffff; + unsigned cu_mask_gs = 0xffff; + + if (sctx->chip_class >= GFX10) { + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + */ + if (!sscreen->info.use_late_alloc) { + late_alloc_wave64 = 0; + } else if (num_cu_per_sh <= 6) { + late_alloc_wave64 = num_cu_per_sh - 2; + } else { + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + + /* CU2 & CU3 disabled because of the dual CU design */ + /* Late alloc is not used for NGG on Navi14 due to a hw bug. */ + cu_mask_vs = 0xfff3; + cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff; + } + } else { + if (!sscreen->info.use_late_alloc) { + late_alloc_wave64 = 0; + } else if (num_cu_per_sh <= 4) { + /* Too few available compute units per SH. Disallowing + * VS to run on one CU could hurt us more than late VS + * allocation would help. + * + * 2 is the highest safe number that allows us to keep + * all CUs enabled. + */ + late_alloc_wave64 = 2; + } else { + /* This is a good initial value, allowing 1 late_alloc + * wave per SIMD on num_cu - 2. + */ + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + } + + if (late_alloc_wave64 > 2) + cu_mask_vs = 0xfffe; /* 1 CU disabled */ + } + + /* VS can't execute on one CU if the limit is > 2. */ + si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, + S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); + + si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); + + si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, + S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); + } + + if (sctx->chip_class >= GFX10) { + /* Break up a pixel wave if it contains deallocs for more than + * half the parameter cache. + * + * To avoid a deadlock where pixel waves aren't launched + * because they're waiting for more pixels while the frontend + * is stuck waiting for PC space, the maximum allowed value is + * the size of the PC minus the largest possible allocation for + * a single primitive shader subgroup. + */ + si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + } + + /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */ + unsigned meta_write_policy, meta_read_policy; + /* TODO: investigate whether LRU improves performance on other chips too */ + if (sscreen->info.num_render_backends <= 4) { + meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ + meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ + } else { + meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */ + meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */ + } + + si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, + S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_HTILE_WR_POLICY(meta_write_policy) | + S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_HTILE_RD_POLICY(meta_read_policy)); + + si_pm4_set_reg( + pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, + S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) | + S_028410_DCC_WR_POLICY(meta_write_policy) | + S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_CMASK_RD_POLICY(meta_read_policy) | + S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) | + S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); + si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + } + + if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, + S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) | + S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6)); + } else if (sctx->chip_class >= GFX8) { + unsigned vgt_tess_distribution; + + vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) | + S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16); + + /* Testing with Unigine Heaven extreme tesselation yielded best results + * with TRAP_SPLIT = 3. + */ + if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10) + vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); + + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); + } else if (!has_clear_state) { + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); + } + + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); + if (sctx->chip_class >= GFX7) { + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); + } + si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS); + + if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, + S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | + S_028C48_MAX_PRIM_PER_BATCH(1023)); + si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, + S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); + } + + si_pm4_upload_indirect_buffer(sctx, pm4); + sctx->init_config = pm4; } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 824bf4fef41..aa024b72e43 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -25,20 +25,19 @@ #ifndef SI_STATE_H #define SI_STATE_H -#include "si_pm4.h" - #include "pipebuffer/pb_slab.h" +#include "si_pm4.h" #include "util/u_blitter.h" -#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1) -#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1) +#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL + 1) +#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE + 1) -#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS -#define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */ -#define SI_NUM_CONST_BUFFERS 16 -#define SI_NUM_IMAGES 16 -#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */ -#define SI_NUM_SHADER_BUFFERS 16 +#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS +#define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */ +#define SI_NUM_CONST_BUFFERS 16 +#define SI_NUM_IMAGES 16 +#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */ +#define SI_NUM_SHADER_BUFFERS 16 struct si_screen; struct si_shader; @@ -48,351 +47,335 @@ struct si_texture; struct si_qbo_state; struct si_state_blend { - struct si_pm4_state pm4; - uint32_t cb_target_mask; - /* Set 0xf or 0x0 (4 bits) per render target if the following is - * true. ANDed with spi_shader_col_format. - */ - unsigned cb_target_enabled_4bit; - unsigned blend_enable_4bit; - unsigned need_src_alpha_4bit; - unsigned commutative_4bit; - unsigned dcc_msaa_corruption_4bit; - bool alpha_to_coverage:1; - bool alpha_to_one:1; - bool dual_src_blend:1; - bool logicop_enable:1; + struct si_pm4_state pm4; + uint32_t cb_target_mask; + /* Set 0xf or 0x0 (4 bits) per render target if the following is + * true. ANDed with spi_shader_col_format. + */ + unsigned cb_target_enabled_4bit; + unsigned blend_enable_4bit; + unsigned need_src_alpha_4bit; + unsigned commutative_4bit; + unsigned dcc_msaa_corruption_4bit; + bool alpha_to_coverage : 1; + bool alpha_to_one : 1; + bool dual_src_blend : 1; + bool logicop_enable : 1; }; struct si_state_rasterizer { - struct si_pm4_state pm4; - /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */ - struct si_pm4_state *pm4_poly_offset; - unsigned pa_sc_line_stipple; - unsigned pa_cl_clip_cntl; - float line_width; - float max_point_size; - unsigned sprite_coord_enable:8; - unsigned clip_plane_enable:8; - unsigned half_pixel_center:1; - unsigned flatshade:1; - unsigned flatshade_first:1; - unsigned two_side:1; - unsigned multisample_enable:1; - unsigned force_persample_interp:1; - unsigned line_stipple_enable:1; - unsigned poly_stipple_enable:1; - unsigned line_smooth:1; - unsigned poly_smooth:1; - unsigned uses_poly_offset:1; - unsigned clamp_fragment_color:1; - unsigned clamp_vertex_color:1; - unsigned rasterizer_discard:1; - unsigned scissor_enable:1; - unsigned clip_halfz:1; - unsigned cull_front:1; - unsigned cull_back:1; - unsigned depth_clamp_any:1; - unsigned provoking_vertex_first:1; - unsigned polygon_mode_enabled:1; - unsigned polygon_mode_is_lines:1; + struct si_pm4_state pm4; + /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */ + struct si_pm4_state *pm4_poly_offset; + unsigned pa_sc_line_stipple; + unsigned pa_cl_clip_cntl; + float line_width; + float max_point_size; + unsigned sprite_coord_enable : 8; + unsigned clip_plane_enable : 8; + unsigned half_pixel_center : 1; + unsigned flatshade : 1; + unsigned flatshade_first : 1; + unsigned two_side : 1; + unsigned multisample_enable : 1; + unsigned force_persample_interp : 1; + unsigned line_stipple_enable : 1; + unsigned poly_stipple_enable : 1; + unsigned line_smooth : 1; + unsigned poly_smooth : 1; + unsigned uses_poly_offset : 1; + unsigned clamp_fragment_color : 1; + unsigned clamp_vertex_color : 1; + unsigned rasterizer_discard : 1; + unsigned scissor_enable : 1; + unsigned clip_halfz : 1; + unsigned cull_front : 1; + unsigned cull_back : 1; + unsigned depth_clamp_any : 1; + unsigned provoking_vertex_first : 1; + unsigned polygon_mode_enabled : 1; + unsigned polygon_mode_is_lines : 1; }; struct si_dsa_stencil_ref_part { - uint8_t valuemask[2]; - uint8_t writemask[2]; + uint8_t valuemask[2]; + uint8_t writemask[2]; }; struct si_dsa_order_invariance { - /** Whether the final result in Z/S buffers is guaranteed to be - * invariant under changes to the order in which fragments arrive. */ - bool zs:1; - - /** Whether the set of fragments that pass the combined Z/S test is - * guaranteed to be invariant under changes to the order in which - * fragments arrive. */ - bool pass_set:1; - - /** Whether the last fragment that passes the combined Z/S test at each - * sample is guaranteed to be invariant under changes to the order in - * which fragments arrive. */ - bool pass_last:1; + /** Whether the final result in Z/S buffers is guaranteed to be + * invariant under changes to the order in which fragments arrive. */ + bool zs : 1; + + /** Whether the set of fragments that pass the combined Z/S test is + * guaranteed to be invariant under changes to the order in which + * fragments arrive. */ + bool pass_set : 1; + + /** Whether the last fragment that passes the combined Z/S test at each + * sample is guaranteed to be invariant under changes to the order in + * which fragments arrive. */ + bool pass_last : 1; }; struct si_state_dsa { - struct si_pm4_state pm4; - struct si_dsa_stencil_ref_part stencil_ref; - - /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */ - struct si_dsa_order_invariance order_invariance[2]; - - ubyte alpha_func:3; - bool depth_enabled:1; - bool depth_write_enabled:1; - bool stencil_enabled:1; - bool stencil_write_enabled:1; - bool db_can_write:1; - + struct si_pm4_state pm4; + struct si_dsa_stencil_ref_part stencil_ref; + + /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */ + struct si_dsa_order_invariance order_invariance[2]; + + ubyte alpha_func : 3; + bool depth_enabled : 1; + bool depth_write_enabled : 1; + bool stencil_enabled : 1; + bool stencil_write_enabled : 1; + bool db_can_write : 1; }; struct si_stencil_ref { - struct pipe_stencil_ref state; - struct si_dsa_stencil_ref_part dsa_part; + struct pipe_stencil_ref state; + struct si_dsa_stencil_ref_part dsa_part; }; -struct si_vertex_elements -{ - struct si_resource *instance_divisor_factor_buffer; - uint32_t rsrc_word3[SI_MAX_ATTRIBS]; - uint16_t src_offset[SI_MAX_ATTRIBS]; - uint8_t fix_fetch[SI_MAX_ATTRIBS]; - uint8_t format_size[SI_MAX_ATTRIBS]; - uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; - - /* Bitmask of elements that always need a fixup to be applied. */ - uint16_t fix_fetch_always; - - /* Bitmask of elements whose fetch should always be opencoded. */ - uint16_t fix_fetch_opencode; - - /* Bitmask of elements which need to be opencoded if the vertex buffer - * is unaligned. */ - uint16_t fix_fetch_unaligned; - - /* For elements in fix_fetch_unaligned: whether the effective - * element load size as seen by the hardware is a dword (as opposed - * to a short). - */ - uint16_t hw_load_is_dword; - - /* Bitmask of vertex buffers requiring alignment check */ - uint16_t vb_alignment_check_mask; - - uint8_t count; - bool uses_instance_divisors; - - uint16_t first_vb_use_mask; - /* Vertex buffer descriptor list size aligned for optimal prefetch. */ - uint16_t vb_desc_list_alloc_size; - uint16_t instance_divisor_is_one; /* bitmask of inputs */ - uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ +struct si_vertex_elements { + struct si_resource *instance_divisor_factor_buffer; + uint32_t rsrc_word3[SI_MAX_ATTRIBS]; + uint16_t src_offset[SI_MAX_ATTRIBS]; + uint8_t fix_fetch[SI_MAX_ATTRIBS]; + uint8_t format_size[SI_MAX_ATTRIBS]; + uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; + + /* Bitmask of elements that always need a fixup to be applied. */ + uint16_t fix_fetch_always; + + /* Bitmask of elements whose fetch should always be opencoded. */ + uint16_t fix_fetch_opencode; + + /* Bitmask of elements which need to be opencoded if the vertex buffer + * is unaligned. */ + uint16_t fix_fetch_unaligned; + + /* For elements in fix_fetch_unaligned: whether the effective + * element load size as seen by the hardware is a dword (as opposed + * to a short). + */ + uint16_t hw_load_is_dword; + + /* Bitmask of vertex buffers requiring alignment check */ + uint16_t vb_alignment_check_mask; + + uint8_t count; + bool uses_instance_divisors; + + uint16_t first_vb_use_mask; + /* Vertex buffer descriptor list size aligned for optimal prefetch. */ + uint16_t vb_desc_list_alloc_size; + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ }; union si_state { - struct { - struct si_state_blend *blend; - struct si_state_rasterizer *rasterizer; - struct si_state_dsa *dsa; - struct si_pm4_state *poly_offset; - struct si_pm4_state *ls; - struct si_pm4_state *hs; - struct si_pm4_state *es; - struct si_pm4_state *gs; - struct si_pm4_state *vgt_shader_config; - struct si_pm4_state *vs; - struct si_pm4_state *ps; - } named; - struct si_pm4_state *array[0]; + struct { + struct si_state_blend *blend; + struct si_state_rasterizer *rasterizer; + struct si_state_dsa *dsa; + struct si_pm4_state *poly_offset; + struct si_pm4_state *ls; + struct si_pm4_state *hs; + struct si_pm4_state *es; + struct si_pm4_state *gs; + struct si_pm4_state *vgt_shader_config; + struct si_pm4_state *vs; + struct si_pm4_state *ps; + } named; + struct si_pm4_state *array[0]; }; -#define SI_STATE_IDX(name) \ - (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *)) +#define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *)) #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name)) -#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *)) +#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *)) static inline unsigned si_states_that_always_roll_context(void) { - return (SI_STATE_BIT(blend) | - SI_STATE_BIT(rasterizer) | - SI_STATE_BIT(dsa) | - SI_STATE_BIT(poly_offset) | - SI_STATE_BIT(vgt_shader_config)); + return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) | + SI_STATE_BIT(poly_offset) | SI_STATE_BIT(vgt_shader_config)); } union si_state_atoms { - struct { - /* The order matters. */ - struct si_atom render_cond; - struct si_atom streamout_begin; - struct si_atom streamout_enable; /* must be after streamout_begin */ - struct si_atom framebuffer; - struct si_atom msaa_sample_locs; - struct si_atom db_render_state; - struct si_atom dpbb_state; - struct si_atom msaa_config; - struct si_atom sample_mask; - struct si_atom cb_render_state; - struct si_atom blend_color; - struct si_atom clip_regs; - struct si_atom clip_state; - struct si_atom shader_pointers; - struct si_atom guardband; - struct si_atom scissors; - struct si_atom viewports; - struct si_atom stencil_ref; - struct si_atom spi_map; - struct si_atom scratch_state; - struct si_atom window_rectangles; - struct si_atom shader_query; - } s; - struct si_atom array[0]; + struct { + /* The order matters. */ + struct si_atom render_cond; + struct si_atom streamout_begin; + struct si_atom streamout_enable; /* must be after streamout_begin */ + struct si_atom framebuffer; + struct si_atom msaa_sample_locs; + struct si_atom db_render_state; + struct si_atom dpbb_state; + struct si_atom msaa_config; + struct si_atom sample_mask; + struct si_atom cb_render_state; + struct si_atom blend_color; + struct si_atom clip_regs; + struct si_atom clip_state; + struct si_atom shader_pointers; + struct si_atom guardband; + struct si_atom scissors; + struct si_atom viewports; + struct si_atom stencil_ref; + struct si_atom spi_map; + struct si_atom scratch_state; + struct si_atom window_rectangles; + struct si_atom shader_query; + } s; + struct si_atom array[0]; }; -#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \ - sizeof(struct si_atom))) -#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*)) +#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom))) +#define SI_NUM_ATOMS (sizeof(union si_state_atoms) / sizeof(struct si_atom *)) static inline unsigned si_atoms_that_always_roll_context(void) { - return (SI_ATOM_BIT(streamout_begin) | - SI_ATOM_BIT(streamout_enable) | - SI_ATOM_BIT(framebuffer) | - SI_ATOM_BIT(msaa_sample_locs) | - SI_ATOM_BIT(sample_mask) | - SI_ATOM_BIT(blend_color) | - SI_ATOM_BIT(clip_state) | - SI_ATOM_BIT(scissors) | - SI_ATOM_BIT(viewports) | - SI_ATOM_BIT(stencil_ref) | - SI_ATOM_BIT(scratch_state) | - SI_ATOM_BIT(window_rectangles)); + return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) | + SI_ATOM_BIT(msaa_sample_locs) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) | + SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) | + SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles)); } struct si_shader_data { - uint32_t sh_base[SI_NUM_SHADERS]; + uint32_t sh_base[SI_NUM_SHADERS]; }; -#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \ - (S_02881C_USE_VTX_POINT_SIZE(1) | \ - S_02881C_USE_VTX_EDGE_FLAG(1) | \ - S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | \ - S_02881C_USE_VTX_VIEWPORT_INDX(1) | \ - S_02881C_VS_OUT_MISC_VEC_ENA(1) | \ - S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1)) +#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \ + (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) | \ + S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) | \ + S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1)) /* The list of registers whose emitted values are remembered by si_context. */ -enum si_tracked_reg { - SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */ - SI_TRACKED_DB_COUNT_CONTROL, +enum si_tracked_reg +{ + SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */ + SI_TRACKED_DB_COUNT_CONTROL, - SI_TRACKED_DB_RENDER_OVERRIDE2, - SI_TRACKED_DB_SHADER_CONTROL, + SI_TRACKED_DB_RENDER_OVERRIDE2, + SI_TRACKED_DB_SHADER_CONTROL, - SI_TRACKED_CB_TARGET_MASK, - SI_TRACKED_CB_DCC_CONTROL, + SI_TRACKED_CB_TARGET_MASK, + SI_TRACKED_CB_DCC_CONTROL, - SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */ - SI_TRACKED_SX_BLEND_OPT_EPSILON, - SI_TRACKED_SX_BLEND_OPT_CONTROL, + SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */ + SI_TRACKED_SX_BLEND_OPT_EPSILON, + SI_TRACKED_SX_BLEND_OPT_CONTROL, - SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */ - SI_TRACKED_PA_SC_AA_CONFIG, + SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */ + SI_TRACKED_PA_SC_AA_CONFIG, - SI_TRACKED_DB_EQAA, - SI_TRACKED_PA_SC_MODE_CNTL_1, + SI_TRACKED_DB_EQAA, + SI_TRACKED_PA_SC_MODE_CNTL_1, - SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, - SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, + SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, + SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/ - SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */ - SI_TRACKED_PA_CL_CLIP_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/ + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */ + SI_TRACKED_PA_CL_CLIP_CNTL, - SI_TRACKED_PA_SC_BINNER_CNTL_0, - SI_TRACKED_DB_DFSM_CONTROL, + SI_TRACKED_PA_SC_BINNER_CNTL_0, + SI_TRACKED_DB_DFSM_CONTROL, - SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */ - SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ, - SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ, - SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ, + SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */ + SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ, + SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ, + SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ, - SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, - SI_TRACKED_PA_SU_VTX_CNTL, + SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, + SI_TRACKED_PA_SU_VTX_CNTL, - SI_TRACKED_PA_SC_CLIPRECT_RULE, + SI_TRACKED_PA_SC_CLIPRECT_RULE, - SI_TRACKED_PA_SC_LINE_STIPPLE, + SI_TRACKED_PA_SC_LINE_STIPPLE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */ - SI_TRACKED_VGT_GSVS_RING_OFFSET_2, - SI_TRACKED_VGT_GSVS_RING_OFFSET_3, + SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */ + SI_TRACKED_VGT_GSVS_RING_OFFSET_2, + SI_TRACKED_VGT_GSVS_RING_OFFSET_3, - SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, - SI_TRACKED_VGT_GS_MAX_VERT_OUT, + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, + SI_TRACKED_VGT_GS_MAX_VERT_OUT, - SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */ - SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1, - SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, - SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, + SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */ + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1, + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, - SI_TRACKED_VGT_GS_INSTANCE_CNT, - SI_TRACKED_VGT_GS_ONCHIP_CNTL, - SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, - SI_TRACKED_VGT_GS_MODE, - SI_TRACKED_VGT_PRIMITIVEID_EN, - SI_TRACKED_VGT_REUSE_OFF, - SI_TRACKED_SPI_VS_OUT_CONFIG, - SI_TRACKED_PA_CL_VTE_CNTL, - SI_TRACKED_PA_CL_NGG_CNTL, - SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, - SI_TRACKED_GE_NGG_SUBGRP_CNTL, + SI_TRACKED_VGT_GS_INSTANCE_CNT, + SI_TRACKED_VGT_GS_ONCHIP_CNTL, + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + SI_TRACKED_VGT_GS_MODE, + SI_TRACKED_VGT_PRIMITIVEID_EN, + SI_TRACKED_VGT_REUSE_OFF, + SI_TRACKED_SPI_VS_OUT_CONFIG, + SI_TRACKED_PA_CL_VTE_CNTL, + SI_TRACKED_PA_CL_NGG_CNTL, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_NGG_SUBGRP_CNTL, - SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */ - SI_TRACKED_SPI_SHADER_POS_FORMAT, + SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */ + SI_TRACKED_SPI_SHADER_POS_FORMAT, - SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */ - SI_TRACKED_SPI_PS_INPUT_ADDR, + SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */ + SI_TRACKED_SPI_PS_INPUT_ADDR, - SI_TRACKED_SPI_BARYC_CNTL, - SI_TRACKED_SPI_PS_IN_CONTROL, + SI_TRACKED_SPI_BARYC_CNTL, + SI_TRACKED_SPI_PS_IN_CONTROL, - SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */ - SI_TRACKED_SPI_SHADER_COL_FORMAT, + SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */ + SI_TRACKED_SPI_SHADER_COL_FORMAT, - SI_TRACKED_CB_SHADER_MASK, - SI_TRACKED_VGT_TF_PARAM, - SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + SI_TRACKED_CB_SHADER_MASK, + SI_TRACKED_VGT_TF_PARAM, + SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, - SI_TRACKED_GE_PC_ALLOC, + SI_TRACKED_GE_PC_ALLOC, - SI_NUM_TRACKED_REGS, + SI_NUM_TRACKED_REGS, }; struct si_tracked_regs { - uint64_t reg_saved; - uint32_t reg_value[SI_NUM_TRACKED_REGS]; - uint32_t spi_ps_input_cntl[32]; + uint64_t reg_saved; + uint32_t reg_value[SI_NUM_TRACKED_REGS]; + uint32_t spi_ps_input_cntl[32]; }; /* Private read-write buffer slots. */ -enum { - SI_ES_RING_ESGS, - SI_GS_RING_ESGS, +enum +{ + SI_ES_RING_ESGS, + SI_GS_RING_ESGS, - SI_RING_GSVS, + SI_RING_GSVS, - SI_VS_STREAMOUT_BUF0, - SI_VS_STREAMOUT_BUF1, - SI_VS_STREAMOUT_BUF2, - SI_VS_STREAMOUT_BUF3, + SI_VS_STREAMOUT_BUF0, + SI_VS_STREAMOUT_BUF1, + SI_VS_STREAMOUT_BUF2, + SI_VS_STREAMOUT_BUF3, - SI_HS_CONST_DEFAULT_TESS_LEVELS, - SI_VS_CONST_INSTANCE_DIVISORS, - SI_VS_CONST_CLIP_PLANES, - SI_PS_CONST_POLY_STIPPLE, - SI_PS_CONST_SAMPLE_POSITIONS, + SI_HS_CONST_DEFAULT_TESS_LEVELS, + SI_VS_CONST_INSTANCE_DIVISORS, + SI_VS_CONST_CLIP_PLANES, + SI_PS_CONST_POLY_STIPPLE, + SI_PS_CONST_SAMPLE_POSITIONS, - /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */ - SI_PS_IMAGE_COLORBUF0, - SI_PS_IMAGE_COLORBUF0_HI, - SI_PS_IMAGE_COLORBUF0_FMASK, - SI_PS_IMAGE_COLORBUF0_FMASK_HI, + /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */ + SI_PS_IMAGE_COLORBUF0, + SI_PS_IMAGE_COLORBUF0_HI, + SI_PS_IMAGE_COLORBUF0_FMASK, + SI_PS_IMAGE_COLORBUF0_FMASK_HI, - GFX10_GS_QUERY_BUF, + GFX10_GS_QUERY_BUF, - SI_NUM_RW_BUFFERS, + SI_NUM_RW_BUFFERS, }; /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines @@ -406,122 +389,111 @@ enum { * 11 - compute const and shader buffers * 12 - compute samplers and images */ -enum { - SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, - SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, - SI_NUM_SHADER_DESCS, +enum +{ + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + SI_NUM_SHADER_DESCS, }; -#define SI_DESCS_RW_BUFFERS 0 -#define SI_DESCS_FIRST_SHADER 1 -#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + \ - PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS) -#define SI_NUM_DESCS (SI_DESCS_FIRST_SHADER + \ - SI_NUM_SHADERS * SI_NUM_SHADER_DESCS) +#define SI_DESCS_RW_BUFFERS 0 +#define SI_DESCS_FIRST_SHADER 1 +#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS) +#define SI_NUM_DESCS (SI_DESCS_FIRST_SHADER + SI_NUM_SHADERS * SI_NUM_SHADER_DESCS) -#define SI_DESCS_SHADER_MASK(name) \ - u_bit_consecutive(SI_DESCS_FIRST_SHADER + \ - PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \ - SI_NUM_SHADER_DESCS) +#define SI_DESCS_SHADER_MASK(name) \ + u_bit_consecutive(SI_DESCS_FIRST_SHADER + PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \ + SI_NUM_SHADER_DESCS) -static inline unsigned -si_const_and_shader_buffer_descriptors_idx(unsigned shader) +static inline unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader) { - return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS; + return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS; } -static inline unsigned -si_sampler_and_image_descriptors_idx(unsigned shader) +static inline unsigned si_sampler_and_image_descriptors_idx(unsigned shader) { - return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_SAMPLERS_AND_IMAGES; + return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES; } /* This represents descriptors in memory, such as buffer resources, * image resources, and sampler states. */ struct si_descriptors { - /* The list of descriptors in malloc'd memory. */ - uint32_t *list; - /* The list in mapped GPU memory. */ - uint32_t *gpu_list; - - /* The buffer where the descriptors have been uploaded. */ - struct si_resource *buffer; - uint64_t gpu_address; - - /* The maximum number of descriptors. */ - uint32_t num_elements; - - /* Slots that are used by currently-bound shaders. - * It determines which slots are uploaded. - */ - uint32_t first_active_slot; - uint32_t num_active_slots; - - /* The SH register offset relative to USER_DATA*_0 where the pointer - * to the descriptor array will be stored. */ - short shader_userdata_offset; - /* The size of one descriptor. */ - ubyte element_dw_size; - /* If there is only one slot enabled, bind it directly instead of - * uploading descriptors. -1 if disabled. */ - signed char slot_index_to_bind_directly; + /* The list of descriptors in malloc'd memory. */ + uint32_t *list; + /* The list in mapped GPU memory. */ + uint32_t *gpu_list; + + /* The buffer where the descriptors have been uploaded. */ + struct si_resource *buffer; + uint64_t gpu_address; + + /* The maximum number of descriptors. */ + uint32_t num_elements; + + /* Slots that are used by currently-bound shaders. + * It determines which slots are uploaded. + */ + uint32_t first_active_slot; + uint32_t num_active_slots; + + /* The SH register offset relative to USER_DATA*_0 where the pointer + * to the descriptor array will be stored. */ + short shader_userdata_offset; + /* The size of one descriptor. */ + ubyte element_dw_size; + /* If there is only one slot enabled, bind it directly instead of + * uploading descriptors. -1 if disabled. */ + signed char slot_index_to_bind_directly; }; struct si_buffer_resources { - struct pipe_resource **buffers; /* this has num_buffers elements */ - unsigned *offsets; /* this has num_buffers elements */ + struct pipe_resource **buffers; /* this has num_buffers elements */ + unsigned *offsets; /* this has num_buffers elements */ - enum radeon_bo_priority priority:6; - enum radeon_bo_priority priority_constbuf:6; + enum radeon_bo_priority priority : 6; + enum radeon_bo_priority priority_constbuf : 6; - /* The i-th bit is set if that element is enabled (non-NULL resource). */ - unsigned enabled_mask; - unsigned writable_mask; + /* The i-th bit is set if that element is enabled (non-NULL resource). */ + unsigned enabled_mask; + unsigned writable_mask; }; -#define si_pm4_state_changed(sctx, member) \ - ((sctx)->queued.named.member != (sctx)->emitted.named.member) +#define si_pm4_state_changed(sctx, member) \ + ((sctx)->queued.named.member != (sctx)->emitted.named.member) -#define si_pm4_state_enabled_and_changed(sctx, member) \ - ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member)) +#define si_pm4_state_enabled_and_changed(sctx, member) \ + ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member)) -#define si_pm4_bind_state(sctx, member, value) \ - do { \ - (sctx)->queued.named.member = (value); \ - (sctx)->dirty_states |= SI_STATE_BIT(member); \ - } while(0) +#define si_pm4_bind_state(sctx, member, value) \ + do { \ + (sctx)->queued.named.member = (value); \ + (sctx)->dirty_states |= SI_STATE_BIT(member); \ + } while (0) -#define si_pm4_delete_state(sctx, member, value) \ - do { \ - if ((sctx)->queued.named.member == (value)) { \ - (sctx)->queued.named.member = NULL; \ - } \ - si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \ - SI_STATE_IDX(member)); \ - } while(0) +#define si_pm4_delete_state(sctx, member, value) \ + do { \ + if ((sctx)->queued.named.member == (value)) { \ + (sctx)->queued.named.member = NULL; \ + } \ + si_pm4_free_state(sctx, (struct si_pm4_state *)(value), SI_STATE_IDX(member)); \ + } while (0) /* si_descriptors.c */ -void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, - struct si_texture *tex, - const struct legacy_surf_level *base_level_info, - unsigned base_level, unsigned first_level, - unsigned block_width, bool is_stencil, - uint32_t *state); +void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex, + const struct legacy_surf_level *base_level_info, + unsigned base_level, unsigned first_level, unsigned block_width, + bool is_stencil, uint32_t *state); void si_update_ps_colorbuf0_slot(struct si_context *sctx); -void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, - uint slot, struct pipe_constant_buffer *cbuf); -void si_get_shader_buffers(struct si_context *sctx, - enum pipe_shader_type shader, - uint start_slot, uint count, - struct pipe_shader_buffer *sbuf); -void si_set_ring_buffer(struct si_context *sctx, uint slot, - struct pipe_resource *buffer, - unsigned stride, unsigned num_records, - bool add_tid, bool swizzle, - unsigned element_size, unsigned index_stride, uint64_t offset); +void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, + struct pipe_constant_buffer *cbuf); +void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot, + uint count, struct pipe_shader_buffer *sbuf); +void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer, + unsigned stride, unsigned num_records, bool add_tid, bool swizzle, + unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); bool si_upload_vertex_buffer_descriptors(struct si_context *sctx); bool si_upload_graphics_shader_descriptors(struct si_context *sctx); @@ -530,102 +502,84 @@ void si_release_all_descriptors(struct si_context *sctx); void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx); void si_compute_resources_add_all_to_bo_list(struct si_context *sctx); void si_all_descriptors_begin_new_cs(struct si_context *sctx); -void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, - const uint8_t *ptr, unsigned size, uint32_t *const_offset); +void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr, + unsigned size, uint32_t *const_offset); void si_update_all_texture_descriptors(struct si_context *sctx); void si_shader_change_notify(struct si_context *sctx); void si_update_needs_color_decompress_masks(struct si_context *sctx); void si_emit_graphics_shader_pointers(struct si_context *sctx); void si_emit_compute_shader_pointers(struct si_context *sctx); -void si_set_rw_buffer(struct si_context *sctx, - uint slot, const struct pipe_constant_buffer *input); +void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input); void si_set_rw_shader_buffer(struct si_context *sctx, uint slot, - const struct pipe_shader_buffer *sbuffer); + const struct pipe_shader_buffer *sbuffer); void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, - uint64_t new_active_mask); -void si_set_active_descriptors_for_shader(struct si_context *sctx, - struct si_shader_selector *sel); -bool si_bindless_descriptor_can_reclaim_slab(void *priv, - struct pb_slab_entry *entry); -struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, - unsigned entry_size, - unsigned group_index); + uint64_t new_active_mask); +void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel); +bool si_bindless_descriptor_can_reclaim_slab(void *priv, struct pb_slab_entry *entry); +struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, unsigned entry_size, + unsigned group_index); void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab); void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf); /* si_state.c */ void si_init_state_compute_functions(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx); void si_init_screen_state_functions(struct si_screen *sscreen); -void -si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, - enum pipe_format format, - unsigned offset, unsigned size, - uint32_t *state); -struct pipe_sampler_view * -si_create_sampler_view_custom(struct pipe_context *ctx, - struct pipe_resource *texture, - const struct pipe_sampler_view *state, - unsigned width0, unsigned height0, - unsigned force_level); +void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, + enum pipe_format format, unsigned offset, unsigned size, + uint32_t *state); +struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state, + unsigned width0, unsigned height0, + unsigned force_level); void si_update_fb_dirtiness_after_rendering(struct si_context *sctx); void si_update_ps_iter_samples(struct si_context *sctx); void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st); void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st); -void si_set_occlusion_query_state(struct si_context *sctx, - bool old_perfect_enable); +void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable); struct si_fast_udiv_info32 { unsigned multiplier; /* the "magic number" multiplier */ - unsigned pre_shift; /* shift for the dividend before multiplying */ + unsigned pre_shift; /* shift for the dividend before multiplying */ unsigned post_shift; /* shift for the dividend after multiplying */ - int increment; /* 0 or 1; if set then increment the numerator, using one of - the two strategies */ + int increment; /* 0 or 1; if set then increment the numerator, using one of + the two strategies */ }; -struct si_fast_udiv_info32 -si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits); +struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits); /* si_state_binning.c */ void si_emit_dpbb_state(struct si_context *sctx); /* si_state_shaders.c */ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, - unsigned char ir_sha1_cache_key[20]); -bool si_shader_cache_load_shader(struct si_screen *sscreen, - unsigned char ir_sha1_cache_key[20], - struct si_shader *shader); -void si_shader_cache_insert_shader(struct si_screen *sscreen, - unsigned char ir_sha1_cache_key[20], - struct si_shader *shader, - bool insert_into_disk_cache); + unsigned char ir_sha1_cache_key[20]); +bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], + struct si_shader *shader); +void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], + struct si_shader *shader, bool insert_into_disk_cache); bool si_update_shaders(struct si_context *sctx); void si_init_screen_live_shader_cache(struct si_screen *sscreen); void si_init_shader_functions(struct si_context *sctx); bool si_init_shader_cache(struct si_screen *sscreen); void si_destroy_shader_cache(struct si_screen *sscreen); void si_schedule_initial_compile(struct si_context *sctx, unsigned processor, - struct util_queue_fence *ready_fence, - struct si_compiler_ctx_state *compiler_ctx_state, - void *job, util_queue_execute_func execute); -void si_get_active_slot_masks(const struct si_shader_info *info, - uint32_t *const_and_shader_buffers, - uint64_t *samplers_and_images); -int si_shader_select_with_key(struct si_screen *sscreen, - struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, - int thread_index, - bool optimized_or_none); -void si_shader_selector_key_vs(struct si_context *sctx, - struct si_shader_selector *vs, - struct si_shader_key *key, - struct si_vs_prolog_bits *prolog_key); + struct util_queue_fence *ready_fence, + struct si_compiler_ctx_state *compiler_ctx_state, void *job, + util_queue_execute_func execute); +void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers, + uint64_t *samplers_and_images); +int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, + struct si_compiler_ctx_state *compiler_state, + struct si_shader_key *key, int thread_index, bool optimized_or_none); +void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key); unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); /* si_state_draw.c */ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, - unsigned cp_coher_cntl); + unsigned cp_coher_cntl); void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); void gfx10_emit_cache_flush(struct si_context *sctx); void si_emit_cache_flush(struct si_context *sctx); @@ -639,35 +593,33 @@ void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples); /* si_state_streamout.c */ void si_streamout_buffers_dirty(struct si_context *sctx); void si_emit_streamout_end(struct si_context *sctx); -void si_update_prims_generated_query_state(struct si_context *sctx, - unsigned type, int diff); +void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff); void si_init_streamout_functions(struct si_context *sctx); - static inline unsigned si_get_constbuf_slot(unsigned slot) { - /* Constant buffers are in slots [16..31], ascending */ - return SI_NUM_SHADER_BUFFERS + slot; + /* Constant buffers are in slots [16..31], ascending */ + return SI_NUM_SHADER_BUFFERS + slot; } static inline unsigned si_get_shaderbuf_slot(unsigned slot) { - /* shader buffers are in slots [15..0], descending */ - return SI_NUM_SHADER_BUFFERS - 1 - slot; + /* shader buffers are in slots [15..0], descending */ + return SI_NUM_SHADER_BUFFERS - 1 - slot; } static inline unsigned si_get_sampler_slot(unsigned slot) { - /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */ - /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */ - return SI_NUM_IMAGE_SLOTS / 2 + slot; + /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */ + /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */ + return SI_NUM_IMAGE_SLOTS / 2 + slot; } static inline unsigned si_get_image_slot(unsigned slot) { - /* image slots are in [31..0] (sampler slots [15..0]), descending */ - /* images are in slots [31..16], while FMASKs are in slots [15..0] */ - return SI_NUM_IMAGE_SLOTS - 1 - slot; + /* image slots are in [31..0] (sampler slots [15..0]), descending */ + /* images are in slots [31..16], while FMASKs are in slots [15..0] */ + return SI_NUM_IMAGE_SLOTS - 1 - slot; } #endif diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 1251b53785b..39bb94366f2 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -28,577 +28,548 @@ #include "sid.h" struct uvec2 { - unsigned x, y; + unsigned x, y; }; struct si_bin_size_map { - unsigned start; - unsigned bin_size_x; - unsigned bin_size_y; + unsigned start; + unsigned bin_size_x; + unsigned bin_size_y; }; typedef struct si_bin_size_map si_bin_size_subtable[3][10]; /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */ -static struct uvec2 si_find_bin_size(struct si_screen *sscreen, - const si_bin_size_subtable table[], - unsigned sum) +static struct uvec2 si_find_bin_size(struct si_screen *sscreen, const si_bin_size_subtable table[], + unsigned sum) { - unsigned log_num_rb_per_se = - util_logbase2_ceil(sscreen->info.num_render_backends / - sscreen->info.max_se); - unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se); - unsigned i; - - /* Get the chip-specific subtable. */ - const struct si_bin_size_map *subtable = - &table[log_num_rb_per_se][log_num_se][0]; - - for (i = 0; subtable[i].bin_size_x != 0; i++) { - if (sum >= subtable[i].start && sum < subtable[i + 1].start) - break; - } - - struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y}; - return size; + unsigned log_num_rb_per_se = + util_logbase2_ceil(sscreen->info.num_render_backends / sscreen->info.max_se); + unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se); + unsigned i; + + /* Get the chip-specific subtable. */ + const struct si_bin_size_map *subtable = &table[log_num_rb_per_se][log_num_se][0]; + + for (i = 0; subtable[i].bin_size_x != 0; i++) { + if (sum >= subtable[i].start && sum < subtable[i + 1].start) + break; + } + + struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y}; + return size; } -static struct uvec2 si_get_color_bin_size(struct si_context *sctx, - unsigned cb_target_enabled_4bit) +static struct uvec2 si_get_color_bin_size(struct si_context *sctx, unsigned cb_target_enabled_4bit) { - unsigned num_fragments = sctx->framebuffer.nr_color_samples; - unsigned sum = 0; - - /* Compute the sum of all Bpp. */ - for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (!(cb_target_enabled_4bit & (0xf << (i * 4)))) - continue; - - struct si_texture *tex = - (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; - sum += tex->surface.bpe; - } - - /* Multiply the sum by some function of the number of samples. */ - if (num_fragments >= 2) { - if (si_get_ps_iter_samples(sctx) >= 2) - sum *= num_fragments; - else - sum *= 2; - } - - static const si_bin_size_subtable table[] = { - { - /* One RB / SE */ - { - /* One shader engine */ - { 0, 128, 128 }, - { 1, 64, 128 }, - { 2, 32, 128 }, - { 3, 16, 128 }, - { 17, 0, 0 }, - }, - { - /* Two shader engines */ - { 0, 128, 128 }, - { 2, 64, 128 }, - { 3, 32, 128 }, - { 5, 16, 128 }, - { 17, 0, 0 }, - }, - { - /* Four shader engines */ - { 0, 128, 128 }, - { 3, 64, 128 }, - { 5, 16, 128 }, - { 17, 0, 0 }, - }, - }, - { - /* Two RB / SE */ - { - /* One shader engine */ - { 0, 128, 128 }, - { 2, 64, 128 }, - { 3, 32, 128 }, - { 9, 16, 128 }, - { 33, 0, 0 }, - }, - { - /* Two shader engines */ - { 0, 128, 128 }, - { 3, 64, 128 }, - { 5, 32, 128 }, - { 9, 16, 128 }, - { 33, 0, 0 }, - }, - { - /* Four shader engines */ - { 0, 256, 256 }, - { 2, 128, 256 }, - { 3, 128, 128 }, - { 5, 64, 128 }, - { 9, 16, 128 }, - { 33, 0, 0 }, - }, - }, - { - /* Four RB / SE */ - { - /* One shader engine */ - { 0, 128, 256 }, - { 2, 128, 128 }, - { 3, 64, 128 }, - { 5, 32, 128 }, - { 9, 16, 128 }, - { 17, 0, 0 }, - }, - { - /* Two shader engines */ - { 0, 256, 256 }, - { 2, 128, 256 }, - { 3, 128, 128 }, - { 5, 64, 128 }, - { 9, 32, 128 }, - { 17, 16, 128 }, - { 33, 0, 0 }, - }, - { - /* Four shader engines */ - { 0, 256, 512 }, - { 2, 128, 512 }, - { 3, 64, 512 }, - { 5, 32, 512 }, - { 9, 32, 256 }, - { 17, 32, 128 }, - { 33, 0, 0 }, - }, - }, - }; - - return si_find_bin_size(sctx->screen, table, sum); + unsigned num_fragments = sctx->framebuffer.nr_color_samples; + unsigned sum = 0; + + /* Compute the sum of all Bpp. */ + for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (!(cb_target_enabled_4bit & (0xf << (i * 4)))) + continue; + + struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture; + sum += tex->surface.bpe; + } + + /* Multiply the sum by some function of the number of samples. */ + if (num_fragments >= 2) { + if (si_get_ps_iter_samples(sctx) >= 2) + sum *= num_fragments; + else + sum *= 2; + } + + static const si_bin_size_subtable table[] = { + { + /* One RB / SE */ + { + /* One shader engine */ + {0, 128, 128}, + {1, 64, 128}, + {2, 32, 128}, + {3, 16, 128}, + {17, 0, 0}, + }, + { + /* Two shader engines */ + {0, 128, 128}, + {2, 64, 128}, + {3, 32, 128}, + {5, 16, 128}, + {17, 0, 0}, + }, + { + /* Four shader engines */ + {0, 128, 128}, + {3, 64, 128}, + {5, 16, 128}, + {17, 0, 0}, + }, + }, + { + /* Two RB / SE */ + { + /* One shader engine */ + {0, 128, 128}, + {2, 64, 128}, + {3, 32, 128}, + {9, 16, 128}, + {33, 0, 0}, + }, + { + /* Two shader engines */ + {0, 128, 128}, + {3, 64, 128}, + {5, 32, 128}, + {9, 16, 128}, + {33, 0, 0}, + }, + { + /* Four shader engines */ + {0, 256, 256}, + {2, 128, 256}, + {3, 128, 128}, + {5, 64, 128}, + {9, 16, 128}, + {33, 0, 0}, + }, + }, + { + /* Four RB / SE */ + { + /* One shader engine */ + {0, 128, 256}, + {2, 128, 128}, + {3, 64, 128}, + {5, 32, 128}, + {9, 16, 128}, + {17, 0, 0}, + }, + { + /* Two shader engines */ + {0, 256, 256}, + {2, 128, 256}, + {3, 128, 128}, + {5, 64, 128}, + {9, 32, 128}, + {17, 16, 128}, + {33, 0, 0}, + }, + { + /* Four shader engines */ + {0, 256, 512}, + {2, 128, 512}, + {3, 64, 512}, + {5, 32, 512}, + {9, 32, 256}, + {17, 32, 128}, + {33, 0, 0}, + }, + }, + }; + + return si_find_bin_size(sctx->screen, table, sum); } static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) { - struct si_state_dsa *dsa = sctx->queued.named.dsa; - - if (!sctx->framebuffer.state.zsbuf || - (!dsa->depth_enabled && !dsa->stencil_enabled)) { - /* Return the max size. */ - struct uvec2 size = {512, 512}; - return size; - } - - struct si_texture *tex = - (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; - unsigned depth_coeff = dsa->depth_enabled ? 5 : 0; - unsigned stencil_coeff = tex->surface.has_stencil && - dsa->stencil_enabled ? 1 : 0; - unsigned sum = 4 * (depth_coeff + stencil_coeff) * - MAX2(tex->buffer.b.b.nr_samples, 1); - - static const si_bin_size_subtable table[] = { - { - // One RB / SE - { - // One shader engine - { 0, 64, 512 }, - { 2, 64, 256 }, - { 4, 64, 128 }, - { 7, 32, 128 }, - { 13, 16, 128 }, - { 49, 0, 0 }, - }, - { - // Two shader engines - { 0, 128, 512 }, - { 2, 64, 512 }, - { 4, 64, 256 }, - { 7, 64, 128 }, - { 13, 32, 128 }, - { 25, 16, 128 }, - { 49, 0, 0 }, - }, - { - // Four shader engines - { 0, 256, 512 }, - { 2, 128, 512 }, - { 4, 64, 512 }, - { 7, 64, 256 }, - { 13, 64, 128 }, - { 25, 16, 128 }, - { 49, 0, 0 }, - }, - }, - { - // Two RB / SE - { - // One shader engine - { 0, 128, 512 }, - { 2, 64, 512 }, - { 4, 64, 256 }, - { 7, 64, 128 }, - { 13, 32, 128 }, - { 25, 16, 128 }, - { 97, 0, 0 }, - }, - { - // Two shader engines - { 0, 256, 512 }, - { 2, 128, 512 }, - { 4, 64, 512 }, - { 7, 64, 256 }, - { 13, 64, 128 }, - { 25, 32, 128 }, - { 49, 16, 128 }, - { 97, 0, 0 }, - }, - { - // Four shader engines - { 0, 512, 512 }, - { 2, 256, 512 }, - { 4, 128, 512 }, - { 7, 64, 512 }, - { 13, 64, 256 }, - { 25, 64, 128 }, - { 49, 16, 128 }, - { 97, 0, 0 }, - }, - }, - { - // Four RB / SE - { - // One shader engine - { 0, 256, 512 }, - { 2, 128, 512 }, - { 4, 64, 512 }, - { 7, 64, 256 }, - { 13, 64, 128 }, - { 25, 32, 128 }, - { 49, 16, 128 }, - { 193, 0, 0 }, - }, - { - // Two shader engines - { 0, 512, 512 }, - { 2, 256, 512 }, - { 4, 128, 512 }, - { 7, 64, 512 }, - { 13, 64, 256 }, - { 25, 64, 128 }, - { 49, 32, 128 }, - { 97, 16, 128 }, - { 193, 0, 0 }, - }, - { - // Four shader engines - { 0, 512, 512 }, - { 4, 256, 512 }, - { 7, 128, 512 }, - { 13, 64, 512 }, - { 25, 32, 512 }, - { 49, 32, 256 }, - { 97, 16, 128 }, - { 193, 0, 0 }, - }, - }, - }; - - return si_find_bin_size(sctx->screen, table, sum); + struct si_state_dsa *dsa = sctx->queued.named.dsa; + + if (!sctx->framebuffer.state.zsbuf || (!dsa->depth_enabled && !dsa->stencil_enabled)) { + /* Return the max size. */ + struct uvec2 size = {512, 512}; + return size; + } + + struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture; + unsigned depth_coeff = dsa->depth_enabled ? 5 : 0; + unsigned stencil_coeff = tex->surface.has_stencil && dsa->stencil_enabled ? 1 : 0; + unsigned sum = 4 * (depth_coeff + stencil_coeff) * MAX2(tex->buffer.b.b.nr_samples, 1); + + static const si_bin_size_subtable table[] = { + { + // One RB / SE + { + // One shader engine + {0, 64, 512}, + {2, 64, 256}, + {4, 64, 128}, + {7, 32, 128}, + {13, 16, 128}, + {49, 0, 0}, + }, + { + // Two shader engines + {0, 128, 512}, + {2, 64, 512}, + {4, 64, 256}, + {7, 64, 128}, + {13, 32, 128}, + {25, 16, 128}, + {49, 0, 0}, + }, + { + // Four shader engines + {0, 256, 512}, + {2, 128, 512}, + {4, 64, 512}, + {7, 64, 256}, + {13, 64, 128}, + {25, 16, 128}, + {49, 0, 0}, + }, + }, + { + // Two RB / SE + { + // One shader engine + {0, 128, 512}, + {2, 64, 512}, + {4, 64, 256}, + {7, 64, 128}, + {13, 32, 128}, + {25, 16, 128}, + {97, 0, 0}, + }, + { + // Two shader engines + {0, 256, 512}, + {2, 128, 512}, + {4, 64, 512}, + {7, 64, 256}, + {13, 64, 128}, + {25, 32, 128}, + {49, 16, 128}, + {97, 0, 0}, + }, + { + // Four shader engines + {0, 512, 512}, + {2, 256, 512}, + {4, 128, 512}, + {7, 64, 512}, + {13, 64, 256}, + {25, 64, 128}, + {49, 16, 128}, + {97, 0, 0}, + }, + }, + { + // Four RB / SE + { + // One shader engine + {0, 256, 512}, + {2, 128, 512}, + {4, 64, 512}, + {7, 64, 256}, + {13, 64, 128}, + {25, 32, 128}, + {49, 16, 128}, + {193, 0, 0}, + }, + { + // Two shader engines + {0, 512, 512}, + {2, 256, 512}, + {4, 128, 512}, + {7, 64, 512}, + {13, 64, 256}, + {25, 64, 128}, + {49, 32, 128}, + {97, 16, 128}, + {193, 0, 0}, + }, + { + // Four shader engines + {0, 512, 512}, + {4, 256, 512}, + {7, 128, 512}, + {13, 64, 512}, + {25, 32, 512}, + {49, 32, 256}, + {97, 16, 128}, + {193, 0, 0}, + }, + }, + }; + + return si_find_bin_size(sctx->screen, table, sum); } -static void gfx10_get_bin_sizes(struct si_context *sctx, - unsigned cb_target_enabled_4bit, - struct uvec2 *color_bin_size, - struct uvec2 *depth_bin_size) +static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enabled_4bit, + struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size) { - const unsigned ZsTagSize = 64; - const unsigned ZsNumTags = 312; - const unsigned CcTagSize = 1024; - const unsigned CcReadTags = 31; - const unsigned FcTagSize = 256; - const unsigned FcReadTags = 44; - - const unsigned num_rbs = sctx->screen->info.num_render_backends; - const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces); - - const unsigned depthBinSizeTagPart = ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes)); - const unsigned colorBinSizeTagPart = ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes)); - const unsigned fmaskBinSizeTagPart = ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes)); - - const unsigned minBinSizeX = 128; - const unsigned minBinSizeY = 64; - - const unsigned num_fragments = sctx->framebuffer.nr_color_samples; - const unsigned num_samples = sctx->framebuffer.nr_samples; - const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2; - - /* Calculate cColor and cFmask(if applicable) */ - unsigned cColor = 0; - unsigned cFmask = 0; - bool has_fmask = false; - - for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { - if (!sctx->framebuffer.state.cbufs[i]) - continue; - - struct si_texture *tex = - (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture; - const unsigned mmrt = - num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2); - - cColor += tex->surface.bpe * mmrt; - if (num_samples >= 2 /* if FMASK is bound */) { - const unsigned fragmentsLog2 = util_logbase2(num_fragments); - const unsigned samplesLog2 = util_logbase2(num_samples); - - static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = { - { 0, 1, 1, 1, 2 }, /* fragments = 1 */ - { 0, 1, 1, 2, 4 }, /* fragments = 2 */ - { 0, 1, 1, 4, 8 }, /* fragments = 4 */ - { 0, 1, 2, 4, 8 } /* fragments = 8 */ - }; - cFmask += cFmaskMrt[fragmentsLog2][samplesLog2]; - has_fmask = true; - } - } - cColor = MAX2(cColor, 1u); - - const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor); - const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */ - const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2); /* round down height */ - - unsigned binSizeX = colorBinSizeX; - unsigned binSizeY = colorBinSizeY; - - if (has_fmask) { - cFmask = MAX2(cFmask, 1u); - - const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask); - const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */ - const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2); /* round down height */ - - /* use the smaller of the Color vs. Fmask bin sizes */ - if (fmaskLog2Pixels < colorLog2Pixels) { - binSizeX = fmaskBinSizeX; - binSizeY = fmaskBinSizeY; - } - } - - /* Return size adjusted for minimum bin size */ - color_bin_size->x = MAX2(binSizeX, minBinSizeX); - color_bin_size->y = MAX2(binSizeY, minBinSizeY); - - if (!sctx->framebuffer.state.zsbuf) { - /* Set to max sizes when no depth buffer is bound. */ - depth_bin_size->x = 512; - depth_bin_size->y = 512; - } else { - struct si_texture *zstex = (struct si_texture*)sctx->framebuffer.state.zsbuf->texture; - struct si_state_dsa *dsa = sctx->queued.named.dsa; - - const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0; - const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0; - const unsigned cDepth = (cPerDepthSample + cPerStencilSample) * - MAX2(zstex->buffer.b.b.nr_samples, 1); - - const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u)); - unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2); - unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2); - - depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX); - depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY); - } + const unsigned ZsTagSize = 64; + const unsigned ZsNumTags = 312; + const unsigned CcTagSize = 1024; + const unsigned CcReadTags = 31; + const unsigned FcTagSize = 256; + const unsigned FcReadTags = 44; + + const unsigned num_rbs = sctx->screen->info.num_render_backends; + const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces); + + const unsigned depthBinSizeTagPart = + ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes)); + const unsigned colorBinSizeTagPart = + ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes)); + const unsigned fmaskBinSizeTagPart = + ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes)); + + const unsigned minBinSizeX = 128; + const unsigned minBinSizeY = 64; + + const unsigned num_fragments = sctx->framebuffer.nr_color_samples; + const unsigned num_samples = sctx->framebuffer.nr_samples; + const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2; + + /* Calculate cColor and cFmask(if applicable) */ + unsigned cColor = 0; + unsigned cFmask = 0; + bool has_fmask = false; + + for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { + if (!sctx->framebuffer.state.cbufs[i]) + continue; + + struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture; + const unsigned mmrt = num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2); + + cColor += tex->surface.bpe * mmrt; + if (num_samples >= 2 /* if FMASK is bound */) { + const unsigned fragmentsLog2 = util_logbase2(num_fragments); + const unsigned samplesLog2 = util_logbase2(num_samples); + + static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = { + {0, 1, 1, 1, 2}, /* fragments = 1 */ + {0, 1, 1, 2, 4}, /* fragments = 2 */ + {0, 1, 1, 4, 8}, /* fragments = 4 */ + {0, 1, 2, 4, 8} /* fragments = 8 */ + }; + cFmask += cFmaskMrt[fragmentsLog2][samplesLog2]; + has_fmask = true; + } + } + cColor = MAX2(cColor, 1u); + + const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor); + const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */ + const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2); /* round down height */ + + unsigned binSizeX = colorBinSizeX; + unsigned binSizeY = colorBinSizeY; + + if (has_fmask) { + cFmask = MAX2(cFmask, 1u); + + const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask); + const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */ + const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2); /* round down height */ + + /* use the smaller of the Color vs. Fmask bin sizes */ + if (fmaskLog2Pixels < colorLog2Pixels) { + binSizeX = fmaskBinSizeX; + binSizeY = fmaskBinSizeY; + } + } + + /* Return size adjusted for minimum bin size */ + color_bin_size->x = MAX2(binSizeX, minBinSizeX); + color_bin_size->y = MAX2(binSizeY, minBinSizeY); + + if (!sctx->framebuffer.state.zsbuf) { + /* Set to max sizes when no depth buffer is bound. */ + depth_bin_size->x = 512; + depth_bin_size->y = 512; + } else { + struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture; + struct si_state_dsa *dsa = sctx->queued.named.dsa; + + const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0; + const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0; + const unsigned cDepth = + (cPerDepthSample + cPerStencilSample) * MAX2(zstex->buffer.b.b.nr_samples, 1); + + const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u)); + unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2); + unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2); + + depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX); + depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY); + } } static void si_emit_dpbb_disable(struct si_context *sctx) { - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - if (sctx->chip_class >= GFX10) { - struct uvec2 bin_size = {}; - struct uvec2 bin_size_extend = {}; - - bin_size.x = 128; - bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64; - - if (bin_size.x >= 32) - bin_size_extend.x = util_logbase2(bin_size.x) - 5; - if (bin_size.y >= 32) - bin_size_extend.y = util_logbase2(bin_size.y) - 5; - - radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, - SI_TRACKED_PA_SC_BINNER_CNTL_0, - S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | - S_028C44_BIN_SIZE_X(bin_size.x == 16) | - S_028C44_BIN_SIZE_Y(bin_size.y == 16) | - S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | - S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | - S_028C44_DISABLE_START_OF_PRIM(1) | - S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0)); - } else { - radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, - SI_TRACKED_PA_SC_BINNER_CNTL_0, - S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | - S_028C44_DISABLE_START_OF_PRIM(1) | - S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 || - sctx->family == CHIP_VEGA20 || - sctx->family >= CHIP_RAVEN2) && - sctx->last_binning_enabled != 0)); - } - - unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL - : R_028060_DB_DFSM_CONTROL; - radeon_opt_set_context_reg(sctx, db_dfsm_control, - SI_TRACKED_DB_DFSM_CONTROL, - S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | - S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; - - sctx->last_binning_enabled = false; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (sctx->chip_class >= GFX10) { + struct uvec2 bin_size = {}; + struct uvec2 bin_size_extend = {}; + + bin_size.x = 128; + bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64; + + if (bin_size.x >= 32) + bin_size_extend.x = util_logbase2(bin_size.x) - 5; + if (bin_size.y >= 32) + bin_size_extend.y = util_logbase2(bin_size.y) - 5; + + radeon_opt_set_context_reg( + sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, + S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | + S_028C44_BIN_SIZE_X(bin_size.x == 16) | S_028C44_BIN_SIZE_Y(bin_size.y == 16) | + S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | + S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | S_028C44_DISABLE_START_OF_PRIM(1) | + S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0)); + } else { + radeon_opt_set_context_reg( + sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, + S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | + S_028C44_DISABLE_START_OF_PRIM(1) | + S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 || + sctx->family == CHIP_VEGA20 || + sctx->family >= CHIP_RAVEN2) && + sctx->last_binning_enabled != 0)); + } + + unsigned db_dfsm_control = + sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL; + radeon_opt_set_context_reg( + sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL, + S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + sctx->last_binning_enabled = false; } void si_emit_dpbb_state(struct si_context *sctx) { - struct si_screen *sscreen = sctx->screen; - struct si_state_blend *blend = sctx->queued.named.blend; - struct si_state_dsa *dsa = sctx->queued.named.dsa; - unsigned db_shader_control = sctx->ps_db_shader_control; - - assert(sctx->chip_class >= GFX9); - - if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) { - si_emit_dpbb_disable(sctx); - return; - } - - bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) || - G_02880C_MASK_EXPORT_ENABLE(db_shader_control) || - G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || - blend->alpha_to_coverage; - - bool db_can_reject_z_trivially = - !G_02880C_Z_EXPORT_ENABLE(db_shader_control) || - G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) || - G_02880C_DEPTH_BEFORE_SHADER(db_shader_control); - - /* Disable DPBB when it's believed to be inefficient. */ - if (sscreen->info.num_render_backends > 4 && - ps_can_kill && - db_can_reject_z_trivially && - sctx->framebuffer.state.zsbuf && - dsa->db_can_write) { - si_emit_dpbb_disable(sctx); - return; - } - - /* Compute the bin size. */ - /* TODO: We could also look at enabled pixel shader outputs. */ - unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit & - blend->cb_target_enabled_4bit; - struct uvec2 color_bin_size, depth_bin_size; - - if (sctx->chip_class >= GFX10) { - gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, - &color_bin_size, &depth_bin_size); - } else { - color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit); - depth_bin_size = si_get_depth_bin_size(sctx); - } - - unsigned color_area = color_bin_size.x * color_bin_size.y; - unsigned depth_area = depth_bin_size.x * depth_bin_size.y; - - struct uvec2 bin_size = color_area < depth_area ? color_bin_size - : depth_bin_size; - - if (!bin_size.x || !bin_size.y) { - si_emit_dpbb_disable(sctx); - return; - } - - /* Enable DFSM if it's preferred. */ - unsigned punchout_mode = V_028060_FORCE_OFF; - bool disable_start_of_prim = true; - bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 && - sctx->framebuffer.state.zsbuf && - sctx->framebuffer.nr_samples != - MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples); - - if (sscreen->dfsm_allowed && - !zs_eqaa_dfsm_bug && - cb_target_enabled_4bit && - !G_02880C_KILL_ENABLE(db_shader_control) && - /* These two also imply that DFSM is disabled when PS writes to memory. */ - !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) && - !G_02880C_EXEC_ON_NOOP(db_shader_control) && - G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) { - punchout_mode = V_028060_AUTO; - disable_start_of_prim = (cb_target_enabled_4bit & - blend->blend_enable_4bit) != 0; - } - - /* Tunable parameters. Also test with DFSM enabled/disabled. */ - unsigned context_states_per_bin; /* allowed range: [1, 6] */ - unsigned persistent_states_per_bin; /* allowed range: [1, 32] */ - unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ - - /* Tuned for Raven. Vega might need different values. */ - if (sscreen->info.has_dedicated_vram) { - if (sscreen->info.num_render_backends > 4) { - context_states_per_bin = 1; - persistent_states_per_bin = 1; - } else { - context_states_per_bin = 3; - persistent_states_per_bin = 8; - } - } else { - /* This is a workaround for: - * https://bugs.freedesktop.org/show_bug.cgi?id=110214 - * (an alternative is to insert manual BATCH_BREAK event when - * a context_roll is detected). */ - context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6; - /* Using 32 here can cause GPU hangs on RAVEN1 */ - persistent_states_per_bin = 16; - } - fpovs_per_batch = 63; - - /* Emit registers. */ - struct uvec2 bin_size_extend = {}; - if (bin_size.x >= 32) - bin_size_extend.x = util_logbase2(bin_size.x) - 5; - if (bin_size.y >= 32) - bin_size_extend.y = util_logbase2(bin_size.y) - 5; - - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - radeon_opt_set_context_reg( - sctx, R_028C44_PA_SC_BINNER_CNTL_0, - SI_TRACKED_PA_SC_BINNER_CNTL_0, - S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | - S_028C44_BIN_SIZE_X(bin_size.x == 16) | - S_028C44_BIN_SIZE_Y(bin_size.y == 16) | - S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | - S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | - S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) | - S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) | - S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | - S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | - S_028C44_OPTIMAL_BIN_SELECTION(1) | - S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 || - sctx->family == CHIP_VEGA20 || - sctx->family >= CHIP_RAVEN2) && - sctx->last_binning_enabled != 1)); - - unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL - : R_028060_DB_DFSM_CONTROL; - radeon_opt_set_context_reg(sctx, db_dfsm_control, - SI_TRACKED_DB_DFSM_CONTROL, - S_028060_PUNCHOUT_MODE(punchout_mode) | - S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; - - sctx->last_binning_enabled = true; + struct si_screen *sscreen = sctx->screen; + struct si_state_blend *blend = sctx->queued.named.blend; + struct si_state_dsa *dsa = sctx->queued.named.dsa; + unsigned db_shader_control = sctx->ps_db_shader_control; + + assert(sctx->chip_class >= GFX9); + + if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) { + si_emit_dpbb_disable(sctx); + return; + } + + bool ps_can_kill = + G_02880C_KILL_ENABLE(db_shader_control) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control) || + G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage; + + bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) || + G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) || + G_02880C_DEPTH_BEFORE_SHADER(db_shader_control); + + /* Disable DPBB when it's believed to be inefficient. */ + if (sscreen->info.num_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially && + sctx->framebuffer.state.zsbuf && dsa->db_can_write) { + si_emit_dpbb_disable(sctx); + return; + } + + /* Compute the bin size. */ + /* TODO: We could also look at enabled pixel shader outputs. */ + unsigned cb_target_enabled_4bit = + sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit; + struct uvec2 color_bin_size, depth_bin_size; + + if (sctx->chip_class >= GFX10) { + gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, &color_bin_size, &depth_bin_size); + } else { + color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit); + depth_bin_size = si_get_depth_bin_size(sctx); + } + + unsigned color_area = color_bin_size.x * color_bin_size.y; + unsigned depth_area = depth_bin_size.x * depth_bin_size.y; + + struct uvec2 bin_size = color_area < depth_area ? color_bin_size : depth_bin_size; + + if (!bin_size.x || !bin_size.y) { + si_emit_dpbb_disable(sctx); + return; + } + + /* Enable DFSM if it's preferred. */ + unsigned punchout_mode = V_028060_FORCE_OFF; + bool disable_start_of_prim = true; + bool zs_eqaa_dfsm_bug = + sctx->chip_class == GFX9 && sctx->framebuffer.state.zsbuf && + sctx->framebuffer.nr_samples != MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples); + + if (sscreen->dfsm_allowed && !zs_eqaa_dfsm_bug && cb_target_enabled_4bit && + !G_02880C_KILL_ENABLE(db_shader_control) && + /* These two also imply that DFSM is disabled when PS writes to memory. */ + !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) && + !G_02880C_EXEC_ON_NOOP(db_shader_control) && + G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) { + punchout_mode = V_028060_AUTO; + disable_start_of_prim = (cb_target_enabled_4bit & blend->blend_enable_4bit) != 0; + } + + /* Tunable parameters. Also test with DFSM enabled/disabled. */ + unsigned context_states_per_bin; /* allowed range: [1, 6] */ + unsigned persistent_states_per_bin; /* allowed range: [1, 32] */ + unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */ + + /* Tuned for Raven. Vega might need different values. */ + if (sscreen->info.has_dedicated_vram) { + if (sscreen->info.num_render_backends > 4) { + context_states_per_bin = 1; + persistent_states_per_bin = 1; + } else { + context_states_per_bin = 3; + persistent_states_per_bin = 8; + } + } else { + /* This is a workaround for: + * https://bugs.freedesktop.org/show_bug.cgi?id=110214 + * (an alternative is to insert manual BATCH_BREAK event when + * a context_roll is detected). */ + context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6; + /* Using 32 here can cause GPU hangs on RAVEN1 */ + persistent_states_per_bin = 16; + } + fpovs_per_batch = 63; + + /* Emit registers. */ + struct uvec2 bin_size_extend = {}; + if (bin_size.x >= 32) + bin_size_extend.x = util_logbase2(bin_size.x) - 5; + if (bin_size.y >= 32) + bin_size_extend.y = util_logbase2(bin_size.y) - 5; + + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + radeon_opt_set_context_reg( + sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, + S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) | + S_028C44_BIN_SIZE_Y(bin_size.y == 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | + S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | + S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) | + S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) | + S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | + S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1) | + S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 || + sctx->family == CHIP_VEGA20 || + sctx->family >= CHIP_RAVEN2) && + sctx->last_binning_enabled != 1)); + + unsigned db_dfsm_control = + sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL; + radeon_opt_set_context_reg( + sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL, + S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + sctx->last_binning_enabled = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index dc6de604d21..7def05440e1 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -22,42 +22,39 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_debug.h" #include "si_build_pm4.h" #include "sid.h" - #include "util/u_index_modify.h" #include "util/u_log.h" -#include "util/u_upload_mgr.h" #include "util/u_prim.h" #include "util/u_suballoc.h" - -#include "ac_debug.h" +#include "util/u_upload_mgr.h" /* special primitive types */ -#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX +#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX static unsigned si_conv_pipe_prim(unsigned mode) { - static const unsigned prim_conv[] = { - [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST, - [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST, - [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP, - [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP, - [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST, - [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP, - [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN, - [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST, - [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP, - [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON, - [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ, - [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ, - [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ, - [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ, - [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH, - [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST - }; - assert(mode < ARRAY_SIZE(prim_conv)); - return prim_conv[mode]; + static const unsigned prim_conv[] = { + [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST, + [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST, + [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP, + [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP, + [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST, + [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN, + [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST, + [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP, + [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON, + [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ, + [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH, + [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST}; + assert(mode < ARRAY_SIZE(prim_conv)); + return prim_conv[mode]; } /** @@ -67,652 +64,597 @@ static unsigned si_conv_pipe_prim(unsigned mode) * The information about LDS and other non-compile-time parameters is then * written to userdata SGPRs. */ -static void si_emit_derived_tess_state(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned *num_patches) +static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, + unsigned *num_patches) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_shader *ls_current; - struct si_shader_selector *ls; - /* The TES pointer will only be used for sctx->last_tcs. - * It would be wrong to think that TCS = TES. */ - struct si_shader_selector *tcs = - sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso; - unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; - bool has_primid_instancing_bug = sctx->chip_class == GFX6 && - sctx->screen->info.max_se == 1; - unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; - unsigned num_tcs_input_cp = info->vertices_per_patch; - unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs; - unsigned num_tcs_patch_outputs; - unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size; - unsigned input_patch_size, output_patch_size, output_patch0_offset; - unsigned perpatch_output_offset, lds_size; - unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets; - unsigned offchip_layout, hardware_lds_size, ls_hs_config; - - /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */ - if (sctx->chip_class >= GFX9) { - if (sctx->tcs_shader.cso) - ls_current = sctx->tcs_shader.current; - else - ls_current = sctx->fixed_func_tcs_shader.current; - - ls = ls_current->key.part.tcs.ls; - } else { - ls_current = sctx->vs_shader.current; - ls = sctx->vs_shader.cso; - } - - if (sctx->last_ls == ls_current && - sctx->last_tcs == tcs && - sctx->last_tes_sh_base == tes_sh_base && - sctx->last_num_tcs_input_cp == num_tcs_input_cp && - (!has_primid_instancing_bug || - (sctx->last_tess_uses_primid == tess_uses_primid))) { - *num_patches = sctx->last_num_patches; - return; - } - - sctx->last_ls = ls_current; - sctx->last_tcs = tcs; - sctx->last_tes_sh_base = tes_sh_base; - sctx->last_num_tcs_input_cp = num_tcs_input_cp; - sctx->last_tess_uses_primid = tess_uses_primid; - - /* This calculates how shader inputs and outputs among VS, TCS, and TES - * are laid out in LDS. */ - num_tcs_inputs = util_last_bit64(ls->outputs_written); - - if (sctx->tcs_shader.cso) { - num_tcs_outputs = util_last_bit64(tcs->outputs_written); - num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; - num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); - } else { - /* No TCS. Route varyings from LS to TES. */ - num_tcs_outputs = num_tcs_inputs; - num_tcs_output_cp = num_tcs_input_cp; - num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ - } - - input_vertex_size = ls->lshs_vertex_stride; - output_vertex_size = num_tcs_outputs * 16; - - input_patch_size = num_tcs_input_cp * input_vertex_size; - - pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. - */ - unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); - *num_patches = 256 / max_verts_per_patch; - - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - * - * While GFX7 can use 64K per threadgroup, there is a hang on Stoney - * with 2 CUs if we use more than 32K. The closed Vulkan driver also - * uses 32K at most on all GCN chips. - */ - hardware_lds_size = 32768; - *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + - output_patch_size)); - - /* Make sure the output data fits in the offchip buffer */ - *num_patches = MIN2(*num_patches, - (sctx->screen->tess_offchip_block_dw_size * 4) / - output_patch_size); - - /* Not necessary for correctness, but improves performance. - * The hardware can do more, but the radeonsi shader constant is - * limited to 6 bits. - */ - *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */ - - /* When distributed tessellation is unsupported, switch between SEs - * at a higher frequency to compensate for it. - */ - if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) - *num_patches = MIN2(*num_patches, 16); /* recommended */ - - /* Make sure that vector lanes are reasonably occupied. It probably - * doesn't matter much because this is LS-HS, and TES is likely to - * occupy significantly more CUs. - */ - unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch; - unsigned wave_size = sctx->screen->ge_wave_size; - - if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4) - *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; - - if (sctx->chip_class == GFX6) { - /* GFX6 bug workaround, related to power management. Limit LS-HS - * threadgroups to only one wave. - */ - unsigned one_wave = wave_size / max_verts_per_patch; - *num_patches = MIN2(*num_patches, one_wave); - } - - /* The VGT HS block increments the patch ID unconditionally - * within a single threadgroup. This results in incorrect - * patch IDs when instanced draws are used. - * - * The intended solution is to restrict threadgroups to - * a single instance by setting SWITCH_ON_EOI, which - * should cause IA to split instances up. However, this - * doesn't work correctly on GFX6 when there is no other - * SE to switch to. - */ - if (has_primid_instancing_bug && tess_uses_primid) - *num_patches = 1; - - sctx->last_num_patches = *num_patches; - - output_patch0_offset = input_patch_size * *num_patches; - perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; - - /* Compute userdata SGPRs. */ - assert(((input_vertex_size / 4) & ~0xff) == 0); - assert(((output_vertex_size / 4) & ~0xff) == 0); - assert(((input_patch_size / 4) & ~0x1fff) == 0); - assert(((output_patch_size / 4) & ~0x1fff) == 0); - assert(((output_patch0_offset / 16) & ~0xffff) == 0); - assert(((perpatch_output_offset / 16) & ~0xffff) == 0); - assert(num_tcs_input_cp <= 32); - assert(num_tcs_output_cp <= 32); - - uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address; - assert((ring_va & u_bit_consecutive(0, 19)) == 0); - - tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) | - S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4); - tcs_out_layout = (output_patch_size / 4) | - (num_tcs_input_cp << 13) | - ring_va; - tcs_out_offsets = (output_patch0_offset / 16) | - ((perpatch_output_offset / 16) << 16); - offchip_layout = *num_patches | - (num_tcs_output_cp << 6) | - (pervertex_output_patch_size * *num_patches << 12); - - /* Compute the LDS size. */ - lds_size = output_patch0_offset + output_patch_size * *num_patches; - - if (sctx->chip_class >= GFX7) { - assert(lds_size <= 65536); - lds_size = align(lds_size, 512) / 512; - } else { - assert(lds_size <= 32768); - lds_size = align(lds_size, 256) / 256; - } - - /* Set SI_SGPR_VS_STATE_BITS. */ - sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & - C_VS_STATE_LS_OUT_VERTEX_SIZE; - sctx->current_vs_state |= tcs_in_layout; - - /* We should be able to support in-shader LDS use with LLVM >= 9 - * by just adding the lds_sizes together, but it has never - * been tested. */ - assert(ls_current->config.lds_size == 0); - - if (sctx->chip_class >= GFX9) { - unsigned hs_rsrc2 = ls_current->config.rsrc2; - - if (sctx->chip_class >= GFX10) - hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size); - else - hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size); - - radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2); - - /* Set userdata SGPRs for merged LS-HS. */ - radeon_set_sh_reg_seq(cs, - R_00B430_SPI_SHADER_USER_DATA_LS_0 + - GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3); - radeon_emit(cs, offchip_layout); - radeon_emit(cs, tcs_out_offsets); - radeon_emit(cs, tcs_out_layout); - } else { - unsigned ls_rsrc2 = ls_current->config.rsrc2; - - si_multiwave_lds_size_workaround(sctx->screen, &lds_size); - ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size); - - /* Due to a hw bug, RSRC2_LS must be written twice with another - * LS register written in between. */ - if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII) - radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); - radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); - radeon_emit(cs, ls_current->config.rsrc1); - radeon_emit(cs, ls_rsrc2); - - /* Set userdata SGPRs for TCS. */ - radeon_set_sh_reg_seq(cs, - R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4); - radeon_emit(cs, offchip_layout); - radeon_emit(cs, tcs_out_offsets); - radeon_emit(cs, tcs_out_layout); - radeon_emit(cs, tcs_in_layout); - } - - /* Set userdata SGPRs for TES. */ - radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); - radeon_emit(cs, offchip_layout); - radeon_emit(cs, ring_va); - - ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | - S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | - S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); - - if (sctx->last_ls_hs_config != ls_hs_config) { - if (sctx->chip_class >= GFX7) { - radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, - ls_hs_config); - } else { - radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, - ls_hs_config); - } - sctx->last_ls_hs_config = ls_hs_config; - sctx->context_roll = true; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_shader *ls_current; + struct si_shader_selector *ls; + /* The TES pointer will only be used for sctx->last_tcs. + * It would be wrong to think that TCS = TES. */ + struct si_shader_selector *tcs = + sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso; + unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; + bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1; + unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; + unsigned num_tcs_input_cp = info->vertices_per_patch; + unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs; + unsigned num_tcs_patch_outputs; + unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size; + unsigned input_patch_size, output_patch_size, output_patch0_offset; + unsigned perpatch_output_offset, lds_size; + unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets; + unsigned offchip_layout, hardware_lds_size, ls_hs_config; + + /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */ + if (sctx->chip_class >= GFX9) { + if (sctx->tcs_shader.cso) + ls_current = sctx->tcs_shader.current; + else + ls_current = sctx->fixed_func_tcs_shader.current; + + ls = ls_current->key.part.tcs.ls; + } else { + ls_current = sctx->vs_shader.current; + ls = sctx->vs_shader.cso; + } + + if (sctx->last_ls == ls_current && sctx->last_tcs == tcs && + sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp && + (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) { + *num_patches = sctx->last_num_patches; + return; + } + + sctx->last_ls = ls_current; + sctx->last_tcs = tcs; + sctx->last_tes_sh_base = tes_sh_base; + sctx->last_num_tcs_input_cp = num_tcs_input_cp; + sctx->last_tess_uses_primid = tess_uses_primid; + + /* This calculates how shader inputs and outputs among VS, TCS, and TES + * are laid out in LDS. */ + num_tcs_inputs = util_last_bit64(ls->outputs_written); + + if (sctx->tcs_shader.cso) { + num_tcs_outputs = util_last_bit64(tcs->outputs_written); + num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); + } else { + /* No TCS. Route varyings from LS to TES. */ + num_tcs_outputs = num_tcs_inputs; + num_tcs_output_cp = num_tcs_input_cp; + num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ + } + + input_vertex_size = ls->lshs_vertex_stride; + output_vertex_size = num_tcs_outputs * 16; + + input_patch_size = num_tcs_input_cp * input_vertex_size; + + pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; + output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + + /* Ensure that we only need one wave per SIMD so we don't need to check + * resource usage. Also ensures that the number of tcs in and out + * vertices per threadgroup are at most 256. + */ + unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); + *num_patches = 256 / max_verts_per_patch; + + /* Make sure that the data fits in LDS. This assumes the shaders only + * use LDS for the inputs and outputs. + * + * While GFX7 can use 64K per threadgroup, there is a hang on Stoney + * with 2 CUs if we use more than 32K. The closed Vulkan driver also + * uses 32K at most on all GCN chips. + */ + hardware_lds_size = 32768; + *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); + + /* Make sure the output data fits in the offchip buffer */ + *num_patches = + MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size); + + /* Not necessary for correctness, but improves performance. + * The hardware can do more, but the radeonsi shader constant is + * limited to 6 bits. + */ + *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */ + + /* When distributed tessellation is unsupported, switch between SEs + * at a higher frequency to compensate for it. + */ + if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) + *num_patches = MIN2(*num_patches, 16); /* recommended */ + + /* Make sure that vector lanes are reasonably occupied. It probably + * doesn't matter much because this is LS-HS, and TES is likely to + * occupy significantly more CUs. + */ + unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch; + unsigned wave_size = sctx->screen->ge_wave_size; + + if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size * 3 / 4) + *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; + + if (sctx->chip_class == GFX6) { + /* GFX6 bug workaround, related to power management. Limit LS-HS + * threadgroups to only one wave. + */ + unsigned one_wave = wave_size / max_verts_per_patch; + *num_patches = MIN2(*num_patches, one_wave); + } + + /* The VGT HS block increments the patch ID unconditionally + * within a single threadgroup. This results in incorrect + * patch IDs when instanced draws are used. + * + * The intended solution is to restrict threadgroups to + * a single instance by setting SWITCH_ON_EOI, which + * should cause IA to split instances up. However, this + * doesn't work correctly on GFX6 when there is no other + * SE to switch to. + */ + if (has_primid_instancing_bug && tess_uses_primid) + *num_patches = 1; + + sctx->last_num_patches = *num_patches; + + output_patch0_offset = input_patch_size * *num_patches; + perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; + + /* Compute userdata SGPRs. */ + assert(((input_vertex_size / 4) & ~0xff) == 0); + assert(((output_vertex_size / 4) & ~0xff) == 0); + assert(((input_patch_size / 4) & ~0x1fff) == 0); + assert(((output_patch_size / 4) & ~0x1fff) == 0); + assert(((output_patch0_offset / 16) & ~0xffff) == 0); + assert(((perpatch_output_offset / 16) & ~0xffff) == 0); + assert(num_tcs_input_cp <= 32); + assert(num_tcs_output_cp <= 32); + + uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address; + assert((ring_va & u_bit_consecutive(0, 19)) == 0); + + tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) | + S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4); + tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va; + tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16); + offchip_layout = + *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12); + + /* Compute the LDS size. */ + lds_size = output_patch0_offset + output_patch_size * *num_patches; + + if (sctx->chip_class >= GFX7) { + assert(lds_size <= 65536); + lds_size = align(lds_size, 512) / 512; + } else { + assert(lds_size <= 32768); + lds_size = align(lds_size, 256) / 256; + } + + /* Set SI_SGPR_VS_STATE_BITS. */ + sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE; + sctx->current_vs_state |= tcs_in_layout; + + /* We should be able to support in-shader LDS use with LLVM >= 9 + * by just adding the lds_sizes together, but it has never + * been tested. */ + assert(ls_current->config.lds_size == 0); + + if (sctx->chip_class >= GFX9) { + unsigned hs_rsrc2 = ls_current->config.rsrc2; + + if (sctx->chip_class >= GFX10) + hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size); + else + hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size); + + radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2); + + /* Set userdata SGPRs for merged LS-HS. */ + radeon_set_sh_reg_seq( + cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3); + radeon_emit(cs, offchip_layout); + radeon_emit(cs, tcs_out_offsets); + radeon_emit(cs, tcs_out_layout); + } else { + unsigned ls_rsrc2 = ls_current->config.rsrc2; + + si_multiwave_lds_size_workaround(sctx->screen, &lds_size); + ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size); + + /* Due to a hw bug, RSRC2_LS must be written twice with another + * LS register written in between. */ + if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII) + radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); + radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); + radeon_emit(cs, ls_current->config.rsrc1); + radeon_emit(cs, ls_rsrc2); + + /* Set userdata SGPRs for TCS. */ + radeon_set_sh_reg_seq( + cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4); + radeon_emit(cs, offchip_layout); + radeon_emit(cs, tcs_out_offsets); + radeon_emit(cs, tcs_out_layout); + radeon_emit(cs, tcs_in_layout); + } + + /* Set userdata SGPRs for TES. */ + radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); + radeon_emit(cs, offchip_layout); + radeon_emit(cs, ring_va); + + ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | + S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); + + if (sctx->last_ls_hs_config != ls_hs_config) { + if (sctx->chip_class >= GFX7) { + radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); + } else { + radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); + } + sctx->last_ls_hs_config = ls_hs_config; + sctx->context_roll = true; + } } static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info, - enum pipe_prim_type prim) + enum pipe_prim_type prim) { - switch (prim) { - case PIPE_PRIM_PATCHES: - return info->count / info->vertices_per_patch; - case PIPE_PRIM_POLYGON: - return info->count >= 3; - case SI_PRIM_RECTANGLE_LIST: - return info->count / 3; - default: - return u_decomposed_prims_for_vertices(prim, info->count); - } + switch (prim) { + case PIPE_PRIM_PATCHES: + return info->count / info->vertices_per_patch; + case PIPE_PRIM_POLYGON: + return info->count >= 3; + case SI_PRIM_RECTANGLE_LIST: + return info->count / 3; + default: + return u_decomposed_prims_for_vertices(prim, info->count); + } } -static unsigned -si_get_init_multi_vgt_param(struct si_screen *sscreen, - union si_vgt_param_key *key) +static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key) { - STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4); - unsigned max_primgroup_in_wave = 2; - - /* SWITCH_ON_EOP(0) is always preferable. */ - bool wd_switch_on_eop = false; - bool ia_switch_on_eop = false; - bool ia_switch_on_eoi = false; - bool partial_vs_wave = false; - bool partial_es_wave = false; - - if (key->u.uses_tess) { - /* SWITCH_ON_EOI must be set if PrimID is used. */ - if (key->u.tess_uses_prim_id) - ia_switch_on_eoi = true; - - /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */ - if ((sscreen->info.family == CHIP_TAHITI || - sscreen->info.family == CHIP_PITCAIRN || - sscreen->info.family == CHIP_BONAIRE) && - key->u.uses_gs) - partial_vs_wave = true; - - /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */ - if (sscreen->info.has_distributed_tess) { - if (key->u.uses_gs) { - if (sscreen->info.chip_class == GFX8) - partial_es_wave = true; - } else { - partial_vs_wave = true; - } - } - } - - /* This is a hardware requirement. */ - if (key->u.line_stipple_enabled || - (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) { - ia_switch_on_eop = true; - wd_switch_on_eop = true; - } - - if (sscreen->info.chip_class >= GFX7) { - /* WD_SWITCH_ON_EOP has no effect on GPUs with less than - * 4 shader engines. Set 1 to pass the assertion below. - * The other cases are hardware requirements. - * - * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0 - * for points, line strips, and tri strips. - */ - if (sscreen->info.max_se <= 2 || - key->u.prim == PIPE_PRIM_POLYGON || - key->u.prim == PIPE_PRIM_LINE_LOOP || - key->u.prim == PIPE_PRIM_TRIANGLE_FAN || - key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY || - (key->u.primitive_restart && - (sscreen->info.family < CHIP_POLARIS10 || - (key->u.prim != PIPE_PRIM_POINTS && - key->u.prim != PIPE_PRIM_LINE_STRIP && - key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) || - key->u.count_from_stream_output) - wd_switch_on_eop = true; - - /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. - * We don't know that for indirect drawing, so treat it as - * always problematic. */ - if (sscreen->info.family == CHIP_HAWAII && - key->u.uses_instancing) - wd_switch_on_eop = true; - - /* Performance recommendation for 4 SE Gfx7-8 parts if - * instances are smaller than a primgroup. - * Assume indirect draws always use small instances. - * This is needed for good VS wave utilization. - */ - if (sscreen->info.chip_class <= GFX8 && - sscreen->info.max_se == 4 && - key->u.multi_instances_smaller_than_primgroup) - wd_switch_on_eop = true; - - /* Required on GFX7 and later. */ - if (sscreen->info.max_se == 4 && !wd_switch_on_eop) - ia_switch_on_eoi = true; - - /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set - * to work around a GS hang. - */ - if (key->u.uses_gs && - (sscreen->info.family == CHIP_TONGA || - sscreen->info.family == CHIP_FIJI || - sscreen->info.family == CHIP_POLARIS10 || - sscreen->info.family == CHIP_POLARIS11 || - sscreen->info.family == CHIP_POLARIS12 || - sscreen->info.family == CHIP_VEGAM)) - partial_vs_wave = true; - - /* Required by Hawaii and, for some special cases, by GFX8. */ - if (ia_switch_on_eoi && - (sscreen->info.family == CHIP_HAWAII || - (sscreen->info.chip_class == GFX8 && - (key->u.uses_gs || max_primgroup_in_wave != 2)))) - partial_vs_wave = true; - - /* Instancing bug on Bonaire. */ - if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && - key->u.uses_instancing) - partial_vs_wave = true; - - /* This only applies to Polaris10 and later 4 SE chips. - * wd_switch_on_eop is already true on all other chips. - */ - if (!wd_switch_on_eop && key->u.primitive_restart) - partial_vs_wave = true; - - /* If the WD switch is false, the IA switch must be false too. */ - assert(wd_switch_on_eop || !ia_switch_on_eop); - } - - /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ - if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi) - partial_es_wave = true; - - return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | - S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | - S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | - S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | - S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) | - /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */ - S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? - max_primgroup_in_wave : 0) | - S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) | - S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9); + STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4); + unsigned max_primgroup_in_wave = 2; + + /* SWITCH_ON_EOP(0) is always preferable. */ + bool wd_switch_on_eop = false; + bool ia_switch_on_eop = false; + bool ia_switch_on_eoi = false; + bool partial_vs_wave = false; + bool partial_es_wave = false; + + if (key->u.uses_tess) { + /* SWITCH_ON_EOI must be set if PrimID is used. */ + if (key->u.tess_uses_prim_id) + ia_switch_on_eoi = true; + + /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */ + if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN || + sscreen->info.family == CHIP_BONAIRE) && + key->u.uses_gs) + partial_vs_wave = true; + + /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */ + if (sscreen->info.has_distributed_tess) { + if (key->u.uses_gs) { + if (sscreen->info.chip_class == GFX8) + partial_es_wave = true; + } else { + partial_vs_wave = true; + } + } + } + + /* This is a hardware requirement. */ + if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) { + ia_switch_on_eop = true; + wd_switch_on_eop = true; + } + + if (sscreen->info.chip_class >= GFX7) { + /* WD_SWITCH_ON_EOP has no effect on GPUs with less than + * 4 shader engines. Set 1 to pass the assertion below. + * The other cases are hardware requirements. + * + * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0 + * for points, line strips, and tri strips. + */ + if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON || + key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN || + key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY || + (key->u.primitive_restart && + (sscreen->info.family < CHIP_POLARIS10 || + (key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP && + key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) || + key->u.count_from_stream_output) + wd_switch_on_eop = true; + + /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. + * We don't know that for indirect drawing, so treat it as + * always problematic. */ + if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing) + wd_switch_on_eop = true; + + /* Performance recommendation for 4 SE Gfx7-8 parts if + * instances are smaller than a primgroup. + * Assume indirect draws always use small instances. + * This is needed for good VS wave utilization. + */ + if (sscreen->info.chip_class <= GFX8 && sscreen->info.max_se == 4 && + key->u.multi_instances_smaller_than_primgroup) + wd_switch_on_eop = true; + + /* Required on GFX7 and later. */ + if (sscreen->info.max_se == 4 && !wd_switch_on_eop) + ia_switch_on_eoi = true; + + /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set + * to work around a GS hang. + */ + if (key->u.uses_gs && + (sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI || + sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 || + sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM)) + partial_vs_wave = true; + + /* Required by Hawaii and, for some special cases, by GFX8. */ + if (ia_switch_on_eoi && + (sscreen->info.family == CHIP_HAWAII || + (sscreen->info.chip_class == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2)))) + partial_vs_wave = true; + + /* Instancing bug on Bonaire. */ + if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing) + partial_vs_wave = true; + + /* This only applies to Polaris10 and later 4 SE chips. + * wd_switch_on_eop is already true on all other chips. + */ + if (!wd_switch_on_eop && key->u.primitive_restart) + partial_vs_wave = true; + + /* If the WD switch is false, the IA switch must be false too. */ + assert(wd_switch_on_eop || !ia_switch_on_eop); + } + + /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */ + if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi) + partial_es_wave = true; + + return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | + S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | + S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) | + S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) | + /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */ + S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? max_primgroup_in_wave + : 0) | + S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) | + S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9); } static void si_init_ia_multi_vgt_param_table(struct si_context *sctx) { - for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++) - for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++) - for (int multi_instances = 0; multi_instances < 2; multi_instances++) - for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++) - for (int count_from_so = 0; count_from_so < 2; count_from_so++) - for (int line_stipple = 0; line_stipple < 2; line_stipple++) - for (int uses_tess = 0; uses_tess < 2; uses_tess++) - for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++) - for (int uses_gs = 0; uses_gs < 2; uses_gs++) { - union si_vgt_param_key key; - - key.index = 0; - key.u.prim = prim; - key.u.uses_instancing = uses_instancing; - key.u.multi_instances_smaller_than_primgroup = multi_instances; - key.u.primitive_restart = primitive_restart; - key.u.count_from_stream_output = count_from_so; - key.u.line_stipple_enabled = line_stipple; - key.u.uses_tess = uses_tess; - key.u.tess_uses_prim_id = tess_uses_primid; - key.u.uses_gs = uses_gs; - - sctx->ia_multi_vgt_param[key.index] = - si_get_init_multi_vgt_param(sctx->screen, &key); - } + for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++) + for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++) + for (int multi_instances = 0; multi_instances < 2; multi_instances++) + for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++) + for (int count_from_so = 0; count_from_so < 2; count_from_so++) + for (int line_stipple = 0; line_stipple < 2; line_stipple++) + for (int uses_tess = 0; uses_tess < 2; uses_tess++) + for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++) + for (int uses_gs = 0; uses_gs < 2; uses_gs++) { + union si_vgt_param_key key; + + key.index = 0; + key.u.prim = prim; + key.u.uses_instancing = uses_instancing; + key.u.multi_instances_smaller_than_primgroup = multi_instances; + key.u.primitive_restart = primitive_restart; + key.u.count_from_stream_output = count_from_so; + key.u.line_stipple_enabled = line_stipple; + key.u.uses_tess = uses_tess; + key.u.tess_uses_prim_id = tess_uses_primid; + key.u.uses_gs = uses_gs; + + sctx->ia_multi_vgt_param[key.index] = + si_get_init_multi_vgt_param(sctx->screen, &key); + } } static bool si_is_line_stipple_enabled(struct si_context *sctx) { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - return rs->line_stipple_enable && - sctx->current_rast_prim != PIPE_PRIM_POINTS && - (rs->polygon_mode_is_lines || - util_prim_is_lines(sctx->current_rast_prim)); + return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS && + (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim)); } static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, - const struct pipe_draw_info *info, - enum pipe_prim_type prim, - unsigned num_patches, - unsigned instance_count, - bool primitive_restart) + const struct pipe_draw_info *info, + enum pipe_prim_type prim, unsigned num_patches, + unsigned instance_count, bool primitive_restart) { - union si_vgt_param_key key = sctx->ia_multi_vgt_param_key; - unsigned primgroup_size; - unsigned ia_multi_vgt_param; - - if (sctx->tes_shader.cso) { - primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ - } else if (sctx->gs_shader.cso) { - primgroup_size = 64; /* recommended with a GS */ - } else { - primgroup_size = 128; /* recommended without a GS and tess */ - } - - key.u.prim = prim; - key.u.uses_instancing = info->indirect || instance_count > 1; - key.u.multi_instances_smaller_than_primgroup = - info->indirect || - (instance_count > 1 && - (info->count_from_stream_output || - si_num_prims_for_vertices(info, prim) < primgroup_size)); - key.u.primitive_restart = primitive_restart; - key.u.count_from_stream_output = info->count_from_stream_output != NULL; - key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx); - - ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] | - S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1); - - if (sctx->gs_shader.cso) { - /* GS requirement. */ - if (sctx->chip_class <= GFX8 && - SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3) - ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1); - - /* GS hw bug with single-primitive instances and SWITCH_ON_EOI. - * The hw doc says all multi-SE chips are affected, but Vulkan - * only applies it to Hawaii. Do what Vulkan does. - */ - if (sctx->family == CHIP_HAWAII && - G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) && - (info->indirect || - (instance_count > 1 && - (info->count_from_stream_output || - si_num_prims_for_vertices(info, prim) <= 1)))) - sctx->flags |= SI_CONTEXT_VGT_FLUSH; - } - - return ia_multi_vgt_param; + union si_vgt_param_key key = sctx->ia_multi_vgt_param_key; + unsigned primgroup_size; + unsigned ia_multi_vgt_param; + + if (sctx->tes_shader.cso) { + primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ + } else if (sctx->gs_shader.cso) { + primgroup_size = 64; /* recommended with a GS */ + } else { + primgroup_size = 128; /* recommended without a GS and tess */ + } + + key.u.prim = prim; + key.u.uses_instancing = info->indirect || instance_count > 1; + key.u.multi_instances_smaller_than_primgroup = + info->indirect || + (instance_count > 1 && + (info->count_from_stream_output || si_num_prims_for_vertices(info, prim) < primgroup_size)); + key.u.primitive_restart = primitive_restart; + key.u.count_from_stream_output = info->count_from_stream_output != NULL; + key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx); + + ia_multi_vgt_param = + sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1); + + if (sctx->gs_shader.cso) { + /* GS requirement. */ + if (sctx->chip_class <= GFX8 && + SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3) + ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1); + + /* GS hw bug with single-primitive instances and SWITCH_ON_EOI. + * The hw doc says all multi-SE chips are affected, but Vulkan + * only applies it to Hawaii. Do what Vulkan does. + */ + if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) && + (info->indirect || (instance_count > 1 && (info->count_from_stream_output || + si_num_prims_for_vertices(info, prim) <= 1)))) + sctx->flags |= SI_CONTEXT_VGT_FLUSH; + } + + return ia_multi_vgt_param; } static unsigned si_conv_prim_to_gs_out(unsigned mode) { - static const int prim_conv[] = { - [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST, - [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, - [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, - [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, - [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, - [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, - [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, - [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST, - [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0, - }; - assert(mode < ARRAY_SIZE(prim_conv)); - - return prim_conv[mode]; + static const int prim_conv[] = { + [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST, + [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, + [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, + [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, + [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, + [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST, + [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0, + }; + assert(mode < ARRAY_SIZE(prim_conv)); + + return prim_conv[mode]; } /* rast_prim is the primitive type after GS. */ static void si_emit_rasterizer_prim_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - enum pipe_prim_type rast_prim = sctx->current_rast_prim; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned initial_cdw = cs->current.cdw; - - if (unlikely(si_is_line_stipple_enabled(sctx))) { - /* For lines, reset the stipple pattern at each primitive. Otherwise, - * reset the stipple pattern at each packet (line strips, line loops). - */ - unsigned value = rs->pa_sc_line_stipple | - S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2); - - radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, - SI_TRACKED_PA_SC_LINE_STIPPLE, value); - } - - unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim); - if (unlikely(gs_out_prim != sctx->last_gs_out_prim && - (sctx->ngg || sctx->gs_shader.cso))) { - radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim); - sctx->last_gs_out_prim = gs_out_prim; - } - - if (initial_cdw != cs->current.cdw) - sctx->context_roll = true; - - if (sctx->ngg) { - unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim; - - sctx->current_vs_state &= C_VS_STATE_OUTPRIM & - C_VS_STATE_PROVOKING_VTX_INDEX; - sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim) | - S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index); - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + enum pipe_prim_type rast_prim = sctx->current_rast_prim; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + unsigned initial_cdw = cs->current.cdw; + + if (unlikely(si_is_line_stipple_enabled(sctx))) { + /* For lines, reset the stipple pattern at each primitive. Otherwise, + * reset the stipple pattern at each packet (line strips, line loops). + */ + unsigned value = + rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2); + + radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE, + value); + } + + unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim); + if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) { + radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim); + sctx->last_gs_out_prim = gs_out_prim; + } + + if (initial_cdw != cs->current.cdw) + sctx->context_roll = true; + + if (sctx->ngg) { + unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim; + + sctx->current_vs_state &= C_VS_STATE_OUTPRIM & C_VS_STATE_PROVOKING_VTX_INDEX; + sctx->current_vs_state |= + S_VS_STATE_OUTPRIM(gs_out_prim) | S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index); + } } -static void si_emit_vs_state(struct si_context *sctx, - const struct pipe_draw_info *info) +static void si_emit_vs_state(struct si_context *sctx, const struct pipe_draw_info *info) { - sctx->current_vs_state &= C_VS_STATE_INDEXED; - sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size); - - if (sctx->num_vs_blit_sgprs) { - /* Re-emit the state after we leave u_blitter. */ - sctx->last_vs_state = ~0; - return; - } - - if (sctx->current_vs_state != sctx->last_vs_state) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */ - radeon_set_sh_reg(cs, - sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + - SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); - - /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage - * before the rasterizer. - * - * For TES or the GS copy shader without NGG: - */ - if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != - R_00B130_SPI_SHADER_USER_DATA_VS_0) { - radeon_set_sh_reg(cs, - R_00B130_SPI_SHADER_USER_DATA_VS_0 + - SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); - } - - /* For NGG: */ - if (sctx->screen->use_ngg && - sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != - R_00B230_SPI_SHADER_USER_DATA_GS_0) { - radeon_set_sh_reg(cs, - R_00B230_SPI_SHADER_USER_DATA_GS_0 + - SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); - } - - sctx->last_vs_state = sctx->current_vs_state; - } + sctx->current_vs_state &= C_VS_STATE_INDEXED; + sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size); + + if (sctx->num_vs_blit_sgprs) { + /* Re-emit the state after we leave u_blitter. */ + sctx->last_vs_state = ~0; + return; + } + + if (sctx->current_vs_state != sctx->last_vs_state) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */ + radeon_set_sh_reg( + cs, sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4, + sctx->current_vs_state); + + /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage + * before the rasterizer. + * + * For TES or the GS copy shader without NGG: + */ + if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B130_SPI_SHADER_USER_DATA_VS_0) { + radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4, + sctx->current_vs_state); + } + + /* For NGG: */ + if (sctx->screen->use_ngg && + sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B230_SPI_SHADER_USER_DATA_GS_0) { + radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4, + sctx->current_vs_state); + } + + sctx->last_vs_state = sctx->current_vs_state; + } } -static inline bool si_prim_restart_index_changed(struct si_context *sctx, - bool primitive_restart, - unsigned restart_index) +static inline bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart, + unsigned restart_index) { - return primitive_restart && - (restart_index != sctx->last_restart_index || - sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN); + return primitive_restart && (restart_index != sctx->last_restart_index || + sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN); } -static void si_emit_ia_multi_vgt_param(struct si_context *sctx, - const struct pipe_draw_info *info, - enum pipe_prim_type prim, - unsigned num_patches, - unsigned instance_count, - bool primitive_restart) +static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info, + enum pipe_prim_type prim, unsigned num_patches, + unsigned instance_count, bool primitive_restart) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned ia_multi_vgt_param; - - ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, num_patches, - instance_count, primitive_restart); - - /* Draw state. */ - if (ia_multi_vgt_param != sctx->last_multi_vgt_param) { - if (sctx->chip_class == GFX9) - radeon_set_uconfig_reg_idx(cs, sctx->screen, - R_030960_IA_MULTI_VGT_PARAM, 4, - ia_multi_vgt_param); - else if (sctx->chip_class >= GFX7) - radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); - else - radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); - - sctx->last_multi_vgt_param = ia_multi_vgt_param; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned ia_multi_vgt_param; + + ia_multi_vgt_param = + si_get_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart); + + /* Draw state. */ + if (ia_multi_vgt_param != sctx->last_multi_vgt_param) { + if (sctx->chip_class == GFX9) + radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4, + ia_multi_vgt_param); + else if (sctx->chip_class >= GFX7) + radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); + else + radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); + + sctx->last_multi_vgt_param = ia_multi_vgt_param; + } } /* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL. @@ -720,1601 +662,1460 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, */ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches) { - union si_vgt_param_key key = sctx->ia_multi_vgt_param_key; - unsigned ge_cntl; - - if (sctx->ngg) { - if (sctx->tes_shader.cso) { - ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) | - S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ - S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id); - } else { - ge_cntl = si_get_vs_state(sctx)->ge_cntl; - } - } else { - unsigned primgroup_size; - unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */; - - if (sctx->tes_shader.cso) { - primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ - } else if (sctx->gs_shader.cso) { - unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl; - primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl); - } else { - primgroup_size = 128; /* recommended without a GS and tess */ - } - - ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | - S_03096C_VERT_GRP_SIZE(vertgroup_size) | - S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id); - } - - ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx)); - - if (ge_cntl != sctx->last_multi_vgt_param) { - radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl); - sctx->last_multi_vgt_param = ge_cntl; - } + union si_vgt_param_key key = sctx->ia_multi_vgt_param_key; + unsigned ge_cntl; + + if (sctx->ngg) { + if (sctx->tes_shader.cso) { + ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ + S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id); + } else { + ge_cntl = si_get_vs_state(sctx)->ge_cntl; + } + } else { + unsigned primgroup_size; + unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */ + ; + + if (sctx->tes_shader.cso) { + primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ + } else if (sctx->gs_shader.cso) { + unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl; + primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl); + } else { + primgroup_size = 128; /* recommended without a GS and tess */ + } + + ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | S_03096C_VERT_GRP_SIZE(vertgroup_size) | + S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id); + } + + ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx)); + + if (ge_cntl != sctx->last_multi_vgt_param) { + radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl); + sctx->last_multi_vgt_param = ge_cntl; + } } -static void si_emit_draw_registers(struct si_context *sctx, - const struct pipe_draw_info *info, - enum pipe_prim_type prim, - unsigned num_patches, - unsigned instance_count, - bool primitive_restart) +static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info, + enum pipe_prim_type prim, unsigned num_patches, + unsigned instance_count, bool primitive_restart) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned vgt_prim = si_conv_pipe_prim(prim); - - if (sctx->chip_class >= GFX10) - gfx10_emit_ge_cntl(sctx, num_patches); - else - si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches, - instance_count, primitive_restart); - - if (vgt_prim != sctx->last_prim) { - if (sctx->chip_class >= GFX10) - radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim); - else if (sctx->chip_class >= GFX7) - radeon_set_uconfig_reg_idx(cs, sctx->screen, - R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim); - else - radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim); - - sctx->last_prim = vgt_prim; - } - - /* Primitive restart. */ - if (primitive_restart != sctx->last_primitive_restart_en) { - if (sctx->chip_class >= GFX9) - radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, - primitive_restart); - else - radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, - primitive_restart); - - sctx->last_primitive_restart_en = primitive_restart; - - } - if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) { - radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, - info->restart_index); - sctx->last_restart_index = info->restart_index; - sctx->context_roll = true; - } + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned vgt_prim = si_conv_pipe_prim(prim); + + if (sctx->chip_class >= GFX10) + gfx10_emit_ge_cntl(sctx, num_patches); + else + si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart); + + if (vgt_prim != sctx->last_prim) { + if (sctx->chip_class >= GFX10) + radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim); + else if (sctx->chip_class >= GFX7) + radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim); + else + radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim); + + sctx->last_prim = vgt_prim; + } + + /* Primitive restart. */ + if (primitive_restart != sctx->last_primitive_restart_en) { + if (sctx->chip_class >= GFX9) + radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart); + else + radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart); + + sctx->last_primitive_restart_en = primitive_restart; + } + if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) { + radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index); + sctx->last_restart_index = info->restart_index; + sctx->context_roll = true; + } } -static void si_emit_draw_packets(struct si_context *sctx, - const struct pipe_draw_info *info, - struct pipe_resource *indexbuf, - unsigned index_size, - unsigned index_offset, - unsigned instance_count, - bool dispatch_prim_discard_cs, - unsigned original_index_size) +static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, + struct pipe_resource *indexbuf, unsigned index_size, + unsigned index_offset, unsigned instance_count, + bool dispatch_prim_discard_cs, unsigned original_index_size) { - struct pipe_draw_indirect_info *indirect = info->indirect; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX]; - bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off; - uint32_t index_max_size = 0; - uint64_t index_va = 0; - - if (info->count_from_stream_output) { - struct si_streamout_target *t = - (struct si_streamout_target*)info->count_from_stream_output; - - radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, - t->stride_in_dw); - si_cp_copy_data(sctx, sctx->gfx_cs, - COPY_DATA_REG, NULL, - R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, - COPY_DATA_SRC_MEM, t->buf_filled_size, - t->buf_filled_size_offset); - } - - /* draw packet */ - if (index_size) { - if (index_size != sctx->last_index_size) { - unsigned index_type; - - /* index type */ - switch (index_size) { - case 1: - index_type = V_028A7C_VGT_INDEX_8; - break; - case 2: - index_type = V_028A7C_VGT_INDEX_16 | - (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? - V_028A7C_VGT_DMA_SWAP_16_BIT : 0); - break; - case 4: - index_type = V_028A7C_VGT_INDEX_32 | - (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? - V_028A7C_VGT_DMA_SWAP_32_BIT : 0); - break; - default: - assert(!"unreachable"); - return; - } - - if (sctx->chip_class >= GFX9) { - radeon_set_uconfig_reg_idx(cs, sctx->screen, - R_03090C_VGT_INDEX_TYPE, 2, - index_type); - } else { - radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); - radeon_emit(cs, index_type); - } - - sctx->last_index_size = index_size; - } - - if (original_index_size) { - index_max_size = (indexbuf->width0 - index_offset) / - original_index_size; - /* Skip draw calls with 0-sized index buffers. - * They cause a hang on some chips, like Navi10-14. - */ - if (!index_max_size) - return; - - index_va = si_resource(indexbuf)->gpu_address + index_offset; - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(indexbuf), - RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); - } - } else { - /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, - * so the state must be re-emitted before the next indexed draw. - */ - if (sctx->chip_class >= GFX7) - sctx->last_index_size = -1; - } - - if (indirect) { - uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address; - - assert(indirect_va % 8 == 0); - - si_invalidate_draw_sh_constants(sctx); - - radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); - radeon_emit(cs, 1); - radeon_emit(cs, indirect_va); - radeon_emit(cs, indirect_va >> 32); - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - si_resource(indirect->buffer), - RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); - - unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA - : V_0287F0_DI_SRC_SEL_AUTO_INDEX; - - assert(indirect->offset % 4 == 0); - - if (index_size) { - radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); - radeon_emit(cs, index_va); - radeon_emit(cs, index_va >> 32); - - radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); - radeon_emit(cs, index_max_size); - } - - if (!sctx->screen->has_draw_indirect_multi) { - radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT - : PKT3_DRAW_INDIRECT, - 3, render_cond_bit)); - radeon_emit(cs, indirect->offset); - radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, di_src_sel); - } else { - uint64_t count_va = 0; - - if (indirect->indirect_draw_count) { - struct si_resource *params_buf = - si_resource(indirect->indirect_draw_count); - - radeon_add_to_buffer_list( - sctx, sctx->gfx_cs, params_buf, - RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); - - count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset; - } - - radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : - PKT3_DRAW_INDIRECT_MULTI, - 8, render_cond_bit)); - radeon_emit(cs, indirect->offset); - radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) | - S_2C3_DRAW_INDEX_ENABLE(1) | - S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count)); - radeon_emit(cs, indirect->draw_count); - radeon_emit(cs, count_va); - radeon_emit(cs, count_va >> 32); - radeon_emit(cs, indirect->stride); - radeon_emit(cs, di_src_sel); - } - } else { - int base_vertex; - - if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN || - sctx->last_instance_count != instance_count) { - radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); - radeon_emit(cs, instance_count); - sctx->last_instance_count = instance_count; - } - - /* Base vertex and start instance. */ - base_vertex = original_index_size ? info->index_bias : info->start; - - if (sctx->num_vs_blit_sgprs) { - /* Re-emit draw constants after we leave u_blitter. */ - si_invalidate_draw_sh_constants(sctx); - - /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */ - radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, - sctx->num_vs_blit_sgprs); - radeon_emit_array(cs, sctx->vs_blit_sh_data, - sctx->num_vs_blit_sgprs); - } else if (base_vertex != sctx->last_base_vertex || - sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || - info->start_instance != sctx->last_start_instance || - info->drawid != sctx->last_drawid || - sh_base_reg != sctx->last_sh_base_reg) { - radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3); - radeon_emit(cs, base_vertex); - radeon_emit(cs, info->start_instance); - radeon_emit(cs, info->drawid); - - sctx->last_base_vertex = base_vertex; - sctx->last_start_instance = info->start_instance; - sctx->last_drawid = info->drawid; - sctx->last_sh_base_reg = sh_base_reg; - } - - if (index_size) { - if (dispatch_prim_discard_cs) { - index_va += info->start * original_index_size; - index_max_size = MIN2(index_max_size, info->count); - - si_dispatch_prim_discard_cs_and_draw(sctx, info, - original_index_size, - base_vertex, - index_va, index_max_size); - return; - } - - index_va += info->start * index_size; - - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); - radeon_emit(cs, index_max_size); - radeon_emit(cs, index_va); - radeon_emit(cs, index_va >> 32); - radeon_emit(cs, info->count); - radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); - } else { - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); - radeon_emit(cs, info->count); - radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | - S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); - } - } + struct pipe_draw_indirect_info *indirect = info->indirect; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX]; + bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off; + uint32_t index_max_size = 0; + uint64_t index_va = 0; + + if (info->count_from_stream_output) { + struct si_streamout_target *t = (struct si_streamout_target *)info->count_from_stream_output; + + radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw); + si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL, + R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM, + t->buf_filled_size, t->buf_filled_size_offset); + } + + /* draw packet */ + if (index_size) { + if (index_size != sctx->last_index_size) { + unsigned index_type; + + /* index type */ + switch (index_size) { + case 1: + index_type = V_028A7C_VGT_INDEX_8; + break; + case 2: + index_type = + V_028A7C_VGT_INDEX_16 | + (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0); + break; + case 4: + index_type = + V_028A7C_VGT_INDEX_32 | + (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0); + break; + default: + assert(!"unreachable"); + return; + } + + if (sctx->chip_class >= GFX9) { + radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type); + } else { + radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); + radeon_emit(cs, index_type); + } + + sctx->last_index_size = index_size; + } + + if (original_index_size) { + index_max_size = (indexbuf->width0 - index_offset) / original_index_size; + /* Skip draw calls with 0-sized index buffers. + * They cause a hang on some chips, like Navi10-14. + */ + if (!index_max_size) + return; + + index_va = si_resource(indexbuf)->gpu_address + index_offset; + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ, + RADEON_PRIO_INDEX_BUFFER); + } + } else { + /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, + * so the state must be re-emitted before the next indexed draw. + */ + if (sctx->chip_class >= GFX7) + sctx->last_index_size = -1; + } + + if (indirect) { + uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address; + + assert(indirect_va % 8 == 0); + + si_invalidate_draw_sh_constants(sctx); + + radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); + radeon_emit(cs, 1); + radeon_emit(cs, indirect_va); + radeon_emit(cs, indirect_va >> 32); + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indirect->buffer), + RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); + + unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX; + + assert(indirect->offset % 4 == 0); + + if (index_size) { + radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); + radeon_emit(cs, index_va); + radeon_emit(cs, index_va >> 32); + + radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); + radeon_emit(cs, index_max_size); + } + + if (!sctx->screen->has_draw_indirect_multi) { + radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, + render_cond_bit)); + radeon_emit(cs, indirect->offset); + radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, di_src_sel); + } else { + uint64_t count_va = 0; + + if (indirect->indirect_draw_count) { + struct si_resource *params_buf = si_resource(indirect->indirect_draw_count); + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, params_buf, RADEON_USAGE_READ, + RADEON_PRIO_DRAW_INDIRECT); + + count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset; + } + + radeon_emit(cs, + PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, + render_cond_bit)); + radeon_emit(cs, indirect->offset); + radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); + radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) | + S_2C3_DRAW_INDEX_ENABLE(1) | + S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count)); + radeon_emit(cs, indirect->draw_count); + radeon_emit(cs, count_va); + radeon_emit(cs, count_va >> 32); + radeon_emit(cs, indirect->stride); + radeon_emit(cs, di_src_sel); + } + } else { + int base_vertex; + + if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN || + sctx->last_instance_count != instance_count) { + radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); + radeon_emit(cs, instance_count); + sctx->last_instance_count = instance_count; + } + + /* Base vertex and start instance. */ + base_vertex = original_index_size ? info->index_bias : info->start; + + if (sctx->num_vs_blit_sgprs) { + /* Re-emit draw constants after we leave u_blitter. */ + si_invalidate_draw_sh_constants(sctx); + + /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */ + radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs); + radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs); + } else if (base_vertex != sctx->last_base_vertex || + sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || + info->start_instance != sctx->last_start_instance || + info->drawid != sctx->last_drawid || sh_base_reg != sctx->last_sh_base_reg) { + radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3); + radeon_emit(cs, base_vertex); + radeon_emit(cs, info->start_instance); + radeon_emit(cs, info->drawid); + + sctx->last_base_vertex = base_vertex; + sctx->last_start_instance = info->start_instance; + sctx->last_drawid = info->drawid; + sctx->last_sh_base_reg = sh_base_reg; + } + + if (index_size) { + if (dispatch_prim_discard_cs) { + index_va += info->start * original_index_size; + index_max_size = MIN2(index_max_size, info->count); + + si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex, + index_va, index_max_size); + return; + } + + index_va += info->start * index_size; + + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); + radeon_emit(cs, index_max_size); + radeon_emit(cs, index_va); + radeon_emit(cs, index_va >> 32); + radeon_emit(cs, info->count); + radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); + } else { + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); + radeon_emit(cs, info->count); + radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | + S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); + } + } } -void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, - unsigned cp_coher_cntl) +void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl) { - bool compute_ib = !sctx->has_graphics || - cs == sctx->prim_discard_compute_cs; - - assert(sctx->chip_class <= GFX9); - - if (sctx->chip_class == GFX9 || compute_ib) { - /* Flush caches and wait for the caches to assert idle. */ - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); - radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - } else { - /* ACQUIRE_MEM is only required on a compute ring. */ - radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); - radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - } - - /* ACQUIRE_MEM has an implicit context roll if the current context - * is busy. */ - if (!compute_ib) - sctx->context_roll = true; + bool compute_ib = !sctx->has_graphics || cs == sctx->prim_discard_compute_cs; + + assert(sctx->chip_class <= GFX9); + + if (sctx->chip_class == GFX9 || compute_ib) { + /* Flush caches and wait for the caches to assert idle. */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } else { + /* ACQUIRE_MEM is only required on a compute ring. */ + radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } + + /* ACQUIRE_MEM has an implicit context roll if the current context + * is busy. */ + if (!compute_ib) + sctx->context_roll = true; } void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) { - if (!si_compute_prim_discard_enabled(sctx)) - return; - - if (!sctx->barrier_buf) { - u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, - &sctx->barrier_buf_offset, - (struct pipe_resource**)&sctx->barrier_buf); - } - - /* Emit a placeholder to signal the next compute IB to start. - * See si_compute_prim_discard.c for explanation. - */ - uint32_t signal = 1; - si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, - 4, V_370_MEM, V_370_ME, &signal); - - sctx->last_pkt3_write_data = - &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5]; - - /* Only the last occurence of WRITE_DATA will be executed. - * The packet will be enabled in si_flush_gfx_cs. - */ - *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); + if (!si_compute_prim_discard_enabled(sctx)) + return; + + if (!sctx->barrier_buf) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset, + (struct pipe_resource **)&sctx->barrier_buf); + } + + /* Emit a placeholder to signal the next compute IB to start. + * See si_compute_prim_discard.c for explanation. + */ + uint32_t signal = 1; + si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME, + &signal); + + sctx->last_pkt3_write_data = &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5]; + + /* Only the last occurence of WRITE_DATA will be executed. + * The packet will be enabled in si_flush_gfx_cs. + */ + *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); } void gfx10_emit_cache_flush(struct si_context *ctx) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - uint32_t gcr_cntl = 0; - unsigned cb_db_event = 0; - unsigned flags = ctx->flags; - - if (!ctx->has_graphics) { - /* Only process compute flags. */ - flags &= SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | - SI_CONTEXT_WB_L2 | - SI_CONTEXT_INV_L2_METADATA | - SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - /* We don't need these. */ - assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | - SI_CONTEXT_FLUSH_AND_INV_DB_META))); - - if (flags & SI_CONTEXT_VGT_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - } - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) - ctx->num_cb_cache_flushes++; - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - ctx->num_db_cache_flushes++; - - if (flags & SI_CONTEXT_INV_ICACHE) - gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); - if (flags & SI_CONTEXT_INV_SCACHE) { - /* TODO: When writing to the SMEM L1 cache, we need to set SEQ - * to FORWARD when both L1 and L2 are written out (WB or INV). - */ - gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); - } - if (flags & SI_CONTEXT_INV_VCACHE) - gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); - - /* The L2 cache ops are: - * - INV: - invalidate lines that reflect memory (were loaded from memory) - * - don't touch lines that were overwritten (were stored by gfx clients) - * - WB: - don't touch lines that reflect memory - * - write back lines that were overwritten - * - WB | INV: - invalidate lines that reflect memory - * - write back lines that were overwritten - * - * GLM doesn't support WB alone. If WB is set, INV must be set too. - */ - if (flags & SI_CONTEXT_INV_L2) { - /* Writeback and invalidate everything in L2. */ - gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | - S_586_GLM_INV(1) | S_586_GLM_WB(1); - ctx->num_L2_invalidates++; - } else if (flags & SI_CONTEXT_WB_L2) { - gcr_cntl |= S_586_GL2_WB(1) | - S_586_GLM_WB(1) | S_586_GLM_INV(1); - } else if (flags & SI_CONTEXT_INV_L2_METADATA) { - gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); - } - - if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | - EVENT_INDEX(0)); - } - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { - /* Flush HTILE. Will wait for idle later. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | - EVENT_INDEX(0)); - } - - /* First flush CB/DB, then L1/L2. */ - gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); - - if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) == - (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { - cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; - } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; - } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { - cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; - } else { - assert(0); - } - } else { - /* Wait for graphics shaders to go idle if requested. */ - if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - /* Only count explicit shader flushes, not implicit ones. */ - ctx->num_vs_flushes++; - ctx->num_ps_flushes++; - } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - ctx->num_vs_flushes++; - } - } - - if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); - ctx->num_cs_flushes++; - ctx->compute_is_busy = false; - } - - if (cb_db_event) { - /* CB/DB flush and invalidate (or possibly just a wait for a - * meta flush) via RELEASE_MEM. - * - * Combine this with other cache flushes when possible; this - * requires affected shaders to be idle, so do it after the - * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always - * implied). - */ - uint64_t va; - - /* Do the flush (enqueue the event and wait for it). */ - va = ctx->wait_mem_scratch->gpu_address; - ctx->wait_mem_number++; - - /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ - unsigned glm_wb = G_586_GLM_WB(gcr_cntl); - unsigned glm_inv = G_586_GLM_INV(gcr_cntl); - unsigned glv_inv = G_586_GLV_INV(gcr_cntl); - unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); - assert(G_586_GL2_US(gcr_cntl) == 0); - assert(G_586_GL2_RANGE(gcr_cntl) == 0); - assert(G_586_GL2_DISCARD(gcr_cntl) == 0); - unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); - unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); - unsigned gcr_seq = G_586_SEQ(gcr_cntl); - - gcr_cntl &= C_586_GLM_WB & - C_586_GLM_INV & - C_586_GLV_INV & - C_586_GL1_INV & - C_586_GL2_INV & - C_586_GL2_WB; /* keep SEQ */ - - si_cp_release_mem(ctx, cs, cb_db_event, - S_490_GLM_WB(glm_wb) | - S_490_GLM_INV(glm_inv) | - S_490_GLV_INV(glv_inv) | - S_490_GL1_INV(gl1_inv) | - S_490_GL2_INV(gl2_inv) | - S_490_GL2_WB(gl2_wb) | - S_490_SEQ(gcr_seq), - EOP_DST_SEL_MEM, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, - EOP_DATA_SEL_VALUE_32BIT, - ctx->wait_mem_scratch, va, - ctx->wait_mem_number, SI_NOT_QUERY); - si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, - WAIT_REG_MEM_EQUAL); - } - - /* Ignore fields that only modify the behavior of other fields. */ - if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { - /* Flush caches and wait for the caches to assert idle. - * The cache flush is executed in the ME, but the PFP waits - * for completion. - */ - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ - } else if (cb_db_event || - (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH))) { - /* We need to ensure that PFP waits as well. */ - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - } - - if (flags & SI_CONTEXT_START_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | - EVENT_INDEX(0)); - } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | - EVENT_INDEX(0)); - } - - ctx->flags = 0; + struct radeon_cmdbuf *cs = ctx->gfx_cs; + uint32_t gcr_cntl = 0; + unsigned cb_db_event = 0; + unsigned flags = ctx->flags; + + if (!ctx->has_graphics) { + /* Only process compute flags. */ + flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* We don't need these. */ + assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META))); + + if (flags & SI_CONTEXT_VGT_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) + ctx->num_cb_cache_flushes++; + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + ctx->num_db_cache_flushes++; + + if (flags & SI_CONTEXT_INV_ICACHE) + gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); + if (flags & SI_CONTEXT_INV_SCACHE) { + /* TODO: When writing to the SMEM L1 cache, we need to set SEQ + * to FORWARD when both L1 and L2 are written out (WB or INV). + */ + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); + } + if (flags & SI_CONTEXT_INV_VCACHE) + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); + + /* The L2 cache ops are: + * - INV: - invalidate lines that reflect memory (were loaded from memory) + * - don't touch lines that were overwritten (were stored by gfx clients) + * - WB: - don't touch lines that reflect memory + * - write back lines that were overwritten + * - WB | INV: - invalidate lines that reflect memory + * - write back lines that were overwritten + * + * GLM doesn't support WB alone. If WB is set, INV must be set too. + */ + if (flags & SI_CONTEXT_INV_L2) { + /* Writeback and invalidate everything in L2. */ + gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); + ctx->num_L2_invalidates++; + } else if (flags & SI_CONTEXT_WB_L2) { + gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); + } else if (flags & SI_CONTEXT_INV_L2_METADATA) { + gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); + } + + if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + /* Flush HTILE. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + } + + /* First flush CB/DB, then L1/L2. */ + gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); + + if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) == + (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + } else { + assert(0); + } + } else { + /* Wait for graphics shaders to go idle if requested. */ + if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* Only count explicit shader flushes, not implicit ones. */ + ctx->num_vs_flushes++; + ctx->num_ps_flushes++; + } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + ctx->num_vs_flushes++; + } + } + + if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); + ctx->num_cs_flushes++; + ctx->compute_is_busy = false; + } + + if (cb_db_event) { + /* CB/DB flush and invalidate (or possibly just a wait for a + * meta flush) via RELEASE_MEM. + * + * Combine this with other cache flushes when possible; this + * requires affected shaders to be idle, so do it after the + * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always + * implied). + */ + uint64_t va; + + /* Do the flush (enqueue the event and wait for it). */ + va = ctx->wait_mem_scratch->gpu_address; + ctx->wait_mem_number++; + + /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ + unsigned glm_wb = G_586_GLM_WB(gcr_cntl); + unsigned glm_inv = G_586_GLM_INV(gcr_cntl); + unsigned glv_inv = G_586_GLV_INV(gcr_cntl); + unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); + unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); + unsigned gcr_seq = G_586_SEQ(gcr_cntl); + + gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & + C_586_GL2_WB; /* keep SEQ */ + + si_cp_release_mem(ctx, cs, cb_db_event, + S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq), + EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, + EOP_DATA_SEL_VALUE_32BIT, ctx->wait_mem_scratch, va, ctx->wait_mem_number, + SI_NOT_QUERY); + si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); + } + + /* Ignore fields that only modify the behavior of other fields. */ + if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { + /* Flush caches and wait for the caches to assert idle. + * The cache flush is executed in the ME, but the PFP waits + * for completion. + */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ + } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* We need to ensure that PFP waits as well. */ + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); + } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); + } + + ctx->flags = 0; } void si_emit_cache_flush(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - uint32_t flags = sctx->flags; - - if (!sctx->has_graphics) { - /* Only process compute flags. */ - flags &= SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | - SI_CONTEXT_WB_L2 | - SI_CONTEXT_INV_L2_METADATA | - SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - uint32_t cp_coher_cntl = 0; - const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB); - const bool is_barrier = flush_cb_db || - /* INV_ICACHE == beginning of gfx IB. Checking - * INV_ICACHE fixes corruption for DeusExMD with - * compute-based culling, but I don't know why. - */ - flags & (SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_VS_PARTIAL_FLUSH) || - (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && - sctx->compute_is_busy); - - assert(sctx->chip_class <= GFX9); - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) - sctx->num_cb_cache_flushes++; - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - sctx->num_db_cache_flushes++; - - /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either - * bit is set. An alternative way is to write SQC_CACHES, but that - * doesn't seem to work reliably. Since the bug doesn't affect - * correctness (it only does more work than necessary) and - * the performance impact is likely negligible, there is no plan - * to add a workaround for it. - */ - - if (flags & SI_CONTEXT_INV_ICACHE) - cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (flags & SI_CONTEXT_INV_SCACHE) - cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - - if (sctx->chip_class <= GFX8) { - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | - S_0085F0_CB0_DEST_BASE_ENA(1) | - S_0085F0_CB1_DEST_BASE_ENA(1) | - S_0085F0_CB2_DEST_BASE_ENA(1) | - S_0085F0_CB3_DEST_BASE_ENA(1) | - S_0085F0_CB4_DEST_BASE_ENA(1) | - S_0085F0_CB5_DEST_BASE_ENA(1) | - S_0085F0_CB6_DEST_BASE_ENA(1) | - S_0085F0_CB7_DEST_BASE_ENA(1); - - /* Necessary for DCC */ - if (sctx->chip_class == GFX8) - si_cp_release_mem(sctx, cs, - V_028A90_FLUSH_AND_INV_CB_DATA_TS, - 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, - EOP_DATA_SEL_DISCARD, NULL, - 0, 0, SI_NOT_QUERY); - } - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | - S_0085F0_DB_DEST_BASE_ENA(1); - } - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); - } - if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_FLUSH_AND_INV_DB_META)) { - /* Flush HTILE. SURFACE_SYNC will wait for idle. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); - } - - /* Wait for shader engines to go idle. - * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait - * for everything including CB/DB cache flushes. - */ - if (!flush_cb_db) { - if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - /* Only count explicit shader flushes, not implicit ones - * done by SURFACE_SYNC. - */ - sctx->num_vs_flushes++; - sctx->num_ps_flushes++; - } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - sctx->num_vs_flushes++; - } - } - - if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && - sctx->compute_is_busy) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - sctx->num_cs_flushes++; - sctx->compute_is_busy = false; - } - - /* VGT state synchronization. */ - if (flags & SI_CONTEXT_VGT_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - } - if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); - } - - /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't - * wait for idle on GFX9. We have to use a TS event. - */ - if (sctx->chip_class == GFX9 && flush_cb_db) { - uint64_t va; - unsigned tc_flags, cb_db_event; - - /* Set the CB/DB flush event. */ - switch (flush_cb_db) { - case SI_CONTEXT_FLUSH_AND_INV_CB: - cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; - break; - case SI_CONTEXT_FLUSH_AND_INV_DB: - cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; - break; - default: - /* both CB & DB */ - cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; - } - - /* These are the only allowed combinations. If you need to - * do multiple operations at once, do them separately. - * All operations that invalidate L2 also seem to invalidate - * metadata. Volatile (VOL) and WC flushes are not listed here. - * - * TC | TC_WB = writeback & invalidate L2 & L1 - * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC - * TC_WB | TC_NC = writeback L2 for MTYPE == NC - * TC | TC_NC = invalidate L2 for MTYPE == NC - * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) - * TCL1 = invalidate L1 - */ - tc_flags = 0; - - if (flags & SI_CONTEXT_INV_L2_METADATA) { - tc_flags = EVENT_TC_ACTION_ENA | - EVENT_TC_MD_ACTION_ENA; - } - - /* Ideally flush TC together with CB/DB. */ - if (flags & SI_CONTEXT_INV_L2) { - /* Writeback and invalidate everything in L2 & L1. */ - tc_flags = EVENT_TC_ACTION_ENA | - EVENT_TC_WB_ACTION_ENA; - - /* Clear the flags. */ - flags &= ~(SI_CONTEXT_INV_L2 | - SI_CONTEXT_WB_L2 | - SI_CONTEXT_INV_VCACHE); - sctx->num_L2_invalidates++; - } - - /* Do the flush (enqueue the event and wait for it). */ - va = sctx->wait_mem_scratch->gpu_address; - sctx->wait_mem_number++; - - si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, - EOP_DST_SEL_MEM, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, - EOP_DATA_SEL_VALUE_32BIT, - sctx->wait_mem_scratch, va, - sctx->wait_mem_number, SI_NOT_QUERY); - si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, - WAIT_REG_MEM_EQUAL); - } - - /* Make sure ME is idle (it executes most packets) before continuing. - * This prevents read-after-write hazards between PFP and ME. - */ - if (sctx->has_graphics && - (cp_coher_cntl || - (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | - SI_CONTEXT_WB_L2)))) { - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - } - - /* GFX6-GFX8 only: - * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC - * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. - * - * cp_coher_cntl should contain all necessary flags except TC flags - * at this point. - * - * GFX6-GFX7 don't support L2 write-back. - */ - if (flags & SI_CONTEXT_INV_L2 || - (sctx->chip_class <= GFX7 && - (flags & SI_CONTEXT_WB_L2))) { - /* Invalidate L1 & L2. (L1 is always invalidated on GFX6) - * WB must be set on GFX8+ when TC_ACTION is set. - */ - si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | - S_0085F0_TC_ACTION_ENA(1) | - S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8)); - cp_coher_cntl = 0; - sctx->num_L2_invalidates++; - } else { - /* L1 invalidation and L2 writeback must be done separately, - * because both operations can't be done together. - */ - if (flags & SI_CONTEXT_WB_L2) { - /* WB = write-back - * NC = apply to non-coherent MTYPEs - * (i.e. MTYPE <= 1, which is what we use everywhere) - * - * WB doesn't work without NC. - */ - si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | - S_0301F0_TC_WB_ACTION_ENA(1) | - S_0301F0_TC_NC_ACTION_ENA(1)); - cp_coher_cntl = 0; - sctx->num_L2_writebacks++; - } - if (flags & SI_CONTEXT_INV_VCACHE) { - /* Invalidate per-CU VMEM L1. */ - si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | - S_0085F0_TCL1_ACTION_ENA(1)); - cp_coher_cntl = 0; - } - } - - /* If TC flushes haven't cleared this... */ - if (cp_coher_cntl) - si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl); - - if (is_barrier) - si_prim_discard_signal_next_compute_ib_start(sctx); - - if (flags & SI_CONTEXT_START_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | - EVENT_INDEX(0)); - } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | - EVENT_INDEX(0)); - } - - sctx->flags = 0; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + uint32_t flags = sctx->flags; + + if (!sctx->has_graphics) { + /* Only process compute flags. */ + flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + uint32_t cp_coher_cntl = 0; + const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB); + const bool is_barrier = + flush_cb_db || + /* INV_ICACHE == beginning of gfx IB. Checking + * INV_ICACHE fixes corruption for DeusExMD with + * compute-based culling, but I don't know why. + */ + flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) || + (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy); + + assert(sctx->chip_class <= GFX9); + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) + sctx->num_cb_cache_flushes++; + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + sctx->num_db_cache_flushes++; + + /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either + * bit is set. An alternative way is to write SQC_CACHES, but that + * doesn't seem to work reliably. Since the bug doesn't affect + * correctness (it only does more work than necessary) and + * the performance impact is likely negligible, there is no plan + * to add a workaround for it. + */ + + if (flags & SI_CONTEXT_INV_ICACHE) + cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); + if (flags & SI_CONTEXT_INV_SCACHE) + cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); + + if (sctx->chip_class <= GFX8) { + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + + /* Necessary for DCC */ + if (sctx->chip_class == GFX8) + si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM, + EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY); + } + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); + } + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) { + /* Flush HTILE. SURFACE_SYNC will wait for idle. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + } + + /* Wait for shader engines to go idle. + * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait + * for everything including CB/DB cache flushes. + */ + if (!flush_cb_db) { + if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* Only count explicit shader flushes, not implicit ones + * done by SURFACE_SYNC. + */ + sctx->num_vs_flushes++; + sctx->num_ps_flushes++; + } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + sctx->num_vs_flushes++; + } + } + + if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + sctx->num_cs_flushes++; + sctx->compute_is_busy = false; + } + + /* VGT state synchronization. */ + if (flags & SI_CONTEXT_VGT_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); + } + + /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't + * wait for idle on GFX9. We have to use a TS event. + */ + if (sctx->chip_class == GFX9 && flush_cb_db) { + uint64_t va; + unsigned tc_flags, cb_db_event; + + /* Set the CB/DB flush event. */ + switch (flush_cb_db) { + case SI_CONTEXT_FLUSH_AND_INV_CB: + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + break; + case SI_CONTEXT_FLUSH_AND_INV_DB: + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + break; + default: + /* both CB & DB */ + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } + + /* These are the only allowed combinations. If you need to + * do multiple operations at once, do them separately. + * All operations that invalidate L2 also seem to invalidate + * metadata. Volatile (VOL) and WC flushes are not listed here. + * + * TC | TC_WB = writeback & invalidate L2 & L1 + * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC + * TC_WB | TC_NC = writeback L2 for MTYPE == NC + * TC | TC_NC = invalidate L2 for MTYPE == NC + * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) + * TCL1 = invalidate L1 + */ + tc_flags = 0; + + if (flags & SI_CONTEXT_INV_L2_METADATA) { + tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; + } + + /* Ideally flush TC together with CB/DB. */ + if (flags & SI_CONTEXT_INV_L2) { + /* Writeback and invalidate everything in L2 & L1. */ + tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; + + /* Clear the flags. */ + flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE); + sctx->num_L2_invalidates++; + } + + /* Do the flush (enqueue the event and wait for it). */ + va = sctx->wait_mem_scratch->gpu_address; + sctx->wait_mem_number++; + + si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, + sctx->wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY); + si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); + } + + /* Make sure ME is idle (it executes most packets) before continuing. + * This prevents read-after-write hazards between PFP and ME. + */ + if (sctx->has_graphics && + (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + /* GFX6-GFX8 only: + * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC + * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. + * + * cp_coher_cntl should contain all necessary flags except TC flags + * at this point. + * + * GFX6-GFX7 don't support L2 write-back. + */ + if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) { + /* Invalidate L1 & L2. (L1 is always invalidated on GFX6) + * WB must be set on GFX8+ when TC_ACTION is set. + */ + si_emit_surface_sync(sctx, sctx->gfx_cs, + cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8)); + cp_coher_cntl = 0; + sctx->num_L2_invalidates++; + } else { + /* L1 invalidation and L2 writeback must be done separately, + * because both operations can't be done together. + */ + if (flags & SI_CONTEXT_WB_L2) { + /* WB = write-back + * NC = apply to non-coherent MTYPEs + * (i.e. MTYPE <= 1, which is what we use everywhere) + * + * WB doesn't work without NC. + */ + si_emit_surface_sync( + sctx, sctx->gfx_cs, + cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); + cp_coher_cntl = 0; + sctx->num_L2_writebacks++; + } + if (flags & SI_CONTEXT_INV_VCACHE) { + /* Invalidate per-CU VMEM L1. */ + si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); + cp_coher_cntl = 0; + } + } + + /* If TC flushes haven't cleared this... */ + if (cp_coher_cntl) + si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl); + + if (is_barrier) + si_prim_discard_signal_next_compute_ib_start(sctx); + + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); + } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); + } + + sctx->flags = 0; } -static void si_get_draw_start_count(struct si_context *sctx, - const struct pipe_draw_info *info, - unsigned *start, unsigned *count) +static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info, + unsigned *start, unsigned *count) { - struct pipe_draw_indirect_info *indirect = info->indirect; - - if (indirect) { - unsigned indirect_count; - struct pipe_transfer *transfer; - unsigned begin, end; - unsigned map_size; - unsigned *data; - - if (indirect->indirect_draw_count) { - data = pipe_buffer_map_range(&sctx->b, - indirect->indirect_draw_count, - indirect->indirect_draw_count_offset, - sizeof(unsigned), - PIPE_TRANSFER_READ, &transfer); - - indirect_count = *data; - - pipe_buffer_unmap(&sctx->b, transfer); - } else { - indirect_count = indirect->draw_count; - } - - if (!indirect_count) { - *start = *count = 0; - return; - } - - map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned); - data = pipe_buffer_map_range(&sctx->b, indirect->buffer, - indirect->offset, map_size, - PIPE_TRANSFER_READ, &transfer); - - begin = UINT_MAX; - end = 0; - - for (unsigned i = 0; i < indirect_count; ++i) { - unsigned count = data[0]; - unsigned start = data[2]; - - if (count > 0) { - begin = MIN2(begin, start); - end = MAX2(end, start + count); - } - - data += indirect->stride / sizeof(unsigned); - } - - pipe_buffer_unmap(&sctx->b, transfer); - - if (begin < end) { - *start = begin; - *count = end - begin; - } else { - *start = *count = 0; - } - } else { - *start = info->start; - *count = info->count; - } + struct pipe_draw_indirect_info *indirect = info->indirect; + + if (indirect) { + unsigned indirect_count; + struct pipe_transfer *transfer; + unsigned begin, end; + unsigned map_size; + unsigned *data; + + if (indirect->indirect_draw_count) { + data = pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count, + indirect->indirect_draw_count_offset, sizeof(unsigned), + PIPE_TRANSFER_READ, &transfer); + + indirect_count = *data; + + pipe_buffer_unmap(&sctx->b, transfer); + } else { + indirect_count = indirect->draw_count; + } + + if (!indirect_count) { + *start = *count = 0; + return; + } + + map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned); + data = pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size, + PIPE_TRANSFER_READ, &transfer); + + begin = UINT_MAX; + end = 0; + + for (unsigned i = 0; i < indirect_count; ++i) { + unsigned count = data[0]; + unsigned start = data[2]; + + if (count > 0) { + begin = MIN2(begin, start); + end = MAX2(end, start + count); + } + + data += indirect->stride / sizeof(unsigned); + } + + pipe_buffer_unmap(&sctx->b, transfer); + + if (begin < end) { + *start = begin; + *count = end - begin; + } else { + *start = *count = 0; + } + } else { + *start = info->start; + *count = info->count; + } } static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, - enum pipe_prim_type prim, unsigned instance_count, - bool primitive_restart, unsigned skip_atom_mask) + enum pipe_prim_type prim, unsigned instance_count, + bool primitive_restart, unsigned skip_atom_mask) { - unsigned num_patches = 0; + unsigned num_patches = 0; - si_emit_rasterizer_prim_state(sctx); - if (sctx->tes_shader.cso) - si_emit_derived_tess_state(sctx, info, &num_patches); + si_emit_rasterizer_prim_state(sctx); + if (sctx->tes_shader.cso) + si_emit_derived_tess_state(sctx, info, &num_patches); - /* Emit state atoms. */ - unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; - while (mask) - sctx->atoms.array[u_bit_scan(&mask)].emit(sctx); + /* Emit state atoms. */ + unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; + while (mask) + sctx->atoms.array[u_bit_scan(&mask)].emit(sctx); - sctx->dirty_atoms &= skip_atom_mask; + sctx->dirty_atoms &= skip_atom_mask; - /* Emit states. */ - mask = sctx->dirty_states; - while (mask) { - unsigned i = u_bit_scan(&mask); - struct si_pm4_state *state = sctx->queued.array[i]; + /* Emit states. */ + mask = sctx->dirty_states; + while (mask) { + unsigned i = u_bit_scan(&mask); + struct si_pm4_state *state = sctx->queued.array[i]; - if (!state || sctx->emitted.array[i] == state) - continue; + if (!state || sctx->emitted.array[i] == state) + continue; - si_pm4_emit(sctx, state); - sctx->emitted.array[i] = state; - } - sctx->dirty_states = 0; + si_pm4_emit(sctx, state); + sctx->emitted.array[i] = state; + } + sctx->dirty_states = 0; - /* Emit draw states. */ - si_emit_vs_state(sctx, info); - si_emit_draw_registers(sctx, info, prim, num_patches, instance_count, - primitive_restart); + /* Emit draw states. */ + si_emit_vs_state(sctx, info); + si_emit_draw_registers(sctx, info, prim, num_patches, instance_count, primitive_restart); } -static bool -si_all_vs_resources_read_only(struct si_context *sctx, - struct pipe_resource *indexbuf) +static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf) { - struct radeon_winsys *ws = sctx->ws; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - - /* Index buffer. */ - if (indexbuf && - ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, - RADEON_USAGE_WRITE)) - goto has_write_reference; - - /* Vertex buffers. */ - struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned num_velems = velems->count; - - for (unsigned i = 0; i < num_velems; i++) { - if (!((1 << i) & velems->first_vb_use_mask)) - continue; - - unsigned vb_index = velems->vertex_buffer_index[i]; - struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, - RADEON_USAGE_WRITE)) - goto has_write_reference; - } - - /* Constant and shader buffers. */ - struct si_descriptors *buffers = - &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)]; - for (unsigned i = 0; i < buffers->num_active_slots; i++) { - unsigned index = buffers->first_active_slot + i; - struct pipe_resource *res = - sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index]; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, - RADEON_USAGE_WRITE)) - goto has_write_reference; - } - - /* Samplers. */ - struct si_shader_selector *vs = sctx->vs_shader.cso; - if (vs->info.samplers_declared) { - unsigned num_samplers = util_last_bit(vs->info.samplers_declared); - - for (unsigned i = 0; i < num_samplers; i++) { - struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i]; - if (!view) - continue; - - if (ws->cs_is_buffer_referenced(cs, - si_resource(view->texture)->buf, - RADEON_USAGE_WRITE)) - goto has_write_reference; - } - } - - /* Images. */ - if (vs->info.images_declared) { - unsigned num_images = util_last_bit(vs->info.images_declared); - - for (unsigned i = 0; i < num_images; i++) { - struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, - RADEON_USAGE_WRITE)) - goto has_write_reference; - } - } - - return true; + struct radeon_winsys *ws = sctx->ws; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + + /* Index buffer. */ + if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE)) + goto has_write_reference; + + /* Vertex buffers. */ + struct si_vertex_elements *velems = sctx->vertex_elements; + unsigned num_velems = velems->count; + + for (unsigned i = 0; i < num_velems; i++) { + if (!((1 << i) & velems->first_vb_use_mask)) + continue; + + unsigned vb_index = velems->vertex_buffer_index[i]; + struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) + goto has_write_reference; + } + + /* Constant and shader buffers. */ + struct si_descriptors *buffers = + &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)]; + for (unsigned i = 0; i < buffers->num_active_slots; i++) { + unsigned index = buffers->first_active_slot + i; + struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index]; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) + goto has_write_reference; + } + + /* Samplers. */ + struct si_shader_selector *vs = sctx->vs_shader.cso; + if (vs->info.samplers_declared) { + unsigned num_samplers = util_last_bit(vs->info.samplers_declared); + + for (unsigned i = 0; i < num_samplers; i++) { + struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i]; + if (!view) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE)) + goto has_write_reference; + } + } + + /* Images. */ + if (vs->info.images_declared) { + unsigned num_images = util_last_bit(vs->info.images_declared); + + for (unsigned i = 0; i < num_images; i++) { + struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource; + if (!res) + continue; + + if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) + goto has_write_reference; + } + } + + return true; has_write_reference: - /* If the current gfx IB has enough packets, flush it to remove write - * references to buffers. - */ - if (cs->prev_dw + cs->current.cdw > 2048) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - assert(si_all_vs_resources_read_only(sctx, indexbuf)); - return true; - } - return false; + /* If the current gfx IB has enough packets, flush it to remove write + * references to buffers. + */ + if (cs->prev_dw + cs->current.cdw > 2048) { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + assert(si_all_vs_resources_read_only(sctx, indexbuf)); + return true; + } + return false; } static ALWAYS_INLINE bool pd_msg(const char *s) { - if (SI_PRIM_DISCARD_DEBUG) - printf("PD failed: %s\n", s); - return false; + if (SI_PRIM_DISCARD_DEBUG) + printf("PD failed: %s\n", s); + return false; } static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct pipe_resource *indexbuf = info->index.resource; - unsigned dirty_tex_counter, dirty_buf_counter; - enum pipe_prim_type rast_prim, prim = info->mode; - unsigned index_size = info->index_size; - unsigned index_offset = info->indirect ? info->start * index_size : 0; - unsigned instance_count = info->instance_count; - bool primitive_restart = info->primitive_restart && - (!sctx->screen->options.prim_restart_tri_strips_only || - (prim != PIPE_PRIM_TRIANGLE_STRIP && - prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)); - - if (likely(!info->indirect)) { - /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is - * no workaround for indirect draws, but we can at least skip - * direct draws. - */ - if (unlikely(!instance_count)) - return; - - /* Handle count == 0. */ - if (unlikely(!info->count && - (index_size || !info->count_from_stream_output))) - return; - } - - struct si_shader_selector *vs = sctx->vs_shader.cso; - if (unlikely(!vs || - sctx->num_vertex_elements < vs->num_vs_inputs || - (!sctx->ps_shader.cso && !rs->rasterizer_discard) || - (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) { - assert(0); - return; - } - - /* Recompute and re-emit the texture resource states if needed. */ - dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter); - if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) { - sctx->last_dirty_tex_counter = dirty_tex_counter; - sctx->framebuffer.dirty_cbufs |= - ((1 << sctx->framebuffer.state.nr_cbufs) - 1); - sctx->framebuffer.dirty_zsbuf = true; - si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); - si_update_all_texture_descriptors(sctx); - } - - dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter); - if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) { - sctx->last_dirty_buf_counter = dirty_buf_counter; - /* Rebind all buffers unconditionally. */ - si_rebind_buffer(sctx, NULL); - } - - si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)); - - /* Set the rasterization primitive type. - * - * This must be done after si_decompress_textures, which can call - * draw_vbo recursively, and before si_update_shaders, which uses - * current_rast_prim for this draw_vbo call. */ - if (sctx->gs_shader.cso) { - /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ - rast_prim = sctx->gs_shader.cso->rast_prim; - } else if (sctx->tes_shader.cso) { - /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ - rast_prim = sctx->tes_shader.cso->rast_prim; - } else if (util_rast_prim_is_triangles(prim)) { - rast_prim = PIPE_PRIM_TRIANGLES; - } else { - /* Only possibilities, POINTS, LINE*, RECTANGLES */ - rast_prim = prim; - } - - if (rast_prim != sctx->current_rast_prim) { - if (util_prim_is_points_or_lines(sctx->current_rast_prim) != - util_prim_is_points_or_lines(rast_prim)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); - - sctx->current_rast_prim = rast_prim; - sctx->do_update_shaders = true; - } - - if (sctx->tes_shader.cso && - sctx->screen->info.has_ls_vgpr_init_bug) { - /* Determine whether the LS VGPR fix should be applied. - * - * It is only required when num input CPs > num output CPs, - * which cannot happen with the fixed function TCS. We should - * also update this bit when switching from TCS to fixed - * function TCS. - */ - struct si_shader_selector *tcs = sctx->tcs_shader.cso; - bool ls_vgpr_fix = - tcs && - info->vertices_per_patch > - tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; - - if (ls_vgpr_fix != sctx->ls_vgpr_fix) { - sctx->ls_vgpr_fix = ls_vgpr_fix; - sctx->do_update_shaders = true; - } - } - - if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) { - /* Determine whether the GS triangle strip adjacency fix should - * be applied. Rotate every other triangle if - * - triangle strips with adjacency are fed to the GS and - * - primitive restart is disabled (the rotation doesn't help - * when the restart occurs after an odd number of triangles). - */ - bool gs_tri_strip_adj_fix = - !sctx->tes_shader.cso && - prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && - !primitive_restart; - - if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { - sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; - sctx->do_update_shaders = true; - } - } - - if (index_size) { - /* Translate or upload, if needed. */ - /* 8-bit indices are supported on GFX8. */ - if (sctx->chip_class <= GFX7 && index_size == 1) { - unsigned start, count, start_offset, size, offset; - void *ptr; - - si_get_draw_start_count(sctx, info, &start, &count); - start_offset = start * 2; - size = count * 2; - - indexbuf = NULL; - u_upload_alloc(ctx->stream_uploader, start_offset, - size, - si_optimal_tcc_alignment(sctx, size), - &offset, &indexbuf, &ptr); - if (!indexbuf) - return; - - util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, - index_offset + start, - count, ptr); - - /* info->start will be added by the drawing code */ - index_offset = offset - start_offset; - index_size = 2; - } else if (info->has_user_indices) { - unsigned start_offset; - - assert(!info->indirect); - start_offset = info->start * index_size; - - indexbuf = NULL; - u_upload_data(ctx->stream_uploader, start_offset, - info->count * index_size, - sctx->screen->info.tcc_cache_line_size, - (char*)info->index.user + start_offset, - &index_offset, &indexbuf); - if (!indexbuf) - return; - - /* info->start will be added by the drawing code */ - index_offset -= start_offset; - } else if (sctx->chip_class <= GFX7 && - si_resource(indexbuf)->TC_L2_dirty) { - /* GFX8 reads index buffers through TC L2, so it doesn't - * need this. */ - sctx->flags |= SI_CONTEXT_WB_L2; - si_resource(indexbuf)->TC_L2_dirty = false; - } - } - - bool dispatch_prim_discard_cs = false; - bool prim_discard_cs_instancing = false; - unsigned original_index_size = index_size; - unsigned direct_count = 0; - - if (info->indirect) { - struct pipe_draw_indirect_info *indirect = info->indirect; - - /* Add the buffer size for memory checking in need_cs_space. */ - si_context_add_resource_size(sctx, indirect->buffer); - - /* Indirect buffers use TC L2 on GFX9, but not older hw. */ - if (sctx->chip_class <= GFX8) { - if (si_resource(indirect->buffer)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WB_L2; - si_resource(indirect->buffer)->TC_L2_dirty = false; - } - - if (indirect->indirect_draw_count && - si_resource(indirect->indirect_draw_count)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WB_L2; - si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; - } - } - } else { - /* Multiply by 3 for strips and fans to get an approximate vertex - * count as triangles. */ - direct_count = info->count * instance_count * - (prim == PIPE_PRIM_TRIANGLES ? 1 : 3); - } - - /* Determine if we can use the primitive discard compute shader. */ - if (si_compute_prim_discard_enabled(sctx) && - (direct_count > sctx->prim_discard_vertex_count_threshold ? - (sctx->compute_num_verts_rejected += direct_count, true) : /* Add, then return true. */ - (sctx->compute_num_verts_ineligible += direct_count, false)) && /* Add, then return false. */ - (!info->count_from_stream_output || pd_msg("draw_opaque")) && - (primitive_restart ? - /* Supported prim types with primitive restart: */ - (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) && - /* Disallow instancing with primitive restart: */ - (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) : - /* Supported prim types without primitive restart + allow instancing: */ - (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | - (1 << PIPE_PRIM_TRIANGLE_STRIP) | - (1 << PIPE_PRIM_TRIANGLE_FAN)) && - /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */ - /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */ - (instance_count == 1 || - (instance_count <= USHRT_MAX && index_size && index_size <= 2) || - pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) && - (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) && - (!sctx->render_cond || pd_msg("render condition")) && - /* Forced enablement ignores pipeline statistics queries. */ - (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) || - (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) || - pd_msg("pipestat or primgen query")) && - (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) && - (!sctx->tes_shader.cso || pd_msg("uses tess")) && - (!sctx->gs_shader.cso || pd_msg("uses GS")) && - (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) && - !rs->polygon_mode_enabled && + struct si_context *sctx = (struct si_context *)ctx; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct pipe_resource *indexbuf = info->index.resource; + unsigned dirty_tex_counter, dirty_buf_counter; + enum pipe_prim_type rast_prim, prim = info->mode; + unsigned index_size = info->index_size; + unsigned index_offset = info->indirect ? info->start * index_size : 0; + unsigned instance_count = info->instance_count; + bool primitive_restart = + info->primitive_restart && + (!sctx->screen->options.prim_restart_tri_strips_only || + (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)); + + if (likely(!info->indirect)) { + /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is + * no workaround for indirect draws, but we can at least skip + * direct draws. + */ + if (unlikely(!instance_count)) + return; + + /* Handle count == 0. */ + if (unlikely(!info->count && (index_size || !info->count_from_stream_output))) + return; + } + + struct si_shader_selector *vs = sctx->vs_shader.cso; + if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs || + (!sctx->ps_shader.cso && !rs->rasterizer_discard) || + (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) { + assert(0); + return; + } + + /* Recompute and re-emit the texture resource states if needed. */ + dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter); + if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) { + sctx->last_dirty_tex_counter = dirty_tex_counter; + sctx->framebuffer.dirty_cbufs |= ((1 << sctx->framebuffer.state.nr_cbufs) - 1); + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer); + si_update_all_texture_descriptors(sctx); + } + + dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter); + if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) { + sctx->last_dirty_buf_counter = dirty_buf_counter; + /* Rebind all buffers unconditionally. */ + si_rebind_buffer(sctx, NULL); + } + + si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)); + + /* Set the rasterization primitive type. + * + * This must be done after si_decompress_textures, which can call + * draw_vbo recursively, and before si_update_shaders, which uses + * current_rast_prim for this draw_vbo call. */ + if (sctx->gs_shader.cso) { + /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ + rast_prim = sctx->gs_shader.cso->rast_prim; + } else if (sctx->tes_shader.cso) { + /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ + rast_prim = sctx->tes_shader.cso->rast_prim; + } else if (util_rast_prim_is_triangles(prim)) { + rast_prim = PIPE_PRIM_TRIANGLES; + } else { + /* Only possibilities, POINTS, LINE*, RECTANGLES */ + rast_prim = prim; + } + + if (rast_prim != sctx->current_rast_prim) { + if (util_prim_is_points_or_lines(sctx->current_rast_prim) != + util_prim_is_points_or_lines(rast_prim)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); + + sctx->current_rast_prim = rast_prim; + sctx->do_update_shaders = true; + } + + if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) { + /* Determine whether the LS VGPR fix should be applied. + * + * It is only required when num input CPs > num output CPs, + * which cannot happen with the fixed function TCS. We should + * also update this bit when switching from TCS to fixed + * function TCS. + */ + struct si_shader_selector *tcs = sctx->tcs_shader.cso; + bool ls_vgpr_fix = + tcs && info->vertices_per_patch > tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; + + if (ls_vgpr_fix != sctx->ls_vgpr_fix) { + sctx->ls_vgpr_fix = ls_vgpr_fix; + sctx->do_update_shaders = true; + } + } + + if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) { + /* Determine whether the GS triangle strip adjacency fix should + * be applied. Rotate every other triangle if + * - triangle strips with adjacency are fed to the GS and + * - primitive restart is disabled (the rotation doesn't help + * when the restart occurs after an odd number of triangles). + */ + bool gs_tri_strip_adj_fix = + !sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart; + + if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { + sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; + sctx->do_update_shaders = true; + } + } + + if (index_size) { + /* Translate or upload, if needed. */ + /* 8-bit indices are supported on GFX8. */ + if (sctx->chip_class <= GFX7 && index_size == 1) { + unsigned start, count, start_offset, size, offset; + void *ptr; + + si_get_draw_start_count(sctx, info, &start, &count); + start_offset = start * 2; + size = count * 2; + + indexbuf = NULL; + u_upload_alloc(ctx->stream_uploader, start_offset, size, + si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr); + if (!indexbuf) + return; + + util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr); + + /* info->start will be added by the drawing code */ + index_offset = offset - start_offset; + index_size = 2; + } else if (info->has_user_indices) { + unsigned start_offset; + + assert(!info->indirect); + start_offset = info->start * index_size; + + indexbuf = NULL; + u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size, + sctx->screen->info.tcc_cache_line_size, + (char *)info->index.user + start_offset, &index_offset, &indexbuf); + if (!indexbuf) + return; + + /* info->start will be added by the drawing code */ + index_offset -= start_offset; + } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) { + /* GFX8 reads index buffers through TC L2, so it doesn't + * need this. */ + sctx->flags |= SI_CONTEXT_WB_L2; + si_resource(indexbuf)->TC_L2_dirty = false; + } + } + + bool dispatch_prim_discard_cs = false; + bool prim_discard_cs_instancing = false; + unsigned original_index_size = index_size; + unsigned direct_count = 0; + + if (info->indirect) { + struct pipe_draw_indirect_info *indirect = info->indirect; + + /* Add the buffer size for memory checking in need_cs_space. */ + si_context_add_resource_size(sctx, indirect->buffer); + + /* Indirect buffers use TC L2 on GFX9, but not older hw. */ + if (sctx->chip_class <= GFX8) { + if (si_resource(indirect->buffer)->TC_L2_dirty) { + sctx->flags |= SI_CONTEXT_WB_L2; + si_resource(indirect->buffer)->TC_L2_dirty = false; + } + + if (indirect->indirect_draw_count && + si_resource(indirect->indirect_draw_count)->TC_L2_dirty) { + sctx->flags |= SI_CONTEXT_WB_L2; + si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; + } + } + } else { + /* Multiply by 3 for strips and fans to get an approximate vertex + * count as triangles. */ + direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3); + } + + /* Determine if we can use the primitive discard compute shader. */ + if (si_compute_prim_discard_enabled(sctx) && + (direct_count > sctx->prim_discard_vertex_count_threshold + ? (sctx->compute_num_verts_rejected += direct_count, true) + : /* Add, then return true. */ + (sctx->compute_num_verts_ineligible += direct_count, + false)) && /* Add, then return false. */ + (!info->count_from_stream_output || pd_msg("draw_opaque")) && + (primitive_restart ? + /* Supported prim types with primitive restart: */ + (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) && + /* Disallow instancing with primitive restart: */ + (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) + : + /* Supported prim types without primitive restart + allow instancing: */ + (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | + (1 << PIPE_PRIM_TRIANGLE_FAN)) && + /* Instancing is limited to 16-bit indices, because InstanceID is packed into + VertexID. */ + /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */ + (instance_count == 1 || + (instance_count <= USHRT_MAX && index_size && index_size <= 2) || + pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) && + (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) && + (!sctx->render_cond || pd_msg("render condition")) && + /* Forced enablement ignores pipeline statistics queries. */ + (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) || + (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) || + pd_msg("pipestat or primgen query")) && + (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) && + (!sctx->tes_shader.cso || pd_msg("uses tess")) && + (!sctx->gs_shader.cso || pd_msg("uses GS")) && + (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) && + !rs->polygon_mode_enabled && #if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */ - (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) && - (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) && - (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) && - (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) && - !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && - !sctx->vs_shader.cso->so.num_outputs && + (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) && + (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) && + (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) && + (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) && + !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && + !sctx->vs_shader.cso->so.num_outputs && #else - (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) && + (sctx->vs_shader.cso->prim_discard_cs_allowed || + pd_msg("VS shader uses unsupported features")) && #endif - /* Check that all buffers are used for read only, because compute - * dispatches can run ahead. */ - (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) { - switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) { - case SI_PRIM_DISCARD_ENABLED: - original_index_size = index_size; - prim_discard_cs_instancing = instance_count > 1; - dispatch_prim_discard_cs = true; - - /* The compute shader changes/lowers the following: */ - prim = PIPE_PRIM_TRIANGLES; - index_size = 4; - instance_count = 1; - primitive_restart = false; - sctx->compute_num_verts_rejected -= direct_count; - sctx->compute_num_verts_accepted += direct_count; - break; - case SI_PRIM_DISCARD_DISABLED: - break; - case SI_PRIM_DISCARD_DRAW_SPLIT: - sctx->compute_num_verts_rejected -= direct_count; - goto return_cleanup; - } - } - - if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) { - sctx->prim_discard_cs_instancing = prim_discard_cs_instancing; - sctx->do_update_shaders = true; - } - - /* Update NGG culling settings. */ - if (sctx->ngg && - !dispatch_prim_discard_cs && - rast_prim == PIPE_PRIM_TRIANGLES && - (sctx->screen->always_use_ngg_culling || - /* At least 1024 non-indexed vertices (8 subgroups) are needed - * per draw call (no TES/GS) to enable NGG culling. - */ - (!index_size && direct_count >= 1024 && - (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) && - !sctx->tes_shader.cso && !sctx->gs_shader.cso)) && - si_get_vs(sctx)->cso->ngg_culling_allowed) { - unsigned ngg_culling = 0; - - if (rs->rasterizer_discard) { - ngg_culling |= SI_NGG_CULL_FRONT_FACE | - SI_NGG_CULL_BACK_FACE; - } else { - /* Polygon mode can't use view and small primitive culling, - * because it draws points or lines where the culling depends - * on the point or line width. - */ - if (!rs->polygon_mode_enabled) - ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS; - - if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front) - ngg_culling |= SI_NGG_CULL_FRONT_FACE; - if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back) - ngg_culling |= SI_NGG_CULL_BACK_FACE; - } - - /* Use NGG fast launch for certain non-indexed primitive types. - * A draw must have at least 1 full primitive. - */ - if (ngg_culling && !index_size && direct_count >= 3 && - !sctx->tes_shader.cso && !sctx->gs_shader.cso) { - if (prim == PIPE_PRIM_TRIANGLES) - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; - else if (prim == PIPE_PRIM_TRIANGLE_STRIP) - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; - } - - if (ngg_culling != sctx->ngg_culling) { - /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs. - * See issues #2418, #2426, #2434 - */ - if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) - sctx->flags |= SI_CONTEXT_VGT_FLUSH; - sctx->ngg_culling = ngg_culling; - sctx->do_update_shaders = true; - } - } else if (sctx->ngg_culling) { - sctx->ngg_culling = false; - sctx->do_update_shaders = true; - } - - if (sctx->do_update_shaders && !si_update_shaders(sctx)) - goto return_cleanup; - - si_need_gfx_cs_space(sctx); - - if (sctx->bo_list_add_all_gfx_resources) - si_gfx_resources_add_all_to_bo_list(sctx); - - /* Since we've called si_context_add_resource_size for vertex buffers, - * this must be called after si_need_cs_space, because we must let - * need_cs_space flush before we add buffers to the buffer list. - */ - if (!si_upload_vertex_buffer_descriptors(sctx)) - goto return_cleanup; - - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - unsigned masked_atoms = 0; - - if (sctx->screen->info.has_gfx9_scissor_bug) { - masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); - - if (info->count_from_stream_output || - sctx->dirty_atoms & si_atoms_that_always_roll_context() || - sctx->dirty_states & si_states_that_always_roll_context()) - sctx->context_roll = true; - } - - /* Use optimal packet order based on whether we need to sync the pipeline. */ - if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH))) { - /* If we have to wait for idle, set all states first, so that all - * SET packets are processed in parallel with previous draw calls. - * Then draw and prefetch at the end. This ensures that the time - * the CUs are idle is very short. - */ - if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) - masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); - - if (!si_upload_graphics_shader_descriptors(sctx)) - goto return_cleanup; - - /* Emit all states except possibly render condition. */ - si_emit_all_states(sctx, info, prim, instance_count, - primitive_restart, masked_atoms); - sctx->emit_cache_flush(sctx); - /* <-- CUs are idle here. */ - - if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) - sctx->atoms.s.render_cond.emit(sctx); - - if (sctx->screen->info.has_gfx9_scissor_bug && - (sctx->context_roll || - si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) - sctx->atoms.s.scissors.emit(sctx); - - sctx->dirty_atoms = 0; - - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, - instance_count, dispatch_prim_discard_cs, - original_index_size); - /* <-- CUs are busy here. */ - - /* Start prefetches after the draw has been started. Both will run - * in parallel, but starting the draw first is more important. - */ - if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx, false); - } else { - /* If we don't wait for idle, start prefetches first, then set - * states, and draw at the end. - */ - if (sctx->flags) - sctx->emit_cache_flush(sctx); - - /* Only prefetch the API VS and VBO descriptors. */ - if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx, true); - - if (!si_upload_graphics_shader_descriptors(sctx)) - goto return_cleanup; - - si_emit_all_states(sctx, info, prim, instance_count, - primitive_restart, masked_atoms); - - if (sctx->screen->info.has_gfx9_scissor_bug && - (sctx->context_roll || - si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) - sctx->atoms.s.scissors.emit(sctx); - - sctx->dirty_atoms = 0; - - si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, - instance_count, dispatch_prim_discard_cs, - original_index_size); - - /* Prefetch the remaining shaders after the draw has been - * started. */ - if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx, false); - } - - /* Mark the displayable dcc buffer as dirty in order to update - * it on the next call to si_flush_resource. */ - if (sctx->screen->info.use_display_dcc_with_retile_blit) { - /* Don't use si_update_fb_dirtiness_after_rendering because it'll - * cause unnecessary texture decompressions on each draw. */ - unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask; - while (displayable_dcc_cb_mask) { - unsigned i = u_bit_scan(&displayable_dcc_cb_mask); - struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; - struct si_texture *tex = (struct si_texture*) surf->texture; - tex->displayable_dcc_dirty = true; - } - } - - /* Clear the context roll flag after the draw call. */ - sctx->context_roll = false; - - if (unlikely(sctx->current_saved_cs)) { - si_trace_emit(sctx); - si_log_draw_state(sctx, sctx->log); - } - - /* Workaround for a VGT hang when streamout is enabled. - * It must be done after drawing. */ - if ((sctx->family == CHIP_HAWAII || - sctx->family == CHIP_TONGA || - sctx->family == CHIP_FIJI) && - si_get_strmout_en(sctx)) { - sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; - } - - if (unlikely(sctx->decompression_enabled)) { - sctx->num_decompress_calls++; - } else { - sctx->num_draw_calls++; - if (sctx->framebuffer.state.nr_cbufs > 1) - sctx->num_mrt_draw_calls++; - if (primitive_restart) - sctx->num_prim_restart_calls++; - if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size)) - sctx->num_spill_draw_calls++; - } + /* Check that all buffers are used for read only, because compute + * dispatches can run ahead. */ + (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || + pd_msg("write reference"))) { + switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) { + case SI_PRIM_DISCARD_ENABLED: + original_index_size = index_size; + prim_discard_cs_instancing = instance_count > 1; + dispatch_prim_discard_cs = true; + + /* The compute shader changes/lowers the following: */ + prim = PIPE_PRIM_TRIANGLES; + index_size = 4; + instance_count = 1; + primitive_restart = false; + sctx->compute_num_verts_rejected -= direct_count; + sctx->compute_num_verts_accepted += direct_count; + break; + case SI_PRIM_DISCARD_DISABLED: + break; + case SI_PRIM_DISCARD_DRAW_SPLIT: + sctx->compute_num_verts_rejected -= direct_count; + goto return_cleanup; + } + } + + if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) { + sctx->prim_discard_cs_instancing = prim_discard_cs_instancing; + sctx->do_update_shaders = true; + } + + /* Update NGG culling settings. */ + if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES && + (sctx->screen->always_use_ngg_culling || + /* At least 1024 non-indexed vertices (8 subgroups) are needed + * per draw call (no TES/GS) to enable NGG culling. + */ + (!index_size && direct_count >= 1024 && + (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) && + !sctx->tes_shader.cso && !sctx->gs_shader.cso)) && + si_get_vs(sctx)->cso->ngg_culling_allowed) { + unsigned ngg_culling = 0; + + if (rs->rasterizer_discard) { + ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE; + } else { + /* Polygon mode can't use view and small primitive culling, + * because it draws points or lines where the culling depends + * on the point or line width. + */ + if (!rs->polygon_mode_enabled) + ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS; + + if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front) + ngg_culling |= SI_NGG_CULL_FRONT_FACE; + if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back) + ngg_culling |= SI_NGG_CULL_BACK_FACE; + } + + /* Use NGG fast launch for certain non-indexed primitive types. + * A draw must have at least 1 full primitive. + */ + if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso && + !sctx->gs_shader.cso) { + if (prim == PIPE_PRIM_TRIANGLES) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; + else if (prim == PIPE_PRIM_TRIANGLE_STRIP) + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; + } + + if (ngg_culling != sctx->ngg_culling) { + /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs. + * See issues #2418, #2426, #2434 + */ + if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) + sctx->flags |= SI_CONTEXT_VGT_FLUSH; + sctx->ngg_culling = ngg_culling; + sctx->do_update_shaders = true; + } + } else if (sctx->ngg_culling) { + sctx->ngg_culling = false; + sctx->do_update_shaders = true; + } + + if (sctx->do_update_shaders && !si_update_shaders(sctx)) + goto return_cleanup; + + si_need_gfx_cs_space(sctx); + + if (sctx->bo_list_add_all_gfx_resources) + si_gfx_resources_add_all_to_bo_list(sctx); + + /* Since we've called si_context_add_resource_size for vertex buffers, + * this must be called after si_need_cs_space, because we must let + * need_cs_space flush before we add buffers to the buffer list. + */ + if (!si_upload_vertex_buffer_descriptors(sctx)) + goto return_cleanup; + + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + unsigned masked_atoms = 0; + + if (sctx->screen->info.has_gfx9_scissor_bug) { + masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); + + if (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context()) + sctx->context_roll = true; + } + + /* Use optimal packet order based on whether we need to sync the pipeline. */ + if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* If we have to wait for idle, set all states first, so that all + * SET packets are processed in parallel with previous draw calls. + * Then draw and prefetch at the end. This ensures that the time + * the CUs are idle is very short. + */ + if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) + masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); + + if (!si_upload_graphics_shader_descriptors(sctx)) + goto return_cleanup; + + /* Emit all states except possibly render condition. */ + si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms); + sctx->emit_cache_flush(sctx); + /* <-- CUs are idle here. */ + + if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) + sctx->atoms.s.render_cond.emit(sctx); + + if (sctx->screen->info.has_gfx9_scissor_bug && + (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) + sctx->atoms.s.scissors.emit(sctx); + + sctx->dirty_atoms = 0; + + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count, + dispatch_prim_discard_cs, original_index_size); + /* <-- CUs are busy here. */ + + /* Start prefetches after the draw has been started. Both will run + * in parallel, but starting the draw first is more important. + */ + if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx, false); + } else { + /* If we don't wait for idle, start prefetches first, then set + * states, and draw at the end. + */ + if (sctx->flags) + sctx->emit_cache_flush(sctx); + + /* Only prefetch the API VS and VBO descriptors. */ + if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx, true); + + if (!si_upload_graphics_shader_descriptors(sctx)) + goto return_cleanup; + + si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms); + + if (sctx->screen->info.has_gfx9_scissor_bug && + (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) + sctx->atoms.s.scissors.emit(sctx); + + sctx->dirty_atoms = 0; + + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count, + dispatch_prim_discard_cs, original_index_size); + + /* Prefetch the remaining shaders after the draw has been + * started. */ + if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx, false); + } + + /* Mark the displayable dcc buffer as dirty in order to update + * it on the next call to si_flush_resource. */ + if (sctx->screen->info.use_display_dcc_with_retile_blit) { + /* Don't use si_update_fb_dirtiness_after_rendering because it'll + * cause unnecessary texture decompressions on each draw. */ + unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask; + while (displayable_dcc_cb_mask) { + unsigned i = u_bit_scan(&displayable_dcc_cb_mask); + struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; + struct si_texture *tex = (struct si_texture *)surf->texture; + tex->displayable_dcc_dirty = true; + } + } + + /* Clear the context roll flag after the draw call. */ + sctx->context_roll = false; + + if (unlikely(sctx->current_saved_cs)) { + si_trace_emit(sctx); + si_log_draw_state(sctx, sctx->log); + } + + /* Workaround for a VGT hang when streamout is enabled. + * It must be done after drawing. */ + if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) && + si_get_strmout_en(sctx)) { + sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; + } + + if (unlikely(sctx->decompression_enabled)) { + sctx->num_decompress_calls++; + } else { + sctx->num_draw_calls++; + if (sctx->framebuffer.state.nr_cbufs > 1) + sctx->num_mrt_draw_calls++; + if (primitive_restart) + sctx->num_prim_restart_calls++; + if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size)) + sctx->num_spill_draw_calls++; + } return_cleanup: - if (index_size && indexbuf != info->index.resource) - pipe_resource_reference(&indexbuf, NULL); + if (index_size && indexbuf != info->index.resource) + pipe_resource_reference(&indexbuf, NULL); } -static void -si_draw_rectangle(struct blitter_context *blitter, - void *vertex_elements_cso, - blitter_get_vs_func get_vs, - int x1, int y1, int x2, int y2, - float depth, unsigned num_instances, - enum blitter_attrib_type type, - const union blitter_attrib *attrib) +static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso, + blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2, + float depth, unsigned num_instances, enum blitter_attrib_type type, + const union blitter_attrib *attrib) { - struct pipe_context *pipe = util_blitter_get_pipe(blitter); - struct si_context *sctx = (struct si_context*)pipe; - - /* Pack position coordinates as signed int16. */ - sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | - ((uint32_t)(y1 & 0xffff) << 16); - sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | - ((uint32_t)(y2 & 0xffff) << 16); - sctx->vs_blit_sh_data[2] = fui(depth); - - switch (type) { - case UTIL_BLITTER_ATTRIB_COLOR: - memcpy(&sctx->vs_blit_sh_data[3], attrib->color, - sizeof(float)*4); - break; - case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: - case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: - memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, - sizeof(attrib->texcoord)); - break; - case UTIL_BLITTER_ATTRIB_NONE:; - } - - pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances)); - - struct pipe_draw_info info = {}; - info.mode = SI_PRIM_RECTANGLE_LIST; - info.count = 3; - info.instance_count = num_instances; - - /* Don't set per-stage shader pointers for VS. */ - sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); - sctx->vertex_buffer_pointer_dirty = false; - sctx->vertex_buffer_user_sgprs_dirty = false; - - si_draw_vbo(pipe, &info); + struct pipe_context *pipe = util_blitter_get_pipe(blitter); + struct si_context *sctx = (struct si_context *)pipe; + + /* Pack position coordinates as signed int16. */ + sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16); + sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16); + sctx->vs_blit_sh_data[2] = fui(depth); + + switch (type) { + case UTIL_BLITTER_ATTRIB_COLOR: + memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4); + break; + case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: + case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: + memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord)); + break; + case UTIL_BLITTER_ATTRIB_NONE:; + } + + pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances)); + + struct pipe_draw_info info = {}; + info.mode = SI_PRIM_RECTANGLE_LIST; + info.count = 3; + info.instance_count = num_instances; + + /* Don't set per-stage shader pointers for VS. */ + sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX); + sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; + + si_draw_vbo(pipe, &info); } void si_trace_emit(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - uint32_t trace_id = ++sctx->current_saved_cs->trace_id; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + uint32_t trace_id = ++sctx->current_saved_cs->trace_id; - si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, - 0, 4, V_370_MEM, V_370_ME, &trace_id); + si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id); - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); - if (sctx->log) - u_log_flush(sctx->log); + if (sctx->log) + u_log_flush(sctx->log); } void si_init_draw_functions(struct si_context *sctx) { - sctx->b.draw_vbo = si_draw_vbo; + sctx->b.draw_vbo = si_draw_vbo; - sctx->blitter->draw_rectangle = si_draw_rectangle; + sctx->blitter->draw_rectangle = si_draw_rectangle; - si_init_ia_multi_vgt_param_table(sctx); + si_init_ia_multi_vgt_param_table(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index 0fa38918b20..9ebb1e5dcb4 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -25,17 +25,16 @@ #include "si_build_pm4.h" /* For MSAA sample positions. */ -#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ - ((((unsigned)(s0x) & 0xf) << 0) | (((unsigned)(s0y) & 0xf) << 4) | \ - (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ - (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ - (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) +#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ + ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \ + (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | \ + (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28)) /* For obtaining location coordinates from registers */ -#define SEXT4(x) ((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0))) -#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index) * 4)) & 0xf) -#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2) -#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1) +#define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0))) +#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf) +#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2) +#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1) /* The following sample ordering is required by EQAA. * @@ -88,132 +87,128 @@ /* 1x MSAA */ static const uint32_t sample_locs_1x = - FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */ + FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */ static const uint64_t centroid_priority_1x = 0x0000000000000000ull; /* 2x MSAA (the positions are sorted for EQAA) */ static const uint32_t sample_locs_2x = - FILL_SREG(-4,-4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */ + FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */ static const uint64_t centroid_priority_2x = 0x1010101010101010ull; /* 4x MSAA (the positions are sorted for EQAA) */ -static const uint32_t sample_locs_4x = - FILL_SREG(-2,-6, 2, 6, -6, 2, 6,-2); +static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2); static const uint64_t centroid_priority_4x = 0x3210321032103210ull; /* 8x MSAA (the positions are sorted for EQAA) */ static const uint32_t sample_locs_8x[] = { - FILL_SREG(-3,-5, 5, 1, -1, 3, 7,-7), - FILL_SREG(-7,-1, 3, 7, -5, 5, 1,-3), - /* The following are unused by hardware, but we emit them to IBs - * instead of multiple SET_CONTEXT_REG packets. */ - 0, - 0, + FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7), + FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3), + /* The following are unused by hardware, but we emit them to IBs + * instead of multiple SET_CONTEXT_REG packets. */ + 0, + 0, }; static const uint64_t centroid_priority_8x = 0x3546012735460127ull; /* 16x MSAA (the positions are sorted for EQAA) */ static const uint32_t sample_locs_16x[] = { - FILL_SREG(-5,-2, 5, 3, -2, 6, 3,-5), - FILL_SREG(-4,-6, 1, 1, -6, 4, 7,-4), - FILL_SREG(-1,-3, 6, 7, -3, 2, 0,-7), - FILL_SREG(-7,-8, 2, 5, -8, 0, 4,-1), + FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5), + FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4), + FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7), + FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1), }; static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull; static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count, - unsigned sample_index, float *out_value) + unsigned sample_index, float *out_value) { - const uint32_t *sample_locs; - - switch (sample_count) { - case 1: - default: - sample_locs = &sample_locs_1x; - break; - case 2: - sample_locs = &sample_locs_2x; - break; - case 4: - sample_locs = &sample_locs_4x; - break; - case 8: - sample_locs = sample_locs_8x; - break; - case 16: - sample_locs = sample_locs_16x; - break; - } - - out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f; - out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f; + const uint32_t *sample_locs; + + switch (sample_count) { + case 1: + default: + sample_locs = &sample_locs_1x; + break; + case 2: + sample_locs = &sample_locs_2x; + break; + case 4: + sample_locs = &sample_locs_4x; + break; + case 8: + sample_locs = sample_locs_8x; + break; + case 16: + sample_locs = sample_locs_16x; + break; + } + + out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f; + out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f; } -static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, - uint64_t centroid_priority, - uint32_t sample_locs) +static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority, + uint32_t sample_locs) { - radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority); - radeon_emit(cs, centroid_priority >> 32); - radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); - radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); - radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); - radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); + radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); + radeon_emit(cs, centroid_priority); + radeon_emit(cs, centroid_priority >> 32); + radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); + radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); + radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); + radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); } -static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, - uint64_t centroid_priority, - const uint32_t *sample_locs, - unsigned num_samples) +static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority, + const uint32_t *sample_locs, unsigned num_samples) { - radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(cs, centroid_priority); - radeon_emit(cs, centroid_priority >> 32); - radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, - num_samples == 8 ? 14 : 16); - radeon_emit_array(cs, sample_locs, 4); - radeon_emit_array(cs, sample_locs, 4); - radeon_emit_array(cs, sample_locs, 4); - radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4); + radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); + radeon_emit(cs, centroid_priority); + radeon_emit(cs, centroid_priority >> 32); + radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, + num_samples == 8 ? 14 : 16); + radeon_emit_array(cs, sample_locs, 4); + radeon_emit_array(cs, sample_locs, 4); + radeon_emit_array(cs, sample_locs, 4); + radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4); } void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples) { - switch (nr_samples) { - default: - case 1: - si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x); - break; - case 2: - si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x); - break; - case 4: - si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x); - break; - case 8: - si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8); - break; - case 16: - si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16); - break; - } + switch (nr_samples) { + default: + case 1: + si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x); + break; + case 2: + si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x); + break; + case 4: + si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x); + break; + case 8: + si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8); + break; + case 16: + si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16); + break; + } } void si_init_msaa_functions(struct si_context *sctx) { - int i; + int i; - sctx->b.get_sample_position = si_get_sample_position; + sctx->b.get_sample_position = si_get_sample_position; - si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]); + si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]); - for (i = 0; i < 2; i++) - si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]); - for (i = 0; i < 4; i++) - si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]); - for (i = 0; i < 8; i++) - si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]); - for (i = 0; i < 16; i++) - si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]); + for (i = 0; i < 2; i++) + si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]); + for (i = 0; i < 4; i++) + si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]); + for (i = 0; i < 8; i++) + si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]); + for (i = 0; i < 16; i++) + si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index be7cda1d332..d322cd1f341 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -22,96 +22,91 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "si_build_pm4.h" -#include "sid.h" - +#include "ac_exp_param.h" +#include "ac_shader_util.h" #include "compiler/nir/nir_serialize.h" #include "nir/tgsi_to_nir.h" -#include "util/hash_table.h" +#include "si_build_pm4.h" +#include "sid.h" #include "util/crc32.h" +#include "util/disk_cache.h" +#include "util/hash_table.h" +#include "util/mesa-sha1.h" #include "util/u_async_debug.h" #include "util/u_memory.h" #include "util/u_prim.h" -#include "util/disk_cache.h" -#include "util/mesa-sha1.h" -#include "ac_exp_param.h" -#include "ac_shader_util.h" - /* SHADER_CACHE */ /** * Return the IR key for the shader cache. */ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, - unsigned char ir_sha1_cache_key[20]) -{ - struct blob blob = {}; - unsigned ir_size; - void *ir_binary; - - if (sel->nir_binary) { - ir_binary = sel->nir_binary; - ir_size = sel->nir_size; - } else { - assert(sel->nir); - - blob_init(&blob); - nir_serialize(&blob, sel->nir, true); - ir_binary = blob.data; - ir_size = blob.size; - } - - /* These settings affect the compilation, but they are not derived - * from the input shader IR. - */ - unsigned shader_variant_flags = 0; - - if (ngg) - shader_variant_flags |= 1 << 0; - if (sel->nir) - shader_variant_flags |= 1 << 1; - if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32) - shader_variant_flags |= 1 << 2; - if (sel->type == PIPE_SHADER_FRAGMENT && - sel->info.uses_derivatives && - sel->info.uses_kill && - sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) - shader_variant_flags |= 1 << 3; - - /* This varies depending on whether compute-based culling is enabled. */ - shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4; - - struct mesa_sha1 ctx; - _mesa_sha1_init(&ctx); - _mesa_sha1_update(&ctx, &shader_variant_flags, 4); - _mesa_sha1_update(&ctx, ir_binary, ir_size); - if (sel->type == PIPE_SHADER_VERTEX || - sel->type == PIPE_SHADER_TESS_EVAL || - sel->type == PIPE_SHADER_GEOMETRY) - _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so)); - _mesa_sha1_final(&ctx, ir_sha1_cache_key); - - if (ir_binary == blob.data) - blob_finish(&blob); + unsigned char ir_sha1_cache_key[20]) +{ + struct blob blob = {}; + unsigned ir_size; + void *ir_binary; + + if (sel->nir_binary) { + ir_binary = sel->nir_binary; + ir_size = sel->nir_size; + } else { + assert(sel->nir); + + blob_init(&blob); + nir_serialize(&blob, sel->nir, true); + ir_binary = blob.data; + ir_size = blob.size; + } + + /* These settings affect the compilation, but they are not derived + * from the input shader IR. + */ + unsigned shader_variant_flags = 0; + + if (ngg) + shader_variant_flags |= 1 << 0; + if (sel->nir) + shader_variant_flags |= 1 << 1; + if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32) + shader_variant_flags |= 1 << 2; + if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill && + sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) + shader_variant_flags |= 1 << 3; + + /* This varies depending on whether compute-based culling is enabled. */ + shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4; + + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + _mesa_sha1_update(&ctx, &shader_variant_flags, 4); + _mesa_sha1_update(&ctx, ir_binary, ir_size); + if (sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL || + sel->type == PIPE_SHADER_GEOMETRY) + _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so)); + _mesa_sha1_final(&ctx, ir_sha1_cache_key); + + if (ir_binary == blob.data) + blob_finish(&blob); } /** Copy "data" to "ptr" and return the next dword following copied data. */ static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size) { - /* data may be NULL if size == 0 */ - if (size) - memcpy(ptr, data, size); - ptr += DIV_ROUND_UP(size, 4); - return ptr; + /* data may be NULL if size == 0 */ + if (size) + memcpy(ptr, data, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; } /** Read data from "ptr". Return the next dword following the data. */ static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size) { - memcpy(data, ptr, size); - ptr += DIV_ROUND_UP(size, 4); - return ptr; + memcpy(data, ptr, size); + ptr += DIV_ROUND_UP(size, 4); + return ptr; } /** @@ -120,8 +115,8 @@ static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size) */ static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size) { - *ptr++ = size; - return write_data(ptr, data, size); + *ptr++ = size; + return write_data(ptr, data, size); } /** @@ -130,12 +125,12 @@ static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size) */ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) { - *size = *ptr++; - assert(*data == NULL); - if (!*size) - return ptr; - *data = malloc(*size); - return read_data(ptr, *data, *size); + *size = *ptr++; + assert(*data == NULL); + if (!*size) + return ptr; + *data = malloc(*size); + return read_data(ptr, *data, *size); } /** @@ -144,258 +139,236 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) */ static void *si_get_shader_binary(struct si_shader *shader) { - /* There is always a size of data followed by the data itself. */ - unsigned llvm_ir_size = shader->binary.llvm_ir_string ? - strlen(shader->binary.llvm_ir_string) + 1 : 0; - - /* Refuse to allocate overly large buffers and guard against integer - * overflow. */ - if (shader->binary.elf_size > UINT_MAX / 4 || - llvm_ir_size > UINT_MAX / 4) - return NULL; - - unsigned size = - 4 + /* total size */ - 4 + /* CRC32 of the data below */ - align(sizeof(shader->config), 4) + - align(sizeof(shader->info), 4) + - 4 + align(shader->binary.elf_size, 4) + - 4 + align(llvm_ir_size, 4); - void *buffer = CALLOC(1, size); - uint32_t *ptr = (uint32_t*)buffer; - - if (!buffer) - return NULL; - - *ptr++ = size; - ptr++; /* CRC32 is calculated at the end. */ - - ptr = write_data(ptr, &shader->config, sizeof(shader->config)); - ptr = write_data(ptr, &shader->info, sizeof(shader->info)); - ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size); - ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size); - assert((char *)ptr - (char *)buffer == size); - - /* Compute CRC32. */ - ptr = (uint32_t*)buffer; - ptr++; - *ptr = util_hash_crc32(ptr + 1, size - 8); - - return buffer; + /* There is always a size of data followed by the data itself. */ + unsigned llvm_ir_size = + shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0; + + /* Refuse to allocate overly large buffers and guard against integer + * overflow. */ + if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4) + return NULL; + + unsigned size = 4 + /* total size */ + 4 + /* CRC32 of the data below */ + align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 + + align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4); + void *buffer = CALLOC(1, size); + uint32_t *ptr = (uint32_t *)buffer; + + if (!buffer) + return NULL; + + *ptr++ = size; + ptr++; /* CRC32 is calculated at the end. */ + + ptr = write_data(ptr, &shader->config, sizeof(shader->config)); + ptr = write_data(ptr, &shader->info, sizeof(shader->info)); + ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size); + ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size); + assert((char *)ptr - (char *)buffer == size); + + /* Compute CRC32. */ + ptr = (uint32_t *)buffer; + ptr++; + *ptr = util_hash_crc32(ptr + 1, size - 8); + + return buffer; } static bool si_load_shader_binary(struct si_shader *shader, void *binary) { - uint32_t *ptr = (uint32_t*)binary; - uint32_t size = *ptr++; - uint32_t crc32 = *ptr++; - unsigned chunk_size; - unsigned elf_size; - - if (util_hash_crc32(ptr, size - 8) != crc32) { - fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n"); - return false; - } - - ptr = read_data(ptr, &shader->config, sizeof(shader->config)); - ptr = read_data(ptr, &shader->info, sizeof(shader->info)); - ptr = read_chunk(ptr, (void**)&shader->binary.elf_buffer, - &elf_size); - shader->binary.elf_size = elf_size; - ptr = read_chunk(ptr, (void**)&shader->binary.llvm_ir_string, &chunk_size); - - return true; + uint32_t *ptr = (uint32_t *)binary; + uint32_t size = *ptr++; + uint32_t crc32 = *ptr++; + unsigned chunk_size; + unsigned elf_size; + + if (util_hash_crc32(ptr, size - 8) != crc32) { + fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n"); + return false; + } + + ptr = read_data(ptr, &shader->config, sizeof(shader->config)); + ptr = read_data(ptr, &shader->info, sizeof(shader->info)); + ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size); + shader->binary.elf_size = elf_size; + ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size); + + return true; } /** * Insert a shader into the cache. It's assumed the shader is not in the cache. * Use si_shader_cache_load_shader before calling this. */ -void si_shader_cache_insert_shader(struct si_screen *sscreen, - unsigned char ir_sha1_cache_key[20], - struct si_shader *shader, - bool insert_into_disk_cache) -{ - void *hw_binary; - struct hash_entry *entry; - uint8_t key[CACHE_KEY_SIZE]; - - entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); - if (entry) - return; /* already added */ - - hw_binary = si_get_shader_binary(shader); - if (!hw_binary) - return; - - if (_mesa_hash_table_insert(sscreen->shader_cache, - mem_dup(ir_sha1_cache_key, 20), - hw_binary) == NULL) { - FREE(hw_binary); - return; - } - - if (sscreen->disk_shader_cache && insert_into_disk_cache) { - disk_cache_compute_key(sscreen->disk_shader_cache, - ir_sha1_cache_key, 20, key); - disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, - *((uint32_t *) hw_binary), NULL); - } -} - -bool si_shader_cache_load_shader(struct si_screen *sscreen, - unsigned char ir_sha1_cache_key[20], - struct si_shader *shader) -{ - struct hash_entry *entry = - _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); - - if (entry) { - if (si_load_shader_binary(shader, entry->data)) { - p_atomic_inc(&sscreen->num_memory_shader_cache_hits); - return true; - } - } - p_atomic_inc(&sscreen->num_memory_shader_cache_misses); - - if (!sscreen->disk_shader_cache) - return false; - - unsigned char sha1[CACHE_KEY_SIZE]; - disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, - 20, sha1); - - size_t binary_size; - uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, - &binary_size); - if (buffer) { - if (binary_size >= sizeof(uint32_t) && - *((uint32_t*)buffer) == binary_size) { - if (si_load_shader_binary(shader, buffer)) { - free(buffer); - si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, - shader, false); - p_atomic_inc(&sscreen->num_disk_shader_cache_hits); - return true; - } - } else { - /* Something has gone wrong discard the item from the cache and - * rebuild/link from source. - */ - assert(!"Invalid radeonsi shader disk cache item!"); - disk_cache_remove(sscreen->disk_shader_cache, sha1); - } - } - - free(buffer); - p_atomic_inc(&sscreen->num_disk_shader_cache_misses); - return false; +void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], + struct si_shader *shader, bool insert_into_disk_cache) +{ + void *hw_binary; + struct hash_entry *entry; + uint8_t key[CACHE_KEY_SIZE]; + + entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); + if (entry) + return; /* already added */ + + hw_binary = si_get_shader_binary(shader); + if (!hw_binary) + return; + + if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), hw_binary) == + NULL) { + FREE(hw_binary); + return; + } + + if (sscreen->disk_shader_cache && insert_into_disk_cache) { + disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key); + disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL); + } +} + +bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], + struct si_shader *shader) +{ + struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key); + + if (entry) { + if (si_load_shader_binary(shader, entry->data)) { + p_atomic_inc(&sscreen->num_memory_shader_cache_hits); + return true; + } + } + p_atomic_inc(&sscreen->num_memory_shader_cache_misses); + + if (!sscreen->disk_shader_cache) + return false; + + unsigned char sha1[CACHE_KEY_SIZE]; + disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1); + + size_t binary_size; + uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size); + if (buffer) { + if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) { + if (si_load_shader_binary(shader, buffer)) { + free(buffer); + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false); + p_atomic_inc(&sscreen->num_disk_shader_cache_hits); + return true; + } + } else { + /* Something has gone wrong discard the item from the cache and + * rebuild/link from source. + */ + assert(!"Invalid radeonsi shader disk cache item!"); + disk_cache_remove(sscreen->disk_shader_cache, sha1); + } + } + + free(buffer); + p_atomic_inc(&sscreen->num_disk_shader_cache_misses); + return false; } static uint32_t si_shader_cache_key_hash(const void *key) { - /* Take the first dword of SHA1. */ - return *(uint32_t*)key; + /* Take the first dword of SHA1. */ + return *(uint32_t *)key; } static bool si_shader_cache_key_equals(const void *a, const void *b) { - /* Compare SHA1s. */ - return memcmp(a, b, 20) == 0; + /* Compare SHA1s. */ + return memcmp(a, b, 20) == 0; } static void si_destroy_shader_cache_entry(struct hash_entry *entry) { - FREE((void*)entry->key); - FREE(entry->data); + FREE((void *)entry->key); + FREE(entry->data); } bool si_init_shader_cache(struct si_screen *sscreen) { - (void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain); - sscreen->shader_cache = - _mesa_hash_table_create(NULL, - si_shader_cache_key_hash, - si_shader_cache_key_equals); + (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain); + sscreen->shader_cache = + _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals); - return sscreen->shader_cache != NULL; + return sscreen->shader_cache != NULL; } void si_destroy_shader_cache(struct si_screen *sscreen) { - if (sscreen->shader_cache) - _mesa_hash_table_destroy(sscreen->shader_cache, - si_destroy_shader_cache_entry); - simple_mtx_destroy(&sscreen->shader_cache_mutex); + if (sscreen->shader_cache) + _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry); + simple_mtx_destroy(&sscreen->shader_cache_mutex); } /* SHADER STATES */ -static void si_set_tesseval_regs(struct si_screen *sscreen, - const struct si_shader_selector *tes, - struct si_pm4_state *pm4) -{ - const struct si_shader_info *info = &tes->info; - unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; - unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; - bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; - bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE]; - unsigned type, partitioning, topology, distribution_mode; - - switch (tes_prim_mode) { - case PIPE_PRIM_LINES: - type = V_028B6C_TESS_ISOLINE; - break; - case PIPE_PRIM_TRIANGLES: - type = V_028B6C_TESS_TRIANGLE; - break; - case PIPE_PRIM_QUADS: - type = V_028B6C_TESS_QUAD; - break; - default: - assert(0); - return; - } - - switch (tes_spacing) { - case PIPE_TESS_SPACING_FRACTIONAL_ODD: - partitioning = V_028B6C_PART_FRAC_ODD; - break; - case PIPE_TESS_SPACING_FRACTIONAL_EVEN: - partitioning = V_028B6C_PART_FRAC_EVEN; - break; - case PIPE_TESS_SPACING_EQUAL: - partitioning = V_028B6C_PART_INTEGER; - break; - default: - assert(0); - return; - } - - if (tes_point_mode) - topology = V_028B6C_OUTPUT_POINT; - else if (tes_prim_mode == PIPE_PRIM_LINES) - topology = V_028B6C_OUTPUT_LINE; - else if (tes_vertex_order_cw) - /* for some reason, this must be the other way around */ - topology = V_028B6C_OUTPUT_TRIANGLE_CCW; - else - topology = V_028B6C_OUTPUT_TRIANGLE_CW; - - if (sscreen->info.has_distributed_tess) { - if (sscreen->info.family == CHIP_FIJI || - sscreen->info.family >= CHIP_POLARIS10) - distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS; - else - distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS; - } else - distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST; - - assert(pm4->shader); - pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | - S_028B6C_PARTITIONING(partitioning) | - S_028B6C_TOPOLOGY(topology) | - S_028B6C_DISTRIBUTION_MODE(distribution_mode); +static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes, + struct si_pm4_state *pm4) +{ + const struct si_shader_info *info = &tes->info; + unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE]; + unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING]; + bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW]; + bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE]; + unsigned type, partitioning, topology, distribution_mode; + + switch (tes_prim_mode) { + case PIPE_PRIM_LINES: + type = V_028B6C_TESS_ISOLINE; + break; + case PIPE_PRIM_TRIANGLES: + type = V_028B6C_TESS_TRIANGLE; + break; + case PIPE_PRIM_QUADS: + type = V_028B6C_TESS_QUAD; + break; + default: + assert(0); + return; + } + + switch (tes_spacing) { + case PIPE_TESS_SPACING_FRACTIONAL_ODD: + partitioning = V_028B6C_PART_FRAC_ODD; + break; + case PIPE_TESS_SPACING_FRACTIONAL_EVEN: + partitioning = V_028B6C_PART_FRAC_EVEN; + break; + case PIPE_TESS_SPACING_EQUAL: + partitioning = V_028B6C_PART_INTEGER; + break; + default: + assert(0); + return; + } + + if (tes_point_mode) + topology = V_028B6C_OUTPUT_POINT; + else if (tes_prim_mode == PIPE_PRIM_LINES) + topology = V_028B6C_OUTPUT_LINE; + else if (tes_vertex_order_cw) + /* for some reason, this must be the other way around */ + topology = V_028B6C_OUTPUT_TRIANGLE_CCW; + else + topology = V_028B6C_OUTPUT_TRIANGLE_CW; + + if (sscreen->info.has_distributed_tess) { + if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10) + distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS; + else + distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS; + } else + distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST; + + assert(pm4->shader); + pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | + S_028B6C_TOPOLOGY(topology) | + S_028B6C_DISTRIBUTION_MODE(distribution_mode); } /* Polaris needs different VTX_REUSE_DEPTH settings depending on @@ -412,722 +385,674 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, * * If "shader" is NULL, it's assumed it's not LS or GS copy shader. */ -static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, - struct si_shader_selector *sel, - struct si_shader *shader, - struct si_pm4_state *pm4) +static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel, + struct si_shader *shader, struct si_pm4_state *pm4) { - unsigned type = sel->type; - - if (sscreen->info.family < CHIP_POLARIS10 || - sscreen->info.chip_class >= GFX10) - return; - - /* VS as VS, or VS as ES: */ - if ((type == PIPE_SHADER_VERTEX && - (!shader || - (!shader->key.as_ls && !shader->is_gs_copy_shader))) || - /* TES as VS, or TES as ES: */ - type == PIPE_SHADER_TESS_EVAL) { - unsigned vtx_reuse_depth = 30; - - if (type == PIPE_SHADER_TESS_EVAL && - sel->info.properties[TGSI_PROPERTY_TES_SPACING] == - PIPE_TESS_SPACING_FRACTIONAL_ODD) - vtx_reuse_depth = 14; - - assert(pm4->shader); - pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; - } + unsigned type = sel->type; + + if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10) + return; + + /* VS as VS, or VS as ES: */ + if ((type == PIPE_SHADER_VERTEX && + (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) || + /* TES as VS, or TES as ES: */ + type == PIPE_SHADER_TESS_EVAL) { + unsigned vtx_reuse_depth = 30; + + if (type == PIPE_SHADER_TESS_EVAL && + sel->info.properties[TGSI_PROPERTY_TES_SPACING] == PIPE_TESS_SPACING_FRACTIONAL_ODD) + vtx_reuse_depth = 14; + + assert(pm4->shader); + pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth; + } } static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader) { - if (shader->pm4) - si_pm4_clear_state(shader->pm4); - else - shader->pm4 = CALLOC_STRUCT(si_pm4_state); - - if (shader->pm4) { - shader->pm4->shader = shader; - return shader->pm4; - } else { - fprintf(stderr, "radeonsi: Failed to create pm4 state.\n"); - return NULL; - } + if (shader->pm4) + si_pm4_clear_state(shader->pm4); + else + shader->pm4 = CALLOC_STRUCT(si_pm4_state); + + if (shader->pm4) { + shader->pm4->shader = shader; + return shader->pm4; + } else { + fprintf(stderr, "radeonsi: Failed to create pm4 state.\n"); + return NULL; + } } static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader, - unsigned num_always_on_user_sgprs) + unsigned num_always_on_user_sgprs) { - struct si_shader_selector *vs = shader->previous_stage_sel ? - shader->previous_stage_sel : shader->selector; - unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs; + struct si_shader_selector *vs = + shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; + unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs; - /* 1 SGPR is reserved for the vertex buffer pointer. */ - assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1); + /* 1 SGPR is reserved for the vertex buffer pointer. */ + assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1); - if (num_vbos_in_user_sgprs) - return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4; + if (num_vbos_in_user_sgprs) + return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4; - /* Add the pointer to VBO descriptors. */ - return num_always_on_user_sgprs + 1; + /* Add the pointer to VBO descriptors. */ + return num_always_on_user_sgprs + 1; } /* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */ -static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, - struct si_shader *shader, bool legacy_vs_prim_id) -{ - assert(shader->selector->type == PIPE_SHADER_VERTEX || - (shader->previous_stage_sel && - shader->previous_stage_sel->type == PIPE_SHADER_VERTEX)); - - /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...). - * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...) - * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID). - * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or InstanceID) - */ - bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls; - - if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid) - return 3; - else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id) - return 2; - else if (is_ls || shader->info.uses_instanceid) - return 1; - else - return 0; +static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader, + bool legacy_vs_prim_id) +{ + assert(shader->selector->type == PIPE_SHADER_VERTEX || + (shader->previous_stage_sel && shader->previous_stage_sel->type == PIPE_SHADER_VERTEX)); + + /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...). + * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...) + * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID). + * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or + * InstanceID) + */ + bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls; + + if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid) + return 3; + else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id) + return 2; + else if (is_ls || shader->info.uses_instanceid) + return 1; + else + return 0; } static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) { - struct si_pm4_state *pm4; - uint64_t va; + struct si_pm4_state *pm4; + uint64_t va; - assert(sscreen->info.chip_class <= GFX8); + assert(sscreen->info.chip_class <= GFX8); - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); - shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) | - S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) | - S_00B528_DX10_CLAMP(1) | - S_00B528_FLOAT_MODE(shader->config.float_mode); - shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) | - S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) | + S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) | + S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode); + shader->config.rsrc2 = + S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) | + S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); } static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) { - struct si_pm4_state *pm4; - uint64_t va; - - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - if (sscreen->info.chip_class >= GFX9) { - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); - } else { - si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); - si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40)); - } - - unsigned num_user_sgprs = - si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); - - shader->config.rsrc2 = - S_00B42C_USER_SGPR(num_user_sgprs) | - S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); - - if (sscreen->info.chip_class >= GFX10) - shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); - else - shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); - } else { - si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); - si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40)); - - shader->config.rsrc2 = - S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | - S_00B42C_OC_LDS_EN(1) | - S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); - } - - si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, - S_00B428_VGPRS((shader->config.num_vgprs - 1) / - (sscreen->ge_wave_size == 32 ? 8 : 4)) | - (sscreen->info.chip_class <= GFX9 ? - S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) | - S_00B428_DX10_CLAMP(1) | - S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | - S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) | - S_00B428_FLOAT_MODE(shader->config.float_mode) | - S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9 ? - si_get_vs_vgpr_comp_cnt(sscreen, shader, false) : 0)); - - if (sscreen->info.chip_class <= GFX8) { - si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, - shader->config.rsrc2); - } + struct si_pm4_state *pm4; + uint64_t va; + + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (sscreen->info.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); + si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40)); + } else { + si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); + si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40)); + } + + unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); + + shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) | + S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + + if (sscreen->info.chip_class >= GFX10) + shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); + else + shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); + } else { + si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); + si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40)); + + shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) | + S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + } + + si_pm4_set_reg( + pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, + S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) + : 0) | + S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | + S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) | + S_00B428_FLOAT_MODE(shader->config.float_mode) | + S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9 + ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false) + : 0)); + + if (sscreen->info.chip_class <= GFX8) { + si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2); + } } static void si_emit_shader_es(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.es->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; + struct si_shader *shader = sctx->queued.named.es->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; - if (!shader) - return; + if (!shader) + return; - radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->selector->esgs_itemsize / 4); + radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, + shader->selector->esgs_itemsize / 4); - if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, - SI_TRACKED_VGT_TF_PARAM, - shader->vgt_tf_param); + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); - if (shader->vgt_vertex_reuse_block_cntl) - radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, - SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, - shader->vgt_vertex_reuse_block_cntl); + if (shader->vgt_vertex_reuse_block_cntl) + radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, + SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + shader->vgt_vertex_reuse_block_cntl); - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) { - struct si_pm4_state *pm4; - unsigned num_user_sgprs; - unsigned vgpr_comp_cnt; - uint64_t va; - unsigned oc_lds_en; - - assert(sscreen->info.chip_class <= GFX8); - - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - pm4->atom.emit = si_emit_shader_es; - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - if (shader->selector->type == PIPE_SHADER_VERTEX) { - vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); - num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); - } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { - vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2; - num_user_sgprs = SI_TES_NUM_USER_SGPR; - } else - unreachable("invalid shader selector type"); - - oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0; - - si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); - si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, - S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) | - S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | - S_00B328_DX10_CLAMP(1) | - S_00B328_FLOAT_MODE(shader->config.float_mode)); - si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, - S_00B32C_USER_SGPR(num_user_sgprs) | - S_00B32C_OC_LDS_EN(oc_lds_en) | - S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); - - if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); - - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); -} - -void gfx9_get_gs_info(struct si_shader_selector *es, - struct si_shader_selector *gs, - struct gfx9_gs_info *out) -{ - unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1); - unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; - bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && - input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; - - /* All these are in dwords: */ - /* We can't allow using the whole LDS, because GS waves compete with - * other shader stages for LDS space. */ - const unsigned max_lds_size = 8 * 1024; - const unsigned esgs_itemsize = es->esgs_itemsize / 4; - unsigned esgs_lds_size; - - /* All these are per subgroup: */ - const unsigned max_out_prims = 32 * 1024; - const unsigned max_es_verts = 255; - const unsigned ideal_gs_prims = 64; - unsigned max_gs_prims, gs_prims; - unsigned min_es_verts, es_verts, worst_case_es_verts; - - if (uses_adjacency || gs_num_invocations > 1) - max_gs_prims = 127 / gs_num_invocations; - else - max_gs_prims = 255; - - /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. - * Make sure we don't go over the maximum value. - */ - if (gs->gs_max_out_vertices > 0) { - max_gs_prims = MIN2(max_gs_prims, - max_out_prims / - (gs->gs_max_out_vertices * gs_num_invocations)); - } - assert(max_gs_prims > 0); - - /* If the primitive has adjacency, halve the number of vertices - * that will be reused in multiple primitives. - */ - min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1); - - gs_prims = MIN2(ideal_gs_prims, max_gs_prims); - worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); - - /* Compute ESGS LDS size based on the worst case number of ES vertices - * needed to create the target number of GS prims per subgroup. - */ - esgs_lds_size = esgs_itemsize * worst_case_es_verts; - - /* If total LDS usage is too big, refactor partitions based on ratio - * of ESGS item sizes. - */ - if (esgs_lds_size > max_lds_size) { - /* Our target GS Prims Per Subgroup was too large. Calculate - * the maximum number of GS Prims Per Subgroup that will fit - * into LDS, capped by the maximum that the hardware can support. - */ - gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), - max_gs_prims); - assert(gs_prims > 0); - worst_case_es_verts = MIN2(min_es_verts * gs_prims, - max_es_verts); - - esgs_lds_size = esgs_itemsize * worst_case_es_verts; - assert(esgs_lds_size <= max_lds_size); - } - - /* Now calculate remaining ESGS information. */ - if (esgs_lds_size) - es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); - else - es_verts = max_es_verts; - - /* Vertices for adjacency primitives are not always reused, so restore - * it for ES_VERTS_PER_SUBGRP. - */ - min_es_verts = gs->gs_input_verts_per_prim; - - /* For normal primitives, the VGT only checks if they are past the ES - * verts per subgroup after allocating a full GS primitive and if they - * are, kick off a new subgroup. But if those additional ES verts are - * unique (e.g. not reused) we need to make sure there is enough LDS - * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. - */ - es_verts -= min_es_verts - 1; - - out->es_verts_per_subgroup = es_verts; - out->gs_prims_per_subgroup = gs_prims; - out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; - out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * - gs->gs_max_out_vertices; - out->esgs_ring_size = 4 * esgs_lds_size; - - assert(out->max_prims_per_subgroup <= max_out_prims); + struct si_pm4_state *pm4; + unsigned num_user_sgprs; + unsigned vgpr_comp_cnt; + uint64_t va; + unsigned oc_lds_en; + + assert(sscreen->info.chip_class <= GFX8); + + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + pm4->atom.emit = si_emit_shader_es; + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (shader->selector->type == PIPE_SHADER_VERTEX) { + vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); + } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { + vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2; + num_user_sgprs = SI_TES_NUM_USER_SGPR; + } else + unreachable("invalid shader selector type"); + + oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0; + + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); + si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, + S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) | + S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) | + S_00B328_FLOAT_MODE(shader->config.float_mode)); + si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, + S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) | + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, shader->selector, pm4); + + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); +} + +void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, + struct gfx9_gs_info *out) +{ + unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1); + unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; + bool uses_adjacency = + input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; + + /* All these are in dwords: */ + /* We can't allow using the whole LDS, because GS waves compete with + * other shader stages for LDS space. */ + const unsigned max_lds_size = 8 * 1024; + const unsigned esgs_itemsize = es->esgs_itemsize / 4; + unsigned esgs_lds_size; + + /* All these are per subgroup: */ + const unsigned max_out_prims = 32 * 1024; + const unsigned max_es_verts = 255; + const unsigned ideal_gs_prims = 64; + unsigned max_gs_prims, gs_prims; + unsigned min_es_verts, es_verts, worst_case_es_verts; + + if (uses_adjacency || gs_num_invocations > 1) + max_gs_prims = 127 / gs_num_invocations; + else + max_gs_prims = 255; + + /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. + * Make sure we don't go over the maximum value. + */ + if (gs->gs_max_out_vertices > 0) { + max_gs_prims = + MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations)); + } + assert(max_gs_prims > 0); + + /* If the primitive has adjacency, halve the number of vertices + * that will be reused in multiple primitives. + */ + min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1); + + gs_prims = MIN2(ideal_gs_prims, max_gs_prims); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); + + /* Compute ESGS LDS size based on the worst case number of ES vertices + * needed to create the target number of GS prims per subgroup. + */ + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + + /* If total LDS usage is too big, refactor partitions based on ratio + * of ESGS item sizes. + */ + if (esgs_lds_size > max_lds_size) { + /* Our target GS Prims Per Subgroup was too large. Calculate + * the maximum number of GS Prims Per Subgroup that will fit + * into LDS, capped by the maximum that the hardware can support. + */ + gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims); + assert(gs_prims > 0); + worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts); + + esgs_lds_size = esgs_itemsize * worst_case_es_verts; + assert(esgs_lds_size <= max_lds_size); + } + + /* Now calculate remaining ESGS information. */ + if (esgs_lds_size) + es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts); + else + es_verts = max_es_verts; + + /* Vertices for adjacency primitives are not always reused, so restore + * it for ES_VERTS_PER_SUBGRP. + */ + min_es_verts = gs->gs_input_verts_per_prim; + + /* For normal primitives, the VGT only checks if they are past the ES + * verts per subgroup after allocating a full GS primitive and if they + * are, kick off a new subgroup. But if those additional ES verts are + * unique (e.g. not reused) we need to make sure there is enough LDS + * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. + */ + es_verts -= min_es_verts - 1; + + out->es_verts_per_subgroup = es_verts; + out->gs_prims_per_subgroup = gs_prims; + out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; + out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices; + out->esgs_ring_size = 4 * esgs_lds_size; + + assert(out->max_prims_per_subgroup <= max_out_prims); } static void si_emit_shader_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - if (!shader) - return; - - /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 - * R_028A68_VGT_GSVS_RING_OFFSET_3 */ - radeon_opt_set_context_reg3(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, - SI_TRACKED_VGT_GSVS_RING_OFFSET_1, - shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, - shader->ctx_reg.gs.vgt_gsvs_ring_offset_2, - shader->ctx_reg.gs.vgt_gsvs_ring_offset_3); - - /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */ - radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE, - SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, - shader->ctx_reg.gs.vgt_gsvs_ring_itemsize); - - /* R_028B38_VGT_GS_MAX_VERT_OUT */ - radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, - SI_TRACKED_VGT_GS_MAX_VERT_OUT, - shader->ctx_reg.gs.vgt_gs_max_vert_out); - - /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1 - * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */ - radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, - SI_TRACKED_VGT_GS_VERT_ITEMSIZE, - shader->ctx_reg.gs.vgt_gs_vert_itemsize, - shader->ctx_reg.gs.vgt_gs_vert_itemsize_1, - shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, - shader->ctx_reg.gs.vgt_gs_vert_itemsize_3); - - /* R_028B90_VGT_GS_INSTANCE_CNT */ - radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, - SI_TRACKED_VGT_GS_INSTANCE_CNT, - shader->ctx_reg.gs.vgt_gs_instance_cnt); - - if (sctx->chip_class >= GFX9) { - /* R_028A44_VGT_GS_ONCHIP_CNTL */ - radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, - SI_TRACKED_VGT_GS_ONCHIP_CNTL, - shader->ctx_reg.gs.vgt_gs_onchip_cntl); - /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */ - radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, - SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, - shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup); - /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */ - radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->ctx_reg.gs.vgt_esgs_ring_itemsize); - - if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL) - radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, - SI_TRACKED_VGT_TF_PARAM, - shader->vgt_tf_param); - if (shader->vgt_vertex_reuse_block_cntl) - radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, - SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, - shader->vgt_vertex_reuse_block_cntl); - } - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 + * R_028A68_VGT_GSVS_RING_OFFSET_3 */ + radeon_opt_set_context_reg3( + sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1, + shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2, + shader->ctx_reg.gs.vgt_gsvs_ring_offset_3); + + /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */ + radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE, + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, + shader->ctx_reg.gs.vgt_gsvs_ring_itemsize); + + /* R_028B38_VGT_GS_MAX_VERT_OUT */ + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ctx_reg.gs.vgt_gs_max_vert_out); + + /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1 + * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */ + radeon_opt_set_context_reg4( + sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE, + shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1, + shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3); + + /* R_028B90_VGT_GS_INSTANCE_CNT */ + radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT, + shader->ctx_reg.gs.vgt_gs_instance_cnt); + + if (sctx->chip_class >= GFX9) { + /* R_028A44_VGT_GS_ONCHIP_CNTL */ + radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL, + shader->ctx_reg.gs.vgt_gs_onchip_cntl); + /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */ + radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup); + /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */ + radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, + shader->ctx_reg.gs.vgt_esgs_ring_itemsize); + + if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL) + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + if (shader->vgt_vertex_reuse_block_cntl) + radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, + SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + shader->vgt_vertex_reuse_block_cntl); + } + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) { - struct si_shader_selector *sel = shader->selector; - const ubyte *num_components = sel->info.num_stream_output_components; - unsigned gs_num_invocations = sel->gs_num_invocations; - struct si_pm4_state *pm4; - uint64_t va; - unsigned max_stream = sel->max_gs_stream; - unsigned offset; - - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - pm4->atom.emit = si_emit_shader_gs; - - offset = num_components[0] * sel->gs_max_out_vertices; - shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset; - - if (max_stream >= 1) - offset += num_components[1] * sel->gs_max_out_vertices; - shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset; - - if (max_stream >= 2) - offset += num_components[2] * sel->gs_max_out_vertices; - shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset; - - if (max_stream >= 3) - offset += num_components[3] * sel->gs_max_out_vertices; - shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset; - - /* The GSVS_RING_ITEMSIZE register takes 15 bits */ - assert(offset < (1 << 15)); - - shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices; - - shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0]; - shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0; - shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0; - shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0; - - shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) | - S_028B90_ENABLE(gs_num_invocations > 0); - - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - if (sscreen->info.chip_class >= GFX9) { - unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; - unsigned es_type = shader->key.part.gs.es->type; - unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; - - if (es_type == PIPE_SHADER_VERTEX) { - es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); - } else if (es_type == PIPE_SHADER_TESS_EVAL) - es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2; - else - unreachable("invalid shader selector type"); - - /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and - * VGPR[0:4] are always loaded. - */ - if (sel->info.uses_invocationid) - gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ - else if (sel->info.uses_primid) - gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - else if (input_prim >= PIPE_PRIM_TRIANGLES) - gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ - else - gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ - - unsigned num_user_sgprs; - if (es_type == PIPE_SHADER_VERTEX) - num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); - else - num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); - } else { - si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40)); - } - - uint32_t rsrc1 = - S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B228_DX10_CLAMP(1) | - S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | - S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) | - S_00B228_FLOAT_MODE(shader->config.float_mode) | - S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt); - uint32_t rsrc2 = - S_00B22C_USER_SGPR(num_user_sgprs) | - S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | - S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | - S_00B22C_LDS_SIZE(shader->config.lds_size) | - S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); - - if (sscreen->info.chip_class >= GFX10) { - rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); - } else { - rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8); - rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); - } - - si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); - si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); - - if (sscreen->info.chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); - } - - shader->ctx_reg.gs.vgt_gs_onchip_cntl = - S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | - S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup); - shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup = - S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup); - shader->ctx_reg.gs.vgt_esgs_ring_itemsize = - shader->key.part.gs.es->esgs_itemsize / 4; - - if (es_type == PIPE_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); - - polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, - NULL, pm4); - } else { - si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); - si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40)); - - si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) | - S_00B228_DX10_CLAMP(1) | - S_00B228_FLOAT_MODE(shader->config.float_mode)); - si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, - S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | - S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); - } + struct si_shader_selector *sel = shader->selector; + const ubyte *num_components = sel->info.num_stream_output_components; + unsigned gs_num_invocations = sel->gs_num_invocations; + struct si_pm4_state *pm4; + uint64_t va; + unsigned max_stream = sel->max_gs_stream; + unsigned offset; + + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + pm4->atom.emit = si_emit_shader_gs; + + offset = num_components[0] * sel->gs_max_out_vertices; + shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset; + + if (max_stream >= 1) + offset += num_components[1] * sel->gs_max_out_vertices; + shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset; + + if (max_stream >= 2) + offset += num_components[2] * sel->gs_max_out_vertices; + shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset; + + if (max_stream >= 3) + offset += num_components[3] * sel->gs_max_out_vertices; + shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset; + + /* The GSVS_RING_ITEMSIZE register takes 15 bits */ + assert(offset < (1 << 15)); + + shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices; + + shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0]; + shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0; + shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0; + shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0; + + shader->ctx_reg.gs.vgt_gs_instance_cnt = + S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0); + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (sscreen->info.chip_class >= GFX9) { + unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; + unsigned es_type = shader->key.part.gs.es->type; + unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; + + if (es_type == PIPE_SHADER_VERTEX) { + es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); + } else if (es_type == PIPE_SHADER_TESS_EVAL) + es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2; + else + unreachable("invalid shader selector type"); + + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and + * VGPR[0:4] are always loaded. + */ + if (sel->info.uses_invocationid) + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */ + else if (sel->info.uses_primid) + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + else if (input_prim >= PIPE_PRIM_TRIANGLES) + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + else + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + + unsigned num_user_sgprs; + if (es_type == PIPE_SHADER_VERTEX) + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); + else + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40)); + } else { + si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); + si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40)); + } + + uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) | + S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | + S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) | + S_00B228_FLOAT_MODE(shader->config.float_mode) | + S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt); + uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | + S_00B22C_LDS_SIZE(shader->config.lds_size) | + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + + if (sscreen->info.chip_class >= GFX10) { + rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); + } else { + rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8); + rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); + } + + si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1); + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); + + if (sscreen->info.chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); + } + + shader->ctx_reg.gs.vgt_gs_onchip_cntl = + S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | + S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup); + shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup = + S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup); + shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4; + + if (es_type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); + + polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); + } else { + si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); + si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40)); + + si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, + S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | + S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) | + S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode)); + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, + S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); + } } static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) { - enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; + enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC; - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || + sctx->tracked_regs.reg_value[reg] != value) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; - if (sctx->family == CHIP_NAVI10 || - sctx->family == CHIP_NAVI12 || - sctx->family == CHIP_NAVI14) { - /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); - } + if (sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 || + sctx->family == CHIP_NAVI14) { + /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); + } - radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); + radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } + sctx->tracked_regs.reg_saved |= 0x1ull << reg; + sctx->tracked_regs.reg_value[reg] = value; + } } /* Common tail code for NGG primitive shaders. */ -static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, - struct si_shader *shader, - unsigned initial_cdw) -{ - radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, - SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, - shader->ctx_reg.ngg.ge_max_output_per_subgroup); - radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, - SI_TRACKED_GE_NGG_SUBGRP_CNTL, - shader->ctx_reg.ngg.ge_ngg_subgrp_cntl); - radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, - SI_TRACKED_VGT_PRIMITIVEID_EN, - shader->ctx_reg.ngg.vgt_primitiveid_en); - radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, - SI_TRACKED_VGT_GS_ONCHIP_CNTL, - shader->ctx_reg.ngg.vgt_gs_onchip_cntl); - radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, - SI_TRACKED_VGT_GS_INSTANCE_CNT, - shader->ctx_reg.ngg.vgt_gs_instance_cnt); - radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, - shader->ctx_reg.ngg.vgt_esgs_ring_itemsize); - radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, - SI_TRACKED_SPI_VS_OUT_CONFIG, - shader->ctx_reg.ngg.spi_vs_out_config); - radeon_opt_set_context_reg2(sctx, R_028708_SPI_SHADER_IDX_FORMAT, - SI_TRACKED_SPI_SHADER_IDX_FORMAT, - shader->ctx_reg.ngg.spi_shader_idx_format, - shader->ctx_reg.ngg.spi_shader_pos_format); - radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, - SI_TRACKED_PA_CL_VTE_CNTL, - shader->ctx_reg.ngg.pa_cl_vte_cntl); - radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, - SI_TRACKED_PA_CL_NGG_CNTL, - shader->ctx_reg.ngg.pa_cl_ngg_cntl); - - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, - shader->pa_cl_vs_out_cntl, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; - - /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); +static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader, + unsigned initial_cdw) +{ + radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + shader->ctx_reg.ngg.ge_max_output_per_subgroup); + radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL, + shader->ctx_reg.ngg.ge_ngg_subgrp_cntl); + radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN, + shader->ctx_reg.ngg.vgt_primitiveid_en); + radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL, + shader->ctx_reg.ngg.vgt_gs_onchip_cntl); + radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT, + shader->ctx_reg.ngg.vgt_gs_instance_cnt); + radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize); + radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG, + shader->ctx_reg.ngg.spi_vs_out_config); + radeon_opt_set_context_reg2( + sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT, + shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format); + radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL, + shader->ctx_reg.ngg.pa_cl_vte_cntl); + radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL, + shader->ctx_reg.ngg.pa_cl_ngg_cntl); + + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, + SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ + gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); } static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; - if (!shader) - return; + if (!shader) + return; - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); } static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; - if (!shader) - return; + if (!shader) + return; - radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, - SI_TRACKED_VGT_TF_PARAM, - shader->vgt_tf_param); + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); } static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; - if (!shader) - return; + if (!shader) + return; - radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, - SI_TRACKED_VGT_GS_MAX_VERT_OUT, - shader->ctx_reg.ngg.vgt_gs_max_vert_out); + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ctx_reg.ngg.vgt_gs_max_vert_out); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); } static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; + struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; - if (!shader) - return; + if (!shader) + return; - radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, - SI_TRACKED_VGT_GS_MAX_VERT_OUT, - shader->ctx_reg.ngg.vgt_gs_max_vert_out); - radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, - SI_TRACKED_VGT_TF_PARAM, - shader->vgt_tf_param); + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ctx_reg.ngg.vgt_gs_max_vert_out); + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); } unsigned si_get_input_prim(const struct si_shader_selector *gs) { - if (gs->type == PIPE_SHADER_GEOMETRY) - return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; - - if (gs->type == PIPE_SHADER_TESS_EVAL) { - if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE]) - return PIPE_PRIM_POINTS; - if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) - return PIPE_PRIM_LINES; - return PIPE_PRIM_TRIANGLES; - } - - /* TODO: Set this correctly if the primitive type is set in the shader key. */ - return PIPE_PRIM_TRIANGLES; /* worst case for all callers */ + if (gs->type == PIPE_SHADER_GEOMETRY) + return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; + + if (gs->type == PIPE_SHADER_TESS_EVAL) { + if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE]) + return PIPE_PRIM_POINTS; + if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) + return PIPE_PRIM_LINES; + return PIPE_PRIM_TRIANGLES; + } + + /* TODO: Set this correctly if the primitive type is set in the shader key. */ + return PIPE_PRIM_TRIANGLES; /* worst case for all callers */ } static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg) { - bool misc_vec_ena = - sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) || - sel->info.writes_layer || sel->info.writes_viewport_index; - return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) | - S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) | - S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) | - S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) | - S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | - S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena); + bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) || + sel->info.writes_layer || sel->info.writes_viewport_index; + return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) | + S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) | + S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) | + S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) | + S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | + S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena); } /** @@ -1136,305 +1061,279 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ng */ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader) { - const struct si_shader_selector *gs_sel = shader->selector; - const struct si_shader_info *gs_info = &gs_sel->info; - enum pipe_shader_type gs_type = shader->selector->type; - const struct si_shader_selector *es_sel = - shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; - const struct si_shader_info *es_info = &es_sel->info; - enum pipe_shader_type es_type = es_sel->type; - unsigned num_user_sgprs; - unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt; - uint64_t va; - unsigned window_space = - gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; - unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); - unsigned input_prim = si_get_input_prim(gs_sel); - bool break_wave_at_eoi = false; - struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - if (es_type == PIPE_SHADER_TESS_EVAL) { - pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs - : gfx10_emit_shader_ngg_tess_nogs; - } else { - pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs - : gfx10_emit_shader_ngg_notess_nogs; - } - - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - if (es_type == PIPE_SHADER_VERTEX) { - es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); - - if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - num_user_sgprs = SI_SGPR_VS_BLIT_DATA + - es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - } else { - num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); - } - } else { - assert(es_type == PIPE_SHADER_TESS_EVAL); - es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2; - num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - - if (es_enable_prim_id || gs_info->uses_primid) - break_wave_at_eoi = true; - } - - /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and - * VGPR[0:4] are always loaded. - * - * Vertex shaders always need to load VGPR3, because they need to - * pass edge flags for decomposed primitives (such as quads) to the PA - * for the GL_LINE polygon mode to skip rendering lines on inner edges. - */ - if (gs_info->uses_invocationid || - (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader))) - gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ - else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) || - (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) - gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ - else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader)) - gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ - else - gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ - - si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); - si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); - si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->config.num_vgprs - 1) / - (sscreen->ge_wave_size == 32 ? 8 : 4)) | - S_00B228_FLOAT_MODE(shader->config.float_mode) | - S_00B228_DX10_CLAMP(1) | - S_00B228_MEM_ORDERED(1) | - S_00B228_WGP_MODE(1) | - S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); - si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, - S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) | - S_00B22C_USER_SGPR(num_user_sgprs) | - S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | - S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | - S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | - S_00B22C_LDS_SIZE(shader->config.lds_size)); - - /* Determine LATE_ALLOC_GS. */ - unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; - unsigned late_alloc_wave64; /* The limit is per SH. */ - - /* For Wave32, the hw will launch twice the number of late - * alloc waves, so 1 == 2x wave32. - * - * Don't use late alloc for NGG on Navi14 due to a hw bug. - */ - if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc) - late_alloc_wave64 = 0; - else if (num_cu_per_sh <= 6) - late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */ - else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) - late_alloc_wave64 = (num_cu_per_sh - 2) * 6; - else - late_alloc_wave64 = (num_cu_per_sh - 2) * 4; - - /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */ - if (sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) - late_alloc_wave64 = MIN2(late_alloc_wave64, 64); - - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); - - nparams = MAX2(shader->info.nr_param_exports, 1); - shader->ctx_reg.ngg.spi_vs_out_config = - S_0286C4_VS_EXPORT_COUNT(nparams - 1) | - S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); - - shader->ctx_reg.ngg.spi_shader_idx_format = - S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP); - shader->ctx_reg.ngg.spi_shader_pos_format = - S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | - S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE) | - S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE) | - S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE); - - shader->ctx_reg.ngg.vgt_primitiveid_en = - S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | - S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id || - gs_sel->info.writes_primid); - - if (gs_type == PIPE_SHADER_GEOMETRY) { - shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; - shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices; - } else { - shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; - } - - if (es_type == PIPE_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, es_sel, pm4); - - shader->ctx_reg.ngg.vgt_gs_onchip_cntl = - S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | - S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations); - shader->ctx_reg.ngg.ge_max_output_per_subgroup = - S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts); - shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = - S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) | - S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */ - shader->ctx_reg.ngg.vgt_gs_instance_cnt = - S_028B90_CNT(gs_num_invocations) | - S_028B90_ENABLE(gs_num_invocations > 1) | - S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE( - shader->ngg.max_vert_out_per_gs_instance); - - /* Always output hw-generated edge flags and pass them via the prim - * export to prevent drawing lines on internal edges of decomposed - * primitives (such as quads) with polygon mode = lines. Only VS needs - * this. - */ - shader->ctx_reg.ngg.pa_cl_ngg_cntl = - S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX); - shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true); - - /* Oversubscribe PC. This improves performance when there are too many varyings. */ - float oversub_pc_factor = 0.25; - - if (shader->key.opt.ngg_culling) { - /* Be more aggressive with NGG culling. */ - if (shader->info.nr_param_exports > 4) - oversub_pc_factor = 1; - else if (shader->info.nr_param_exports > 2) - oversub_pc_factor = 0.75; - else - oversub_pc_factor = 0.5; - } - - unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor; - shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | - S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); - } else { - shader->ge_cntl = - S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ - S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); - - /* Bug workaround for a possible hang with non-tessellation cases. - * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 - * - * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 - */ - if ((sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && - (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256) { - shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - - if (shader->ngg.hw_max_esverts > 5) { - shader->ge_cntl |= - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); - } - } - } - - if (window_space) { - shader->ctx_reg.ngg.pa_cl_vte_cntl = - S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1); - } else { - shader->ctx_reg.ngg.pa_cl_vte_cntl = - S_028818_VTX_W0_FMT(1) | - S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | - S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | - S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); - } + const struct si_shader_selector *gs_sel = shader->selector; + const struct si_shader_info *gs_info = &gs_sel->info; + enum pipe_shader_type gs_type = shader->selector->type; + const struct si_shader_selector *es_sel = + shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector; + const struct si_shader_info *es_info = &es_sel->info; + enum pipe_shader_type es_type = es_sel->type; + unsigned num_user_sgprs; + unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt; + uint64_t va; + unsigned window_space = gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; + unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1); + unsigned input_prim = si_get_input_prim(gs_sel); + bool break_wave_at_eoi = false; + struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + if (es_type == PIPE_SHADER_TESS_EVAL) { + pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs + : gfx10_emit_shader_ngg_tess_nogs; + } else { + pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs + : gfx10_emit_shader_ngg_notess_nogs; + } + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (es_type == PIPE_SHADER_VERTEX) { + es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false); + + if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + num_user_sgprs = + SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + } else { + num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR); + } + } else { + assert(es_type == PIPE_SHADER_TESS_EVAL); + es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2; + num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; + + if (es_enable_prim_id || gs_info->uses_primid) + break_wave_at_eoi = true; + } + + /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and + * VGPR[0:4] are always loaded. + * + * Vertex shaders always need to load VGPR3, because they need to + * pass edge flags for decomposed primitives (such as quads) to the PA + * for the GL_LINE polygon mode to skip rendering lines on inner edges. + */ + if (gs_info->uses_invocationid || + (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader))) + gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */ + else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) || + (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)) + gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */ + else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader)) + gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ + else + gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ + + si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); + si_pm4_set_reg( + pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, + S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) | + S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) | + S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); + si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) | + S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | + S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) | + S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | + S_00B22C_LDS_SIZE(shader->config.lds_size)); + + /* Determine LATE_ALLOC_GS. */ + unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; + unsigned late_alloc_wave64; /* The limit is per SH. */ + + /* For Wave32, the hw will launch twice the number of late + * alloc waves, so 1 == 2x wave32. + * + * Don't use late alloc for NGG on Navi14 due to a hw bug. + */ + if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc) + late_alloc_wave64 = 0; + else if (num_cu_per_sh <= 6) + late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */ + else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) + late_alloc_wave64 = (num_cu_per_sh - 2) * 6; + else + late_alloc_wave64 = (num_cu_per_sh - 2) * 4; + + /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */ + if (sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) + late_alloc_wave64 = MIN2(late_alloc_wave64, 64); + + si_pm4_set_reg( + pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64)); + + nparams = MAX2(shader->info.nr_param_exports, 1); + shader->ctx_reg.ngg.spi_vs_out_config = + S_0286C4_VS_EXPORT_COUNT(nparams - 1) | + S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); + + shader->ctx_reg.ngg.spi_shader_idx_format = + S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP); + shader->ctx_reg.ngg.spi_shader_pos_format = + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE) | + S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE) | + S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE); + + shader->ctx_reg.ngg.vgt_primitiveid_en = + S_028A84_PRIMITIVEID_EN(es_enable_prim_id) | + S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id || + gs_sel->info.writes_primid); + + if (gs_type == PIPE_SHADER_GEOMETRY) { + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4; + shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices; + } else { + shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1; + } + + if (es_type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, es_sel, pm4); + + shader->ctx_reg.ngg.vgt_gs_onchip_cntl = + S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) | + S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations); + shader->ctx_reg.ngg.ge_max_output_per_subgroup = + S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts); + shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) | + S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */ + shader->ctx_reg.ngg.vgt_gs_instance_cnt = + S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) | + S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance); + + /* Always output hw-generated edge flags and pass them via the prim + * export to prevent drawing lines on internal edges of decomposed + * primitives (such as quads) with polygon mode = lines. Only VS needs + * this. + */ + shader->ctx_reg.ngg.pa_cl_ngg_cntl = + S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX); + shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true); + + /* Oversubscribe PC. This improves performance when there are too many varyings. */ + float oversub_pc_factor = 0.25; + + if (shader->key.opt.ngg_culling) { + /* Be more aggressive with NGG culling. */ + if (shader->info.nr_param_exports > 4) + oversub_pc_factor = 1; + else if (shader->info.nr_param_exports > 2) + oversub_pc_factor = 0.75; + else + oversub_pc_factor = 0.5; + } + + unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor; + shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | + S_030980_NUM_PC_LINES(oversub_pc_lines - 1); + + if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3); + } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2); + } else { + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */ + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + + /* Bug workaround for a possible hang with non-tessellation cases. + * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0 + * + * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5 + */ + if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) && + (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */ + shader->ngg.hw_max_esverts != 256) { + shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; + + if (shader->ngg.hw_max_esverts > 5) { + shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); + } + } + } + + if (window_space) { + shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1); + } else { + shader->ctx_reg.ngg.pa_cl_vte_cntl = + S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | + S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | + S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); + } } static void si_emit_shader_vs(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.vs->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - if (!shader) - return; - - radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, - SI_TRACKED_VGT_GS_MODE, - shader->ctx_reg.vs.vgt_gs_mode); - radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, - SI_TRACKED_VGT_PRIMITIVEID_EN, - shader->ctx_reg.vs.vgt_primitiveid_en); - - if (sctx->chip_class <= GFX8) { - radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, - SI_TRACKED_VGT_REUSE_OFF, - shader->ctx_reg.vs.vgt_reuse_off); - } - - radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, - SI_TRACKED_SPI_VS_OUT_CONFIG, - shader->ctx_reg.vs.spi_vs_out_config); - - radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT, - SI_TRACKED_SPI_SHADER_POS_FORMAT, - shader->ctx_reg.vs.spi_shader_pos_format); - - radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, - SI_TRACKED_PA_CL_VTE_CNTL, - shader->ctx_reg.vs.pa_cl_vte_cntl); - - if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, - SI_TRACKED_VGT_TF_PARAM, - shader->vgt_tf_param); - - if (shader->vgt_vertex_reuse_block_cntl) - radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, - SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, - shader->vgt_vertex_reuse_block_cntl); - - /* Required programming for tessellation. (legacy pipeline only) */ - if (sctx->chip_class == GFX10 && - shader->selector->type == PIPE_SHADER_TESS_EVAL) { - radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, - SI_TRACKED_VGT_GS_ONCHIP_CNTL, - S_028A44_ES_VERTS_PER_SUBGRP(250) | - S_028A44_GS_PRIMS_PER_SUBGRP(126) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(126)); - } - - if (sctx->chip_class >= GFX10) { - radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, - shader->pa_cl_vs_out_cntl, - SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - } - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; - - /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ - if (sctx->chip_class >= GFX10) - gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); + struct si_shader *shader = sctx->queued.named.vs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + + if (!shader) + return; + + radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE, + shader->ctx_reg.vs.vgt_gs_mode); + radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN, + shader->ctx_reg.vs.vgt_primitiveid_en); + + if (sctx->chip_class <= GFX8) { + radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF, + shader->ctx_reg.vs.vgt_reuse_off); + } + + radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG, + shader->ctx_reg.vs.spi_vs_out_config); + + radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT, + SI_TRACKED_SPI_SHADER_POS_FORMAT, + shader->ctx_reg.vs.spi_shader_pos_format); + + radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL, + shader->ctx_reg.vs.pa_cl_vte_cntl); + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + + if (shader->vgt_vertex_reuse_block_cntl) + radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, + SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, + shader->vgt_vertex_reuse_block_cntl); + + /* Required programming for tessellation. (legacy pipeline only) */ + if (sctx->chip_class == GFX10 && shader->selector->type == PIPE_SHADER_TESS_EVAL) { + radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL, + S_028A44_ES_VERTS_PER_SUBGRP(250) | + S_028A44_GS_PRIMS_PER_SUBGRP(126) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(126)); + } + + if (sctx->chip_class >= GFX10) { + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, + SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + } + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; + + /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ + if (sctx->chip_class >= GFX10) + gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc); } /** @@ -1447,827 +1346,757 @@ static void si_emit_shader_vs(struct si_context *sctx) static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, struct si_shader_selector *gs) { - const struct si_shader_info *info = &shader->selector->info; - struct si_pm4_state *pm4; - unsigned num_user_sgprs, vgpr_comp_cnt; - uint64_t va; - unsigned nparams, oc_lds_en; - unsigned window_space = - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid; - - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - pm4->atom.emit = si_emit_shader_vs; - - /* We always write VGT_GS_MODE in the VS state, because every switch - * between different shader pipelines involving a different GS or no - * GS at all involves a switch of the VS (different GS use different - * copy shaders). On the other hand, when the API switches from a GS to - * no GS and then back to the same GS used originally, the GS state is - * not sent again. - */ - if (!gs) { - unsigned mode = V_028A40_GS_OFF; - - /* PrimID needs GS scenario A. */ - if (enable_prim_id) - mode = V_028A40_GS_SCENARIO_A; - - shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode); - shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id; - } else { - shader->ctx_reg.vs.vgt_gs_mode = ac_vgt_gs_mode(gs->gs_max_out_vertices, - sscreen->info.chip_class); - shader->ctx_reg.vs.vgt_primitiveid_en = 0; - } - - if (sscreen->info.chip_class <= GFX8) { - /* Reuse needs to be set off if we write oViewport. */ - shader->ctx_reg.vs.vgt_reuse_off = - S_028AB4_REUSE_OFF(info->writes_viewport_index); - } - - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - - if (gs) { - vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */ - num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR; - } else if (shader->selector->type == PIPE_SHADER_VERTEX) { - vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id); - - if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { - num_user_sgprs = SI_SGPR_VS_BLIT_DATA + - info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - } else { - num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); - } - } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { - vgpr_comp_cnt = enable_prim_id ? 3 : 2; - num_user_sgprs = SI_TES_NUM_USER_SGPR; - } else - unreachable("invalid shader selector type"); - - /* VS is required to export at least one param. */ - nparams = MAX2(shader->info.nr_param_exports, 1); - shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1); - - if (sscreen->info.chip_class >= GFX10) { - shader->ctx_reg.vs.spi_vs_out_config |= - S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); - } - - shader->ctx_reg.vs.spi_shader_pos_format = - S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | - S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE) | - S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE) | - S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? - V_02870C_SPI_SHADER_4COMP : - V_02870C_SPI_SHADER_NONE); - shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | - S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1); - shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false); - - oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0; - - si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); - si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40)); - - uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) / - (sscreen->ge_wave_size == 32 ? 8 : 4)) | - S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | - S_00B128_DX10_CLAMP(1) | - S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | - S_00B128_FLOAT_MODE(shader->config.float_mode); - uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | - S_00B12C_OC_LDS_EN(oc_lds_en) | - S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); - - if (sscreen->info.chip_class >= GFX10) - rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); - else if (sscreen->info.chip_class == GFX9) - rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); - - if (sscreen->info.chip_class <= GFX9) - rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8); - - if (!sscreen->use_ngg_streamout) { - rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) | - S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | - S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | - S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | - S_00B12C_SO_EN(!!shader->selector->so.num_outputs); - } - - si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1); - si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2); - - if (window_space) - shader->ctx_reg.vs.pa_cl_vte_cntl = - S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1); - else - shader->ctx_reg.vs.pa_cl_vte_cntl = - S_028818_VTX_W0_FMT(1) | - S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | - S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | - S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); - - if (shader->selector->type == PIPE_SHADER_TESS_EVAL) - si_set_tesseval_regs(sscreen, shader->selector, pm4); - - polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); + const struct si_shader_info *info = &shader->selector->info; + struct si_pm4_state *pm4; + unsigned num_user_sgprs, vgpr_comp_cnt; + uint64_t va; + unsigned nparams, oc_lds_en; + unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid; + + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + pm4->atom.emit = si_emit_shader_vs; + + /* We always write VGT_GS_MODE in the VS state, because every switch + * between different shader pipelines involving a different GS or no + * GS at all involves a switch of the VS (different GS use different + * copy shaders). On the other hand, when the API switches from a GS to + * no GS and then back to the same GS used originally, the GS state is + * not sent again. + */ + if (!gs) { + unsigned mode = V_028A40_GS_OFF; + + /* PrimID needs GS scenario A. */ + if (enable_prim_id) + mode = V_028A40_GS_SCENARIO_A; + + shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode); + shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id; + } else { + shader->ctx_reg.vs.vgt_gs_mode = + ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class); + shader->ctx_reg.vs.vgt_primitiveid_en = 0; + } + + if (sscreen->info.chip_class <= GFX8) { + /* Reuse needs to be set off if we write oViewport. */ + shader->ctx_reg.vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index); + } + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + + if (gs) { + vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */ + num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR; + } else if (shader->selector->type == PIPE_SHADER_VERTEX) { + vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id); + + if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) { + num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + } else { + num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR); + } + } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { + vgpr_comp_cnt = enable_prim_id ? 3 : 2; + num_user_sgprs = SI_TES_NUM_USER_SGPR; + } else + unreachable("invalid shader selector type"); + + /* VS is required to export at least one param. */ + nparams = MAX2(shader->info.nr_param_exports, 1); + shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1); + + if (sscreen->info.chip_class >= GFX10) { + shader->ctx_reg.vs.spi_vs_out_config |= + S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); + } + + shader->ctx_reg.vs.spi_shader_pos_format = + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE) | + S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE) | + S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP + : V_02870C_SPI_SHADER_NONE); + shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) | + S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1); + shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false); + + oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0; + + si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); + si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40)); + + uint32_t rsrc1 = + S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) | + S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | + S_00B128_FLOAT_MODE(shader->config.float_mode); + uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) | + S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + + if (sscreen->info.chip_class >= GFX10) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5); + else if (sscreen->info.chip_class == GFX9) + rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); + + if (sscreen->info.chip_class <= GFX9) + rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8); + + if (!sscreen->use_ngg_streamout) { + rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) | + S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | + S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | + S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | + S_00B12C_SO_EN(!!shader->selector->so.num_outputs); + } + + si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1); + si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2); + + if (window_space) + shader->ctx_reg.vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1); + else + shader->ctx_reg.vs.pa_cl_vte_cntl = + S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | + S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | + S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1); + + if (shader->selector->type == PIPE_SHADER_TESS_EVAL) + si_set_tesseval_regs(sscreen, shader->selector, pm4); + + polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); } static unsigned si_get_ps_num_interp(struct si_shader *ps) { - struct si_shader_info *info = &ps->selector->info; - unsigned num_colors = !!(info->colors_read & 0x0f) + - !!(info->colors_read & 0xf0); - unsigned num_interp = ps->selector->info.num_inputs + - (ps->key.part.ps.prolog.color_two_side ? num_colors : 0); - - assert(num_interp <= 32); - return MIN2(num_interp, 32); + struct si_shader_info *info = &ps->selector->info; + unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0); + unsigned num_interp = + ps->selector->info.num_inputs + (ps->key.part.ps.prolog.color_two_side ? num_colors : 0); + + assert(num_interp <= 32); + return MIN2(num_interp, 32); } static unsigned si_get_spi_shader_col_format(struct si_shader *shader) { - unsigned value = shader->key.part.ps.epilog.spi_shader_col_format; - unsigned i, num_targets = (util_last_bit(value) + 3) / 4; + unsigned value = shader->key.part.ps.epilog.spi_shader_col_format; + unsigned i, num_targets = (util_last_bit(value) + 3) / 4; - /* If the i-th target format is set, all previous target formats must - * be non-zero to avoid hangs. - */ - for (i = 0; i < num_targets; i++) - if (!(value & (0xf << (i * 4)))) - value |= V_028714_SPI_SHADER_32_R << (i * 4); + /* If the i-th target format is set, all previous target formats must + * be non-zero to avoid hangs. + */ + for (i = 0; i < num_targets; i++) + if (!(value & (0xf << (i * 4)))) + value |= V_028714_SPI_SHADER_32_R << (i * 4); - return value; + return value; } static void si_emit_shader_ps(struct si_context *sctx) { - struct si_shader *shader = sctx->queued.named.ps->shader; - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - - if (!shader) - return; - - /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/ - radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, - SI_TRACKED_SPI_PS_INPUT_ENA, - shader->ctx_reg.ps.spi_ps_input_ena, - shader->ctx_reg.ps.spi_ps_input_addr); - - radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, - SI_TRACKED_SPI_BARYC_CNTL, - shader->ctx_reg.ps.spi_baryc_cntl); - radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, - SI_TRACKED_SPI_PS_IN_CONTROL, - shader->ctx_reg.ps.spi_ps_in_control); - - /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */ - radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, - SI_TRACKED_SPI_SHADER_Z_FORMAT, - shader->ctx_reg.ps.spi_shader_z_format, - shader->ctx_reg.ps.spi_shader_col_format); - - radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, - SI_TRACKED_CB_SHADER_MASK, - shader->ctx_reg.ps.cb_shader_mask); - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; -} + struct si_shader *shader = sctx->queued.named.ps->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; -static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) -{ - struct si_shader_info *info = &shader->selector->info; - struct si_pm4_state *pm4; - unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask; - unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); - uint64_t va; - unsigned input_ena = shader->config.spi_ps_input_ena; - - /* we need to enable at least one of them, otherwise we hang the GPU */ - assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || - G_0286CC_PERSP_CENTER_ENA(input_ena) || - G_0286CC_PERSP_CENTROID_ENA(input_ena) || - G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) || - G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || - G_0286CC_LINEAR_CENTER_ENA(input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(input_ena) || - G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena)); - /* POS_W_FLOAT_ENA requires one of the perspective weights. */ - assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || - G_0286CC_PERSP_SAMPLE_ENA(input_ena) || - G_0286CC_PERSP_CENTER_ENA(input_ena) || - G_0286CC_PERSP_CENTROID_ENA(input_ena) || - G_0286CC_PERSP_PULL_MODEL_ENA(input_ena)); - - /* Validate interpolation optimization flags (read as implications). */ - assert(!shader->key.part.ps.prolog.bc_optimize_for_persp || - (G_0286CC_PERSP_CENTER_ENA(input_ena) && - G_0286CC_PERSP_CENTROID_ENA(input_ena))); - assert(!shader->key.part.ps.prolog.bc_optimize_for_linear || - (G_0286CC_LINEAR_CENTER_ENA(input_ena) && - G_0286CC_LINEAR_CENTROID_ENA(input_ena))); - assert(!shader->key.part.ps.prolog.force_persp_center_interp || - (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && - !G_0286CC_PERSP_CENTROID_ENA(input_ena))); - assert(!shader->key.part.ps.prolog.force_linear_center_interp || - (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && - !G_0286CC_LINEAR_CENTROID_ENA(input_ena))); - assert(!shader->key.part.ps.prolog.force_persp_sample_interp || - (!G_0286CC_PERSP_CENTER_ENA(input_ena) && - !G_0286CC_PERSP_CENTROID_ENA(input_ena))); - assert(!shader->key.part.ps.prolog.force_linear_sample_interp || - (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && - !G_0286CC_LINEAR_CENTROID_ENA(input_ena))); - - /* Validate cases when the optimizations are off (read as implications). */ - assert(shader->key.part.ps.prolog.bc_optimize_for_persp || - !G_0286CC_PERSP_CENTER_ENA(input_ena) || - !G_0286CC_PERSP_CENTROID_ENA(input_ena)); - assert(shader->key.part.ps.prolog.bc_optimize_for_linear || - !G_0286CC_LINEAR_CENTER_ENA(input_ena) || - !G_0286CC_LINEAR_CENTROID_ENA(input_ena)); - - pm4 = si_get_shader_pm4_state(shader); - if (!pm4) - return; - - pm4->atom.emit = si_emit_shader_ps; - - /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION - * Possible vaules: - * 0 -> Position = pixel center - * 1 -> Position = pixel centroid - * 2 -> Position = at sample position - * - * From GLSL 4.5 specification, section 7.1: - * "The variable gl_FragCoord is available as an input variable from - * within fragment shaders and it holds the window relative coordinates - * (x, y, z, 1/w) values for the fragment. If multi-sampling, this - * value can be for any location within the pixel, or one of the - * fragment samples. The use of centroid does not further restrict - * this value to be inside the current primitive." - * - * Meaning that centroid has no effect and we can return anything within - * the pixel. Thus, return the value at sample position, because that's - * the most accurate one shaders can get. - */ - spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); - - if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == - TGSI_FS_COORD_PIXEL_CENTER_INTEGER) - spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1); - - spi_shader_col_format = si_get_spi_shader_col_format(shader); - cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format); - - /* Ensure that some export memory is always allocated, for two reasons: - * - * 1) Correctness: The hardware ignores the EXEC mask if no export - * memory is allocated, so KILL and alpha test do not work correctly - * without this. - * 2) Performance: Every shader needs at least a NULL export, even when - * it writes no color/depth output. The NULL export instruction - * stalls without this setting. - * - * Don't add this to CB_SHADER_MASK. - * - * GFX10 supports pixel shaders without exports by setting both - * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export - * instructions if any are present. - */ - if ((sscreen->info.chip_class <= GFX9 || - info->uses_kill || - shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) && - !spi_shader_col_format && - !info->writes_z && !info->writes_stencil && !info->writes_samplemask) - spi_shader_col_format = V_028714_SPI_SHADER_32_R; - - shader->ctx_reg.ps.spi_ps_input_ena = input_ena; - shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; - - /* Set interpolation controls. */ - spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | - S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); - - shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; - shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; - shader->ctx_reg.ps.spi_shader_z_format = - ac_get_spi_shader_z_format(info->writes_z, - info->writes_stencil, - info->writes_samplemask); - shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format; - shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask; - - va = shader->bo->gpu_address; - si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8); - si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40)); - - uint32_t rsrc1 = - S_00B028_VGPRS((shader->config.num_vgprs - 1) / - (sscreen->ps_wave_size == 32 ? 8 : 4)) | - S_00B028_DX10_CLAMP(1) | - S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | - S_00B028_FLOAT_MODE(shader->config.float_mode); - - if (sscreen->info.chip_class < GFX10) { - rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8); - } - - si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1); - si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, - S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) | - S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) | - S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); + if (!shader) + return; + + /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/ + radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA, + shader->ctx_reg.ps.spi_ps_input_ena, + shader->ctx_reg.ps.spi_ps_input_addr); + + radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL, + shader->ctx_reg.ps.spi_baryc_cntl); + radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL, + shader->ctx_reg.ps.spi_ps_in_control); + + /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */ + radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT, + shader->ctx_reg.ps.spi_shader_z_format, + shader->ctx_reg.ps.spi_shader_col_format); + + radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, + shader->ctx_reg.ps.cb_shader_mask); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } -static void si_shader_init_pm4_state(struct si_screen *sscreen, - struct si_shader *shader) +static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) { - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - if (shader->key.as_ls) - si_shader_ls(sscreen, shader); - else if (shader->key.as_es) - si_shader_es(sscreen, shader); - else if (shader->key.as_ngg) - gfx10_shader_ngg(sscreen, shader); - else - si_shader_vs(sscreen, shader, NULL); - break; - case PIPE_SHADER_TESS_CTRL: - si_shader_hs(sscreen, shader); - break; - case PIPE_SHADER_TESS_EVAL: - if (shader->key.as_es) - si_shader_es(sscreen, shader); - else if (shader->key.as_ngg) - gfx10_shader_ngg(sscreen, shader); - else - si_shader_vs(sscreen, shader, NULL); - break; - case PIPE_SHADER_GEOMETRY: - if (shader->key.as_ngg) - gfx10_shader_ngg(sscreen, shader); - else - si_shader_gs(sscreen, shader); - break; - case PIPE_SHADER_FRAGMENT: - si_shader_ps(sscreen, shader); - break; - default: - assert(0); - } + struct si_shader_info *info = &shader->selector->info; + struct si_pm4_state *pm4; + unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask; + unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); + uint64_t va; + unsigned input_ena = shader->config.spi_ps_input_ena; + + /* we need to enable at least one of them, otherwise we hang the GPU */ + assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) || + G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) || + G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena)); + /* POS_W_FLOAT_ENA requires one of the perspective weights. */ + assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) || + G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) || + G_0286CC_PERSP_PULL_MODEL_ENA(input_ena)); + + /* Validate interpolation optimization flags (read as implications). */ + assert(!shader->key.part.ps.prolog.bc_optimize_for_persp || + (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena))); + assert(!shader->key.part.ps.prolog.bc_optimize_for_linear || + (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena))); + assert(!shader->key.part.ps.prolog.force_persp_center_interp || + (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena))); + assert(!shader->key.part.ps.prolog.force_linear_center_interp || + (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena))); + assert(!shader->key.part.ps.prolog.force_persp_sample_interp || + (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena))); + assert(!shader->key.part.ps.prolog.force_linear_sample_interp || + (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena))); + + /* Validate cases when the optimizations are off (read as implications). */ + assert(shader->key.part.ps.prolog.bc_optimize_for_persp || + !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena)); + assert(shader->key.part.ps.prolog.bc_optimize_for_linear || + !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena)); + + pm4 = si_get_shader_pm4_state(shader); + if (!pm4) + return; + + pm4->atom.emit = si_emit_shader_ps; + + /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION + * Possible vaules: + * 0 -> Position = pixel center + * 1 -> Position = pixel centroid + * 2 -> Position = at sample position + * + * From GLSL 4.5 specification, section 7.1: + * "The variable gl_FragCoord is available as an input variable from + * within fragment shaders and it holds the window relative coordinates + * (x, y, z, 1/w) values for the fragment. If multi-sampling, this + * value can be for any location within the pixel, or one of the + * fragment samples. The use of centroid does not further restrict + * this value to be inside the current primitive." + * + * Meaning that centroid has no effect and we can return anything within + * the pixel. Thus, return the value at sample position, because that's + * the most accurate one shaders can get. + */ + spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2); + + if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == TGSI_FS_COORD_PIXEL_CENTER_INTEGER) + spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1); + + spi_shader_col_format = si_get_spi_shader_col_format(shader); + cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format); + + /* Ensure that some export memory is always allocated, for two reasons: + * + * 1) Correctness: The hardware ignores the EXEC mask if no export + * memory is allocated, so KILL and alpha test do not work correctly + * without this. + * 2) Performance: Every shader needs at least a NULL export, even when + * it writes no color/depth output. The NULL export instruction + * stalls without this setting. + * + * Don't add this to CB_SHADER_MASK. + * + * GFX10 supports pixel shaders without exports by setting both + * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export + * instructions if any are present. + */ + if ((sscreen->info.chip_class <= GFX9 || info->uses_kill || + shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) && + !spi_shader_col_format && !info->writes_z && !info->writes_stencil && + !info->writes_samplemask) + spi_shader_col_format = V_028714_SPI_SHADER_32_R; + + shader->ctx_reg.ps.spi_ps_input_ena = input_ena; + shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr; + + /* Set interpolation controls. */ + spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) | + S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); + + shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; + shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control; + shader->ctx_reg.ps.spi_shader_z_format = + ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask); + shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format; + shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask; + + va = shader->bo->gpu_address; + si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8); + si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40)); + + uint32_t rsrc1 = + S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) | + S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) | + S_00B028_FLOAT_MODE(shader->config.float_mode); + + if (sscreen->info.chip_class < GFX10) { + rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8); + } + + si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1); + si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, + S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) | + S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) | + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); +} + +static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader) +{ + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (shader->key.as_ls) + si_shader_ls(sscreen, shader); + else if (shader->key.as_es) + si_shader_es(sscreen, shader); + else if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); + else + si_shader_vs(sscreen, shader, NULL); + break; + case PIPE_SHADER_TESS_CTRL: + si_shader_hs(sscreen, shader); + break; + case PIPE_SHADER_TESS_EVAL: + if (shader->key.as_es) + si_shader_es(sscreen, shader); + else if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); + else + si_shader_vs(sscreen, shader, NULL); + break; + case PIPE_SHADER_GEOMETRY: + if (shader->key.as_ngg) + gfx10_shader_ngg(sscreen, shader); + else + si_shader_gs(sscreen, shader); + break; + case PIPE_SHADER_FRAGMENT: + si_shader_ps(sscreen, shader); + break; + default: + assert(0); + } } static unsigned si_get_alpha_test_func(struct si_context *sctx) { - /* Alpha-test should be disabled if colorbuffer 0 is integer. */ - return sctx->queued.named.dsa->alpha_func; + /* Alpha-test should be disabled if colorbuffer 0 is integer. */ + return sctx->queued.named.dsa->alpha_func; } -void si_shader_selector_key_vs(struct si_context *sctx, - struct si_shader_selector *vs, - struct si_shader_key *key, - struct si_vs_prolog_bits *prolog_key) +void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key) { - if (!sctx->vertex_elements || - vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) - return; - - struct si_vertex_elements *elts = sctx->vertex_elements; - - prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; - prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; - prolog_key->unpack_instance_id_from_vertex_id = - sctx->prim_discard_cs_instancing; - - /* Prefer a monolithic shader to allow scheduling divisions around - * VBO loads. */ - if (prolog_key->instance_divisor_is_fetched) - key->opt.prefer_mono = 1; - - unsigned count = MIN2(vs->info.num_inputs, elts->count); - unsigned count_mask = (1 << count) - 1; - unsigned fix = elts->fix_fetch_always & count_mask; - unsigned opencode = elts->fix_fetch_opencode & count_mask; - - if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) { - uint32_t mask = elts->fix_fetch_unaligned & count_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); - unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1); - unsigned vbidx = elts->vertex_buffer_index[i]; - struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx]; - unsigned align_mask = (1 << log_hw_load_size) - 1; - if (vb->buffer_offset & align_mask || - vb->stride & align_mask) { - fix |= 1 << i; - opencode |= 1 << i; - } - } - } - - while (fix) { - unsigned i = u_bit_scan(&fix); - key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; - } - key->mono.vs_fetch_opencode = opencode; -} + if (!sctx->vertex_elements || vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) + return; -static void si_shader_selector_key_hw_vs(struct si_context *sctx, - struct si_shader_selector *vs, - struct si_shader_key *key) -{ - struct si_shader_selector *ps = sctx->ps_shader.cso; - - key->opt.clip_disable = - sctx->queued.named.rasterizer->clip_plane_enable == 0 && - (vs->info.clipdist_writemask || - vs->info.writes_clipvertex) && - !vs->info.culldist_writemask; - - /* Find out if PS is disabled. */ - bool ps_disabled = true; - if (ps) { - bool ps_modifies_zs = ps->info.uses_kill || - ps->info.writes_z || - ps->info.writes_stencil || - ps->info.writes_samplemask || - sctx->queued.named.blend->alpha_to_coverage || - si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS; - unsigned ps_colormask = si_get_total_colormask(sctx); - - ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard || - (!ps_colormask && - !ps_modifies_zs && - !ps->info.writes_memory); - } - - /* Find out which VS outputs aren't used by the PS. */ - uint64_t outputs_written = vs->outputs_written_before_ps; - uint64_t inputs_read = 0; - - /* Ignore outputs that are not passed from VS to PS. */ - outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) | - (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) | - (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true))); - - if (!ps_disabled) { - inputs_read = ps->inputs_read; - } - - uint64_t linked = outputs_written & inputs_read; - - key->opt.kill_outputs = ~linked & outputs_written; - key->opt.ngg_culling = sctx->ngg_culling; + struct si_vertex_elements *elts = sctx->vertex_elements; + + prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched; + prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing; + + /* Prefer a monolithic shader to allow scheduling divisions around + * VBO loads. */ + if (prolog_key->instance_divisor_is_fetched) + key->opt.prefer_mono = 1; + + unsigned count = MIN2(vs->info.num_inputs, elts->count); + unsigned count_mask = (1 << count) - 1; + unsigned fix = elts->fix_fetch_always & count_mask; + unsigned opencode = elts->fix_fetch_opencode & count_mask; + + if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) { + uint32_t mask = elts->fix_fetch_unaligned & count_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1); + unsigned vbidx = elts->vertex_buffer_index[i]; + struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx]; + unsigned align_mask = (1 << log_hw_load_size) - 1; + if (vb->buffer_offset & align_mask || vb->stride & align_mask) { + fix |= 1 << i; + opencode |= 1 << i; + } + } + } + + while (fix) { + unsigned i = u_bit_scan(&fix); + key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; + } + key->mono.vs_fetch_opencode = opencode; } -/* Compute the key for the hw shader variant */ -static inline void si_shader_selector_key(struct pipe_context *ctx, - struct si_shader_selector *sel, - union si_vgt_stages_key stages_key, - struct si_shader_key *key) +static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, + struct si_shader_key *key) { - struct si_context *sctx = (struct si_context *)ctx; - - memset(key, 0, sizeof(*key)); - - switch (sel->type) { - case PIPE_SHADER_VERTEX: - si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); - - if (sctx->tes_shader.cso) - key->as_ls = 1; - else if (sctx->gs_shader.cso) { - key->as_es = 1; - key->as_ngg = stages_key.u.ngg; - } else { - key->as_ngg = stages_key.u.ngg; - si_shader_selector_key_hw_vs(sctx, sel, key); - - if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->mono.u.vs_export_prim_id = 1; - } - break; - case PIPE_SHADER_TESS_CTRL: - if (sctx->chip_class >= GFX9) { - si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, - key, &key->part.tcs.ls_prolog); - key->part.tcs.ls = sctx->vs_shader.cso; - - /* When the LS VGPR fix is needed, monolithic shaders - * can: - * - avoid initializing EXEC in both the LS prolog - * and the LS main part when !vs_needs_prolog - * - remove the fixup for unused input VGPRs - */ - key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; - - /* The LS output / HS input layout can be communicated - * directly instead of via user SGPRs for merged LS-HS. - * The LS VGPR fix prefers this too. - */ - key->opt.prefer_mono = 1; - } - - key->part.tcs.epilog.prim_mode = - sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; - key->part.tcs.epilog.invoc0_tess_factors_are_def = - sel->info.tessfactors_are_def_in_all_invocs; - key->part.tcs.epilog.tes_reads_tess_factors = - sctx->tes_shader.cso->info.reads_tess_factors; - - if (sel == sctx->fixed_func_tcs_shader.cso) - key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written; - break; - case PIPE_SHADER_TESS_EVAL: - key->as_ngg = stages_key.u.ngg; - - if (sctx->gs_shader.cso) - key->as_es = 1; - else { - si_shader_selector_key_hw_vs(sctx, sel, key); - - if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->mono.u.vs_export_prim_id = 1; - } - break; - case PIPE_SHADER_GEOMETRY: - if (sctx->chip_class >= GFX9) { - if (sctx->tes_shader.cso) { - key->part.gs.es = sctx->tes_shader.cso; - } else { - si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, - key, &key->part.gs.vs_prolog); - key->part.gs.es = sctx->vs_shader.cso; - key->part.gs.prolog.gfx9_prev_is_vs = 1; - } - - key->as_ngg = stages_key.u.ngg; - - /* Merged ES-GS can have unbalanced wave usage. - * - * ES threads are per-vertex, while GS threads are - * per-primitive. So without any amplification, there - * are fewer GS threads than ES threads, which can result - * in empty (no-op) GS waves. With too much amplification, - * there are more GS threads than ES threads, which - * can result in empty (no-op) ES waves. - * - * Non-monolithic shaders are implemented by setting EXEC - * at the beginning of shader parts, and don't jump to - * the end if EXEC is 0. - * - * Monolithic shaders use conditional blocks, so they can - * jump and skip empty waves of ES or GS. So set this to - * always use optimized variants, which are monolithic. - */ - key->opt.prefer_mono = 1; - } - key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; - break; - case PIPE_SHADER_FRAGMENT: { - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_state_blend *blend = sctx->queued.named.blend; - - if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && - sel->info.colors_written == 0x1) - key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; - - /* Select the shader color format based on whether - * blending or alpha are needed. - */ - key->part.ps.epilog.spi_shader_col_format = - (blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend_alpha) | - (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_blend) | - (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format_alpha) | - (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & - sctx->framebuffer.spi_shader_col_format); - key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; - - /* The output for dual source blending should have - * the same format as the first output. - */ - if (blend->dual_src_blend) { - key->part.ps.epilog.spi_shader_col_format |= - (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; - } - - /* If alpha-to-coverage is enabled, we have to export alpha - * even if there is no color buffer. - */ - if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && - blend->alpha_to_coverage) - key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; - - /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs - * to the range supported by the type if a channel has less - * than 16 bits and the export format is 16_ABGR. - */ - if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { - key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; - key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; - } - - /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ - if (!key->part.ps.epilog.last_cbuf) { - key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; - key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; - key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; - } - - bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); - bool is_line = util_prim_is_lines(sctx->current_rast_prim); - - key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; - key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read; - - key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && - rs->multisample_enable; - - key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; - key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) || - (is_line && rs->line_smooth)) && - sctx->framebuffer.nr_samples <= 1; - key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; - - if (sctx->ps_iter_samples > 1 && - sel->info.reads_samplemask) { - key->part.ps.prolog.samplemask_log_ps_iter = - util_logbase2(sctx->ps_iter_samples); - } - - if (rs->force_persample_interp && - rs->multisample_enable && - sctx->framebuffer.nr_samples > 1 && - sctx->ps_iter_samples > 1) { - key->part.ps.prolog.force_persp_sample_interp = - sel->info.uses_persp_center || - sel->info.uses_persp_centroid; - - key->part.ps.prolog.force_linear_sample_interp = - sel->info.uses_linear_center || - sel->info.uses_linear_centroid; - } else if (rs->multisample_enable && - sctx->framebuffer.nr_samples > 1) { - key->part.ps.prolog.bc_optimize_for_persp = - sel->info.uses_persp_center && - sel->info.uses_persp_centroid; - key->part.ps.prolog.bc_optimize_for_linear = - sel->info.uses_linear_center && - sel->info.uses_linear_centroid; - } else { - /* Make sure SPI doesn't compute more than 1 pair - * of (i,j), which is the optimization here. */ - key->part.ps.prolog.force_persp_center_interp = - sel->info.uses_persp_center + - sel->info.uses_persp_centroid + - sel->info.uses_persp_sample > 1; - - key->part.ps.prolog.force_linear_center_interp = - sel->info.uses_linear_center + - sel->info.uses_linear_centroid + - sel->info.uses_linear_sample > 1; - - if (sel->info.uses_persp_opcode_interp_sample || - sel->info.uses_linear_opcode_interp_sample) - key->mono.u.ps.interpolate_at_sample_force_center = 1; - } - - key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx); - - /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch && !sctx->blitter->running) { - struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; - struct pipe_resource *tex = cb0->texture; - - /* 1D textures are allocated and used as 2D on GFX9. */ - key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; - key->mono.u.ps.fbfetch_is_1D = sctx->chip_class != GFX9 && - (tex->target == PIPE_TEXTURE_1D || - tex->target == PIPE_TEXTURE_1D_ARRAY); - key->mono.u.ps.fbfetch_layered = tex->target == PIPE_TEXTURE_1D_ARRAY || - tex->target == PIPE_TEXTURE_2D_ARRAY || - tex->target == PIPE_TEXTURE_CUBE || - tex->target == PIPE_TEXTURE_CUBE_ARRAY || - tex->target == PIPE_TEXTURE_3D; - } - break; - } - default: - assert(0); - } - - if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) - memset(&key->opt, 0, sizeof(key->opt)); + struct si_shader_selector *ps = sctx->ps_shader.cso; + + key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 && + (vs->info.clipdist_writemask || vs->info.writes_clipvertex) && + !vs->info.culldist_writemask; + + /* Find out if PS is disabled. */ + bool ps_disabled = true; + if (ps) { + bool ps_modifies_zs = ps->info.uses_kill || ps->info.writes_z || ps->info.writes_stencil || + ps->info.writes_samplemask || + sctx->queued.named.blend->alpha_to_coverage || + si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS; + unsigned ps_colormask = si_get_total_colormask(sctx); + + ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard || + (!ps_colormask && !ps_modifies_zs && !ps->info.writes_memory); + } + + /* Find out which VS outputs aren't used by the PS. */ + uint64_t outputs_written = vs->outputs_written_before_ps; + uint64_t inputs_read = 0; + + /* Ignore outputs that are not passed from VS to PS. */ + outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) | + (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) | + (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true))); + + if (!ps_disabled) { + inputs_read = ps->inputs_read; + } + + uint64_t linked = outputs_written & inputs_read; + + key->opt.kill_outputs = ~linked & outputs_written; + key->opt.ngg_culling = sctx->ngg_culling; } -static void si_build_shader_variant(struct si_shader *shader, - int thread_index, - bool low_priority) -{ - struct si_shader_selector *sel = shader->selector; - struct si_screen *sscreen = sel->screen; - struct ac_llvm_compiler *compiler; - struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug; - - if (thread_index >= 0) { - if (low_priority) { - assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp)); - compiler = &sscreen->compiler_lowp[thread_index]; - } else { - assert(thread_index < ARRAY_SIZE(sscreen->compiler)); - compiler = &sscreen->compiler[thread_index]; - } - if (!debug->async) - debug = NULL; - } else { - assert(!low_priority); - compiler = shader->compiler_ctx_state.compiler; - } - - if (!compiler->passes) - si_init_compiler(sscreen, compiler); - - if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) { - PRINT_ERR("Failed to build shader variant (type=%u)\n", - sel->type); - shader->compilation_failed = true; - return; - } - - if (shader->compiler_ctx_state.is_debug_context) { - FILE *f = open_memstream(&shader->shader_log, - &shader->shader_log_size); - if (f) { - si_shader_dump(sscreen, shader, NULL, f, false); - fclose(f); - } - } - - si_shader_init_pm4_state(sscreen, shader); +/* Compute the key for the hw shader variant */ +static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel, + union si_vgt_stages_key stages_key, + struct si_shader_key *key) +{ + struct si_context *sctx = (struct si_context *)ctx; + + memset(key, 0, sizeof(*key)); + + switch (sel->type) { + case PIPE_SHADER_VERTEX: + si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); + + if (sctx->tes_shader.cso) + key->as_ls = 1; + else if (sctx->gs_shader.cso) { + key->as_es = 1; + key->as_ngg = stages_key.u.ngg; + } else { + key->as_ngg = stages_key.u.ngg; + si_shader_selector_key_hw_vs(sctx, sel, key); + + if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) + key->mono.u.vs_export_prim_id = 1; + } + break; + case PIPE_SHADER_TESS_CTRL: + if (sctx->chip_class >= GFX9) { + si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.tcs.ls_prolog); + key->part.tcs.ls = sctx->vs_shader.cso; + + /* When the LS VGPR fix is needed, monolithic shaders + * can: + * - avoid initializing EXEC in both the LS prolog + * and the LS main part when !vs_needs_prolog + * - remove the fixup for unused input VGPRs + */ + key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; + + /* The LS output / HS input layout can be communicated + * directly instead of via user SGPRs for merged LS-HS. + * The LS VGPR fix prefers this too. + */ + key->opt.prefer_mono = 1; + } + + key->part.tcs.epilog.prim_mode = + sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + key->part.tcs.epilog.invoc0_tess_factors_are_def = + sel->info.tessfactors_are_def_in_all_invocs; + key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors; + + if (sel == sctx->fixed_func_tcs_shader.cso) + key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written; + break; + case PIPE_SHADER_TESS_EVAL: + key->as_ngg = stages_key.u.ngg; + + if (sctx->gs_shader.cso) + key->as_es = 1; + else { + si_shader_selector_key_hw_vs(sctx, sel, key); + + if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) + key->mono.u.vs_export_prim_id = 1; + } + break; + case PIPE_SHADER_GEOMETRY: + if (sctx->chip_class >= GFX9) { + if (sctx->tes_shader.cso) { + key->part.gs.es = sctx->tes_shader.cso; + } else { + si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog); + key->part.gs.es = sctx->vs_shader.cso; + key->part.gs.prolog.gfx9_prev_is_vs = 1; + } + + key->as_ngg = stages_key.u.ngg; + + /* Merged ES-GS can have unbalanced wave usage. + * + * ES threads are per-vertex, while GS threads are + * per-primitive. So without any amplification, there + * are fewer GS threads than ES threads, which can result + * in empty (no-op) GS waves. With too much amplification, + * there are more GS threads than ES threads, which + * can result in empty (no-op) ES waves. + * + * Non-monolithic shaders are implemented by setting EXEC + * at the beginning of shader parts, and don't jump to + * the end if EXEC is 0. + * + * Monolithic shaders use conditional blocks, so they can + * jump and skip empty waves of ES or GS. So set this to + * always use optimized variants, which are monolithic. + */ + key->opt.prefer_mono = 1; + } + key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; + break; + case PIPE_SHADER_FRAGMENT: { + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_state_blend *blend = sctx->queued.named.blend; + + if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && + sel->info.colors_written == 0x1) + key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; + + /* Select the shader color format based on whether + * blending or alpha are needed. + */ + key->part.ps.epilog.spi_shader_col_format = + (blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend_alpha) | + (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_blend) | + (~blend->blend_enable_4bit & blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format_alpha) | + (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & + sctx->framebuffer.spi_shader_col_format); + key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; + + /* The output for dual source blending should have + * the same format as the first output. + */ + if (blend->dual_src_blend) { + key->part.ps.epilog.spi_shader_col_format |= + (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4; + } + + /* If alpha-to-coverage is enabled, we have to export alpha + * even if there is no color buffer. + */ + if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage) + key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR; + + /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs + * to the range supported by the type if a channel has less + * than 16 bits and the export format is 16_ABGR. + */ + if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) { + key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8; + key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10; + } + + /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */ + if (!key->part.ps.epilog.last_cbuf) { + key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit; + key->part.ps.epilog.color_is_int8 &= sel->info.colors_written; + key->part.ps.epilog.color_is_int10 &= sel->info.colors_written; + } + + bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim); + bool is_line = util_prim_is_lines(sctx->current_rast_prim); + + key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read; + key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read; + + key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable; + + key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly; + key->part.ps.epilog.poly_line_smoothing = + ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) && + sctx->framebuffer.nr_samples <= 1; + key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; + + if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) { + key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples); + } + + if (rs->force_persample_interp && rs->multisample_enable && + sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) { + key->part.ps.prolog.force_persp_sample_interp = + sel->info.uses_persp_center || sel->info.uses_persp_centroid; + + key->part.ps.prolog.force_linear_sample_interp = + sel->info.uses_linear_center || sel->info.uses_linear_centroid; + } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) { + key->part.ps.prolog.bc_optimize_for_persp = + sel->info.uses_persp_center && sel->info.uses_persp_centroid; + key->part.ps.prolog.bc_optimize_for_linear = + sel->info.uses_linear_center && sel->info.uses_linear_centroid; + } else { + /* Make sure SPI doesn't compute more than 1 pair + * of (i,j), which is the optimization here. */ + key->part.ps.prolog.force_persp_center_interp = sel->info.uses_persp_center + + sel->info.uses_persp_centroid + + sel->info.uses_persp_sample > + 1; + + key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center + + sel->info.uses_linear_centroid + + sel->info.uses_linear_sample > + 1; + + if (sel->info.uses_persp_opcode_interp_sample || + sel->info.uses_linear_opcode_interp_sample) + key->mono.u.ps.interpolate_at_sample_force_center = 1; + } + + key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx); + + /* ps_uses_fbfetch is true only if the color buffer is bound. */ + if (sctx->ps_uses_fbfetch && !sctx->blitter->running) { + struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; + struct pipe_resource *tex = cb0->texture; + + /* 1D textures are allocated and used as 2D on GFX9. */ + key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1; + key->mono.u.ps.fbfetch_is_1D = + sctx->chip_class != GFX9 && + (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY); + key->mono.u.ps.fbfetch_layered = + tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY || + tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY || + tex->target == PIPE_TEXTURE_3D; + } + break; + } + default: + assert(0); + } + + if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) + memset(&key->opt, 0, sizeof(key->opt)); +} + +static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority) +{ + struct si_shader_selector *sel = shader->selector; + struct si_screen *sscreen = sel->screen; + struct ac_llvm_compiler *compiler; + struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug; + + if (thread_index >= 0) { + if (low_priority) { + assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp)); + compiler = &sscreen->compiler_lowp[thread_index]; + } else { + assert(thread_index < ARRAY_SIZE(sscreen->compiler)); + compiler = &sscreen->compiler[thread_index]; + } + if (!debug->async) + debug = NULL; + } else { + assert(!low_priority); + compiler = shader->compiler_ctx_state.compiler; + } + + if (!compiler->passes) + si_init_compiler(sscreen, compiler); + + if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) { + PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->type); + shader->compilation_failed = true; + return; + } + + if (shader->compiler_ctx_state.is_debug_context) { + FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size); + if (f) { + si_shader_dump(sscreen, shader, NULL, f, false); + fclose(f); + } + } + + si_shader_init_pm4_state(sscreen, shader); } static void si_build_shader_variant_low_priority(void *job, int thread_index) { - struct si_shader *shader = (struct si_shader *)job; + struct si_shader *shader = (struct si_shader *)job; - assert(thread_index >= 0); + assert(thread_index >= 0); - si_build_shader_variant(shader, thread_index, true); + si_build_shader_variant(shader, thread_index, true); } static const struct si_shader_key zeroed; -static bool si_check_missing_main_part(struct si_screen *sscreen, - struct si_shader_selector *sel, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key) +static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel, + struct si_compiler_ctx_state *compiler_state, + struct si_shader_key *key) { - struct si_shader **mainp = si_get_main_shader_part(sel, key); - - if (!*mainp) { - struct si_shader *main_part = CALLOC_STRUCT(si_shader); - - if (!main_part) - return false; - - /* We can leave the fence as permanently signaled because the - * main part becomes visible globally only after it has been - * compiled. */ - util_queue_fence_init(&main_part->ready); - - main_part->selector = sel; - main_part->key.as_es = key->as_es; - main_part->key.as_ls = key->as_ls; - main_part->key.as_ngg = key->as_ngg; - main_part->is_monolithic = false; - - if (!si_compile_shader(sscreen, compiler_state->compiler, - main_part, &compiler_state->debug)) { - FREE(main_part); - return false; - } - *mainp = main_part; - } - return true; + struct si_shader **mainp = si_get_main_shader_part(sel, key); + + if (!*mainp) { + struct si_shader *main_part = CALLOC_STRUCT(si_shader); + + if (!main_part) + return false; + + /* We can leave the fence as permanently signaled because the + * main part becomes visible globally only after it has been + * compiled. */ + util_queue_fence_init(&main_part->ready); + + main_part->selector = sel; + main_part->key.as_es = key->as_es; + main_part->key.as_ls = key->as_ls; + main_part->key.as_ngg = key->as_ngg; + main_part->is_monolithic = false; + + if (!si_compile_shader(sscreen, compiler_state->compiler, main_part, + &compiler_state->debug)) { + FREE(main_part); + return false; + } + *mainp = main_part; + } + return true; } /** @@ -2277,283 +2106,264 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, * the compilation isn't finished, don't select any * shader and return an error. */ -int si_shader_select_with_key(struct si_screen *sscreen, - struct si_shader_ctx_state *state, - struct si_compiler_ctx_state *compiler_state, - struct si_shader_key *key, - int thread_index, - bool optimized_or_none) +int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, + struct si_compiler_ctx_state *compiler_state, + struct si_shader_key *key, int thread_index, bool optimized_or_none) { - struct si_shader_selector *sel = state->cso; - struct si_shader_selector *previous_stage_sel = NULL; - struct si_shader *current = state->current; - struct si_shader *iter, *shader = NULL; + struct si_shader_selector *sel = state->cso; + struct si_shader_selector *previous_stage_sel = NULL; + struct si_shader *current = state->current; + struct si_shader *iter, *shader = NULL; again: - /* Check if we don't need to change anything. - * This path is also used for most shaders that don't need multiple - * variants, it will cost just a computation of the key and this - * test. */ - if (likely(current && - memcmp(¤t->key, key, sizeof(*key)) == 0)) { - if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) { - if (current->is_optimized) { - if (optimized_or_none) - return -1; - - memset(&key->opt, 0, sizeof(key->opt)); - goto current_not_ready; - } - - util_queue_fence_wait(¤t->ready); - } - - return current->compilation_failed ? -1 : 0; - } + /* Check if we don't need to change anything. + * This path is also used for most shaders that don't need multiple + * variants, it will cost just a computation of the key and this + * test. */ + if (likely(current && memcmp(¤t->key, key, sizeof(*key)) == 0)) { + if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) { + if (current->is_optimized) { + if (optimized_or_none) + return -1; + + memset(&key->opt, 0, sizeof(key->opt)); + goto current_not_ready; + } + + util_queue_fence_wait(¤t->ready); + } + + return current->compilation_failed ? -1 : 0; + } current_not_ready: - /* This must be done before the mutex is locked, because async GS - * compilation calls this function too, and therefore must enter - * the mutex first. - * - * Only wait if we are in a draw call. Don't wait if we are - * in a compiler thread. - */ - if (thread_index < 0) - util_queue_fence_wait(&sel->ready); - - simple_mtx_lock(&sel->mutex); - - /* Find the shader variant. */ - for (iter = sel->first_variant; iter; iter = iter->next_variant) { - /* Don't check the "current" shader. We checked it above. */ - if (current != iter && - memcmp(&iter->key, key, sizeof(*key)) == 0) { - simple_mtx_unlock(&sel->mutex); - - if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) { - /* If it's an optimized shader and its compilation has - * been started but isn't done, use the unoptimized - * shader so as not to cause a stall due to compilation. - */ - if (iter->is_optimized) { - if (optimized_or_none) - return -1; - memset(&key->opt, 0, sizeof(key->opt)); - goto again; - } - - util_queue_fence_wait(&iter->ready); - } - - if (iter->compilation_failed) { - return -1; /* skip the draw call */ - } - - state->current = iter; - return 0; - } - } - - /* Build a new shader. */ - shader = CALLOC_STRUCT(si_shader); - if (!shader) { - simple_mtx_unlock(&sel->mutex); - return -ENOMEM; - } - - util_queue_fence_init(&shader->ready); - - shader->selector = sel; - shader->key = *key; - shader->compiler_ctx_state = *compiler_state; - - /* If this is a merged shader, get the first shader's selector. */ - if (sscreen->info.chip_class >= GFX9) { - if (sel->type == PIPE_SHADER_TESS_CTRL) - previous_stage_sel = key->part.tcs.ls; - else if (sel->type == PIPE_SHADER_GEOMETRY) - previous_stage_sel = key->part.gs.es; - - /* We need to wait for the previous shader. */ - if (previous_stage_sel && thread_index < 0) - util_queue_fence_wait(&previous_stage_sel->ready); - } - - bool is_pure_monolithic = - sscreen->use_monolithic_shaders || - memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0; - - /* Compile the main shader part if it doesn't exist. This can happen - * if the initial guess was wrong. - * - * The prim discard CS doesn't need the main shader part. - */ - if (!is_pure_monolithic && - !key->opt.vs_as_prim_discard_cs) { - bool ok = true; - - /* Make sure the main shader part is present. This is needed - * for shaders that can be compiled as VS, LS, or ES, and only - * one of them is compiled at creation. - * - * It is also needed for GS, which can be compiled as non-NGG - * and NGG. - * - * For merged shaders, check that the starting shader's main - * part is present. - */ - if (previous_stage_sel) { - struct si_shader_key shader1_key = zeroed; - - if (sel->type == PIPE_SHADER_TESS_CTRL) { - shader1_key.as_ls = 1; - } else if (sel->type == PIPE_SHADER_GEOMETRY) { - shader1_key.as_es = 1; - shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */ - } else { - assert(0); - } - - simple_mtx_lock(&previous_stage_sel->mutex); - ok = si_check_missing_main_part(sscreen, - previous_stage_sel, - compiler_state, &shader1_key); - simple_mtx_unlock(&previous_stage_sel->mutex); - } - - if (ok) { - ok = si_check_missing_main_part(sscreen, sel, - compiler_state, key); - } - - if (!ok) { - FREE(shader); - simple_mtx_unlock(&sel->mutex); - return -ENOMEM; /* skip the draw call */ - } - } - - /* Keep the reference to the 1st shader of merged shaders, so that - * Gallium can't destroy it before we destroy the 2nd shader. - * - * Set sctx = NULL, because it's unused if we're not releasing - * the shader, and we don't have any sctx here. - */ - si_shader_selector_reference(NULL, &shader->previous_stage_sel, - previous_stage_sel); - - /* Monolithic-only shaders don't make a distinction between optimized - * and unoptimized. */ - shader->is_monolithic = - is_pure_monolithic || - memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - - /* The prim discard CS is always optimized. */ - shader->is_optimized = - (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) && - memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - - /* If it's an optimized shader, compile it asynchronously. */ - if (shader->is_optimized && thread_index < 0) { - /* Compile it asynchronously. */ - util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, - shader, &shader->ready, - si_build_shader_variant_low_priority, NULL, - 0); - - /* Add only after the ready fence was reset, to guard against a - * race with si_bind_XX_shader. */ - if (!sel->last_variant) { - sel->first_variant = shader; - sel->last_variant = shader; - } else { - sel->last_variant->next_variant = shader; - sel->last_variant = shader; - } - - /* Use the default (unoptimized) shader for now. */ - memset(&key->opt, 0, sizeof(key->opt)); - simple_mtx_unlock(&sel->mutex); - - if (sscreen->options.sync_compile) - util_queue_fence_wait(&shader->ready); - - if (optimized_or_none) - return -1; - goto again; - } - - /* Reset the fence before adding to the variant list. */ - util_queue_fence_reset(&shader->ready); - - if (!sel->last_variant) { - sel->first_variant = shader; - sel->last_variant = shader; - } else { - sel->last_variant->next_variant = shader; - sel->last_variant = shader; - } - - simple_mtx_unlock(&sel->mutex); - - assert(!shader->is_optimized); - si_build_shader_variant(shader, thread_index, false); - - util_queue_fence_signal(&shader->ready); - - if (!shader->compilation_failed) - state->current = shader; - - return shader->compilation_failed ? -1 : 0; -} - -static int si_shader_select(struct pipe_context *ctx, - struct si_shader_ctx_state *state, - union si_vgt_stages_key stages_key, - struct si_compiler_ctx_state *compiler_state) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_key key; - - si_shader_selector_key(ctx, state->cso, stages_key, &key); - return si_shader_select_with_key(sctx->screen, state, compiler_state, - &key, -1, false); -} - -static void si_parse_next_shader_property(const struct si_shader_info *info, - bool streamout, - struct si_shader_key *key) -{ - unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER]; - - switch (info->processor) { - case PIPE_SHADER_VERTEX: - switch (next_shader) { - case PIPE_SHADER_GEOMETRY: - key->as_es = 1; - break; - case PIPE_SHADER_TESS_CTRL: - case PIPE_SHADER_TESS_EVAL: - key->as_ls = 1; - break; - default: - /* If POSITION isn't written, it can only be a HW VS - * if streamout is used. If streamout isn't used, - * assume that it's a HW LS. (the next shader is TCS) - * This heuristic is needed for separate shader objects. - */ - if (!info->writes_position && !streamout) - key->as_ls = 1; - } - break; - - case PIPE_SHADER_TESS_EVAL: - if (next_shader == PIPE_SHADER_GEOMETRY || - !info->writes_position) - key->as_es = 1; - break; - } + /* This must be done before the mutex is locked, because async GS + * compilation calls this function too, and therefore must enter + * the mutex first. + * + * Only wait if we are in a draw call. Don't wait if we are + * in a compiler thread. + */ + if (thread_index < 0) + util_queue_fence_wait(&sel->ready); + + simple_mtx_lock(&sel->mutex); + + /* Find the shader variant. */ + for (iter = sel->first_variant; iter; iter = iter->next_variant) { + /* Don't check the "current" shader. We checked it above. */ + if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) { + simple_mtx_unlock(&sel->mutex); + + if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) { + /* If it's an optimized shader and its compilation has + * been started but isn't done, use the unoptimized + * shader so as not to cause a stall due to compilation. + */ + if (iter->is_optimized) { + if (optimized_or_none) + return -1; + memset(&key->opt, 0, sizeof(key->opt)); + goto again; + } + + util_queue_fence_wait(&iter->ready); + } + + if (iter->compilation_failed) { + return -1; /* skip the draw call */ + } + + state->current = iter; + return 0; + } + } + + /* Build a new shader. */ + shader = CALLOC_STRUCT(si_shader); + if (!shader) { + simple_mtx_unlock(&sel->mutex); + return -ENOMEM; + } + + util_queue_fence_init(&shader->ready); + + shader->selector = sel; + shader->key = *key; + shader->compiler_ctx_state = *compiler_state; + + /* If this is a merged shader, get the first shader's selector. */ + if (sscreen->info.chip_class >= GFX9) { + if (sel->type == PIPE_SHADER_TESS_CTRL) + previous_stage_sel = key->part.tcs.ls; + else if (sel->type == PIPE_SHADER_GEOMETRY) + previous_stage_sel = key->part.gs.es; + + /* We need to wait for the previous shader. */ + if (previous_stage_sel && thread_index < 0) + util_queue_fence_wait(&previous_stage_sel->ready); + } + + bool is_pure_monolithic = + sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0; + + /* Compile the main shader part if it doesn't exist. This can happen + * if the initial guess was wrong. + * + * The prim discard CS doesn't need the main shader part. + */ + if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) { + bool ok = true; + + /* Make sure the main shader part is present. This is needed + * for shaders that can be compiled as VS, LS, or ES, and only + * one of them is compiled at creation. + * + * It is also needed for GS, which can be compiled as non-NGG + * and NGG. + * + * For merged shaders, check that the starting shader's main + * part is present. + */ + if (previous_stage_sel) { + struct si_shader_key shader1_key = zeroed; + + if (sel->type == PIPE_SHADER_TESS_CTRL) { + shader1_key.as_ls = 1; + } else if (sel->type == PIPE_SHADER_GEOMETRY) { + shader1_key.as_es = 1; + shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */ + } else { + assert(0); + } + + simple_mtx_lock(&previous_stage_sel->mutex); + ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); + simple_mtx_unlock(&previous_stage_sel->mutex); + } + + if (ok) { + ok = si_check_missing_main_part(sscreen, sel, compiler_state, key); + } + + if (!ok) { + FREE(shader); + simple_mtx_unlock(&sel->mutex); + return -ENOMEM; /* skip the draw call */ + } + } + + /* Keep the reference to the 1st shader of merged shaders, so that + * Gallium can't destroy it before we destroy the 2nd shader. + * + * Set sctx = NULL, because it's unused if we're not releasing + * the shader, and we don't have any sctx here. + */ + si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel); + + /* Monolithic-only shaders don't make a distinction between optimized + * and unoptimized. */ + shader->is_monolithic = + is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; + + /* The prim discard CS is always optimized. */ + shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) && + memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; + + /* If it's an optimized shader, compile it asynchronously. */ + if (shader->is_optimized && thread_index < 0) { + /* Compile it asynchronously. */ + util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready, + si_build_shader_variant_low_priority, NULL, 0); + + /* Add only after the ready fence was reset, to guard against a + * race with si_bind_XX_shader. */ + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } + + /* Use the default (unoptimized) shader for now. */ + memset(&key->opt, 0, sizeof(key->opt)); + simple_mtx_unlock(&sel->mutex); + + if (sscreen->options.sync_compile) + util_queue_fence_wait(&shader->ready); + + if (optimized_or_none) + return -1; + goto again; + } + + /* Reset the fence before adding to the variant list. */ + util_queue_fence_reset(&shader->ready); + + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } + + simple_mtx_unlock(&sel->mutex); + + assert(!shader->is_optimized); + si_build_shader_variant(shader, thread_index, false); + + util_queue_fence_signal(&shader->ready); + + if (!shader->compilation_failed) + state->current = shader; + + return shader->compilation_failed ? -1 : 0; +} + +static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, + union si_vgt_stages_key stages_key, + struct si_compiler_ctx_state *compiler_state) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_key key; + + si_shader_selector_key(ctx, state->cso, stages_key, &key); + return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false); +} + +static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout, + struct si_shader_key *key) +{ + unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER]; + + switch (info->processor) { + case PIPE_SHADER_VERTEX: + switch (next_shader) { + case PIPE_SHADER_GEOMETRY: + key->as_es = 1; + break; + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + key->as_ls = 1; + break; + default: + /* If POSITION isn't written, it can only be a HW VS + * if streamout is used. If streamout isn't used, + * assume that it's a HW LS. (the next shader is TCS) + * This heuristic is needed for separate shader objects. + */ + if (!info->writes_position && !streamout) + key->as_ls = 1; + } + break; + + case PIPE_SHADER_TESS_EVAL: + if (next_shader == PIPE_SHADER_GEOMETRY || !info->writes_position) + key->as_es = 1; + break; + } } /** @@ -2563,971 +2373,904 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, */ static void si_init_shader_selector_async(void *job, int thread_index) { - struct si_shader_selector *sel = (struct si_shader_selector *)job; - struct si_screen *sscreen = sel->screen; - struct ac_llvm_compiler *compiler; - struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; - - assert(!debug->debug_message || debug->async); - assert(thread_index >= 0); - assert(thread_index < ARRAY_SIZE(sscreen->compiler)); - compiler = &sscreen->compiler[thread_index]; - - if (!compiler->passes) - si_init_compiler(sscreen, compiler); - - /* Serialize NIR to save memory. Monolithic shader variants - * have to deserialize NIR before compilation. - */ - if (sel->nir) { - struct blob blob; - size_t size; - - blob_init(&blob); - /* true = remove optional debugging data to increase - * the likehood of getting more shader cache hits. - * It also drops variable names, so we'll save more memory. - */ - nir_serialize(&blob, sel->nir, true); - blob_finish_get_buffer(&blob, &sel->nir_binary, &size); - sel->nir_size = size; - } - - /* Compile the main shader part for use with a prolog and/or epilog. - * If this fails, the driver will try to compile a monolithic shader - * on demand. - */ - if (!sscreen->use_monolithic_shaders) { - struct si_shader *shader = CALLOC_STRUCT(si_shader); - unsigned char ir_sha1_cache_key[20]; - - if (!shader) { - fprintf(stderr, "radeonsi: can't allocate a main shader part\n"); - return; - } - - /* We can leave the fence signaled because use of the default - * main part is guarded by the selector's ready fence. */ - util_queue_fence_init(&shader->ready); - - shader->selector = sel; - shader->is_monolithic = false; - si_parse_next_shader_property(&sel->info, - sel->so.num_outputs != 0, - &shader->key); - - if (sscreen->use_ngg && - (!sel->so.num_outputs || sscreen->use_ngg_streamout) && - ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) || - sel->type == PIPE_SHADER_TESS_EVAL || - sel->type == PIPE_SHADER_GEOMETRY)) - shader->key.as_ngg = 1; - - if (sel->nir) { - si_get_ir_cache_key(sel, shader->key.as_ngg, - shader->key.as_es, ir_sha1_cache_key); - } - - /* Try to load the shader from the shader cache. */ - simple_mtx_lock(&sscreen->shader_cache_mutex); - - if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { - simple_mtx_unlock(&sscreen->shader_cache_mutex); - si_shader_dump_stats_for_shader_db(sscreen, shader, debug); - } else { - simple_mtx_unlock(&sscreen->shader_cache_mutex); - - /* Compile the shader if it hasn't been loaded from the cache. */ - if (!si_compile_shader(sscreen, compiler, shader, debug)) { - FREE(shader); - fprintf(stderr, "radeonsi: can't compile a main shader part\n"); - return; - } - - simple_mtx_lock(&sscreen->shader_cache_mutex); - si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, - shader, true); - simple_mtx_unlock(&sscreen->shader_cache_mutex); - } - - *si_get_main_shader_part(sel, &shader->key) = shader; - - /* Unset "outputs_written" flags for outputs converted to - * DEFAULT_VAL, so that later inter-shader optimizations don't - * try to eliminate outputs that don't exist in the final - * shader. - * - * This is only done if non-monolithic shaders are enabled. - */ - if ((sel->type == PIPE_SHADER_VERTEX || - sel->type == PIPE_SHADER_TESS_EVAL) && - !shader->key.as_ls && - !shader->key.as_es) { - unsigned i; - - for (i = 0; i < sel->info.num_outputs; i++) { - unsigned offset = shader->info.vs_output_param_offset[i]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) - continue; - - unsigned name = sel->info.output_semantic_name[i]; - unsigned index = sel->info.output_semantic_index[i]; - unsigned id; - - switch (name) { - case TGSI_SEMANTIC_GENERIC: - /* don't process indices the function can't handle */ - if (index >= SI_MAX_IO_GENERIC) - break; - /* fall through */ - default: - id = si_shader_io_get_unique_index(name, index, true); - sel->outputs_written_before_ps &= ~(1ull << id); - break; - case TGSI_SEMANTIC_POSITION: /* ignore these */ - case TGSI_SEMANTIC_PSIZE: - case TGSI_SEMANTIC_CLIPVERTEX: - case TGSI_SEMANTIC_EDGEFLAG: - break; - } - } - } - } - - /* The GS copy shader is always pre-compiled. */ - if (sel->type == PIPE_SHADER_GEOMETRY && - (!sscreen->use_ngg || - !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ - sel->tess_turns_off_ngg)) { - sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); - if (!sel->gs_copy_shader) { - fprintf(stderr, "radeonsi: can't create GS copy shader\n"); - return; - } - - si_shader_vs(sscreen, sel->gs_copy_shader, sel); - } - - /* Free NIR. We only keep serialized NIR after this point. */ - if (sel->nir) { - ralloc_free(sel->nir); - sel->nir = NULL; - } + struct si_shader_selector *sel = (struct si_shader_selector *)job; + struct si_screen *sscreen = sel->screen; + struct ac_llvm_compiler *compiler; + struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; + + assert(!debug->debug_message || debug->async); + assert(thread_index >= 0); + assert(thread_index < ARRAY_SIZE(sscreen->compiler)); + compiler = &sscreen->compiler[thread_index]; + + if (!compiler->passes) + si_init_compiler(sscreen, compiler); + + /* Serialize NIR to save memory. Monolithic shader variants + * have to deserialize NIR before compilation. + */ + if (sel->nir) { + struct blob blob; + size_t size; + + blob_init(&blob); + /* true = remove optional debugging data to increase + * the likehood of getting more shader cache hits. + * It also drops variable names, so we'll save more memory. + */ + nir_serialize(&blob, sel->nir, true); + blob_finish_get_buffer(&blob, &sel->nir_binary, &size); + sel->nir_size = size; + } + + /* Compile the main shader part for use with a prolog and/or epilog. + * If this fails, the driver will try to compile a monolithic shader + * on demand. + */ + if (!sscreen->use_monolithic_shaders) { + struct si_shader *shader = CALLOC_STRUCT(si_shader); + unsigned char ir_sha1_cache_key[20]; + + if (!shader) { + fprintf(stderr, "radeonsi: can't allocate a main shader part\n"); + return; + } + + /* We can leave the fence signaled because use of the default + * main part is guarded by the selector's ready fence. */ + util_queue_fence_init(&shader->ready); + + shader->selector = sel; + shader->is_monolithic = false; + si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key); + + if (sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) && + ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) || + sel->type == PIPE_SHADER_TESS_EVAL || sel->type == PIPE_SHADER_GEOMETRY)) + shader->key.as_ngg = 1; + + if (sel->nir) { + si_get_ir_cache_key(sel, shader->key.as_ngg, shader->key.as_es, ir_sha1_cache_key); + } + + /* Try to load the shader from the shader cache. */ + simple_mtx_lock(&sscreen->shader_cache_mutex); + + if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) { + simple_mtx_unlock(&sscreen->shader_cache_mutex); + si_shader_dump_stats_for_shader_db(sscreen, shader, debug); + } else { + simple_mtx_unlock(&sscreen->shader_cache_mutex); + + /* Compile the shader if it hasn't been loaded from the cache. */ + if (!si_compile_shader(sscreen, compiler, shader, debug)) { + FREE(shader); + fprintf(stderr, "radeonsi: can't compile a main shader part\n"); + return; + } + + simple_mtx_lock(&sscreen->shader_cache_mutex); + si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true); + simple_mtx_unlock(&sscreen->shader_cache_mutex); + } + + *si_get_main_shader_part(sel, &shader->key) = shader; + + /* Unset "outputs_written" flags for outputs converted to + * DEFAULT_VAL, so that later inter-shader optimizations don't + * try to eliminate outputs that don't exist in the final + * shader. + * + * This is only done if non-monolithic shaders are enabled. + */ + if ((sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL) && + !shader->key.as_ls && !shader->key.as_es) { + unsigned i; + + for (i = 0; i < sel->info.num_outputs; i++) { + unsigned offset = shader->info.vs_output_param_offset[i]; + + if (offset <= AC_EXP_PARAM_OFFSET_31) + continue; + + unsigned name = sel->info.output_semantic_name[i]; + unsigned index = sel->info.output_semantic_index[i]; + unsigned id; + + switch (name) { + case TGSI_SEMANTIC_GENERIC: + /* don't process indices the function can't handle */ + if (index >= SI_MAX_IO_GENERIC) + break; + /* fall through */ + default: + id = si_shader_io_get_unique_index(name, index, true); + sel->outputs_written_before_ps &= ~(1ull << id); + break; + case TGSI_SEMANTIC_POSITION: /* ignore these */ + case TGSI_SEMANTIC_PSIZE: + case TGSI_SEMANTIC_CLIPVERTEX: + case TGSI_SEMANTIC_EDGEFLAG: + break; + } + } + } + } + + /* The GS copy shader is always pre-compiled. */ + if (sel->type == PIPE_SHADER_GEOMETRY && + (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ + sel->tess_turns_off_ngg)) { + sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); + if (!sel->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + return; + } + + si_shader_vs(sscreen, sel->gs_copy_shader, sel); + } + + /* Free NIR. We only keep serialized NIR after this point. */ + if (sel->nir) { + ralloc_free(sel->nir); + sel->nir = NULL; + } } void si_schedule_initial_compile(struct si_context *sctx, unsigned processor, - struct util_queue_fence *ready_fence, - struct si_compiler_ctx_state *compiler_ctx_state, - void *job, util_queue_execute_func execute) + struct util_queue_fence *ready_fence, + struct si_compiler_ctx_state *compiler_ctx_state, void *job, + util_queue_execute_func execute) { - util_queue_fence_init(ready_fence); - - struct util_async_debug_callback async_debug; - bool debug = - (sctx->debug.debug_message && !sctx->debug.async) || - sctx->is_debug || - si_can_dump_shader(sctx->screen, processor); - - if (debug) { - u_async_debug_init(&async_debug); - compiler_ctx_state->debug = async_debug.base; - } - - util_queue_add_job(&sctx->screen->shader_compiler_queue, job, - ready_fence, execute, NULL, 0); - - if (debug) { - util_queue_fence_wait(ready_fence); - u_async_debug_drain(&async_debug, &sctx->debug); - u_async_debug_cleanup(&async_debug); - } - - if (sctx->screen->options.sync_compile) - util_queue_fence_wait(ready_fence); + util_queue_fence_init(ready_fence); + + struct util_async_debug_callback async_debug; + bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug || + si_can_dump_shader(sctx->screen, processor); + + if (debug) { + u_async_debug_init(&async_debug); + compiler_ctx_state->debug = async_debug.base; + } + + util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0); + + if (debug) { + util_queue_fence_wait(ready_fence); + u_async_debug_drain(&async_debug, &sctx->debug); + u_async_debug_cleanup(&async_debug); + } + + if (sctx->screen->options.sync_compile) + util_queue_fence_wait(ready_fence); } /* Return descriptor slot usage masks from the given shader info. */ -void si_get_active_slot_masks(const struct si_shader_info *info, - uint32_t *const_and_shader_buffers, - uint64_t *samplers_and_images) -{ - unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers; - - num_shaderbufs = util_last_bit(info->shader_buffers_declared); - num_constbufs = util_last_bit(info->const_buffers_declared); - /* two 8-byte images share one 16-byte slot */ - num_images = align(util_last_bit(info->images_declared), 2); - num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2); - num_samplers = util_last_bit(info->samplers_declared); - - /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */ - start = si_get_shaderbuf_slot(num_shaderbufs - 1); - *const_and_shader_buffers = - u_bit_consecutive(start, num_shaderbufs + num_constbufs); - - /* The layout is: - * - fmask[last] ... fmask[0] go to [15-last .. 15] - * - image[last] ... image[0] go to [31-last .. 31] - * - sampler[0] ... sampler[last] go to [32 .. 32+last*2] - * - * FMASKs for images are placed separately, because MSAA images are rare, - * and so we can benefit from a better cache hit rate if we keep image - * descriptors together. - */ - if (num_msaa_images) - num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */ - - start = si_get_image_slot(num_images - 1) / 2; - *samplers_and_images = - u_bit_consecutive64(start, num_images / 2 + num_samplers); +void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers, + uint64_t *samplers_and_images) +{ + unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers; + + num_shaderbufs = util_last_bit(info->shader_buffers_declared); + num_constbufs = util_last_bit(info->const_buffers_declared); + /* two 8-byte images share one 16-byte slot */ + num_images = align(util_last_bit(info->images_declared), 2); + num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2); + num_samplers = util_last_bit(info->samplers_declared); + + /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */ + start = si_get_shaderbuf_slot(num_shaderbufs - 1); + *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs); + + /* The layout is: + * - fmask[last] ... fmask[0] go to [15-last .. 15] + * - image[last] ... image[0] go to [31-last .. 31] + * - sampler[0] ... sampler[last] go to [32 .. 32+last*2] + * + * FMASKs for images are placed separately, because MSAA images are rare, + * and so we can benefit from a better cache hit rate if we keep image + * descriptors together. + */ + if (num_msaa_images) + num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */ + + start = si_get_image_slot(num_images - 1) / 2; + *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers); } static void *si_create_shader_selector(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - struct si_screen *sscreen = (struct si_screen *)ctx->screen; - struct si_context *sctx = (struct si_context*)ctx; - struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); - int i; - - if (!sel) - return NULL; - - sel->screen = sscreen; - sel->compiler_ctx_state.debug = sctx->debug; - sel->compiler_ctx_state.is_debug_context = sctx->is_debug; - - sel->so = state->stream_output; - - if (state->type == PIPE_SHADER_IR_TGSI) { - sel->nir = tgsi_to_nir(state->tokens, ctx->screen); - } else { - assert(state->type == PIPE_SHADER_IR_NIR); - sel->nir = state->ir.nir; - } - - si_nir_scan_shader(sel->nir, &sel->info); - si_nir_adjust_driver_locations(sel->nir); - - sel->type = sel->info.processor; - p_atomic_inc(&sscreen->num_shaders_created); - si_get_active_slot_masks(&sel->info, - &sel->active_const_and_shader_buffers, - &sel->active_samplers_and_images); - - /* Record which streamout buffers are enabled. */ - for (i = 0; i < sel->so.num_outputs; i++) { - sel->enabled_streamout_buffer_mask |= - (1 << sel->so.output[i].output_buffer) << - (sel->so.output[i].stream * 4); - } - - sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX && - !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ? - sel->info.num_inputs : 0; - sel->num_vbos_in_user_sgprs = - MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); - - /* The prolog is a no-op if there are no inputs. */ - sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && - sel->info.num_inputs && - !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - - sel->prim_discard_cs_allowed = - sel->type == PIPE_SHADER_VERTEX && - !sel->info.uses_bindless_images && - !sel->info.uses_bindless_samplers && - !sel->info.writes_memory && - !sel->info.writes_viewport_index && - !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && - !sel->so.num_outputs; - - switch (sel->type) { - case PIPE_SHADER_GEOMETRY: - sel->gs_output_prim = - sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; - - /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ - sel->rast_prim = sel->gs_output_prim; - if (util_rast_prim_is_triangles(sel->rast_prim)) - sel->rast_prim = PIPE_PRIM_TRIANGLES; - - sel->gs_max_out_vertices = - sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; - sel->gs_num_invocations = - sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; - sel->gsvs_vertex_size = sel->info.num_outputs * 16; - sel->max_gsvs_emit_size = sel->gsvs_vertex_size * - sel->gs_max_out_vertices; - - sel->max_gs_stream = 0; - for (i = 0; i < sel->so.num_outputs; i++) - sel->max_gs_stream = MAX2(sel->max_gs_stream, - sel->so.output[i].stream); - - sel->gs_input_verts_per_prim = - u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]); - - /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */ - sel->tess_turns_off_ngg = - sscreen->info.chip_class == GFX10 && - sel->gs_num_invocations * sel->gs_max_out_vertices > 256; - break; - - case PIPE_SHADER_TESS_CTRL: - /* Always reserve space for these. */ - sel->patch_outputs_written |= - (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) | - (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0)); - /* fall through */ - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_TESS_EVAL: - for (i = 0; i < sel->info.num_outputs; i++) { - unsigned name = sel->info.output_semantic_name[i]; - unsigned index = sel->info.output_semantic_index[i]; - - switch (name) { - case TGSI_SEMANTIC_TESSINNER: - case TGSI_SEMANTIC_TESSOUTER: - case TGSI_SEMANTIC_PATCH: - sel->patch_outputs_written |= - 1ull << si_shader_io_get_unique_index_patch(name, index); - break; - - case TGSI_SEMANTIC_GENERIC: - /* don't process indices the function can't handle */ - if (index >= SI_MAX_IO_GENERIC) - break; - /* fall through */ - default: - sel->outputs_written |= - 1ull << si_shader_io_get_unique_index(name, index, false); - sel->outputs_written_before_ps |= - 1ull << si_shader_io_get_unique_index(name, index, true); - break; - case TGSI_SEMANTIC_EDGEFLAG: - break; - } - } - sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; - sel->lshs_vertex_stride = sel->esgs_itemsize; - - /* Add 1 dword to reduce LDS bank conflicts, so that each vertex - * will start on a different bank. (except for the maximum 32*16). - */ - if (sel->lshs_vertex_stride < 32*16) - sel->lshs_vertex_stride += 4; - - /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank - * conflicts, i.e. each vertex will start at a different bank. - */ - if (sctx->chip_class >= GFX9) - sel->esgs_itemsize += 4; - - assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); - - /* Only for TES: */ - if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE]) - sel->rast_prim = PIPE_PRIM_POINTS; - else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) - sel->rast_prim = PIPE_PRIM_LINE_STRIP; - else - sel->rast_prim = PIPE_PRIM_TRIANGLES; - break; - - case PIPE_SHADER_FRAGMENT: - for (i = 0; i < sel->info.num_inputs; i++) { - unsigned name = sel->info.input_semantic_name[i]; - unsigned index = sel->info.input_semantic_index[i]; - - switch (name) { - case TGSI_SEMANTIC_GENERIC: - /* don't process indices the function can't handle */ - if (index >= SI_MAX_IO_GENERIC) - break; - /* fall through */ - default: - sel->inputs_read |= - 1ull << si_shader_io_get_unique_index(name, index, true); - break; - case TGSI_SEMANTIC_PCOORD: /* ignore this */ - break; - } - } - - for (i = 0; i < 8; i++) - if (sel->info.colors_written & (1 << i)) - sel->colors_written_4bit |= 0xf << (4 * i); - - for (i = 0; i < sel->info.num_inputs; i++) { - if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) { - int index = sel->info.input_semantic_index[i]; - sel->color_attr_index[index] = i; - } - } - break; - default:; - } - - sel->ngg_culling_allowed = - sscreen->info.chip_class == GFX10 && - sscreen->info.has_dedicated_vram && - sscreen->use_ngg_culling && - /* Disallow TES by default, because TessMark results are mixed. */ - (sel->type == PIPE_SHADER_VERTEX || - (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) && - sel->info.writes_position && - !sel->info.writes_viewport_index && /* cull only against viewport 0 */ - !sel->info.writes_memory && - !sel->so.num_outputs && - !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] && - !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - - /* PA_CL_VS_OUT_CNTL */ - if (sctx->chip_class <= GFX9) - sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false); - - sel->clipdist_mask = sel->info.writes_clipvertex ? - SIX_BITS : sel->info.clipdist_writemask; - sel->culldist_mask = sel->info.culldist_writemask << - sel->info.num_written_clipdistance; - - /* DB_SHADER_CONTROL */ - sel->db_shader_control = - S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) | - S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) | - S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) | - S_02880C_KILL_ENABLE(sel->info.uses_kill); - - switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) { - case TGSI_FS_DEPTH_LAYOUT_GREATER: - sel->db_shader_control |= - S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); - break; - case TGSI_FS_DEPTH_LAYOUT_LESS: - sel->db_shader_control |= - S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); - break; - } - - /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following: - * - * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP - * --|-----------|------------|------------|--------------------|-------------------|------------- - * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0 - * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0 - * 2 | false | true | n/a | LateZ | 1 | 0 - * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0 - * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1 - * - * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register. - * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense. - * - * Don't use ReZ without profiling !!! - * - * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex - * shaders. - */ - if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) { - /* Cases 3, 4. */ - sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) | - S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | - S_02880C_EXEC_ON_NOOP(sel->info.writes_memory); - } else if (sel->info.writes_memory) { - /* Case 2. */ - sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | - S_02880C_EXEC_ON_HIER_FAIL(1); - } else { - /* Case 1. */ - sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); - } - - if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE]) - sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1); - - (void) simple_mtx_init(&sel->mutex, mtx_plain); - - si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, - &sel->compiler_ctx_state, sel, - si_init_shader_selector_async); - return sel; -} - -static void *si_create_shader(struct pipe_context *ctx, - const struct pipe_shader_state *state) -{ - struct si_screen *sscreen = (struct si_screen *)ctx->screen; - - return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state); + const struct pipe_shader_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector); + int i; + + if (!sel) + return NULL; + + sel->screen = sscreen; + sel->compiler_ctx_state.debug = sctx->debug; + sel->compiler_ctx_state.is_debug_context = sctx->is_debug; + + sel->so = state->stream_output; + + if (state->type == PIPE_SHADER_IR_TGSI) { + sel->nir = tgsi_to_nir(state->tokens, ctx->screen); + } else { + assert(state->type == PIPE_SHADER_IR_NIR); + sel->nir = state->ir.nir; + } + + si_nir_scan_shader(sel->nir, &sel->info); + si_nir_adjust_driver_locations(sel->nir); + + sel->type = sel->info.processor; + p_atomic_inc(&sscreen->num_shaders_created); + si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers, + &sel->active_samplers_and_images); + + /* Record which streamout buffers are enabled. */ + for (i = 0; i < sel->so.num_outputs; i++) { + sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer) + << (sel->so.output[i].stream * 4); + } + + sel->num_vs_inputs = + sel->type == PIPE_SHADER_VERTEX && !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] + ? sel->info.num_inputs + : 0; + sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs); + + /* The prolog is a no-op if there are no inputs. */ + sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && sel->info.num_inputs && + !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + + sel->prim_discard_cs_allowed = + sel->type == PIPE_SHADER_VERTEX && !sel->info.uses_bindless_images && + !sel->info.uses_bindless_samplers && !sel->info.writes_memory && + !sel->info.writes_viewport_index && + !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && !sel->so.num_outputs; + + switch (sel->type) { + case PIPE_SHADER_GEOMETRY: + sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM]; + + /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */ + sel->rast_prim = sel->gs_output_prim; + if (util_rast_prim_is_triangles(sel->rast_prim)) + sel->rast_prim = PIPE_PRIM_TRIANGLES; + + sel->gs_max_out_vertices = sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; + sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; + sel->gsvs_vertex_size = sel->info.num_outputs * 16; + sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices; + + sel->max_gs_stream = 0; + for (i = 0; i < sel->so.num_outputs; i++) + sel->max_gs_stream = MAX2(sel->max_gs_stream, sel->so.output[i].stream); + + sel->gs_input_verts_per_prim = + u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]); + + /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */ + sel->tess_turns_off_ngg = sscreen->info.chip_class == GFX10 && + sel->gs_num_invocations * sel->gs_max_out_vertices > 256; + break; + + case PIPE_SHADER_TESS_CTRL: + /* Always reserve space for these. */ + sel->patch_outputs_written |= + (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) | + (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0)); + /* fall through */ + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: + for (i = 0; i < sel->info.num_outputs; i++) { + unsigned name = sel->info.output_semantic_name[i]; + unsigned index = sel->info.output_semantic_index[i]; + + switch (name) { + case TGSI_SEMANTIC_TESSINNER: + case TGSI_SEMANTIC_TESSOUTER: + case TGSI_SEMANTIC_PATCH: + sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(name, index); + break; + + case TGSI_SEMANTIC_GENERIC: + /* don't process indices the function can't handle */ + if (index >= SI_MAX_IO_GENERIC) + break; + /* fall through */ + default: + sel->outputs_written |= 1ull << si_shader_io_get_unique_index(name, index, false); + sel->outputs_written_before_ps |= 1ull + << si_shader_io_get_unique_index(name, index, true); + break; + case TGSI_SEMANTIC_EDGEFLAG: + break; + } + } + sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; + sel->lshs_vertex_stride = sel->esgs_itemsize; + + /* Add 1 dword to reduce LDS bank conflicts, so that each vertex + * will start on a different bank. (except for the maximum 32*16). + */ + if (sel->lshs_vertex_stride < 32 * 16) + sel->lshs_vertex_stride += 4; + + /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank + * conflicts, i.e. each vertex will start at a different bank. + */ + if (sctx->chip_class >= GFX9) + sel->esgs_itemsize += 4; + + assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); + + /* Only for TES: */ + if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE]) + sel->rast_prim = PIPE_PRIM_POINTS; + else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES) + sel->rast_prim = PIPE_PRIM_LINE_STRIP; + else + sel->rast_prim = PIPE_PRIM_TRIANGLES; + break; + + case PIPE_SHADER_FRAGMENT: + for (i = 0; i < sel->info.num_inputs; i++) { + unsigned name = sel->info.input_semantic_name[i]; + unsigned index = sel->info.input_semantic_index[i]; + + switch (name) { + case TGSI_SEMANTIC_GENERIC: + /* don't process indices the function can't handle */ + if (index >= SI_MAX_IO_GENERIC) + break; + /* fall through */ + default: + sel->inputs_read |= 1ull << si_shader_io_get_unique_index(name, index, true); + break; + case TGSI_SEMANTIC_PCOORD: /* ignore this */ + break; + } + } + + for (i = 0; i < 8; i++) + if (sel->info.colors_written & (1 << i)) + sel->colors_written_4bit |= 0xf << (4 * i); + + for (i = 0; i < sel->info.num_inputs; i++) { + if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) { + int index = sel->info.input_semantic_index[i]; + sel->color_attr_index[index] = i; + } + } + break; + default:; + } + + sel->ngg_culling_allowed = + sscreen->info.chip_class == GFX10 && sscreen->info.has_dedicated_vram && + sscreen->use_ngg_culling && + /* Disallow TES by default, because TessMark results are mixed. */ + (sel->type == PIPE_SHADER_VERTEX || + (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) && + sel->info.writes_position && + !sel->info.writes_viewport_index && /* cull only against viewport 0 */ + !sel->info.writes_memory && !sel->so.num_outputs && + !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] && + !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + + /* PA_CL_VS_OUT_CNTL */ + if (sctx->chip_class <= GFX9) + sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false); + + sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS : sel->info.clipdist_writemask; + sel->culldist_mask = sel->info.culldist_writemask << sel->info.num_written_clipdistance; + + /* DB_SHADER_CONTROL */ + sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) | + S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) | + S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) | + S_02880C_KILL_ENABLE(sel->info.uses_kill); + + switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) { + case TGSI_FS_DEPTH_LAYOUT_GREATER: + sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + break; + } + + /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following: + * + * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP + * --|-----------|------------|------------|--------------------|-------------------|------------- + * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0 + * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0 + * 2 | false | true | n/a | LateZ | 1 | 0 + * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0 + * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1 + * + * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register. + * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense. + * + * Don't use ReZ without profiling !!! + * + * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex + * shaders. + */ + if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) { + /* Cases 3, 4. */ + sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) | + S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | + S_02880C_EXEC_ON_NOOP(sel->info.writes_memory); + } else if (sel->info.writes_memory) { + /* Case 2. */ + sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1); + } else { + /* Case 1. */ + sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); + } + + if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE]) + sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1); + + (void)simple_mtx_init(&sel->mutex, mtx_plain); + + si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, &sel->compiler_ctx_state, + sel, si_init_shader_selector_async); + return sel; +} + +static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + + return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state); } static void si_update_streamout_state(struct si_context *sctx) { - struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso; + struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso; - if (!shader_with_so) - return; + if (!shader_with_so) + return; - sctx->streamout.enabled_stream_buffers_mask = - shader_with_so->enabled_streamout_buffer_mask; - sctx->streamout.stride_in_dw = shader_with_so->so.stride; + sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask; + sctx->streamout.stride_in_dw = shader_with_so->so.stride; } -static void si_update_clip_regs(struct si_context *sctx, - struct si_shader_selector *old_hw_vs, - struct si_shader *old_hw_vs_variant, - struct si_shader_selector *next_hw_vs, - struct si_shader *next_hw_vs_variant) +static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs, + struct si_shader *old_hw_vs_variant, + struct si_shader_selector *next_hw_vs, + struct si_shader *next_hw_vs_variant) { - if (next_hw_vs && - (!old_hw_vs || - old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] != - next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] || - old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl || - old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask || - old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || - !old_hw_vs_variant || - !next_hw_vs_variant || - old_hw_vs_variant->key.opt.clip_disable != - next_hw_vs_variant->key.opt.clip_disable)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + if (next_hw_vs && + (!old_hw_vs || + old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] != + next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] || + old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl || + old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask || + old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant || + !next_hw_vs_variant || + old_hw_vs_variant->key.opt.clip_disable != next_hw_vs_variant->key.opt.clip_disable)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); } static void si_update_common_shader_state(struct si_context *sctx) { - sctx->uses_bindless_samplers = - si_shader_uses_bindless_samplers(sctx->vs_shader.cso) || - si_shader_uses_bindless_samplers(sctx->gs_shader.cso) || - si_shader_uses_bindless_samplers(sctx->ps_shader.cso) || - si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) || - si_shader_uses_bindless_samplers(sctx->tes_shader.cso); - sctx->uses_bindless_images = - si_shader_uses_bindless_images(sctx->vs_shader.cso) || - si_shader_uses_bindless_images(sctx->gs_shader.cso) || - si_shader_uses_bindless_images(sctx->ps_shader.cso) || - si_shader_uses_bindless_images(sctx->tcs_shader.cso) || - si_shader_uses_bindless_images(sctx->tes_shader.cso); - sctx->do_update_shaders = true; + sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->vs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->gs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->ps_shader.cso) || + si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->tes_shader.cso); + sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->vs_shader.cso) || + si_shader_uses_bindless_images(sctx->gs_shader.cso) || + si_shader_uses_bindless_images(sctx->ps_shader.cso) || + si_shader_uses_bindless_images(sctx->tcs_shader.cso) || + si_shader_uses_bindless_images(sctx->tes_shader.cso); + sctx->do_update_shaders = true; } static void si_bind_vs_shader(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; - struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); - struct si_shader_selector *sel = state; - - if (sctx->vs_shader.cso == sel) - return; - - sctx->vs_shader.cso = sel; - sctx->vs_shader.current = sel ? sel->first_variant : NULL; - sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0; - - if (si_update_ngg(sctx)) - si_shader_change_notify(sctx); - - si_update_common_shader_state(sctx); - si_update_vs_viewport_state(sctx); - si_set_active_descriptors_for_shader(sctx, sel); - si_update_streamout_state(sctx); - si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, - si_get_vs(sctx)->cso, si_get_vs_state(sctx)); + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); + struct si_shader_selector *sel = state; + + if (sctx->vs_shader.cso == sel) + return; + + sctx->vs_shader.cso = sel; + sctx->vs_shader.current = sel ? sel->first_variant : NULL; + sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0; + + if (si_update_ngg(sctx)) + si_shader_change_notify(sctx); + + si_update_common_shader_state(sctx); + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, + si_get_vs_state(sctx)); } static void si_update_tess_uses_prim_id(struct si_context *sctx) { - sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id = - (sctx->tes_shader.cso && - sctx->tes_shader.cso->info.uses_primid) || - (sctx->tcs_shader.cso && - sctx->tcs_shader.cso->info.uses_primid) || - (sctx->gs_shader.cso && - sctx->gs_shader.cso->info.uses_primid) || - (sctx->ps_shader.cso && !sctx->gs_shader.cso && - sctx->ps_shader.cso->info.uses_primid); + sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id = + (sctx->tes_shader.cso && sctx->tes_shader.cso->info.uses_primid) || + (sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) || + (sctx->gs_shader.cso && sctx->gs_shader.cso->info.uses_primid) || + (sctx->ps_shader.cso && !sctx->gs_shader.cso && sctx->ps_shader.cso->info.uses_primid); } bool si_update_ngg(struct si_context *sctx) { - if (!sctx->screen->use_ngg) { - assert(!sctx->ngg); - return false; - } - - bool new_ngg = true; - - if (sctx->gs_shader.cso && sctx->tes_shader.cso && - sctx->gs_shader.cso->tess_turns_off_ngg) { - new_ngg = false; - } else if (!sctx->screen->use_ngg_streamout) { - struct si_shader_selector *last = si_get_vs(sctx)->cso; - - if ((last && last->so.num_outputs) || - sctx->streamout.prims_gen_query_enabled) - new_ngg = false; - } - - if (new_ngg != sctx->ngg) { - /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14. - * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring - * pointers are set. - */ - if ((sctx->family == CHIP_NAVI10 || - sctx->family == CHIP_NAVI12 || - sctx->family == CHIP_NAVI14) && - !new_ngg) - sctx->flags |= SI_CONTEXT_VGT_FLUSH; - - sctx->ngg = new_ngg; - sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ - return true; - } - return false; + if (!sctx->screen->use_ngg) { + assert(!sctx->ngg); + return false; + } + + bool new_ngg = true; + + if (sctx->gs_shader.cso && sctx->tes_shader.cso && sctx->gs_shader.cso->tess_turns_off_ngg) { + new_ngg = false; + } else if (!sctx->screen->use_ngg_streamout) { + struct si_shader_selector *last = si_get_vs(sctx)->cso; + + if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled) + new_ngg = false; + } + + if (new_ngg != sctx->ngg) { + /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14. + * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring + * pointers are set. + */ + if ((sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 || + sctx->family == CHIP_NAVI14) && + !new_ngg) + sctx->flags |= SI_CONTEXT_VGT_FLUSH; + + sctx->ngg = new_ngg; + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ + return true; + } + return false; } static void si_bind_gs_shader(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; - struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); - struct si_shader_selector *sel = state; - bool enable_changed = !!sctx->gs_shader.cso != !!sel; - bool ngg_changed; - - if (sctx->gs_shader.cso == sel) - return; - - sctx->gs_shader.cso = sel; - sctx->gs_shader.current = sel ? sel->first_variant : NULL; - sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL; - - si_update_common_shader_state(sctx); - sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ - - ngg_changed = si_update_ngg(sctx); - if (ngg_changed || enable_changed) - si_shader_change_notify(sctx); - if (enable_changed) { - if (sctx->ia_multi_vgt_param_key.u.uses_tess) - si_update_tess_uses_prim_id(sctx); - } - si_update_vs_viewport_state(sctx); - si_set_active_descriptors_for_shader(sctx, sel); - si_update_streamout_state(sctx); - si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, - si_get_vs(sctx)->cso, si_get_vs_state(sctx)); + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); + struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->gs_shader.cso != !!sel; + bool ngg_changed; + + if (sctx->gs_shader.cso == sel) + return; + + sctx->gs_shader.cso = sel; + sctx->gs_shader.current = sel ? sel->first_variant : NULL; + sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL; + + si_update_common_shader_state(sctx); + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ + + ngg_changed = si_update_ngg(sctx); + if (ngg_changed || enable_changed) + si_shader_change_notify(sctx); + if (enable_changed) { + if (sctx->ia_multi_vgt_param_key.u.uses_tess) + si_update_tess_uses_prim_id(sctx); + } + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, + si_get_vs_state(sctx)); } static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = state; - bool enable_changed = !!sctx->tcs_shader.cso != !!sel; + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->tcs_shader.cso != !!sel; - if (sctx->tcs_shader.cso == sel) - return; + if (sctx->tcs_shader.cso == sel) + return; - sctx->tcs_shader.cso = sel; - sctx->tcs_shader.current = sel ? sel->first_variant : NULL; - si_update_tess_uses_prim_id(sctx); + sctx->tcs_shader.cso = sel; + sctx->tcs_shader.current = sel ? sel->first_variant : NULL; + si_update_tess_uses_prim_id(sctx); - si_update_common_shader_state(sctx); + si_update_common_shader_state(sctx); - if (enable_changed) - sctx->last_tcs = NULL; /* invalidate derived tess state */ + if (enable_changed) + sctx->last_tcs = NULL; /* invalidate derived tess state */ - si_set_active_descriptors_for_shader(sctx, sel); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tes_shader(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; - struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); - struct si_shader_selector *sel = state; - bool enable_changed = !!sctx->tes_shader.cso != !!sel; - - if (sctx->tes_shader.cso == sel) - return; - - sctx->tes_shader.cso = sel; - sctx->tes_shader.current = sel ? sel->first_variant : NULL; - sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; - si_update_tess_uses_prim_id(sctx); - - si_update_common_shader_state(sctx); - sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ - - bool ngg_changed = si_update_ngg(sctx); - if (ngg_changed || enable_changed) - si_shader_change_notify(sctx); - if (enable_changed) - sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ - si_update_vs_viewport_state(sctx); - si_set_active_descriptors_for_shader(sctx, sel); - si_update_streamout_state(sctx); - si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, - si_get_vs(sctx)->cso, si_get_vs_state(sctx)); + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); + struct si_shader_selector *sel = state; + bool enable_changed = !!sctx->tes_shader.cso != !!sel; + + if (sctx->tes_shader.cso == sel) + return; + + sctx->tes_shader.cso = sel; + sctx->tes_shader.current = sel ? sel->first_variant : NULL; + sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; + si_update_tess_uses_prim_id(sctx); + + si_update_common_shader_state(sctx); + sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ + + bool ngg_changed = si_update_ngg(sctx); + if (ngg_changed || enable_changed) + si_shader_change_notify(sctx); + if (enable_changed) + sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, + si_get_vs_state(sctx)); } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *old_sel = sctx->ps_shader.cso; - struct si_shader_selector *sel = state; - - /* skip if supplied shader is one already in use */ - if (old_sel == sel) - return; - - sctx->ps_shader.cso = sel; - sctx->ps_shader.current = sel ? sel->first_variant : NULL; - - si_update_common_shader_state(sctx); - if (sel) { - if (sctx->ia_multi_vgt_param_key.u.uses_tess) - si_update_tess_uses_prim_id(sctx); - - if (!old_sel || - old_sel->info.colors_written != sel->info.colors_written) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (sctx->screen->has_out_of_order_rast && - (!old_sel || - old_sel->info.writes_memory != sel->info.writes_memory || - old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] != - sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - } - si_set_active_descriptors_for_shader(sctx, sel); - si_update_ps_colorbuf0_slot(sctx); + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_sel = sctx->ps_shader.cso; + struct si_shader_selector *sel = state; + + /* skip if supplied shader is one already in use */ + if (old_sel == sel) + return; + + sctx->ps_shader.cso = sel; + sctx->ps_shader.current = sel ? sel->first_variant : NULL; + + si_update_common_shader_state(sctx); + if (sel) { + if (sctx->ia_multi_vgt_param_key.u.uses_tess) + si_update_tess_uses_prim_id(sctx); + + if (!old_sel || old_sel->info.colors_written != sel->info.colors_written) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (sctx->screen->has_out_of_order_rast && + (!old_sel || old_sel->info.writes_memory != sel->info.writes_memory || + old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] != + sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + } + si_set_active_descriptors_for_shader(sctx, sel); + si_update_ps_colorbuf0_slot(sctx); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) { - if (shader->is_optimized) { - util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, - &shader->ready); - } - - util_queue_fence_destroy(&shader->ready); - - if (shader->pm4) { - /* If destroyed shaders were not unbound, the next compiled - * shader variant could get the same pointer address and so - * binding it to the same shader stage would be considered - * a no-op, causing random behavior. - */ - switch (shader->selector->type) { - case PIPE_SHADER_VERTEX: - if (shader->key.as_ls) { - assert(sctx->chip_class <= GFX8); - si_pm4_delete_state(sctx, ls, shader->pm4); - } else if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_delete_state(sctx, es, shader->pm4); - } else if (shader->key.as_ngg) { - si_pm4_delete_state(sctx, gs, shader->pm4); - } else { - si_pm4_delete_state(sctx, vs, shader->pm4); - } - break; - case PIPE_SHADER_TESS_CTRL: - si_pm4_delete_state(sctx, hs, shader->pm4); - break; - case PIPE_SHADER_TESS_EVAL: - if (shader->key.as_es) { - assert(sctx->chip_class <= GFX8); - si_pm4_delete_state(sctx, es, shader->pm4); - } else if (shader->key.as_ngg) { - si_pm4_delete_state(sctx, gs, shader->pm4); - } else { - si_pm4_delete_state(sctx, vs, shader->pm4); - } - break; - case PIPE_SHADER_GEOMETRY: - if (shader->is_gs_copy_shader) - si_pm4_delete_state(sctx, vs, shader->pm4); - else - si_pm4_delete_state(sctx, gs, shader->pm4); - break; - case PIPE_SHADER_FRAGMENT: - si_pm4_delete_state(sctx, ps, shader->pm4); - break; - default:; - } - } - - si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); - si_shader_destroy(shader); - free(shader); + if (shader->is_optimized) { + util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready); + } + + util_queue_fence_destroy(&shader->ready); + + if (shader->pm4) { + /* If destroyed shaders were not unbound, the next compiled + * shader variant could get the same pointer address and so + * binding it to the same shader stage would be considered + * a no-op, causing random behavior. + */ + switch (shader->selector->type) { + case PIPE_SHADER_VERTEX: + if (shader->key.as_ls) { + assert(sctx->chip_class <= GFX8); + si_pm4_delete_state(sctx, ls, shader->pm4); + } else if (shader->key.as_es) { + assert(sctx->chip_class <= GFX8); + si_pm4_delete_state(sctx, es, shader->pm4); + } else if (shader->key.as_ngg) { + si_pm4_delete_state(sctx, gs, shader->pm4); + } else { + si_pm4_delete_state(sctx, vs, shader->pm4); + } + break; + case PIPE_SHADER_TESS_CTRL: + si_pm4_delete_state(sctx, hs, shader->pm4); + break; + case PIPE_SHADER_TESS_EVAL: + if (shader->key.as_es) { + assert(sctx->chip_class <= GFX8); + si_pm4_delete_state(sctx, es, shader->pm4); + } else if (shader->key.as_ngg) { + si_pm4_delete_state(sctx, gs, shader->pm4); + } else { + si_pm4_delete_state(sctx, vs, shader->pm4); + } + break; + case PIPE_SHADER_GEOMETRY: + if (shader->is_gs_copy_shader) + si_pm4_delete_state(sctx, vs, shader->pm4); + else + si_pm4_delete_state(sctx, gs, shader->pm4); + break; + case PIPE_SHADER_FRAGMENT: + si_pm4_delete_state(sctx, ps, shader->pm4); + break; + default:; + } + } + + si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); + si_shader_destroy(shader); + free(shader); } static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)cso; - struct si_shader *p = sel->first_variant, *c; - struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = &sctx->vs_shader, - [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader, - [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, - [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader, - [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader, - }; - - util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready); - - if (current_shader[sel->type]->cso == sel) { - current_shader[sel->type]->cso = NULL; - current_shader[sel->type]->current = NULL; - } - - while (p) { - c = p->next_variant; - si_delete_shader(sctx, p); - p = c; - } - - if (sel->main_shader_part) - si_delete_shader(sctx, sel->main_shader_part); - if (sel->main_shader_part_ls) - si_delete_shader(sctx, sel->main_shader_part_ls); - if (sel->main_shader_part_es) - si_delete_shader(sctx, sel->main_shader_part_es); - if (sel->main_shader_part_ngg) - si_delete_shader(sctx, sel->main_shader_part_ngg); - if (sel->gs_copy_shader) - si_delete_shader(sctx, sel->gs_copy_shader); - - util_queue_fence_destroy(&sel->ready); - simple_mtx_destroy(&sel->mutex); - ralloc_free(sel->nir); - free(sel->nir_binary); - free(sel); + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)cso; + struct si_shader *p = sel->first_variant, *c; + struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = &sctx->vs_shader, [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader, + [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader, + [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader, + }; + + util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready); + + if (current_shader[sel->type]->cso == sel) { + current_shader[sel->type]->cso = NULL; + current_shader[sel->type]->current = NULL; + } + + while (p) { + c = p->next_variant; + si_delete_shader(sctx, p); + p = c; + } + + if (sel->main_shader_part) + si_delete_shader(sctx, sel->main_shader_part); + if (sel->main_shader_part_ls) + si_delete_shader(sctx, sel->main_shader_part_ls); + if (sel->main_shader_part_es) + si_delete_shader(sctx, sel->main_shader_part_es); + if (sel->main_shader_part_ngg) + si_delete_shader(sctx, sel->main_shader_part_ngg); + if (sel->gs_copy_shader) + si_delete_shader(sctx, sel->gs_copy_shader); + + util_queue_fence_destroy(&sel->ready); + simple_mtx_destroy(&sel->mutex); + ralloc_free(sel->nir); + free(sel->nir_binary); + free(sel); } static void si_delete_shader_selector(struct pipe_context *ctx, void *state) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_shader_selector *sel = (struct si_shader_selector *)state; - - si_shader_selector_reference(sctx, &sel, NULL); -} - -static unsigned si_get_ps_input_cntl(struct si_context *sctx, - struct si_shader *vs, unsigned name, - unsigned index, unsigned interpolate) -{ - struct si_shader_info *vsinfo = &vs->selector->info; - unsigned j, offset, ps_input_cntl = 0; - - if (interpolate == TGSI_INTERPOLATE_CONSTANT || - (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) || - name == TGSI_SEMANTIC_PRIMID) - ps_input_cntl |= S_028644_FLAT_SHADE(1); - - if (name == TGSI_SEMANTIC_PCOORD || - (name == TGSI_SEMANTIC_TEXCOORD && - sctx->sprite_coord_enable & (1 << index))) { - ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); - } - - for (j = 0; j < vsinfo->num_outputs; j++) { - if (name == vsinfo->output_semantic_name[j] && - index == vsinfo->output_semantic_index[j]) { - offset = vs->info.vs_output_param_offset[j]; - - if (offset <= AC_EXP_PARAM_OFFSET_31) { - /* The input is loaded from parameter memory. */ - ps_input_cntl |= S_028644_OFFSET(offset); - } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - if (offset == AC_EXP_PARAM_UNDEFINED) { - /* This can happen with depth-only rendering. */ - offset = 0; - } else { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && - offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); - offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; - } - - ps_input_cntl = S_028644_OFFSET(0x20) | - S_028644_DEFAULT_VAL(offset); - } - break; - } - } - - if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID) - /* PrimID is written after the last output when HW VS is used. */ - ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); - else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* No corresponding output found, load defaults into input. - * Don't set any other bits. - * (FLAT_SHADE=1 completely changes behavior) */ - ps_input_cntl = S_028644_OFFSET(0x20); - /* D3D 9 behaviour. GL is undefined */ - if (name == TGSI_SEMANTIC_COLOR && index == 0) - ps_input_cntl |= S_028644_DEFAULT_VAL(3); - } - return ps_input_cntl; + struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *sel = (struct si_shader_selector *)state; + + si_shader_selector_reference(sctx, &sel, NULL); +} + +static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, unsigned name, + unsigned index, unsigned interpolate) +{ + struct si_shader_info *vsinfo = &vs->selector->info; + unsigned j, offset, ps_input_cntl = 0; + + if (interpolate == TGSI_INTERPOLATE_CONSTANT || + (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) || name == TGSI_SEMANTIC_PRIMID) + ps_input_cntl |= S_028644_FLAT_SHADE(1); + + if (name == TGSI_SEMANTIC_PCOORD || + (name == TGSI_SEMANTIC_TEXCOORD && sctx->sprite_coord_enable & (1 << index))) { + ps_input_cntl |= S_028644_PT_SPRITE_TEX(1); + } + + for (j = 0; j < vsinfo->num_outputs; j++) { + if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) { + offset = vs->info.vs_output_param_offset[j]; + + if (offset <= AC_EXP_PARAM_OFFSET_31) { + /* The input is loaded from parameter memory. */ + ps_input_cntl |= S_028644_OFFSET(offset); + } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + if (offset == AC_EXP_PARAM_UNDEFINED) { + /* This can happen with depth-only rendering. */ + offset = 0; + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && + offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); + offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; + } + + ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); + } + break; + } + } + + if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID) + /* PrimID is written after the last output when HW VS is used. */ + ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]); + else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) { + /* No corresponding output found, load defaults into input. + * Don't set any other bits. + * (FLAT_SHADE=1 completely changes behavior) */ + ps_input_cntl = S_028644_OFFSET(0x20); + /* D3D 9 behaviour. GL is undefined */ + if (name == TGSI_SEMANTIC_COLOR && index == 0) + ps_input_cntl |= S_028644_DEFAULT_VAL(3); + } + return ps_input_cntl; } static void si_emit_spi_map(struct si_context *sctx) { - struct si_shader *ps = sctx->ps_shader.current; - struct si_shader *vs = si_get_vs_state(sctx); - struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; - unsigned i, num_interp, num_written = 0, bcol_interp[2]; - unsigned spi_ps_input_cntl[32]; - - if (!ps || !ps->selector->info.num_inputs) - return; - - num_interp = si_get_ps_num_interp(ps); - assert(num_interp > 0); - - for (i = 0; i < psinfo->num_inputs; i++) { - unsigned name = psinfo->input_semantic_name[i]; - unsigned index = psinfo->input_semantic_index[i]; - unsigned interpolate = psinfo->input_interpolate[i]; - - spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name, - index, interpolate); - - if (name == TGSI_SEMANTIC_COLOR) { - assert(index < ARRAY_SIZE(bcol_interp)); - bcol_interp[index] = interpolate; - } - } - - if (ps->key.part.ps.prolog.color_two_side) { - unsigned bcol = TGSI_SEMANTIC_BCOLOR; - - for (i = 0; i < 2; i++) { - if (!(psinfo->colors_read & (0xf << (i * 4)))) - continue; - - spi_ps_input_cntl[num_written++] = - si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]); - - } - } - assert(num_interp == num_written); - - /* R_028644_SPI_PS_INPUT_CNTL_0 */ - /* Dota 2: Only ~16% of SPI map updates set different values. */ - /* Talos: Only ~9% of SPI map updates set different values. */ - unsigned initial_cdw = sctx->gfx_cs->current.cdw; - radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, - spi_ps_input_cntl, - sctx->tracked_regs.spi_ps_input_cntl, num_interp); - - if (initial_cdw != sctx->gfx_cs->current.cdw) - sctx->context_roll = true; + struct si_shader *ps = sctx->ps_shader.current; + struct si_shader *vs = si_get_vs_state(sctx); + struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL; + unsigned i, num_interp, num_written = 0, bcol_interp[2]; + unsigned spi_ps_input_cntl[32]; + + if (!ps || !ps->selector->info.num_inputs) + return; + + num_interp = si_get_ps_num_interp(ps); + assert(num_interp > 0); + + for (i = 0; i < psinfo->num_inputs; i++) { + unsigned name = psinfo->input_semantic_name[i]; + unsigned index = psinfo->input_semantic_index[i]; + unsigned interpolate = psinfo->input_interpolate[i]; + + spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name, index, interpolate); + + if (name == TGSI_SEMANTIC_COLOR) { + assert(index < ARRAY_SIZE(bcol_interp)); + bcol_interp[index] = interpolate; + } + } + + if (ps->key.part.ps.prolog.color_two_side) { + unsigned bcol = TGSI_SEMANTIC_BCOLOR; + + for (i = 0; i < 2; i++) { + if (!(psinfo->colors_read & (0xf << (i * 4)))) + continue; + + spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]); + } + } + assert(num_interp == num_written); + + /* R_028644_SPI_PS_INPUT_CNTL_0 */ + /* Dota 2: Only ~16% of SPI map updates set different values. */ + /* Talos: Only ~9% of SPI map updates set different values. */ + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, + sctx->tracked_regs.spi_ps_input_cntl, num_interp); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll = true; } /** @@ -3535,169 +3278,150 @@ static void si_emit_spi_map(struct si_context *sctx) */ static void si_init_config_add_vgt_flush(struct si_context *sctx) { - if (sctx->init_config_has_vgt_flush) - return; - - /* Done by Vulkan before VGT_FLUSH. */ - si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); - si_pm4_cmd_add(sctx->init_config, - EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - si_pm4_cmd_end(sctx->init_config, false); - - /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ - si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); - si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - si_pm4_cmd_end(sctx->init_config, false); - sctx->init_config_has_vgt_flush = true; + if (sctx->init_config_has_vgt_flush) + return; + + /* Done by Vulkan before VGT_FLUSH. */ + si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); + si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + si_pm4_cmd_end(sctx->init_config, false); + + /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ + si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); + si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + si_pm4_cmd_end(sctx->init_config, false); + sctx->init_config_has_vgt_flush = true; } /* Initialize state related to ESGS / GSVS ring buffers */ static bool si_update_gs_ring_buffers(struct si_context *sctx) { - struct si_shader_selector *es = - sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso; - struct si_shader_selector *gs = sctx->gs_shader.cso; - struct si_pm4_state *pm4; - - /* Chip constants. */ - unsigned num_se = sctx->screen->info.max_se; - unsigned wave_size = 64; - unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ - /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16. - * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2). - */ - unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se; - unsigned alignment = 256 * num_se; - /* The maximum size is 63.999 MB per SE. */ - unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; - - /* Calculate the minimum size. */ - unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * - wave_size, alignment); - - /* These are recommended sizes, not minimum sizes. */ - unsigned esgs_ring_size = max_gs_waves * 2 * wave_size * - es->esgs_itemsize * gs->gs_input_verts_per_prim; - unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * - gs->max_gsvs_emit_size; - - min_esgs_ring_size = align(min_esgs_ring_size, alignment); - esgs_ring_size = align(esgs_ring_size, alignment); - gsvs_ring_size = align(gsvs_ring_size, alignment); - - esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); - gsvs_ring_size = MIN2(gsvs_ring_size, max_size); - - /* Some rings don't have to be allocated if shaders don't use them. - * (e.g. no varyings between ES and GS or GS and VS) - * - * GFX9 doesn't have the ESGS ring. - */ - bool update_esgs = sctx->chip_class <= GFX8 && - esgs_ring_size && - (!sctx->esgs_ring || - sctx->esgs_ring->width0 < esgs_ring_size); - bool update_gsvs = gsvs_ring_size && - (!sctx->gsvs_ring || - sctx->gsvs_ring->width0 < gsvs_ring_size); - - if (!update_esgs && !update_gsvs) - return true; - - if (update_esgs) { - pipe_resource_reference(&sctx->esgs_ring, NULL); - sctx->esgs_ring = - pipe_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - esgs_ring_size, - sctx->screen->info.pte_fragment_size); - if (!sctx->esgs_ring) - return false; - } - - if (update_gsvs) { - pipe_resource_reference(&sctx->gsvs_ring, NULL); - sctx->gsvs_ring = - pipe_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - gsvs_ring_size, - sctx->screen->info.pte_fragment_size); - if (!sctx->gsvs_ring) - return false; - } - - /* Create the "init_config_gs_rings" state. */ - pm4 = CALLOC_STRUCT(si_pm4_state); - if (!pm4) - return false; - - if (sctx->chip_class >= GFX7) { - if (sctx->esgs_ring) { - assert(sctx->chip_class <= GFX8); - si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, - sctx->esgs_ring->width0 / 256); - } - if (sctx->gsvs_ring) - si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, - sctx->gsvs_ring->width0 / 256); - } else { - if (sctx->esgs_ring) - si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, - sctx->esgs_ring->width0 / 256); - if (sctx->gsvs_ring) - si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, - sctx->gsvs_ring->width0 / 256); - } - - /* Set the state. */ - if (sctx->init_config_gs_rings) - si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); - sctx->init_config_gs_rings = pm4; - - if (!sctx->init_config_has_vgt_flush) { - si_init_config_add_vgt_flush(sctx); - si_pm4_upload_indirect_buffer(sctx, sctx->init_config); - } - - /* Flush the context to re-emit both init_config states. */ - sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - - /* Set ring bindings. */ - if (sctx->esgs_ring) { - assert(sctx->chip_class <= GFX8); - si_set_ring_buffer(sctx, SI_ES_RING_ESGS, - sctx->esgs_ring, 0, sctx->esgs_ring->width0, - true, true, 4, 64, 0); - si_set_ring_buffer(sctx, SI_GS_RING_ESGS, - sctx->esgs_ring, 0, sctx->esgs_ring->width0, - false, false, 0, 0, 0); - } - if (sctx->gsvs_ring) { - si_set_ring_buffer(sctx, SI_RING_GSVS, - sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, - false, false, 0, 0, 0); - } - - return true; + struct si_shader_selector *es = + sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso; + struct si_shader_selector *gs = sctx->gs_shader.cso; + struct si_pm4_state *pm4; + + /* Chip constants. */ + unsigned num_se = sctx->screen->info.max_se; + unsigned wave_size = 64; + unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ + /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16. + * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2). + */ + unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se; + unsigned alignment = 256 * num_se; + /* The maximum size is 63.999 MB per SE. */ + unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; + + /* Calculate the minimum size. */ + unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment); + + /* These are recommended sizes, not minimum sizes. */ + unsigned esgs_ring_size = + max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim; + unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size; + + min_esgs_ring_size = align(min_esgs_ring_size, alignment); + esgs_ring_size = align(esgs_ring_size, alignment); + gsvs_ring_size = align(gsvs_ring_size, alignment); + + esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + gsvs_ring_size = MIN2(gsvs_ring_size, max_size); + + /* Some rings don't have to be allocated if shaders don't use them. + * (e.g. no varyings between ES and GS or GS and VS) + * + * GFX9 doesn't have the ESGS ring. + */ + bool update_esgs = sctx->chip_class <= GFX8 && esgs_ring_size && + (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size); + bool update_gsvs = + gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size); + + if (!update_esgs && !update_gsvs) + return true; + + if (update_esgs) { + pipe_resource_reference(&sctx->esgs_ring, NULL); + sctx->esgs_ring = + pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + esgs_ring_size, sctx->screen->info.pte_fragment_size); + if (!sctx->esgs_ring) + return false; + } + + if (update_gsvs) { + pipe_resource_reference(&sctx->gsvs_ring, NULL); + sctx->gsvs_ring = + pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + gsvs_ring_size, sctx->screen->info.pte_fragment_size); + if (!sctx->gsvs_ring) + return false; + } + + /* Create the "init_config_gs_rings" state. */ + pm4 = CALLOC_STRUCT(si_pm4_state); + if (!pm4) + return false; + + if (sctx->chip_class >= GFX7) { + if (sctx->esgs_ring) { + assert(sctx->chip_class <= GFX8); + si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); + } + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); + } else { + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); + } + + /* Set the state. */ + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); + sctx->init_config_gs_rings = pm4; + + if (!sctx->init_config_has_vgt_flush) { + si_init_config_add_vgt_flush(sctx); + si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + } + + /* Flush the context to re-emit both init_config states. */ + sctx->initial_gfx_cs_size = 0; /* force flush */ + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + + /* Set ring bindings. */ + if (sctx->esgs_ring) { + assert(sctx->chip_class <= GFX8); + si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true, + true, 4, 64, 0); + si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false, + false, 0, 0, 0); + } + if (sctx->gsvs_ring) { + si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false, + false, 0, 0, 0); + } + + return true; } static void si_shader_lock(struct si_shader *shader) { - simple_mtx_lock(&shader->selector->mutex); - if (shader->previous_stage_sel) { - assert(shader->previous_stage_sel != shader->selector); - simple_mtx_lock(&shader->previous_stage_sel->mutex); - } + simple_mtx_lock(&shader->selector->mutex); + if (shader->previous_stage_sel) { + assert(shader->previous_stage_sel != shader->selector); + simple_mtx_lock(&shader->previous_stage_sel->mutex); + } } static void si_shader_unlock(struct si_shader *shader) { - if (shader->previous_stage_sel) - simple_mtx_unlock(&shader->previous_stage_sel->mutex); - simple_mtx_unlock(&shader->selector->mutex); + if (shader->previous_stage_sel) + simple_mtx_unlock(&shader->previous_stage_sel->mutex); + simple_mtx_unlock(&shader->selector->mutex); } /** @@ -3705,578 +3429,545 @@ static void si_shader_unlock(struct si_shader *shader) * 0 if not * < 0 if there was a failure */ -static int si_update_scratch_buffer(struct si_context *sctx, - struct si_shader *shader) +static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader) { - uint64_t scratch_va = sctx->scratch_buffer->gpu_address; + uint64_t scratch_va = sctx->scratch_buffer->gpu_address; - if (!shader) - return 0; + if (!shader) + return 0; - /* This shader doesn't need a scratch buffer */ - if (shader->config.scratch_bytes_per_wave == 0) - return 0; + /* This shader doesn't need a scratch buffer */ + if (shader->config.scratch_bytes_per_wave == 0) + return 0; - /* Prevent race conditions when updating: - * - si_shader::scratch_bo - * - si_shader::binary::code - * - si_shader::previous_stage::binary::code. - */ - si_shader_lock(shader); + /* Prevent race conditions when updating: + * - si_shader::scratch_bo + * - si_shader::binary::code + * - si_shader::previous_stage::binary::code. + */ + si_shader_lock(shader); - /* This shader is already configured to use the current - * scratch buffer. */ - if (shader->scratch_bo == sctx->scratch_buffer) { - si_shader_unlock(shader); - return 0; - } + /* This shader is already configured to use the current + * scratch buffer. */ + if (shader->scratch_bo == sctx->scratch_buffer) { + si_shader_unlock(shader); + return 0; + } - assert(sctx->scratch_buffer); + assert(sctx->scratch_buffer); - /* Replace the shader bo with a new bo that has the relocs applied. */ - if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) { - si_shader_unlock(shader); - return -1; - } + /* Replace the shader bo with a new bo that has the relocs applied. */ + if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) { + si_shader_unlock(shader); + return -1; + } - /* Update the shader state to use the new shader bo. */ - si_shader_init_pm4_state(sctx->screen, shader); + /* Update the shader state to use the new shader bo. */ + si_shader_init_pm4_state(sctx->screen, shader); - si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer); + si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer); - si_shader_unlock(shader); - return 1; + si_shader_unlock(shader); + return 1; } static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) { - return shader ? shader->config.scratch_bytes_per_wave : 0; + return shader ? shader->config.scratch_bytes_per_wave : 0; } static struct si_shader *si_get_tcs_current(struct si_context *sctx) { - if (!sctx->tes_shader.cso) - return NULL; /* tessellation disabled */ + if (!sctx->tes_shader.cso) + return NULL; /* tessellation disabled */ - return sctx->tcs_shader.cso ? sctx->tcs_shader.current : - sctx->fixed_func_tcs_shader.current; + return sctx->tcs_shader.cso ? sctx->tcs_shader.current : sctx->fixed_func_tcs_shader.current; } static bool si_update_scratch_relocs(struct si_context *sctx) { - struct si_shader *tcs = si_get_tcs_current(sctx); - int r; - - /* Update the shaders, so that they are using the latest scratch. - * The scratch buffer may have been changed since these shaders were - * last used, so we still need to try to update them, even if they - * require scratch buffers smaller than the current size. - */ - r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, tcs); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, hs, tcs->pm4); - - /* VS can be bound as LS, ES, or VS. */ - r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); - if (r < 0) - return false; - if (r == 1) { - if (sctx->vs_shader.current->key.as_ls) - si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); - else if (sctx->vs_shader.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); - else if (sctx->vs_shader.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); - } - - /* TES can be bound as ES or VS. */ - r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); - if (r < 0) - return false; - if (r == 1) { - if (sctx->tes_shader.current->key.as_es) - si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); - else if (sctx->tes_shader.current->key.as_ngg) - si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); - } - - return true; + struct si_shader *tcs = si_get_tcs_current(sctx); + int r; + + /* Update the shaders, so that they are using the latest scratch. + * The scratch buffer may have been changed since these shaders were + * last used, so we still need to try to update them, even if they + * require scratch buffers smaller than the current size. + */ + r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, tcs); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, hs, tcs->pm4); + + /* VS can be bound as LS, ES, or VS. */ + r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->vs_shader.current->key.as_ls) + si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); + else if (sctx->vs_shader.current->key.as_es) + si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); + else if (sctx->vs_shader.current->key.as_ngg) + si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); + } + + /* TES can be bound as ES or VS. */ + r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->tes_shader.current->key.as_es) + si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); + else if (sctx->tes_shader.current->key.as_ngg) + si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); + } + + return true; } static bool si_update_spi_tmpring_size(struct si_context *sctx) { - /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. - * There are 2 cases to handle: - * - * - If the current needed size is less than the maximum seen size, - * use the maximum seen size, so that WAVESIZE remains the same. - * - * - If the current needed size is greater than the maximum seen size, - * the scratch buffer is reallocated, so we can increase WAVESIZE. - * - * Shaders that set SCRATCH_EN=0 don't allocate scratch space. - * Otherwise, the number of waves that can use scratch is - * SPI_TMPRING_SIZE.WAVES. - */ - unsigned bytes = 0; - - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); - - if (sctx->tes_shader.cso) { - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); - } - - sctx->max_seen_scratch_bytes_per_wave = - MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); - - unsigned scratch_needed_size = - sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; - unsigned spi_tmpring_size; - - if (scratch_needed_size > 0) { - if (!sctx->scratch_buffer || - scratch_needed_size > sctx->scratch_buffer->b.b.width0) { - /* Create a bigger scratch buffer */ - si_resource_reference(&sctx->scratch_buffer, NULL); - - sctx->scratch_buffer = - si_aligned_buffer_create(&sctx->screen->b, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - scratch_needed_size, - sctx->screen->info.pte_fragment_size); - if (!sctx->scratch_buffer) - return false; - - si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); - si_context_add_resource_size(sctx, - &sctx->scratch_buffer->b.b); - } - - if (!si_update_scratch_relocs(sctx)) - return false; - } - - /* The LLVM shader backend should be reporting aligned scratch_sizes. */ - assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && - "scratch size should already be aligned correctly."); - - spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | - S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10); - if (spi_tmpring_size != sctx->spi_tmpring_size) { - sctx->spi_tmpring_size = spi_tmpring_size; - si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); - } - return true; + /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer. + * There are 2 cases to handle: + * + * - If the current needed size is less than the maximum seen size, + * use the maximum seen size, so that WAVESIZE remains the same. + * + * - If the current needed size is greater than the maximum seen size, + * the scratch buffer is reallocated, so we can increase WAVESIZE. + * + * Shaders that set SCRATCH_EN=0 don't allocate scratch space. + * Otherwise, the number of waves that can use scratch is + * SPI_TMPRING_SIZE.WAVES. + */ + unsigned bytes = 0; + + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); + + if (sctx->tes_shader.cso) { + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx))); + } + + sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes); + + unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves; + unsigned spi_tmpring_size; + + if (scratch_needed_size > 0) { + if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) { + /* Create a bigger scratch buffer */ + si_resource_reference(&sctx->scratch_buffer, NULL); + + sctx->scratch_buffer = si_aligned_buffer_create( + &sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_needed_size, + sctx->screen->info.pte_fragment_size); + if (!sctx->scratch_buffer) + return false; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); + si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b); + } + + if (!si_update_scratch_relocs(sctx)) + return false; + } + + /* The LLVM shader backend should be reporting aligned scratch_sizes. */ + assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && + "scratch size should already be aligned correctly."); + + spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | + S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10); + if (spi_tmpring_size != sctx->spi_tmpring_size) { + sctx->spi_tmpring_size = spi_tmpring_size; + si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); + } + return true; } static void si_init_tess_factor_ring(struct si_context *sctx) { - assert(!sctx->tess_rings); - assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); - - /* The address must be aligned to 2^19, because the shader only - * receives the high 13 bits. - */ - sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_32BIT, - PIPE_USAGE_DEFAULT, - sctx->screen->tess_offchip_ring_size + - sctx->screen->tess_factor_ring_size, - 1 << 19); - if (!sctx->tess_rings) - return; - - si_init_config_add_vgt_flush(sctx); - - si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings), - RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS); - - uint64_t factor_va = si_resource(sctx->tess_rings)->gpu_address + - sctx->screen->tess_offchip_ring_size; - - /* Append these registers to the init config state. */ - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE, - S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4)); - si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, - factor_va >> 8); - if (sctx->chip_class >= GFX10) - si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD, - S_030984_BASE_HI(factor_va >> 40)); - else if (sctx->chip_class == GFX9) - si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI, - S_030944_BASE_HI(factor_va >> 40)); - si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM, - sctx->screen->vgt_hs_offchip_param); - } else { - si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE, - S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4)); - si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, - factor_va >> 8); - si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, - sctx->screen->vgt_hs_offchip_param); - } - - /* Flush the context to re-emit the init_config state. - * This is done only once in a lifetime of a context. - */ - si_pm4_upload_indirect_buffer(sctx, sctx->init_config); - sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + assert(!sctx->tess_rings); + assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); + + /* The address must be aligned to 2^19, because the shader only + * receives the high 13 bits. + */ + sctx->tess_rings = pipe_aligned_buffer_create( + sctx->b.screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, + sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19); + if (!sctx->tess_rings) + return; + + si_init_config_add_vgt_flush(sctx); + + si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings), RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RINGS); + + uint64_t factor_va = + si_resource(sctx->tess_rings)->gpu_address + sctx->screen->tess_offchip_ring_size; + + /* Append these registers to the init config state. */ + if (sctx->chip_class >= GFX7) { + si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE, + S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4)); + si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8); + if (sctx->chip_class >= GFX10) + si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD, + S_030984_BASE_HI(factor_va >> 40)); + else if (sctx->chip_class == GFX9) + si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI, + S_030944_BASE_HI(factor_va >> 40)); + si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM, + sctx->screen->vgt_hs_offchip_param); + } else { + si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE, + S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4)); + si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8); + si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, + sctx->screen->vgt_hs_offchip_param); + } + + /* Flush the context to re-emit the init_config state. + * This is done only once in a lifetime of a context. + */ + si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + sctx->initial_gfx_cs_size = 0; /* force flush */ + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, - union si_vgt_stages_key key) + union si_vgt_stages_key key) { - struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); - uint32_t stages = 0; - - if (key.u.tess) { - stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | - S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1); - - if (key.u.gs) - stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | - S_028B54_GS_EN(1); - else if (key.u.ngg) - stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS); - else - stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS); - } else if (key.u.gs) { - stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | - S_028B54_GS_EN(1); - } else if (key.u.ngg) { - stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL); - } - - if (key.u.ngg) { - stages |= S_028B54_PRIMGEN_EN(1) | - S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | - S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | - S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough); - } else if (key.u.gs) - stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); - - if (screen->info.chip_class >= GFX9) - stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); - - if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { - stages |= S_028B54_HS_W32_EN(1) | - S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ - S_028B54_VS_W32_EN(1); - } - - si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages); - return pm4; + struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + uint32_t stages = 0; + + if (key.u.tess) { + stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1); + + if (key.u.gs) + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1); + else if (key.u.ngg) + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS); + else + stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS); + } else if (key.u.gs) { + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1); + } else if (key.u.ngg) { + stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL); + } + + if (key.u.ngg) { + stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | + S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | + S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough); + } else if (key.u.gs) + stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); + + if (screen->info.chip_class >= GFX9) + stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); + + if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { + stages |= S_028B54_HS_W32_EN(1) | + S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ + S_028B54_VS_W32_EN(1); + } + + si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages); + return pm4; } -static void si_update_vgt_shader_config(struct si_context *sctx, - union si_vgt_stages_key key) +static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key) { - struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; + struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index]; - if (unlikely(!*pm4)) - *pm4 = si_build_vgt_shader_config(sctx->screen, key); - si_pm4_bind_state(sctx, vgt_shader_config, *pm4); + if (unlikely(!*pm4)) + *pm4 = si_build_vgt_shader_config(sctx->screen, key); + si_pm4_bind_state(sctx, vgt_shader_config, *pm4); } bool si_update_shaders(struct si_context *sctx) { - struct pipe_context *ctx = (struct pipe_context*)sctx; - struct si_compiler_ctx_state compiler_state; - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader *old_vs = si_get_vs_state(sctx); - bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false; - struct si_shader *old_ps = sctx->ps_shader.current; - union si_vgt_stages_key key; - unsigned old_spi_shader_col_format = - old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; - int r; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - key.index = 0; - - if (sctx->tes_shader.cso) - key.u.tess = 1; - if (sctx->gs_shader.cso) - key.u.gs = 1; - - if (sctx->ngg) { - key.u.ngg = 1; - key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs; - } - - /* Update TCS and TES. */ - if (sctx->tes_shader.cso) { - if (!sctx->tess_rings) { - si_init_tess_factor_ring(sctx); - if (!sctx->tess_rings) - return false; - } - - if (sctx->tcs_shader.cso) { - r = si_shader_select(ctx, &sctx->tcs_shader, key, - &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4); - } else { - if (!sctx->fixed_func_tcs_shader.cso) { - sctx->fixed_func_tcs_shader.cso = - si_create_fixed_func_tcs(sctx); - if (!sctx->fixed_func_tcs_shader.cso) - return false; - } - - r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, - key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, hs, - sctx->fixed_func_tcs_shader.current->pm4); - } - - if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state); - if (r) - return false; - - if (sctx->gs_shader.cso) { - /* TES as ES */ - assert(sctx->chip_class <= GFX8); - si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); - } else if (key.u.ngg) { - si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4); - } else { - si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); - } - } - } else { - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, ls, NULL); - si_pm4_bind_state(sctx, hs, NULL); - } - - /* Update GS. */ - if (sctx->gs_shader.cso) { - r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); - if (!key.u.ngg) { - si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4); - - if (!si_update_gs_ring_buffers(sctx)) - return false; - } else { - si_pm4_bind_state(sctx, vs, NULL); - } - } else { - if (!key.u.ngg) { - si_pm4_bind_state(sctx, gs, NULL); - if (sctx->chip_class <= GFX8) - si_pm4_bind_state(sctx, es, NULL); - } - } - - /* Update VS. */ - if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) { - r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state); - if (r) - return false; - - if (!key.u.tess && !key.u.gs) { - if (key.u.ngg) { - si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4); - si_pm4_bind_state(sctx, vs, NULL); - } else { - si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); - } - } else if (sctx->tes_shader.cso) { - si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); - } else { - assert(sctx->gs_shader.cso); - si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); - } - } - - /* This must be done after the shader variant is selected. */ - if (sctx->ngg) { - struct si_shader *vs = si_get_vs(sctx)->current; - - key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); - key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & - SI_NGG_CULL_GS_FAST_LAUNCH_ALL); - } - - si_update_vgt_shader_config(sctx, key); - - if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable) - si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - - if (sctx->ps_shader.cso) { - unsigned db_shader_control; - - r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state); - if (r) - return false; - si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); - - db_shader_control = - sctx->ps_shader.cso->db_shader_control | - S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS); - - if (si_pm4_state_changed(sctx, ps) || - si_pm4_state_changed(sctx, vs) || - (key.u.ngg && si_pm4_state_changed(sctx, gs)) || - sctx->sprite_coord_enable != rs->sprite_coord_enable || - sctx->flatshade != rs->flatshade) { - sctx->sprite_coord_enable = rs->sprite_coord_enable; - sctx->flatshade = rs->flatshade; - si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); - } - - if (sctx->screen->info.rbplus_allowed && - si_pm4_state_changed(sctx, ps) && - (!old_ps || - old_spi_shader_col_format != - sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - - if (sctx->ps_db_shader_control != db_shader_control) { - sctx->ps_db_shader_control = db_shader_control; - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - if (sctx->screen->dpbb_allowed) - si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); - } - - if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) { - sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing; - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); - - if (sctx->chip_class == GFX6) - si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); - - if (sctx->framebuffer.nr_samples <= 1) - si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); - } - } - - if (si_pm4_state_enabled_and_changed(sctx, ls) || - si_pm4_state_enabled_and_changed(sctx, hs) || - si_pm4_state_enabled_and_changed(sctx, es) || - si_pm4_state_enabled_and_changed(sctx, gs) || - si_pm4_state_enabled_and_changed(sctx, vs) || - si_pm4_state_enabled_and_changed(sctx, ps)) { - if (!si_update_spi_tmpring_size(sctx)) - return false; - } - - if (sctx->chip_class >= GFX7) { - if (si_pm4_state_enabled_and_changed(sctx, ls)) - sctx->prefetch_L2_mask |= SI_PREFETCH_LS; - else if (!sctx->queued.named.ls) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; - - if (si_pm4_state_enabled_and_changed(sctx, hs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_HS; - else if (!sctx->queued.named.hs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; - - if (si_pm4_state_enabled_and_changed(sctx, es)) - sctx->prefetch_L2_mask |= SI_PREFETCH_ES; - else if (!sctx->queued.named.es) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; - - if (si_pm4_state_enabled_and_changed(sctx, gs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_GS; - else if (!sctx->queued.named.gs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; - - if (si_pm4_state_enabled_and_changed(sctx, vs)) - sctx->prefetch_L2_mask |= SI_PREFETCH_VS; - else if (!sctx->queued.named.vs) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; - - if (si_pm4_state_enabled_and_changed(sctx, ps)) - sctx->prefetch_L2_mask |= SI_PREFETCH_PS; - else if (!sctx->queued.named.ps) - sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; - } - - sctx->do_update_shaders = false; - return true; + struct pipe_context *ctx = (struct pipe_context *)sctx; + struct si_compiler_ctx_state compiler_state; + struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + struct si_shader *old_vs = si_get_vs_state(sctx); + bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false; + struct si_shader *old_ps = sctx->ps_shader.current; + union si_vgt_stages_key key; + unsigned old_spi_shader_col_format = + old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; + int r; + + if (!sctx->compiler.passes) + si_init_compiler(sctx->screen, &sctx->compiler); + + compiler_state.compiler = &sctx->compiler; + compiler_state.debug = sctx->debug; + compiler_state.is_debug_context = sctx->is_debug; + + key.index = 0; + + if (sctx->tes_shader.cso) + key.u.tess = 1; + if (sctx->gs_shader.cso) + key.u.gs = 1; + + if (sctx->ngg) { + key.u.ngg = 1; + key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs; + } + + /* Update TCS and TES. */ + if (sctx->tes_shader.cso) { + if (!sctx->tess_rings) { + si_init_tess_factor_ring(sctx); + if (!sctx->tess_rings) + return false; + } + + if (sctx->tcs_shader.cso) { + r = si_shader_select(ctx, &sctx->tcs_shader, key, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4); + } else { + if (!sctx->fixed_func_tcs_shader.cso) { + sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx); + if (!sctx->fixed_func_tcs_shader.cso) + return false; + } + + r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4); + } + + if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) { + r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state); + if (r) + return false; + + if (sctx->gs_shader.cso) { + /* TES as ES */ + assert(sctx->chip_class <= GFX8); + si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); + } else if (key.u.ngg) { + si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4); + } else { + si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); + } + } + } else { + if (sctx->chip_class <= GFX8) + si_pm4_bind_state(sctx, ls, NULL); + si_pm4_bind_state(sctx, hs, NULL); + } + + /* Update GS. */ + if (sctx->gs_shader.cso) { + r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); + if (!key.u.ngg) { + si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4); + + if (!si_update_gs_ring_buffers(sctx)) + return false; + } else { + si_pm4_bind_state(sctx, vs, NULL); + } + } else { + if (!key.u.ngg) { + si_pm4_bind_state(sctx, gs, NULL); + if (sctx->chip_class <= GFX8) + si_pm4_bind_state(sctx, es, NULL); + } + } + + /* Update VS. */ + if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) { + r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state); + if (r) + return false; + + if (!key.u.tess && !key.u.gs) { + if (key.u.ngg) { + si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4); + si_pm4_bind_state(sctx, vs, NULL); + } else { + si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); + } + } else if (sctx->tes_shader.cso) { + si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); + } else { + assert(sctx->gs_shader.cso); + si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); + } + } + + /* This must be done after the shader variant is selected. */ + if (sctx->ngg) { + struct si_shader *vs = si_get_vs(sctx)->current; + + key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs); + key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + } + + si_update_vgt_shader_config(sctx, key); + + if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable) + si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); + + if (sctx->ps_shader.cso) { + unsigned db_shader_control; + + r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state); + if (r) + return false; + si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); + + db_shader_control = sctx->ps_shader.cso->db_shader_control | + S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS); + + if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) || + (key.u.ngg && si_pm4_state_changed(sctx, gs)) || + sctx->sprite_coord_enable != rs->sprite_coord_enable || + sctx->flatshade != rs->flatshade) { + sctx->sprite_coord_enable = rs->sprite_coord_enable; + sctx->flatshade = rs->flatshade; + si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map); + } + + if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) && + (!old_ps || old_spi_shader_col_format != + sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); + + if (sctx->ps_db_shader_control != db_shader_control) { + sctx->ps_db_shader_control = db_shader_control; + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); + } + + if (sctx->smoothing_enabled != + sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) { + sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing; + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); + + if (sctx->chip_class == GFX6) + si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); + + if (sctx->framebuffer.nr_samples <= 1) + si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); + } + } + + if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) || + si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) || + si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { + if (!si_update_spi_tmpring_size(sctx)) + return false; + } + + if (sctx->chip_class >= GFX7) { + if (si_pm4_state_enabled_and_changed(sctx, ls)) + sctx->prefetch_L2_mask |= SI_PREFETCH_LS; + else if (!sctx->queued.named.ls) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; + + if (si_pm4_state_enabled_and_changed(sctx, hs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_HS; + else if (!sctx->queued.named.hs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; + + if (si_pm4_state_enabled_and_changed(sctx, es)) + sctx->prefetch_L2_mask |= SI_PREFETCH_ES; + else if (!sctx->queued.named.es) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; + + if (si_pm4_state_enabled_and_changed(sctx, gs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_GS; + else if (!sctx->queued.named.gs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; + + if (si_pm4_state_enabled_and_changed(sctx, vs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_VS; + else if (!sctx->queued.named.vs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + + if (si_pm4_state_enabled_and_changed(sctx, ps)) + sctx->prefetch_L2_mask |= SI_PREFETCH_PS; + else if (!sctx->queued.named.ps) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; + } + + sctx->do_update_shaders = false; + return true; } static void si_emit_scratch_state(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct radeon_cmdbuf *cs = sctx->gfx_cs; - radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, - sctx->spi_tmpring_size); + radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size); - if (sctx->scratch_buffer) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - sctx->scratch_buffer, RADEON_USAGE_READWRITE, - RADEON_PRIO_SCRATCH_BUFFER); - } + if (sctx->scratch_buffer) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_SCRATCH_BUFFER); + } } void si_init_screen_live_shader_cache(struct si_screen *sscreen) { - util_live_shader_cache_init(&sscreen->live_shader_cache, - si_create_shader_selector, - si_destroy_shader_selector); + util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector, + si_destroy_shader_selector); } void si_init_shader_functions(struct si_context *sctx) { - sctx->atoms.s.spi_map.emit = si_emit_spi_map; - sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; - - sctx->b.create_vs_state = si_create_shader; - sctx->b.create_tcs_state = si_create_shader; - sctx->b.create_tes_state = si_create_shader; - sctx->b.create_gs_state = si_create_shader; - sctx->b.create_fs_state = si_create_shader; - - sctx->b.bind_vs_state = si_bind_vs_shader; - sctx->b.bind_tcs_state = si_bind_tcs_shader; - sctx->b.bind_tes_state = si_bind_tes_shader; - sctx->b.bind_gs_state = si_bind_gs_shader; - sctx->b.bind_fs_state = si_bind_ps_shader; - - sctx->b.delete_vs_state = si_delete_shader_selector; - sctx->b.delete_tcs_state = si_delete_shader_selector; - sctx->b.delete_tes_state = si_delete_shader_selector; - sctx->b.delete_gs_state = si_delete_shader_selector; - sctx->b.delete_fs_state = si_delete_shader_selector; + sctx->atoms.s.spi_map.emit = si_emit_spi_map; + sctx->atoms.s.scratch_state.emit = si_emit_scratch_state; + + sctx->b.create_vs_state = si_create_shader; + sctx->b.create_tcs_state = si_create_shader; + sctx->b.create_tes_state = si_create_shader; + sctx->b.create_gs_state = si_create_shader; + sctx->b.create_fs_state = si_create_shader; + + sctx->b.bind_vs_state = si_bind_vs_shader; + sctx->b.bind_tcs_state = si_bind_tcs_shader; + sctx->b.bind_tes_state = si_bind_tes_shader; + sctx->b.bind_gs_state = si_bind_gs_shader; + sctx->b.bind_fs_state = si_bind_ps_shader; + + sctx->b.delete_vs_state = si_delete_shader_selector; + sctx->b.delete_tcs_state = si_delete_shader_selector; + sctx->b.delete_tes_state = si_delete_shader_selector; + sctx->b.delete_gs_state = si_delete_shader_selector; + sctx->b.delete_fs_state = si_delete_shader_selector; } diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 85ac4a119c5..2ce8de0ccde 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -23,395 +23,372 @@ */ #include "si_build_pm4.h" - #include "util/u_memory.h" #include "util/u_suballoc.h" static void si_set_streamout_enable(struct si_context *sctx, bool enable); static inline void si_so_target_reference(struct si_streamout_target **dst, - struct pipe_stream_output_target *src) + struct pipe_stream_output_target *src) { - pipe_so_target_reference((struct pipe_stream_output_target**)dst, src); + pipe_so_target_reference((struct pipe_stream_output_target **)dst, src); } -static struct pipe_stream_output_target * -si_create_so_target(struct pipe_context *ctx, - struct pipe_resource *buffer, - unsigned buffer_offset, - unsigned buffer_size) +static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size) { - struct si_context *sctx = (struct si_context *)ctx; - struct si_streamout_target *t; - struct si_resource *buf = si_resource(buffer); - - t = CALLOC_STRUCT(si_streamout_target); - if (!t) { - return NULL; - } - - unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; - u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4, - &t->buf_filled_size_offset, - (struct pipe_resource**)&t->buf_filled_size); - if (!t->buf_filled_size) { - FREE(t); - return NULL; - } - - t->b.reference.count = 1; - t->b.context = ctx; - pipe_resource_reference(&t->b.buffer, buffer); - t->b.buffer_offset = buffer_offset; - t->b.buffer_size = buffer_size; - - util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, - buffer_offset + buffer_size); - return &t->b; + struct si_context *sctx = (struct si_context *)ctx; + struct si_streamout_target *t; + struct si_resource *buf = si_resource(buffer); + + t = CALLOC_STRUCT(si_streamout_target); + if (!t) { + return NULL; + } + + unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; + u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4, + &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size); + if (!t->buf_filled_size) { + FREE(t); + return NULL; + } + + t->b.reference.count = 1; + t->b.context = ctx; + pipe_resource_reference(&t->b.buffer, buffer); + t->b.buffer_offset = buffer_offset; + t->b.buffer_size = buffer_size; + + util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); + return &t->b; } -static void si_so_target_destroy(struct pipe_context *ctx, - struct pipe_stream_output_target *target) +static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target) { - struct si_streamout_target *t = (struct si_streamout_target*)target; - pipe_resource_reference(&t->b.buffer, NULL); - si_resource_reference(&t->buf_filled_size, NULL); - FREE(t); + struct si_streamout_target *t = (struct si_streamout_target *)target; + pipe_resource_reference(&t->b.buffer, NULL); + si_resource_reference(&t->buf_filled_size, NULL); + FREE(t); } void si_streamout_buffers_dirty(struct si_context *sctx) { - if (!sctx->streamout.enabled_mask) - return; + if (!sctx->streamout.enabled_mask) + return; - si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); - si_set_streamout_enable(sctx, true); + si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); + si_set_streamout_enable(sctx, true); } -static void si_set_streamout_targets(struct pipe_context *ctx, - unsigned num_targets, - struct pipe_stream_output_target **targets, - const unsigned *offsets) +static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) { - struct si_context *sctx = (struct si_context *)ctx; - unsigned old_num_targets = sctx->streamout.num_targets; - unsigned i; - bool wait_now = false; - - /* We are going to unbind the buffers. Mark which caches need to be flushed. */ - if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { - /* Since streamout uses vector writes which go through TC L2 - * and most other clients can use TC L2 as well, we don't need - * to flush it. - * - * The only cases which requires flushing it is VGT DMA index - * fetching (on <= GFX7) and indirect draw data, which are rare - * cases. Thus, flag the TC L2 dirtiness in the resource and - * handle it at draw call time. - */ - for (i = 0; i < sctx->streamout.num_targets; i++) - if (sctx->streamout.targets[i]) - si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; - - /* Invalidate the scalar cache in case a streamout buffer is - * going to be used as a constant buffer. - * - * Invalidate vL1, because streamout bypasses it (done by - * setting GLC=1 in the store instruction), but vL1 in other - * CUs can contain outdated data of streamout buffers. - * - * VS_PARTIAL_FLUSH is required if the buffers are going to be - * used as an input immediately. - */ - sctx->flags |= SI_CONTEXT_INV_SCACHE | - SI_CONTEXT_INV_VCACHE; - - /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ - if (sctx->screen->use_ngg_streamout) { - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* Wait now. This is needed to make sure that GDS is not - * busy at the end of IBs. - * - * Also, the next streamout operation will overwrite GDS, - * so we need to make sure that it's idle. - */ - wait_now = true; - } else { - sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; - } - } - - /* All readers of the streamout targets need to be finished before we can - * start writing to the targets. - */ - if (num_targets) { - if (sctx->screen->use_ngg_streamout) - si_allocate_gds(sctx); - - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - /* Streamout buffers must be bound in 2 places: - * 1) in VGT by setting the VGT_STRMOUT registers - * 2) as shader resources - */ - - /* Stop streamout. */ - if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) - si_emit_streamout_end(sctx); - - /* Set the new targets. */ - unsigned enabled_mask = 0, append_bitmask = 0; - for (i = 0; i < num_targets; i++) { - si_so_target_reference(&sctx->streamout.targets[i], targets[i]); - if (!targets[i]) - continue; - - si_context_add_resource_size(sctx, targets[i]->buffer); - enabled_mask |= 1 << i; - - if (offsets[i] == ((unsigned)-1)) - append_bitmask |= 1 << i; - } - - for (; i < sctx->streamout.num_targets; i++) - si_so_target_reference(&sctx->streamout.targets[i], NULL); - - sctx->streamout.enabled_mask = enabled_mask; - sctx->streamout.num_targets = num_targets; - sctx->streamout.append_bitmask = append_bitmask; - - /* Update dirty state bits. */ - if (num_targets) { - si_streamout_buffers_dirty(sctx); - } else { - si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); - si_set_streamout_enable(sctx, false); - } - - /* Set the shader resources.*/ - for (i = 0; i < num_targets; i++) { - if (targets[i]) { - struct pipe_shader_buffer sbuf; - sbuf.buffer = targets[i]->buffer; - - if (sctx->screen->use_ngg_streamout) { - sbuf.buffer_offset = targets[i]->buffer_offset; - sbuf.buffer_size = targets[i]->buffer_size; - } else { - sbuf.buffer_offset = 0; - sbuf.buffer_size = targets[i]->buffer_offset + - targets[i]->buffer_size; - } - - si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); - si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; - } else { - si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); - } - } - for (; i < old_num_targets; i++) - si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); - - if (wait_now) - sctx->emit_cache_flush(sctx); + struct si_context *sctx = (struct si_context *)ctx; + unsigned old_num_targets = sctx->streamout.num_targets; + unsigned i; + bool wait_now = false; + + /* We are going to unbind the buffers. Mark which caches need to be flushed. */ + if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { + /* Since streamout uses vector writes which go through TC L2 + * and most other clients can use TC L2 as well, we don't need + * to flush it. + * + * The only cases which requires flushing it is VGT DMA index + * fetching (on <= GFX7) and indirect draw data, which are rare + * cases. Thus, flag the TC L2 dirtiness in the resource and + * handle it at draw call time. + */ + for (i = 0; i < sctx->streamout.num_targets; i++) + if (sctx->streamout.targets[i]) + si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; + + /* Invalidate the scalar cache in case a streamout buffer is + * going to be used as a constant buffer. + * + * Invalidate vL1, because streamout bypasses it (done by + * setting GLC=1 in the store instruction), but vL1 in other + * CUs can contain outdated data of streamout buffers. + * + * VS_PARTIAL_FLUSH is required if the buffers are going to be + * used as an input immediately. + */ + sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; + + /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ + if (sctx->screen->use_ngg_streamout) { + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + + /* Wait now. This is needed to make sure that GDS is not + * busy at the end of IBs. + * + * Also, the next streamout operation will overwrite GDS, + * so we need to make sure that it's idle. + */ + wait_now = true; + } else { + sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; + } + } + + /* All readers of the streamout targets need to be finished before we can + * start writing to the targets. + */ + if (num_targets) { + if (sctx->screen->use_ngg_streamout) + si_allocate_gds(sctx); + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* Streamout buffers must be bound in 2 places: + * 1) in VGT by setting the VGT_STRMOUT registers + * 2) as shader resources + */ + + /* Stop streamout. */ + if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) + si_emit_streamout_end(sctx); + + /* Set the new targets. */ + unsigned enabled_mask = 0, append_bitmask = 0; + for (i = 0; i < num_targets; i++) { + si_so_target_reference(&sctx->streamout.targets[i], targets[i]); + if (!targets[i]) + continue; + + si_context_add_resource_size(sctx, targets[i]->buffer); + enabled_mask |= 1 << i; + + if (offsets[i] == ((unsigned)-1)) + append_bitmask |= 1 << i; + } + + for (; i < sctx->streamout.num_targets; i++) + si_so_target_reference(&sctx->streamout.targets[i], NULL); + + sctx->streamout.enabled_mask = enabled_mask; + sctx->streamout.num_targets = num_targets; + sctx->streamout.append_bitmask = append_bitmask; + + /* Update dirty state bits. */ + if (num_targets) { + si_streamout_buffers_dirty(sctx); + } else { + si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); + si_set_streamout_enable(sctx, false); + } + + /* Set the shader resources.*/ + for (i = 0; i < num_targets; i++) { + if (targets[i]) { + struct pipe_shader_buffer sbuf; + sbuf.buffer = targets[i]->buffer; + + if (sctx->screen->use_ngg_streamout) { + sbuf.buffer_offset = targets[i]->buffer_offset; + sbuf.buffer_size = targets[i]->buffer_size; + } else { + sbuf.buffer_offset = 0; + sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size; + } + + si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); + si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; + } else { + si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); + } + } + for (; i < old_num_targets; i++) + si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); + + if (wait_now) + sctx->emit_cache_flush(sctx); } static void gfx10_emit_streamout_begin(struct si_context *sctx) { - struct si_streamout_target **t = sctx->streamout.targets; - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned last_target = 0; - - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (t[i]) - last_target = i; - } - - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; - - t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; - - bool append = sctx->streamout.append_bitmask & (1 << i); - uint64_t va = 0; - - if (append) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - t[i]->buf_filled_size, - RADEON_USAGE_READ, - RADEON_PRIO_SO_FILLED_SIZE); - - va = t[i]->buf_filled_size->gpu_address + - t[i]->buf_filled_size_offset; - } - - radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | - S_411_DST_SEL(V_411_GDS) | - S_411_CP_SYNC(i == last_target)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 4 * i); /* destination in GDS */ - radeon_emit(cs, 0); - radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | - S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target)); - } - - sctx->streamout.begin_emitted = true; + struct si_streamout_target **t = sctx->streamout.targets; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned last_target = 0; + + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (t[i]) + last_target = i; + } + + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; + + bool append = sctx->streamout.append_bitmask & (1 << i); + uint64_t va = 0; + + if (append) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ, + RADEON_PRIO_SO_FILLED_SIZE); + + va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + } + + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | + S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 4 * i); /* destination in GDS */ + radeon_emit(cs, 0); + radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target)); + } + + sctx->streamout.begin_emitted = true; } static void gfx10_emit_streamout_end(struct si_context *sctx) { - struct si_streamout_target **t = sctx->streamout.targets; + struct si_streamout_target **t = sctx->streamout.targets; - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; - uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, - EOP_DST_SEL_TC_L2, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, - EOP_DATA_SEL_GDS, - t[i]->buf_filled_size, va, - EOP_DATA_GDS(i, 1), 0); + si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, + t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); - t[i]->buf_filled_size_valid = true; - } + t[i]->buf_filled_size_valid = true; + } - sctx->streamout.begin_emitted = false; + sctx->streamout.begin_emitted = false; } static void si_flush_vgt_streamout(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - unsigned reg_strmout_cntl; - - /* The register is at different places on different ASICs. */ - if (sctx->chip_class >= GFX7) { - reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; - radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); - } else { - reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; - radeon_set_config_reg(cs, reg_strmout_cntl, 0); - } - - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); - - radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ - radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ - radeon_emit(cs, 0); - radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ - radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ - radeon_emit(cs, 4); /* poll interval */ + struct radeon_cmdbuf *cs = sctx->gfx_cs; + unsigned reg_strmout_cntl; + + /* The register is at different places on different ASICs. */ + if (sctx->chip_class >= GFX7) { + reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; + radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); + } else { + reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; + radeon_set_config_reg(cs, reg_strmout_cntl, 0); + } + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, + WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ + radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ + radeon_emit(cs, 0); + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ + radeon_emit(cs, 4); /* poll interval */ } static void si_emit_streamout_begin(struct si_context *sctx) { - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_streamout_target **t = sctx->streamout.targets; - uint16_t *stride_in_dw = sctx->streamout.stride_in_dw; - unsigned i; - - si_flush_vgt_streamout(sctx); - - for (i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; - - t[i]->stride_in_dw = stride_in_dw[i]; - - /* AMD GCN binds streamout buffers as shader resources. - * VGT only counts primitives and tells the shader - * through SGPRs what to do. */ - radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); - radeon_emit(cs, (t[i]->b.buffer_offset + - t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ - radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ - - if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { - uint64_t va = t[i]->buf_filled_size->gpu_address + - t[i]->buf_filled_size_offset; - - /* Append. */ - radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | - STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, va); /* src address lo */ - radeon_emit(cs, va >> 32); /* src address hi */ - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - t[i]->buf_filled_size, - RADEON_USAGE_READ, - RADEON_PRIO_SO_FILLED_SIZE); - } else { - /* Start from the beginning. */ - radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | - STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ - radeon_emit(cs, 0); /* unused */ - } - } - - sctx->streamout.begin_emitted = true; + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_streamout_target **t = sctx->streamout.targets; + uint16_t *stride_in_dw = sctx->streamout.stride_in_dw; + unsigned i; + + si_flush_vgt_streamout(sctx); + + for (i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + t[i]->stride_in_dw = stride_in_dw[i]; + + /* AMD GCN binds streamout buffers as shader resources. + * VGT only counts primitives and tells the shader + * through SGPRs what to do. */ + radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); + radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ + radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ + + if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + + /* Append. */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, va); /* src address lo */ + radeon_emit(cs, va >> 32); /* src address hi */ + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ, + RADEON_PRIO_SO_FILLED_SIZE); + } else { + /* Start from the beginning. */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ + radeon_emit(cs, 0); /* unused */ + } + } + + sctx->streamout.begin_emitted = true; } void si_emit_streamout_end(struct si_context *sctx) { - if (sctx->screen->use_ngg_streamout) { - gfx10_emit_streamout_end(sctx); - return; - } - - struct radeon_cmdbuf *cs = sctx->gfx_cs; - struct si_streamout_target **t = sctx->streamout.targets; - unsigned i; - uint64_t va; - - si_flush_vgt_streamout(sctx); - - for (i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; - - va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | - STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | - STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ - radeon_emit(cs, va); /* dst address lo */ - radeon_emit(cs, va >> 32); /* dst address hi */ - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, 0); /* unused */ - - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, - t[i]->buf_filled_size, - RADEON_USAGE_WRITE, - RADEON_PRIO_SO_FILLED_SIZE); - - /* Zero the buffer size. The counters (primitives generated, - * primitives emitted) may be enabled even if there is not - * buffer bound. This ensures that the primitives-emitted query - * won't increment. */ - radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); - sctx->context_roll = true; - - t[i]->buf_filled_size_valid = true; - } - - sctx->streamout.begin_emitted = false; + if (sctx->screen->use_ngg_streamout) { + gfx10_emit_streamout_end(sctx); + return; + } + + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_streamout_target **t = sctx->streamout.targets; + unsigned i; + uint64_t va; + + si_flush_vgt_streamout(sctx); + + for (i = 0; i < sctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ + radeon_emit(cs, va); /* dst address lo */ + radeon_emit(cs, va >> 32); /* dst address hi */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE, + RADEON_PRIO_SO_FILLED_SIZE); + + /* Zero the buffer size. The counters (primitives generated, + * primitives emitted) may be enabled even if there is not + * buffer bound. This ensures that the primitives-emitted query + * won't increment. */ + radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); + sctx->context_roll = true; + + t[i]->buf_filled_size_valid = true; + } + + sctx->streamout.begin_emitted = false; } /* STREAMOUT CONFIG DERIVED STATE @@ -423,71 +400,65 @@ void si_emit_streamout_end(struct si_context *sctx) static void si_emit_streamout_enable(struct si_context *sctx) { - assert(!sctx->screen->use_ngg_streamout); - - radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); - radeon_emit(sctx->gfx_cs, - S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | - S_028B94_RAST_STREAM(0) | - S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | - S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | - S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); - radeon_emit(sctx->gfx_cs, - sctx->streamout.hw_enabled_mask & - sctx->streamout.enabled_stream_buffers_mask); + assert(!sctx->screen->use_ngg_streamout); + + radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); + radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | + S_028B94_RAST_STREAM(0) | + S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | + S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | + S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); + radeon_emit(sctx->gfx_cs, + sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask); } static void si_set_streamout_enable(struct si_context *sctx, bool enable) { - bool old_strmout_en = si_get_strmout_en(sctx); - unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; + bool old_strmout_en = si_get_strmout_en(sctx); + unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; - sctx->streamout.streamout_enabled = enable; + sctx->streamout.streamout_enabled = enable; - sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask | - (sctx->streamout.enabled_mask << 4) | - (sctx->streamout.enabled_mask << 8) | - (sctx->streamout.enabled_mask << 12); + sctx->streamout.hw_enabled_mask = + sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | + (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); - if (!sctx->screen->use_ngg_streamout && - ((old_strmout_en != si_get_strmout_en(sctx)) || - (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) - si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); + if (!sctx->screen->use_ngg_streamout && + ((old_strmout_en != si_get_strmout_en(sctx)) || + (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) + si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); } -void si_update_prims_generated_query_state(struct si_context *sctx, - unsigned type, int diff) +void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) { - if (!sctx->screen->use_ngg_streamout && - type == PIPE_QUERY_PRIMITIVES_GENERATED) { - bool old_strmout_en = si_get_strmout_en(sctx); + if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { + bool old_strmout_en = si_get_strmout_en(sctx); - sctx->streamout.num_prims_gen_queries += diff; - assert(sctx->streamout.num_prims_gen_queries >= 0); + sctx->streamout.num_prims_gen_queries += diff; + assert(sctx->streamout.num_prims_gen_queries >= 0); - sctx->streamout.prims_gen_query_enabled = - sctx->streamout.num_prims_gen_queries != 0; + sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0; - if (old_strmout_en != si_get_strmout_en(sctx)) - si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); + if (old_strmout_en != si_get_strmout_en(sctx)) + si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); - if (si_update_ngg(sctx)) { - si_shader_change_notify(sctx); - sctx->do_update_shaders = true; - } - } + if (si_update_ngg(sctx)) { + si_shader_change_notify(sctx); + sctx->do_update_shaders = true; + } + } } void si_init_streamout_functions(struct si_context *sctx) { - sctx->b.create_stream_output_target = si_create_so_target; - sctx->b.stream_output_target_destroy = si_so_target_destroy; - sctx->b.set_stream_output_targets = si_set_streamout_targets; - - if (sctx->screen->use_ngg_streamout) { - sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; - } else { - sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; - sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; - } + sctx->b.create_stream_output_target = si_create_so_target; + sctx->b.stream_output_target_destroy = si_so_target_destroy; + sctx->b.set_stream_output_targets = si_set_streamout_targets; + + if (sctx->screen->use_ngg_streamout) { + sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; + } else { + sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; + sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; + } } diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 682f00d44a8..5149ee1c643 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -30,541 +30,512 @@ void si_update_ngg_small_prim_precision(struct si_context *ctx) { - if (!ctx->screen->use_ngg_culling) - return; - - /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */ - unsigned num_samples = ctx->framebuffer.nr_samples; - unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode; - float precision; - - if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - precision = num_samples / 4096.0; - else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - precision = num_samples / 1024.0; - else - precision = num_samples / 256.0; - - ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION; - ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23); + if (!ctx->screen->use_ngg_culling) + return; + + /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */ + unsigned num_samples = ctx->framebuffer.nr_samples; + unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode; + float precision; + + if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) + precision = num_samples / 4096.0; + else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) + precision = num_samples / 1024.0; + else + precision = num_samples / 256.0; + + ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION; + ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23); } -void si_get_small_prim_cull_info(struct si_context *sctx, - struct si_small_prim_cull_info *out) +void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out) { - /* This is needed by the small primitive culling, because it's done - * in screen space. - */ - struct si_small_prim_cull_info info; - unsigned num_samples = sctx->framebuffer.nr_samples; - assert(num_samples >= 1); - - info.scale[0] = sctx->viewports.states[0].scale[0]; - info.scale[1] = sctx->viewports.states[0].scale[1]; - info.translate[0] = sctx->viewports.states[0].translate[0]; - info.translate[1] = sctx->viewports.states[0].translate[1]; - - /* The viewport shouldn't flip the X axis for the small prim culling to work. */ - assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]); - - /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. - * This is because the viewport transformation inverts the clip space - * bounding box, so min becomes max, which breaks small primitive - * culling. - */ - if (sctx->viewports.y_inverted) { - info.scale[1] = -info.scale[1]; - info.translate[1] = -info.translate[1]; - } - - /* Scale the framebuffer up, so that samples become pixels and small - * primitive culling is the same for all sample counts. - * This only works with the standard DX sample positions, because - * the samples are evenly spaced on both X and Y axes. - */ - for (unsigned i = 0; i < 2; i++) { - info.scale[i] *= num_samples; - info.translate[i] *= num_samples; - } - *out = info; + /* This is needed by the small primitive culling, because it's done + * in screen space. + */ + struct si_small_prim_cull_info info; + unsigned num_samples = sctx->framebuffer.nr_samples; + assert(num_samples >= 1); + + info.scale[0] = sctx->viewports.states[0].scale[0]; + info.scale[1] = sctx->viewports.states[0].scale[1]; + info.translate[0] = sctx->viewports.states[0].translate[0]; + info.translate[1] = sctx->viewports.states[0].translate[1]; + + /* The viewport shouldn't flip the X axis for the small prim culling to work. */ + assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]); + + /* If the Y axis is inverted (OpenGL default framebuffer), reverse it. + * This is because the viewport transformation inverts the clip space + * bounding box, so min becomes max, which breaks small primitive + * culling. + */ + if (sctx->viewports.y_inverted) { + info.scale[1] = -info.scale[1]; + info.translate[1] = -info.translate[1]; + } + + /* Scale the framebuffer up, so that samples become pixels and small + * primitive culling is the same for all sample counts. + * This only works with the standard DX sample positions, because + * the samples are evenly spaced on both X and Y axes. + */ + for (unsigned i = 0; i < 2; i++) { + info.scale[i] *= num_samples; + info.translate[i] *= num_samples; + } + *out = info; } -static void si_set_scissor_states(struct pipe_context *pctx, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *state) +static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot, + unsigned num_scissors, const struct pipe_scissor_state *state) { - struct si_context *ctx = (struct si_context *)pctx; - int i; + struct si_context *ctx = (struct si_context *)pctx; + int i; - for (i = 0; i < num_scissors; i++) - ctx->scissors[start_slot + i] = state[i]; + for (i = 0; i < num_scissors; i++) + ctx->scissors[start_slot + i] = state[i]; - if (!ctx->queued.named.rasterizer->scissor_enable) - return; + if (!ctx->queued.named.rasterizer->scissor_enable) + return; - si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); } /* Since the guard band disables clipping, we have to clip per-pixel * using a scissor. */ static void si_get_scissor_from_viewport(struct si_context *ctx, - const struct pipe_viewport_state *vp, - struct si_signed_scissor *scissor) + const struct pipe_viewport_state *vp, + struct si_signed_scissor *scissor) { - float tmp, minx, miny, maxx, maxy; - - /* Convert (-1, -1) and (1, 1) from clip space into window space. */ - minx = -vp->scale[0] + vp->translate[0]; - miny = -vp->scale[1] + vp->translate[1]; - maxx = vp->scale[0] + vp->translate[0]; - maxy = vp->scale[1] + vp->translate[1]; - - /* Handle inverted viewports. */ - if (minx > maxx) { - tmp = minx; - minx = maxx; - maxx = tmp; - } - if (miny > maxy) { - tmp = miny; - miny = maxy; - maxy = tmp; - } - - /* Convert to integer and round up the max bounds. */ - scissor->minx = minx; - scissor->miny = miny; - scissor->maxx = ceilf(maxx); - scissor->maxy = ceilf(maxy); + float tmp, minx, miny, maxx, maxy; + + /* Convert (-1, -1) and (1, 1) from clip space into window space. */ + minx = -vp->scale[0] + vp->translate[0]; + miny = -vp->scale[1] + vp->translate[1]; + maxx = vp->scale[0] + vp->translate[0]; + maxy = vp->scale[1] + vp->translate[1]; + + /* Handle inverted viewports. */ + if (minx > maxx) { + tmp = minx; + minx = maxx; + maxx = tmp; + } + if (miny > maxy) { + tmp = miny; + miny = maxy; + maxy = tmp; + } + + /* Convert to integer and round up the max bounds. */ + scissor->minx = minx; + scissor->miny = miny; + scissor->maxx = ceilf(maxx); + scissor->maxy = ceilf(maxy); } -static void si_clamp_scissor(struct si_context *ctx, - struct pipe_scissor_state *out, - struct si_signed_scissor *scissor) +static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out, + struct si_signed_scissor *scissor) { - out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR); - out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR); - out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR); - out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR); + out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR); + out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR); + out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR); + out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR); } -static void si_clip_scissor(struct pipe_scissor_state *out, - struct pipe_scissor_state *clip) +static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip) { - out->minx = MAX2(out->minx, clip->minx); - out->miny = MAX2(out->miny, clip->miny); - out->maxx = MIN2(out->maxx, clip->maxx); - out->maxy = MIN2(out->maxy, clip->maxy); + out->minx = MAX2(out->minx, clip->minx); + out->miny = MAX2(out->miny, clip->miny); + out->maxx = MIN2(out->maxx, clip->maxx); + out->maxy = MIN2(out->maxy, clip->maxy); } -static void si_scissor_make_union(struct si_signed_scissor *out, - struct si_signed_scissor *in) +static void si_scissor_make_union(struct si_signed_scissor *out, struct si_signed_scissor *in) { - out->minx = MIN2(out->minx, in->minx); - out->miny = MIN2(out->miny, in->miny); - out->maxx = MAX2(out->maxx, in->maxx); - out->maxy = MAX2(out->maxy, in->maxy); - out->quant_mode = MIN2(out->quant_mode, in->quant_mode); + out->minx = MIN2(out->minx, in->minx); + out->miny = MIN2(out->miny, in->miny); + out->maxx = MAX2(out->maxx, in->maxx); + out->maxy = MAX2(out->maxy, in->maxy); + out->quant_mode = MIN2(out->quant_mode, in->quant_mode); } -static void si_emit_one_scissor(struct si_context *ctx, - struct radeon_cmdbuf *cs, - struct si_signed_scissor *vp_scissor, - struct pipe_scissor_state *scissor) +static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs, + struct si_signed_scissor *vp_scissor, + struct pipe_scissor_state *scissor) { - struct pipe_scissor_state final; - - if (ctx->vs_disables_clipping_viewport) { - final.minx = final.miny = 0; - final.maxx = final.maxy = SI_MAX_SCISSOR; - } else { - si_clamp_scissor(ctx, &final, vp_scissor); - } - - if (scissor) - si_clip_scissor(&final, scissor); - - /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_- - * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. - */ - if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) { - radeon_emit(cs, S_028250_TL_X(1) | - S_028250_TL_Y(1) | - S_028250_WINDOW_OFFSET_DISABLE(1)); - radeon_emit(cs, S_028254_BR_X(1) | - S_028254_BR_Y(1)); - return; - } - - radeon_emit(cs, S_028250_TL_X(final.minx) | - S_028250_TL_Y(final.miny) | - S_028250_WINDOW_OFFSET_DISABLE(1)); - radeon_emit(cs, S_028254_BR_X(final.maxx) | - S_028254_BR_Y(final.maxy)); + struct pipe_scissor_state final; + + if (ctx->vs_disables_clipping_viewport) { + final.minx = final.miny = 0; + final.maxx = final.maxy = SI_MAX_SCISSOR; + } else { + si_clamp_scissor(ctx, &final, vp_scissor); + } + + if (scissor) + si_clip_scissor(&final, scissor); + + /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_- + * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. + */ + if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) { + radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1)); + return; + } + + radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy)); } #define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176 static void si_emit_guardband(struct si_context *ctx) { - const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer; - struct si_signed_scissor vp_as_scissor; - struct pipe_viewport_state vp; - float left, top, right, bottom, max_range, guardband_x, guardband_y; - float discard_x, discard_y; - - if (ctx->vs_writes_viewport_index) { - /* Shaders can draw to any viewport. Make a union of all - * viewports. */ - vp_as_scissor = ctx->viewports.as_scissor[0]; - for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) { - si_scissor_make_union(&vp_as_scissor, - &ctx->viewports.as_scissor[i]); - } - } else { - vp_as_scissor = ctx->viewports.as_scissor[0]; - } - - /* Blits don't set the viewport state. The vertex shader determines - * the viewport size by scaling the coordinates, so we don't know - * how large the viewport is. Assume the worst case. - */ - if (ctx->vs_disables_clipping_viewport) - vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; - - /* Determine the optimal hardware screen offset to center the viewport - * within the viewport range in order to maximize the guardband size. - */ - int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2; - int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2; - - /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */ - const unsigned hw_screen_offset_alignment = - ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16); - - /* Indexed by quantization modes */ - static int max_viewport_size[] = {65535, 16383, 4095}; - - /* Ensure that the whole viewport stays representable in - * absolute coordinates. - * See comment in si_set_viewport_states. - */ - assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] && - vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]); - - hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); - hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); - - /* Align the screen offset by dropping the low bits. */ - hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1); - hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1); - - /* Apply the offset to center the viewport and maximize the guardband. */ - vp_as_scissor.minx -= hw_screen_offset_x; - vp_as_scissor.maxx -= hw_screen_offset_x; - vp_as_scissor.miny -= hw_screen_offset_y; - vp_as_scissor.maxy -= hw_screen_offset_y; - - /* Reconstruct the viewport transformation from the scissor. */ - vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0; - vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0; - vp.scale[0] = vp_as_scissor.maxx - vp.translate[0]; - vp.scale[1] = vp_as_scissor.maxy - vp.translate[1]; - - /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */ - if (vp_as_scissor.minx == vp_as_scissor.maxx) - vp.scale[0] = 0.5; - if (vp_as_scissor.miny == vp_as_scissor.maxy) - vp.scale[1] = 0.5; - - /* Find the biggest guard band that is inside the supported viewport - * range. The guard band is specified as a horizontal and vertical - * distance from (0,0) in clip space. - * - * This is done by applying the inverse viewport transformation - * on the viewport limits to get those limits in clip space. - * - * The viewport range is [-max_viewport_size/2, max_viewport_size/2]. - */ - assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size)); - max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2; - left = (-max_range - vp.translate[0]) / vp.scale[0]; - right = ( max_range - vp.translate[0]) / vp.scale[0]; - top = (-max_range - vp.translate[1]) / vp.scale[1]; - bottom = ( max_range - vp.translate[1]) / vp.scale[1]; - - assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1); - - guardband_x = MIN2(-left, right); - guardband_y = MIN2(-top, bottom); - - discard_x = 1.0; - discard_y = 1.0; - - if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) { - /* When rendering wide points or lines, we need to be more - * conservative about when to discard them entirely. */ - float pixels; - - if (ctx->current_rast_prim == PIPE_PRIM_POINTS) - pixels = rs->max_point_size; - else - pixels = rs->line_width; - - /* Add half the point size / line width */ - discard_x += pixels / (2.0 * vp.scale[0]); - discard_y += pixels / (2.0 * vp.scale[1]); - - /* Discard primitives that would lie entirely outside the clip - * region. */ - discard_x = MIN2(discard_x, guardband_x); - discard_y = MIN2(discard_y, guardband_y); - } - - /* If any of the GB registers is updated, all of them must be updated. - * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ - * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ - */ - unsigned initial_cdw = ctx->gfx_cs->current.cdw; - radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, - SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, - fui(guardband_y), fui(discard_y), - fui(guardband_x), fui(discard_x)); - radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, - SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, - S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) | - S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4)); - radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL, - SI_TRACKED_PA_SU_VTX_CNTL, - S_028BE4_PIX_CENTER(rs->half_pixel_center) | - S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + - vp_as_scissor.quant_mode)); - if (initial_cdw != ctx->gfx_cs->current.cdw) - ctx->context_roll = true; - - si_update_ngg_small_prim_precision(ctx); + const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer; + struct si_signed_scissor vp_as_scissor; + struct pipe_viewport_state vp; + float left, top, right, bottom, max_range, guardband_x, guardband_y; + float discard_x, discard_y; + + if (ctx->vs_writes_viewport_index) { + /* Shaders can draw to any viewport. Make a union of all + * viewports. */ + vp_as_scissor = ctx->viewports.as_scissor[0]; + for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) { + si_scissor_make_union(&vp_as_scissor, &ctx->viewports.as_scissor[i]); + } + } else { + vp_as_scissor = ctx->viewports.as_scissor[0]; + } + + /* Blits don't set the viewport state. The vertex shader determines + * the viewport size by scaling the coordinates, so we don't know + * how large the viewport is. Assume the worst case. + */ + if (ctx->vs_disables_clipping_viewport) + vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; + + /* Determine the optimal hardware screen offset to center the viewport + * within the viewport range in order to maximize the guardband size. + */ + int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2; + int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2; + + /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */ + const unsigned hw_screen_offset_alignment = + ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16); + + /* Indexed by quantization modes */ + static int max_viewport_size[] = {65535, 16383, 4095}; + + /* Ensure that the whole viewport stays representable in + * absolute coordinates. + * See comment in si_set_viewport_states. + */ + assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] && + vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]); + + hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); + hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); + + /* Align the screen offset by dropping the low bits. */ + hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1); + hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1); + + /* Apply the offset to center the viewport and maximize the guardband. */ + vp_as_scissor.minx -= hw_screen_offset_x; + vp_as_scissor.maxx -= hw_screen_offset_x; + vp_as_scissor.miny -= hw_screen_offset_y; + vp_as_scissor.maxy -= hw_screen_offset_y; + + /* Reconstruct the viewport transformation from the scissor. */ + vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0; + vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0; + vp.scale[0] = vp_as_scissor.maxx - vp.translate[0]; + vp.scale[1] = vp_as_scissor.maxy - vp.translate[1]; + + /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */ + if (vp_as_scissor.minx == vp_as_scissor.maxx) + vp.scale[0] = 0.5; + if (vp_as_scissor.miny == vp_as_scissor.maxy) + vp.scale[1] = 0.5; + + /* Find the biggest guard band that is inside the supported viewport + * range. The guard band is specified as a horizontal and vertical + * distance from (0,0) in clip space. + * + * This is done by applying the inverse viewport transformation + * on the viewport limits to get those limits in clip space. + * + * The viewport range is [-max_viewport_size/2, max_viewport_size/2]. + */ + assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size)); + max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2; + left = (-max_range - vp.translate[0]) / vp.scale[0]; + right = (max_range - vp.translate[0]) / vp.scale[0]; + top = (-max_range - vp.translate[1]) / vp.scale[1]; + bottom = (max_range - vp.translate[1]) / vp.scale[1]; + + assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1); + + guardband_x = MIN2(-left, right); + guardband_y = MIN2(-top, bottom); + + discard_x = 1.0; + discard_y = 1.0; + + if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) { + /* When rendering wide points or lines, we need to be more + * conservative about when to discard them entirely. */ + float pixels; + + if (ctx->current_rast_prim == PIPE_PRIM_POINTS) + pixels = rs->max_point_size; + else + pixels = rs->line_width; + + /* Add half the point size / line width */ + discard_x += pixels / (2.0 * vp.scale[0]); + discard_y += pixels / (2.0 * vp.scale[1]); + + /* Discard primitives that would lie entirely outside the clip + * region. */ + discard_x = MIN2(discard_x, guardband_x); + discard_y = MIN2(discard_y, guardband_y); + } + + /* If any of the GB registers is updated, all of them must be updated. + * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ + * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ + */ + unsigned initial_cdw = ctx->gfx_cs->current.cdw; + radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, + SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y), + fui(guardband_x), fui(discard_x)); + radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, + SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, + S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) | + S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4)); + radeon_opt_set_context_reg( + ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, + S_028BE4_PIX_CENTER(rs->half_pixel_center) | + S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); + if (initial_cdw != ctx->gfx_cs->current.cdw) + ctx->context_roll = true; + + si_update_ngg_small_prim_precision(ctx); } static void si_emit_scissors(struct si_context *ctx) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - struct pipe_scissor_state *states = ctx->scissors; - bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable; - - /* The simple case: Only 1 viewport is active. */ - if (!ctx->vs_writes_viewport_index) { - struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0]; - - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); - si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL); - return; - } - - /* All registers in the array need to be updated if any of them is changed. - * This is a hardware requirement. - */ - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, - SI_MAX_VIEWPORTS * 2); - for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { - si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i], - scissor_enabled ? &states[i] : NULL); - } + struct radeon_cmdbuf *cs = ctx->gfx_cs; + struct pipe_scissor_state *states = ctx->scissors; + bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable; + + /* The simple case: Only 1 viewport is active. */ + if (!ctx->vs_writes_viewport_index) { + struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0]; + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); + si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL); + return; + } + + /* All registers in the array need to be updated if any of them is changed. + * This is a hardware requirement. + */ + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2); + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { + si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i], + scissor_enabled ? &states[i] : NULL); + } } -static void si_set_viewport_states(struct pipe_context *pctx, - unsigned start_slot, - unsigned num_viewports, - const struct pipe_viewport_state *state) +static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slot, + unsigned num_viewports, const struct pipe_viewport_state *state) { - struct si_context *ctx = (struct si_context *)pctx; - int i; - - for (i = 0; i < num_viewports; i++) { - unsigned index = start_slot + i; - struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index]; - - ctx->viewports.states[index] = state[i]; - - si_get_scissor_from_viewport(ctx, &state[i], scissor); - - unsigned w = scissor->maxx - scissor->minx; - unsigned h = scissor->maxy - scissor->miny; - unsigned max_extent = MAX2(w, h); - - int max_corner = MAX2(scissor->maxx, scissor->maxy); - - unsigned center_x = (scissor->maxx + scissor->minx) / 2; - unsigned center_y = (scissor->maxy + scissor->miny) / 2; - unsigned max_center = MAX2(center_x, center_y); - - /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose - * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET. - * (for example, a 1x1 viewport in the lower right corner of - * 16Kx16K) Such viewports need a greater guardband, so they - * have to use a worse quantization mode. - */ - unsigned distance_off_center = - MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET); - max_extent += distance_off_center; - - /* Determine the best quantization mode (subpixel precision), - * but also leave enough space for the guardband. - * - * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10 - * and Raven1 for line and rectangle primitive types to work correctly. - * Always use 16_8 if primitive binning is possible to occur. - */ - if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && - ctx->screen->dpbb_allowed) - max_extent = 16384; /* Use QUANT_MODE == 16_8. */ - - /* Another constraint is that all coordinates in the viewport - * are representable in fixed point with respect to the - * surface origin. - * - * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given - * an offset that would make the upper corner of the viewport - * greater than the maximum representable number post - * quantization, ie 2^quant_bits. - * - * This does not matter for 14.10 and 16.8 formats since the - * offset is already limited at 8k, but it means we can't use - * 12.12 if we are drawing to some pixels outside the lower - * 4k x 4k of the render target. - */ - - if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */ - scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH; - else if (max_extent <= 4096) /* 16K scanline area for guardband */ - scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH; - else /* 64K scanline area for guardband */ - scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; - } - - if (start_slot == 0) { - ctx->viewports.y_inverted = - -state->scale[1] + state->translate[1] > - state->scale[1] + state->translate[1]; - } - - si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); - si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); - si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + struct si_context *ctx = (struct si_context *)pctx; + int i; + + for (i = 0; i < num_viewports; i++) { + unsigned index = start_slot + i; + struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index]; + + ctx->viewports.states[index] = state[i]; + + si_get_scissor_from_viewport(ctx, &state[i], scissor); + + unsigned w = scissor->maxx - scissor->minx; + unsigned h = scissor->maxy - scissor->miny; + unsigned max_extent = MAX2(w, h); + + int max_corner = MAX2(scissor->maxx, scissor->maxy); + + unsigned center_x = (scissor->maxx + scissor->minx) / 2; + unsigned center_y = (scissor->maxy + scissor->miny) / 2; + unsigned max_center = MAX2(center_x, center_y); + + /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose + * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET. + * (for example, a 1x1 viewport in the lower right corner of + * 16Kx16K) Such viewports need a greater guardband, so they + * have to use a worse quantization mode. + */ + unsigned distance_off_center = MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET); + max_extent += distance_off_center; + + /* Determine the best quantization mode (subpixel precision), + * but also leave enough space for the guardband. + * + * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10 + * and Raven1 for line and rectangle primitive types to work correctly. + * Always use 16_8 if primitive binning is possible to occur. + */ + if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && ctx->screen->dpbb_allowed) + max_extent = 16384; /* Use QUANT_MODE == 16_8. */ + + /* Another constraint is that all coordinates in the viewport + * are representable in fixed point with respect to the + * surface origin. + * + * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given + * an offset that would make the upper corner of the viewport + * greater than the maximum representable number post + * quantization, ie 2^quant_bits. + * + * This does not matter for 14.10 and 16.8 formats since the + * offset is already limited at 8k, but it means we can't use + * 12.12 if we are drawing to some pixels outside the lower + * 4k x 4k of the render target. + */ + + if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */ + scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH; + else if (max_extent <= 4096) /* 16K scanline area for guardband */ + scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH; + else /* 64K scanline area for guardband */ + scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; + } + + if (start_slot == 0) { + ctx->viewports.y_inverted = + -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1]; + } + + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); + si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); + si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); } -static void si_emit_one_viewport(struct si_context *ctx, - struct pipe_viewport_state *state) +static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - - radeon_emit(cs, fui(state->scale[0])); - radeon_emit(cs, fui(state->translate[0])); - radeon_emit(cs, fui(state->scale[1])); - radeon_emit(cs, fui(state->translate[1])); - radeon_emit(cs, fui(state->scale[2])); - radeon_emit(cs, fui(state->translate[2])); + struct radeon_cmdbuf *cs = ctx->gfx_cs; + + radeon_emit(cs, fui(state->scale[0])); + radeon_emit(cs, fui(state->translate[0])); + radeon_emit(cs, fui(state->scale[1])); + radeon_emit(cs, fui(state->translate[1])); + radeon_emit(cs, fui(state->scale[2])); + radeon_emit(cs, fui(state->translate[2])); } static void si_emit_viewports(struct si_context *ctx) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - struct pipe_viewport_state *states = ctx->viewports.states; - - if (ctx->screen->use_ngg_culling) { - /* Set the viewport info for small primitive culling. */ - struct si_small_prim_cull_info info; - si_get_small_prim_cull_info(ctx, &info); - - if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) { - unsigned offset = 0; - - /* Align to 256, because the address is shifted by 8 bits. */ - u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, - &info, &offset, - (struct pipe_resource**)&ctx->small_prim_cull_info_buf); - - ctx->small_prim_cull_info_address = - ctx->small_prim_cull_info_buf->gpu_address + offset; - ctx->last_small_prim_cull_info = info; - ctx->small_prim_cull_info_dirty = true; - } - - if (ctx->small_prim_cull_info_dirty) { - /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ - radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); - radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, - ctx->small_prim_cull_info_address >> 8); - ctx->small_prim_cull_info_dirty = false; - } - } - - /* The simple case: Only 1 viewport is active. */ - if (!ctx->vs_writes_viewport_index) { - radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); - si_emit_one_viewport(ctx, &states[0]); - return; - } - - /* All registers in the array need to be updated if any of them is changed. - * This is a hardware requirement. - */ - radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + - 0, SI_MAX_VIEWPORTS * 6); - for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) - si_emit_one_viewport(ctx, &states[i]); + struct radeon_cmdbuf *cs = ctx->gfx_cs; + struct pipe_viewport_state *states = ctx->viewports.states; + + if (ctx->screen->use_ngg_culling) { + /* Set the viewport info for small primitive culling. */ + struct si_small_prim_cull_info info; + si_get_small_prim_cull_info(ctx, &info); + + if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) { + unsigned offset = 0; + + /* Align to 256, because the address is shifted by 8 bits. */ + u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset, + (struct pipe_resource **)&ctx->small_prim_cull_info_buf); + + ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset; + ctx->last_small_prim_cull_info = info; + ctx->small_prim_cull_info_dirty = true; + } + + if (ctx->small_prim_cull_info_dirty) { + /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ + radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf, + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); + radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, + ctx->small_prim_cull_info_address >> 8); + ctx->small_prim_cull_info_dirty = false; + } + } + + /* The simple case: Only 1 viewport is active. */ + if (!ctx->vs_writes_viewport_index) { + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); + si_emit_one_viewport(ctx, &states[0]); + return; + } + + /* All registers in the array need to be updated if any of them is changed. + * This is a hardware requirement. + */ + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6); + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) + si_emit_one_viewport(ctx, &states[i]); } -static inline void -si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, - bool window_space_position, float *zmin, float *zmax) +static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, + bool window_space_position, float *zmin, float *zmax) { - if (window_space_position) { - *zmin = 0; - *zmax = 1; - return; - } - util_viewport_zmin_zmax(vp, halfz, zmin, zmax); + if (window_space_position) { + *zmin = 0; + *zmax = 1; + return; + } + util_viewport_zmin_zmax(vp, halfz, zmin, zmax); } static void si_emit_depth_ranges(struct si_context *ctx) { - struct radeon_cmdbuf *cs = ctx->gfx_cs; - struct pipe_viewport_state *states = ctx->viewports.states; - bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz; - bool window_space = ctx->vs_disables_clipping_viewport; - float zmin, zmax; - - /* The simple case: Only 1 viewport is active. */ - if (!ctx->vs_writes_viewport_index) { - si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, - &zmin, &zmax); - - radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2); - radeon_emit(cs, fui(zmin)); - radeon_emit(cs, fui(zmax)); - return; - } - - /* All registers in the array need to be updated if any of them is changed. - * This is a hardware requirement. - */ - radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, - SI_MAX_VIEWPORTS * 2); - for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { - si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, - &zmin, &zmax); - radeon_emit(cs, fui(zmin)); - radeon_emit(cs, fui(zmax)); - } + struct radeon_cmdbuf *cs = ctx->gfx_cs; + struct pipe_viewport_state *states = ctx->viewports.states; + bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz; + bool window_space = ctx->vs_disables_clipping_viewport; + float zmin, zmax; + + /* The simple case: Only 1 viewport is active. */ + if (!ctx->vs_writes_viewport_index) { + si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax); + + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2); + radeon_emit(cs, fui(zmin)); + radeon_emit(cs, fui(zmax)); + return; + } + + /* All registers in the array need to be updated if any of them is changed. + * This is a hardware requirement. + */ + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2); + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { + si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax); + radeon_emit(cs, fui(zmin)); + radeon_emit(cs, fui(zmax)); + } } static void si_emit_viewport_states(struct si_context *ctx) { - si_emit_viewports(ctx); - si_emit_depth_ranges(ctx); + si_emit_viewports(ctx); + si_emit_depth_ranges(ctx); } /** @@ -579,128 +550,112 @@ static void si_emit_viewport_states(struct si_context *ctx) */ void si_update_vs_viewport_state(struct si_context *ctx) { - struct si_shader_info *info = si_get_vs_info(ctx); - bool vs_window_space; - - if (!info) - return; - - /* When the VS disables clipping and viewport transformation. */ - vs_window_space = - info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - - if (ctx->vs_disables_clipping_viewport != vs_window_space) { - ctx->vs_disables_clipping_viewport = vs_window_space; - si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); - si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); - } - - /* Viewport index handling. */ - if (ctx->vs_writes_viewport_index == info->writes_viewport_index) - return; - - /* This changes how the guardband is computed. */ - ctx->vs_writes_viewport_index = info->writes_viewport_index; - si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); - - /* Emit scissors and viewports that were enabled by having - * the ViewportIndex output. - */ - if (info->writes_viewport_index) { - si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); - si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); - } + struct si_shader_info *info = si_get_vs_info(ctx); + bool vs_window_space; + + if (!info) + return; + + /* When the VS disables clipping and viewport transformation. */ + vs_window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + + if (ctx->vs_disables_clipping_viewport != vs_window_space) { + ctx->vs_disables_clipping_viewport = vs_window_space; + si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); + } + + /* Viewport index handling. */ + if (ctx->vs_writes_viewport_index == info->writes_viewport_index) + return; + + /* This changes how the guardband is computed. */ + ctx->vs_writes_viewport_index = info->writes_viewport_index; + si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); + + /* Emit scissors and viewports that were enabled by having + * the ViewportIndex output. + */ + if (info->writes_viewport_index) { + si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); + si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); + } } static void si_emit_window_rectangles(struct si_context *sctx) { - /* There are four clipping rectangles. Their corner coordinates are inclusive. - * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending - * on whether the pixel is inside cliprects 0-3, respectively. For example, - * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned - * the number 3 (binary 0011). - * - * If CLIPRECT_RULE & (1 << number), the pixel is rasterized. - */ - struct radeon_cmdbuf *cs = sctx->gfx_cs; - static const unsigned outside[4] = { - /* outside rectangle 0 */ - V_02820C_OUT | - V_02820C_IN_1 | - V_02820C_IN_2 | - V_02820C_IN_21 | - V_02820C_IN_3 | - V_02820C_IN_31 | - V_02820C_IN_32 | - V_02820C_IN_321, - /* outside rectangles 0, 1 */ - V_02820C_OUT | - V_02820C_IN_2 | - V_02820C_IN_3 | - V_02820C_IN_32, - /* outside rectangles 0, 1, 2 */ - V_02820C_OUT | - V_02820C_IN_3, - /* outside rectangles 0, 1, 2, 3 */ - V_02820C_OUT, - }; - const unsigned disabled = 0xffff; /* all inside and outside cases */ - unsigned num_rectangles = sctx->num_window_rectangles; - struct pipe_scissor_state *rects = sctx->window_rectangles; - unsigned rule; - - assert(num_rectangles <= 4); - - if (num_rectangles == 0) - rule = disabled; - else if (sctx->window_rectangles_include) - rule = ~outside[num_rectangles - 1]; - else - rule = outside[num_rectangles - 1]; - - radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, - SI_TRACKED_PA_SC_CLIPRECT_RULE, rule); - if (num_rectangles == 0) - return; - - radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, - num_rectangles * 2); - for (unsigned i = 0; i < num_rectangles; i++) { - radeon_emit(cs, S_028210_TL_X(rects[i].minx) | - S_028210_TL_Y(rects[i].miny)); - radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | - S_028214_BR_Y(rects[i].maxy)); - } + /* There are four clipping rectangles. Their corner coordinates are inclusive. + * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending + * on whether the pixel is inside cliprects 0-3, respectively. For example, + * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned + * the number 3 (binary 0011). + * + * If CLIPRECT_RULE & (1 << number), the pixel is rasterized. + */ + struct radeon_cmdbuf *cs = sctx->gfx_cs; + static const unsigned outside[4] = { + /* outside rectangle 0 */ + V_02820C_OUT | V_02820C_IN_1 | V_02820C_IN_2 | V_02820C_IN_21 | V_02820C_IN_3 | + V_02820C_IN_31 | V_02820C_IN_32 | V_02820C_IN_321, + /* outside rectangles 0, 1 */ + V_02820C_OUT | V_02820C_IN_2 | V_02820C_IN_3 | V_02820C_IN_32, + /* outside rectangles 0, 1, 2 */ + V_02820C_OUT | V_02820C_IN_3, + /* outside rectangles 0, 1, 2, 3 */ + V_02820C_OUT, + }; + const unsigned disabled = 0xffff; /* all inside and outside cases */ + unsigned num_rectangles = sctx->num_window_rectangles; + struct pipe_scissor_state *rects = sctx->window_rectangles; + unsigned rule; + + assert(num_rectangles <= 4); + + if (num_rectangles == 0) + rule = disabled; + else if (sctx->window_rectangles_include) + rule = ~outside[num_rectangles - 1]; + else + rule = outside[num_rectangles - 1]; + + radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, + rule); + if (num_rectangles == 0) + return; + + radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2); + for (unsigned i = 0; i < num_rectangles; i++) { + radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny)); + radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy)); + } } -static void si_set_window_rectangles(struct pipe_context *ctx, - bool include, - unsigned num_rectangles, - const struct pipe_scissor_state *rects) +static void si_set_window_rectangles(struct pipe_context *ctx, bool include, + unsigned num_rectangles, + const struct pipe_scissor_state *rects) { - struct si_context *sctx = (struct si_context *)ctx; + struct si_context *sctx = (struct si_context *)ctx; - sctx->num_window_rectangles = num_rectangles; - sctx->window_rectangles_include = include; - if (num_rectangles) { - memcpy(sctx->window_rectangles, rects, - sizeof(*rects) * num_rectangles); - } + sctx->num_window_rectangles = num_rectangles; + sctx->window_rectangles_include = include; + if (num_rectangles) { + memcpy(sctx->window_rectangles, rects, sizeof(*rects) * num_rectangles); + } - si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles); + si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles); } void si_init_viewport_functions(struct si_context *ctx) { - ctx->atoms.s.guardband.emit = si_emit_guardband; - ctx->atoms.s.scissors.emit = si_emit_scissors; - ctx->atoms.s.viewports.emit = si_emit_viewport_states; - ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles; + ctx->atoms.s.guardband.emit = si_emit_guardband; + ctx->atoms.s.scissors.emit = si_emit_scissors; + ctx->atoms.s.viewports.emit = si_emit_viewport_states; + ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles; - ctx->b.set_scissor_states = si_set_scissor_states; - ctx->b.set_viewport_states = si_set_viewport_states; - ctx->b.set_window_rectangles = si_set_window_rectangles; + ctx->b.set_scissor_states = si_set_scissor_states; + ctx->b.set_viewport_states = si_set_viewport_states; + ctx->b.set_window_rectangles = si_set_window_rectangles; - for (unsigned i = 0; i < 16; i++) - ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; + for (unsigned i = 0; i < 16; i++) + ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH; } diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index f803448cfc6..7b4ecedbcba 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -26,8 +26,8 @@ /* This file implements randomized SDMA texture blit tests. */ #include "si_pipe.h" -#include "util/u_surface.h" #include "util/rand_xor.h" +#include "util/u_surface.h" static uint64_t seed_xorshift128plus[2]; @@ -36,382 +36,356 @@ static uint64_t seed_xorshift128plus[2]; /* The GPU blits are emulated on the CPU using these CPU textures. */ struct cpu_texture { - uint8_t *ptr; - uint64_t size; - uint64_t layer_stride; - unsigned stride; + uint8_t *ptr; + uint64_t size; + uint64_t layer_stride; + unsigned stride; }; -static void alloc_cpu_texture(struct cpu_texture *tex, - struct pipe_resource *templ) +static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ) { - tex->stride = align(util_format_get_stride(templ->format, templ->width0), - RAND_NUM_SIZE); - tex->layer_stride = (uint64_t)tex->stride * templ->height0; - tex->size = tex->layer_stride * templ->array_size; - tex->ptr = malloc(tex->size); - assert(tex->ptr); + tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE); + tex->layer_stride = (uint64_t)tex->stride * templ->height0; + tex->size = tex->layer_stride * templ->array_size; + tex->ptr = malloc(tex->size); + assert(tex->ptr); } -static void set_random_pixels(struct pipe_context *ctx, - struct pipe_resource *tex, - struct cpu_texture *cpu) +static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex, + struct cpu_texture *cpu) { - struct pipe_transfer *t; - uint8_t *map; - int x,y,z; - - map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, - 0, 0, 0, tex->width0, tex->height0, - tex->array_size, &t); - assert(map); - - for (z = 0; z < tex->array_size; z++) { - for (y = 0; y < tex->height0; y++) { - uint64_t *ptr = (uint64_t*) - (map + t->layer_stride*z + t->stride*y); - uint64_t *ptr_cpu = (uint64_t*) - (cpu->ptr + cpu->layer_stride*z + cpu->stride*y); - unsigned size = cpu->stride / RAND_NUM_SIZE; - - assert(t->stride % RAND_NUM_SIZE == 0); - assert(cpu->stride % RAND_NUM_SIZE == 0); - - for (x = 0; x < size; x++) { - *ptr++ = *ptr_cpu++ = - rand_xorshift128plus(seed_xorshift128plus); - } - } - } - - pipe_transfer_unmap(ctx, t); + struct pipe_transfer *t; + uint8_t *map; + int x, y, z; + + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0, + tex->array_size, &t); + assert(map); + + for (z = 0; z < tex->array_size; z++) { + for (y = 0; y < tex->height0; y++) { + uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y); + uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y); + unsigned size = cpu->stride / RAND_NUM_SIZE; + + assert(t->stride % RAND_NUM_SIZE == 0); + assert(cpu->stride % RAND_NUM_SIZE == 0); + + for (x = 0; x < size; x++) { + *ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus); + } + } + } + + pipe_transfer_unmap(ctx, t); } -static bool compare_textures(struct pipe_context *ctx, - struct pipe_resource *tex, - struct cpu_texture *cpu) +static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex, + struct cpu_texture *cpu) { - struct pipe_transfer *t; - uint8_t *map; - int y,z; - bool pass = true; - unsigned stride = util_format_get_stride(tex->format, tex->width0); - - map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, - 0, 0, 0, tex->width0, tex->height0, - tex->array_size, &t); - assert(map); - - for (z = 0; z < tex->array_size; z++) { - for (y = 0; y < tex->height0; y++) { - uint8_t *ptr = map + t->layer_stride*z + t->stride*y; - uint8_t *cpu_ptr = cpu->ptr + - cpu->layer_stride*z + cpu->stride*y; - - if (memcmp(ptr, cpu_ptr, stride)) { - pass = false; - goto done; - } - } - } + struct pipe_transfer *t; + uint8_t *map; + int y, z; + bool pass = true; + unsigned stride = util_format_get_stride(tex->format, tex->width0); + + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0, + tex->array_size, &t); + assert(map); + + for (z = 0; z < tex->array_size; z++) { + for (y = 0; y < tex->height0; y++) { + uint8_t *ptr = map + t->layer_stride * z + t->stride * y; + uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y; + + if (memcmp(ptr, cpu_ptr, stride)) { + pass = false; + goto done; + } + } + } done: - pipe_transfer_unmap(ctx, t); - return pass; + pipe_transfer_unmap(ctx, t); + return pass; } static enum pipe_format choose_format() { - enum pipe_format formats[] = { - PIPE_FORMAT_R8_UINT, - PIPE_FORMAT_R16_UINT, - PIPE_FORMAT_R32_UINT, - PIPE_FORMAT_R32G32_UINT, - PIPE_FORMAT_R32G32B32A32_UINT, - PIPE_FORMAT_G8R8_B8R8_UNORM, - }; - return formats[rand() % ARRAY_SIZE(formats)]; + enum pipe_format formats[] = { + PIPE_FORMAT_R8_UINT, PIPE_FORMAT_R16_UINT, PIPE_FORMAT_R32_UINT, + PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM, + }; + return formats[rand() % ARRAY_SIZE(formats)]; } -static const char *array_mode_to_string(struct si_screen *sscreen, - struct radeon_surf *surf) +static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf) { - if (sscreen->info.chip_class >= GFX9) { - switch (surf->u.gfx9.surf.swizzle_mode) { - case 0: - return " LINEAR"; - case 21: - return " 4KB_S_X"; - case 22: - return " 4KB_D_X"; - case 25: - return "64KB_S_X"; - case 26: - return "64KB_D_X"; - default: - printf("Unhandled swizzle mode = %u\n", - surf->u.gfx9.surf.swizzle_mode); - return " UNKNOWN"; - } - } else { - switch (surf->u.legacy.level[0].mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - return "LINEAR_ALIGNED"; - case RADEON_SURF_MODE_1D: - return "1D_TILED_THIN1"; - case RADEON_SURF_MODE_2D: - return "2D_TILED_THIN1"; - default: - assert(0); - return " UNKNOWN"; - } - } + if (sscreen->info.chip_class >= GFX9) { + switch (surf->u.gfx9.surf.swizzle_mode) { + case 0: + return " LINEAR"; + case 21: + return " 4KB_S_X"; + case 22: + return " 4KB_D_X"; + case 25: + return "64KB_S_X"; + case 26: + return "64KB_D_X"; + default: + printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode); + return " UNKNOWN"; + } + } else { + switch (surf->u.legacy.level[0].mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + return "LINEAR_ALIGNED"; + case RADEON_SURF_MODE_1D: + return "1D_TILED_THIN1"; + case RADEON_SURF_MODE_2D: + return "2D_TILED_THIN1"; + default: + assert(0); + return " UNKNOWN"; + } + } } static unsigned generate_max_tex_side(unsigned max_tex_side) { - switch (rand() % 4) { - case 0: - /* Try to hit large sizes in 1/4 of the cases. */ - return max_tex_side; - case 1: - /* Try to hit 1D tiling in 1/4 of the cases. */ - return 128; - default: - /* Try to hit common sizes in 2/4 of the cases. */ - return 2048; - } + switch (rand() % 4) { + case 0: + /* Try to hit large sizes in 1/4 of the cases. */ + return max_tex_side; + case 1: + /* Try to hit 1D tiling in 1/4 of the cases. */ + return 128; + default: + /* Try to hit common sizes in 2/4 of the cases. */ + return 2048; + } } void si_test_dma(struct si_screen *sscreen) { - struct pipe_screen *screen = &sscreen->b; - struct pipe_context *ctx = screen->context_create(screen, NULL, 0); - struct si_context *sctx = (struct si_context*)ctx; - uint64_t max_alloc_size; - unsigned i, iterations, num_partial_copies, max_tex_side; - unsigned num_pass = 0, num_fail = 0; - - max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE); - - /* Max 128 MB allowed for both textures. */ - max_alloc_size = 128 * 1024 * 1024; - - /* the seed for random test parameters */ - srand(0x9b47d95b); - /* the seed for random pixel data */ - s_rand_xorshift128plus(seed_xorshift128plus, false); - - iterations = 1000000000; /* just kill it when you are bored */ - num_partial_copies = 30; - - /* These parameters are randomly generated per test: - * - whether to do one whole-surface copy or N partial copies per test - * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D) - * - which texture dimensions to use - * - whether to use VRAM (all tiling modes) and GTT (staging, linear - * only) allocations - * - random initial pixels in src - * - generate random subrectangle copies for partial blits - */ - for (i = 0; i < iterations; i++) { - struct pipe_resource tsrc = {}, tdst = {}, *src, *dst; - struct si_texture *sdst; - struct si_texture *ssrc; - struct cpu_texture src_cpu, dst_cpu; - unsigned max_width, max_height, max_depth, j, num; - unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen; - unsigned max_tex_layers; - bool pass; - bool do_partial_copies = rand() & 1; - - /* generate a random test case */ - tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY; - tsrc.depth0 = tdst.depth0 = 1; - - tsrc.format = tdst.format = choose_format(); - - max_tex_side_gen = generate_max_tex_side(max_tex_side); - max_tex_layers = rand() % 4 ? 1 : 5; - - tsrc.width0 = (rand() % max_tex_side_gen) + 1; - tsrc.height0 = (rand() % max_tex_side_gen) + 1; - tsrc.array_size = (rand() % max_tex_layers) + 1; - - if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM) - tsrc.width0 = align(tsrc.width0, 2); - - /* Have a 1/4 chance of getting power-of-two dimensions. */ - if (rand() % 4 == 0) { - tsrc.width0 = util_next_power_of_two(tsrc.width0); - tsrc.height0 = util_next_power_of_two(tsrc.height0); - } - - if (!do_partial_copies) { - /* whole-surface copies only, same dimensions */ - tdst = tsrc; - } else { - max_tex_side_gen = generate_max_tex_side(max_tex_side); - max_tex_layers = rand() % 4 ? 1 : 5; - - /* many partial copies, dimensions can be different */ - tdst.width0 = (rand() % max_tex_side_gen) + 1; - tdst.height0 = (rand() % max_tex_side_gen) + 1; - tdst.array_size = (rand() % max_tex_layers) + 1; - - /* Have a 1/4 chance of getting power-of-two dimensions. */ - if (rand() % 4 == 0) { - tdst.width0 = util_next_power_of_two(tdst.width0); - tdst.height0 = util_next_power_of_two(tdst.height0); - } - } - - /* check texture sizes */ - if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) - * tsrc.array_size * util_format_get_blocksize(tsrc.format) + - (uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) - * tdst.array_size * util_format_get_blocksize(tdst.format) > - max_alloc_size) { - /* too large, try again */ - i--; - continue; - } - - /* VRAM + the tiling mode depends on dimensions (3/4 of cases), - * or GTT + linear only (1/4 of cases) - */ - tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; - tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; - - /* Allocate textures (both the GPU and CPU copies). - * The CPU will emulate what the GPU should be doing. - */ - src = screen->resource_create(screen, &tsrc); - dst = screen->resource_create(screen, &tdst); - assert(src); - assert(dst); - sdst = (struct si_texture*)dst; - ssrc = (struct si_texture*)src; - alloc_cpu_texture(&src_cpu, &tsrc); - alloc_cpu_texture(&dst_cpu, &tdst); - - printf("%4u: dst = (%5u x %5u x %u, %s), " - " src = (%5u x %5u x %u, %s), format = %s, ", - i, tdst.width0, tdst.height0, tdst.array_size, - array_mode_to_string(sscreen, &sdst->surface), - tsrc.width0, tsrc.height0, tsrc.array_size, - array_mode_to_string(sscreen, &ssrc->surface), - util_format_description(tsrc.format)->name); - fflush(stdout); - - /* set src pixels */ - set_random_pixels(ctx, src, &src_cpu); - - /* clear dst pixels */ - uint32_t zero = 0; - si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, - SI_COHERENCY_SHADER, false); - memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); - - /* preparation */ - max_width = MIN2(tsrc.width0, tdst.width0); - max_height = MIN2(tsrc.height0, tdst.height0); - max_depth = MIN2(tsrc.array_size, tdst.array_size); - - num = do_partial_copies ? num_partial_copies : 1; - for (j = 0; j < num; j++) { - int width, height, depth; - int srcx, srcy, srcz, dstx, dsty, dstz; - struct pipe_box box; - unsigned old_num_draw_calls = sctx->num_draw_calls; - unsigned old_num_dma_calls = sctx->num_dma_calls; - unsigned old_num_cs_calls = sctx->num_compute_calls; - - if (!do_partial_copies) { - /* copy whole src to dst */ - width = max_width; - height = max_height; - depth = max_depth; - - srcx = srcy = srcz = dstx = dsty = dstz = 0; - } else { - /* random sub-rectangle copies from src to dst */ - depth = (rand() % max_depth) + 1; - srcz = rand() % (tsrc.array_size - depth + 1); - dstz = rand() % (tdst.array_size - depth + 1); - - /* special code path to hit the tiled partial copies */ - if (!ssrc->surface.is_linear && - !sdst->surface.is_linear && - rand() & 1) { - if (max_width < 8 || max_height < 8) - continue; - width = ((rand() % (max_width / 8)) + 1) * 8; - height = ((rand() % (max_height / 8)) + 1) * 8; - - srcx = rand() % (tsrc.width0 - width + 1) & ~0x7; - srcy = rand() % (tsrc.height0 - height + 1) & ~0x7; - - dstx = rand() % (tdst.width0 - width + 1) & ~0x7; - dsty = rand() % (tdst.height0 - height + 1) & ~0x7; - } else { - /* just make sure that it doesn't divide by zero */ - assert(max_width > 0 && max_height > 0); - - width = (rand() % max_width) + 1; - height = (rand() % max_height) + 1; - - srcx = rand() % (tsrc.width0 - width + 1); - srcy = rand() % (tsrc.height0 - height + 1); - - dstx = rand() % (tdst.width0 - width + 1); - dsty = rand() % (tdst.height0 - height + 1); - } - - /* special code path to hit out-of-bounds reads in L2T */ - if (ssrc->surface.is_linear && - !sdst->surface.is_linear && - rand() % 4 == 0) { - srcx = 0; - srcy = 0; - srcz = 0; - } - } - - /* GPU copy */ - u_box_3d(srcx, srcy, srcz, width, height, depth, &box); - sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); - - /* See which engine was used. */ - gfx_blits += sctx->num_draw_calls > old_num_draw_calls; - dma_blits += sctx->num_dma_calls > old_num_dma_calls; - cs_blits += sctx->num_compute_calls > old_num_cs_calls; - - /* CPU copy */ - util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, - dst_cpu.layer_stride, - dstx, dsty, dstz, width, height, depth, - src_cpu.ptr, src_cpu.stride, - src_cpu.layer_stride, - srcx, srcy, srcz); - } - - pass = compare_textures(ctx, dst, &dst_cpu); - if (pass) - num_pass++; - else - num_fail++; - - printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", - gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail", - num_pass, num_pass+num_fail); - - /* cleanup */ - pipe_resource_reference(&src, NULL); - pipe_resource_reference(&dst, NULL); - free(src_cpu.ptr); - free(dst_cpu.ptr); - } - - ctx->destroy(ctx); - exit(0); + struct pipe_screen *screen = &sscreen->b; + struct pipe_context *ctx = screen->context_create(screen, NULL, 0); + struct si_context *sctx = (struct si_context *)ctx; + uint64_t max_alloc_size; + unsigned i, iterations, num_partial_copies, max_tex_side; + unsigned num_pass = 0, num_fail = 0; + + max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE); + + /* Max 128 MB allowed for both textures. */ + max_alloc_size = 128 * 1024 * 1024; + + /* the seed for random test parameters */ + srand(0x9b47d95b); + /* the seed for random pixel data */ + s_rand_xorshift128plus(seed_xorshift128plus, false); + + iterations = 1000000000; /* just kill it when you are bored */ + num_partial_copies = 30; + + /* These parameters are randomly generated per test: + * - whether to do one whole-surface copy or N partial copies per test + * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D) + * - which texture dimensions to use + * - whether to use VRAM (all tiling modes) and GTT (staging, linear + * only) allocations + * - random initial pixels in src + * - generate random subrectangle copies for partial blits + */ + for (i = 0; i < iterations; i++) { + struct pipe_resource tsrc = {}, tdst = {}, *src, *dst; + struct si_texture *sdst; + struct si_texture *ssrc; + struct cpu_texture src_cpu, dst_cpu; + unsigned max_width, max_height, max_depth, j, num; + unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen; + unsigned max_tex_layers; + bool pass; + bool do_partial_copies = rand() & 1; + + /* generate a random test case */ + tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY; + tsrc.depth0 = tdst.depth0 = 1; + + tsrc.format = tdst.format = choose_format(); + + max_tex_side_gen = generate_max_tex_side(max_tex_side); + max_tex_layers = rand() % 4 ? 1 : 5; + + tsrc.width0 = (rand() % max_tex_side_gen) + 1; + tsrc.height0 = (rand() % max_tex_side_gen) + 1; + tsrc.array_size = (rand() % max_tex_layers) + 1; + + if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM) + tsrc.width0 = align(tsrc.width0, 2); + + /* Have a 1/4 chance of getting power-of-two dimensions. */ + if (rand() % 4 == 0) { + tsrc.width0 = util_next_power_of_two(tsrc.width0); + tsrc.height0 = util_next_power_of_two(tsrc.height0); + } + + if (!do_partial_copies) { + /* whole-surface copies only, same dimensions */ + tdst = tsrc; + } else { + max_tex_side_gen = generate_max_tex_side(max_tex_side); + max_tex_layers = rand() % 4 ? 1 : 5; + + /* many partial copies, dimensions can be different */ + tdst.width0 = (rand() % max_tex_side_gen) + 1; + tdst.height0 = (rand() % max_tex_side_gen) + 1; + tdst.array_size = (rand() % max_tex_layers) + 1; + + /* Have a 1/4 chance of getting power-of-two dimensions. */ + if (rand() % 4 == 0) { + tdst.width0 = util_next_power_of_two(tdst.width0); + tdst.height0 = util_next_power_of_two(tdst.height0); + } + } + + /* check texture sizes */ + if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) * + tsrc.array_size * util_format_get_blocksize(tsrc.format) + + (uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) * + tdst.array_size * util_format_get_blocksize(tdst.format) > + max_alloc_size) { + /* too large, try again */ + i--; + continue; + } + + /* VRAM + the tiling mode depends on dimensions (3/4 of cases), + * or GTT + linear only (1/4 of cases) + */ + tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; + tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; + + /* Allocate textures (both the GPU and CPU copies). + * The CPU will emulate what the GPU should be doing. + */ + src = screen->resource_create(screen, &tsrc); + dst = screen->resource_create(screen, &tdst); + assert(src); + assert(dst); + sdst = (struct si_texture *)dst; + ssrc = (struct si_texture *)src; + alloc_cpu_texture(&src_cpu, &tsrc); + alloc_cpu_texture(&dst_cpu, &tdst); + + printf("%4u: dst = (%5u x %5u x %u, %s), " + " src = (%5u x %5u x %u, %s), format = %s, ", + i, tdst.width0, tdst.height0, tdst.array_size, + array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0, + tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface), + util_format_description(tsrc.format)->name); + fflush(stdout); + + /* set src pixels */ + set_random_pixels(ctx, src, &src_cpu); + + /* clear dst pixels */ + uint32_t zero = 0; + si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false); + memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); + + /* preparation */ + max_width = MIN2(tsrc.width0, tdst.width0); + max_height = MIN2(tsrc.height0, tdst.height0); + max_depth = MIN2(tsrc.array_size, tdst.array_size); + + num = do_partial_copies ? num_partial_copies : 1; + for (j = 0; j < num; j++) { + int width, height, depth; + int srcx, srcy, srcz, dstx, dsty, dstz; + struct pipe_box box; + unsigned old_num_draw_calls = sctx->num_draw_calls; + unsigned old_num_dma_calls = sctx->num_dma_calls; + unsigned old_num_cs_calls = sctx->num_compute_calls; + + if (!do_partial_copies) { + /* copy whole src to dst */ + width = max_width; + height = max_height; + depth = max_depth; + + srcx = srcy = srcz = dstx = dsty = dstz = 0; + } else { + /* random sub-rectangle copies from src to dst */ + depth = (rand() % max_depth) + 1; + srcz = rand() % (tsrc.array_size - depth + 1); + dstz = rand() % (tdst.array_size - depth + 1); + + /* special code path to hit the tiled partial copies */ + if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) { + if (max_width < 8 || max_height < 8) + continue; + width = ((rand() % (max_width / 8)) + 1) * 8; + height = ((rand() % (max_height / 8)) + 1) * 8; + + srcx = rand() % (tsrc.width0 - width + 1) & ~0x7; + srcy = rand() % (tsrc.height0 - height + 1) & ~0x7; + + dstx = rand() % (tdst.width0 - width + 1) & ~0x7; + dsty = rand() % (tdst.height0 - height + 1) & ~0x7; + } else { + /* just make sure that it doesn't divide by zero */ + assert(max_width > 0 && max_height > 0); + + width = (rand() % max_width) + 1; + height = (rand() % max_height) + 1; + + srcx = rand() % (tsrc.width0 - width + 1); + srcy = rand() % (tsrc.height0 - height + 1); + + dstx = rand() % (tdst.width0 - width + 1); + dsty = rand() % (tdst.height0 - height + 1); + } + + /* special code path to hit out-of-bounds reads in L2T */ + if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) { + srcx = 0; + srcy = 0; + srcz = 0; + } + } + + /* GPU copy */ + u_box_3d(srcx, srcy, srcz, width, height, depth, &box); + sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); + + /* See which engine was used. */ + gfx_blits += sctx->num_draw_calls > old_num_draw_calls; + dma_blits += sctx->num_dma_calls > old_num_dma_calls; + cs_blits += sctx->num_compute_calls > old_num_cs_calls; + + /* CPU copy */ + util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty, + dstz, width, height, depth, src_cpu.ptr, src_cpu.stride, + src_cpu.layer_stride, srcx, srcy, srcz); + } + + pass = compare_textures(ctx, dst, &dst_cpu); + if (pass) + num_pass++; + else + num_fail++; + + printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits, + pass ? "pass" : "fail", num_pass, num_pass + num_fail); + + /* cleanup */ + pipe_resource_reference(&src, NULL); + pipe_resource_reference(&dst, NULL); + free(src_cpu.ptr); + free(dst_cpu.ptr); + } + + ctx->destroy(ctx); + exit(0); } diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 4eec3d12459..116bfe69069 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -28,451 +28,444 @@ #include "si_pipe.h" #include "si_query.h" -#define MIN_SIZE 512 -#define MAX_SIZE (128 * 1024 * 1024) -#define SIZE_SHIFT 1 -#define NUM_RUNS 128 +#define MIN_SIZE 512 +#define MAX_SIZE (128 * 1024 * 1024) +#define SIZE_SHIFT 1 +#define NUM_RUNS 128 static double get_MBps_rate(unsigned num_bytes, unsigned ns) { - return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); + return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); } void si_test_dma_perf(struct si_screen *sscreen) { - struct pipe_screen *screen = &sscreen->b; - struct pipe_context *ctx = screen->context_create(screen, NULL, 0); - struct si_context *sctx = (struct si_context*)ctx; - const uint32_t clear_value = 0x12345678; - static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; - static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0}; + struct pipe_screen *screen = &sscreen->b; + struct pipe_context *ctx = screen->context_create(screen, NULL, 0); + struct si_context *sctx = (struct si_context *)ctx; + const uint32_t clear_value = 0x12345678; + static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; + static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0}; #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) -#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) - - static const char *method_str[] = { - "CP MC ", - "CP L2 ", - "CP L2 ", - "SDMA ", - }; - static const char *placement_str[] = { - /* Clear */ - "fill->VRAM", - "fill->GTT ", - /* Copy */ - "VRAM->VRAM", - "VRAM->GTT ", - "GTT ->VRAM", - }; - - printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); - printf("Heap ,Method ,L2p,Wa,"); - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - if (size >= 1024) - printf("%6uKB,", size / 1024); - else - printf(" %6uB,", size); - } - printf("\n"); - - /* results[log2(size)][placement][method][] */ - struct si_result { - bool is_valid; - bool is_cp; - bool is_sdma; - bool is_cs; - unsigned cache_policy; - unsigned dwords_per_thread; - unsigned waves_per_sh; - unsigned score; - unsigned index; /* index in results[x][y][index] */ - } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; - - /* Run benchmarks. */ - for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { - bool is_copy = placement >= 2; - - printf("-----------,--------,---,--,"); - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) - printf("--------,"); - printf("\n"); - - for (unsigned method = 0; method < NUM_METHODS; method++) { - bool test_cp = method <= 2; - bool test_sdma = method == 3; - bool test_cs = method >= 4; - unsigned cs_method = method - 4; - STATIC_ASSERT(L2_STREAM + 1 == L2_LRU); - unsigned cs_waves_per_sh = - test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0; - cs_method %= 2*NUM_SHADERS; - unsigned cache_policy = test_cp ? method % 3 : - test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0; - unsigned cs_dwords_per_thread = - test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; - - if (test_sdma && !sctx->sdma_cs) - continue; - - if (sctx->chip_class == GFX6) { - /* GFX6 doesn't support CP DMA operations through L2. */ - if (test_cp && cache_policy != L2_BYPASS) - continue; - /* WAVES_PER_SH is in multiples of 16 on GFX6. */ - if (test_cs && cs_waves_per_sh % 16 != 0) - continue; - } - - printf("%s ,", placement_str[placement]); - if (test_cs) { - printf("CS x%-4u,%3s,", cs_dwords_per_thread, - cache_policy == L2_LRU ? "LRU" : - cache_policy == L2_STREAM ? "Str" : ""); - } else { - printf("%s,%3s,", method_str[method], - method == L2_LRU ? "LRU" : - method == L2_STREAM ? "Str" : ""); - } - if (test_cs && cs_waves_per_sh) - printf("%2u,", cs_waves_per_sh); - else - printf(" ,"); - - double score = 0; - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - /* Don't test bigger sizes if it's too slow. Print 0. */ - if (size >= 512*1024 && - score < 400 * (size / (4*1024*1024))) { - printf("%7.0f ,", 0.0); - continue; - } - - enum pipe_resource_usage dst_usage, src_usage; - struct pipe_resource *dst, *src; - struct pipe_query *q[NUM_RUNS]; - unsigned query_type = PIPE_QUERY_TIME_ELAPSED; - - if (test_sdma) { - if (sctx->chip_class == GFX6) - query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI; - else - query_type = SI_QUERY_TIME_ELAPSED_SDMA; - } - - if (placement == 0 || placement == 2 || placement == 4) - dst_usage = PIPE_USAGE_DEFAULT; - else - dst_usage = PIPE_USAGE_STREAM; - - if (placement == 2 || placement == 3) - src_usage = PIPE_USAGE_DEFAULT; - else - src_usage = PIPE_USAGE_STREAM; - - dst = pipe_buffer_create(screen, 0, dst_usage, size); - src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL; - - /* Run tests. */ - for (unsigned iter = 0; iter < NUM_RUNS; iter++) { - q[iter] = ctx->create_query(ctx, query_type, 0); - ctx->begin_query(ctx, q[iter]); - - if (test_cp) { - /* CP DMA */ - if (is_copy) { - si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, - SI_COHERENCY_NONE, cache_policy); - } else { - si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, - clear_value, 0, - SI_COHERENCY_NONE, cache_policy); - } - } else if (test_sdma) { - /* SDMA */ - if (is_copy) { - si_sdma_copy_buffer(sctx, dst, src, 0, 0, size); - } else { - si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); - } - } else { - /* Compute */ - /* The memory accesses are coalesced, meaning that the 1st instruction writes - * the 1st contiguous block of data for the whole wave, the 2nd instruction - * writes the 2nd contiguous block of data, etc. - */ - unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); - unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; - unsigned dwords_per_wave = cs_dwords_per_thread * 64; - - unsigned num_dwords = size / 4; - unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); - - void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, - cache_policy == L2_STREAM, is_copy); - - struct pipe_grid_info info = {}; - info.block[0] = MIN2(64, num_instructions); - info.block[1] = 1; - info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); - info.grid[1] = 1; - info.grid[2] = 1; - - struct pipe_shader_buffer sb[2] = {}; - sb[0].buffer = dst; - sb[0].buffer_size = size; - - if (is_copy) { - sb[1].buffer = src; - sb[1].buffer_size = size; - } else { - for (unsigned i = 0; i < 4; i++) - sctx->cs_user_data[i] = clear_value; - } - - sctx->flags |= SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_SCACHE; - - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, - is_copy ? 2 : 1, sb, 0x1); - ctx->bind_compute_state(ctx, cs); - sctx->cs_max_waves_per_sh = cs_waves_per_sh; - - ctx->launch_grid(ctx, &info); - - ctx->bind_compute_state(ctx, NULL); - ctx->delete_compute_state(ctx, cs); - sctx->cs_max_waves_per_sh = 0; /* disable the limit */ - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - /* Flush L2, so that we don't just test L2 cache performance. */ - if (!test_sdma) { - sctx->flags |= SI_CONTEXT_WB_L2; - sctx->emit_cache_flush(sctx); - } - - ctx->end_query(ctx, q[iter]); - ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); - } - pipe_resource_reference(&dst, NULL); - pipe_resource_reference(&src, NULL); - - /* Get results. */ - uint64_t min = ~0ull, max = 0, total = 0; - - for (unsigned iter = 0; iter < NUM_RUNS; iter++) { - union pipe_query_result result; - - ctx->get_query_result(ctx, q[iter], true, &result); - ctx->destroy_query(ctx, q[iter]); - - min = MIN2(min, result.u64); - max = MAX2(max, result.u64); - total += result.u64; - } - - score = get_MBps_rate(size, total / (double)NUM_RUNS); - printf("%7.0f ,", score); - fflush(stdout); - - struct si_result *r = &results[util_logbase2(size)][placement][method]; - r->is_valid = true; - r->is_cp = test_cp; - r->is_sdma = test_sdma; - r->is_cs = test_cs; - r->cache_policy = cache_policy; - r->dwords_per_thread = cs_dwords_per_thread; - r->waves_per_sh = cs_waves_per_sh; - r->score = score; - r->index = method; - } - puts(""); - } - } - - puts(""); - puts("static struct si_method"); - printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n", - sctx->screen->info.name); - puts("{"); - puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); - - /* Analyze results and find the best methods. */ - for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { - if (placement == 0) - puts(" if (dst == RADEON_DOMAIN_VRAM) {"); - else if (placement == 1) - puts(" } else { /* GTT */"); - else if (placement == 2) { - puts("}"); - puts(""); - puts("static struct si_method"); - printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", - sctx->screen->info.name); - printf(" uint64_t size64, bool async, bool cached)\n"); - puts("{"); - puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); - puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); - } else if (placement == 3) - puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); - else - puts(" } else { /* GTT -> VRAM */"); - - for (unsigned mode = 0; mode < 3; mode++) { - bool async = mode == 0; - bool cached = mode == 1; - - if (async) - puts(" if (async) { /* SDMA or async compute */"); - else if (cached) - puts(" if (cached) { /* gfx ring */"); - else - puts(" } else { /* gfx ring - uncached */"); - - /* The list of best chosen methods. */ - struct si_result *methods[32]; - unsigned method_max_size[32]; - unsigned num_methods = 0; - - for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { - /* Find the best method. */ - struct si_result *best = NULL; - - for (unsigned i = 0; i < NUM_METHODS; i++) { - struct si_result *r = &results[util_logbase2(size)][placement][i]; - - if (!r->is_valid) - continue; - - /* Ban CP DMA clears via MC on <= GFX8. They are super slow - * on GTT, which we can get due to BO evictions. - */ - if (sctx->chip_class <= GFX8 && placement == 1 && - r->is_cp && r->cache_policy == L2_BYPASS) - continue; - - if (async) { - /* The following constraints for compute IBs try to limit - * resource usage so as not to decrease the performance - * of gfx IBs too much. - */ - - /* Don't use CP DMA on asynchronous rings, because - * the engine is shared with gfx IBs. - */ - if (r->is_cp) - continue; - - /* Don't use L2 caching on asynchronous rings to minimize - * L2 usage. - */ - if (r->cache_policy == L2_LRU) - continue; - - /* Asynchronous compute recommends waves_per_sh != 0 - * to limit CU usage. */ - if (r->is_cs && r->waves_per_sh == 0) - continue; - } else { - /* SDMA is always asynchronous */ - if (r->is_sdma) - continue; - - if (cached && r->cache_policy == L2_BYPASS) - continue; - if (!cached && r->cache_policy == L2_LRU) - continue; - } - - if (!best) { - best = r; - continue; - } - - /* Assume some measurement error. Earlier methods occupy fewer - * resources, so the next method is always more greedy, and we - * don't want to select it due to a measurement error. - */ - double min_improvement = 1.03; - - if (best->score * min_improvement < r->score) - best = r; - } - - if (num_methods > 0) { - unsigned prev_index = num_methods - 1; - struct si_result *prev = methods[prev_index]; - struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index]; - - /* If the best one is also the best for the previous size, - * just bump the size for the previous one. - * - * If there is no best, it means all methods were too slow - * for this size and were not tested. Use the best one for - * the previous size. - */ - if (!best || - /* If it's the same method as for the previous size: */ - (prev->is_cp == best->is_cp && - prev->is_sdma == best->is_sdma && - prev->is_cs == best->is_cs && - prev->cache_policy == best->cache_policy && - prev->dwords_per_thread == best->dwords_per_thread && - prev->waves_per_sh == best->waves_per_sh) || - /* If the method for the previous size is also the best - * for this size: */ - (prev_this_size->is_valid && - prev_this_size->score * 1.03 > best->score)) { - method_max_size[prev_index] = size; - continue; - } - } - - /* Add it to the list. */ - assert(num_methods < ARRAY_SIZE(methods)); - methods[num_methods] = best; - method_max_size[num_methods] = size; - num_methods++; - } - - for (unsigned i = 0; i < num_methods; i++) { - struct si_result *best = methods[i]; - unsigned size = method_max_size[i]; - - /* The size threshold is between the current benchmarked - * size and the next benchmarked size. */ - if (i < num_methods - 1) - printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2); - else if (i > 0) - printf(" else "); - else - printf(" "); - printf("return "); - - assert(best); - if (best->is_cp) { - printf("CP_DMA(%s);\n", - best->cache_policy == L2_BYPASS ? "L2_BYPASS" : - best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM"); - } - if (best->is_sdma) - printf("SDMA;\n"); - if (best->is_cs) { - printf("COMPUTE(%s, %u, %u);\n", - best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM", - best->dwords_per_thread, - best->waves_per_sh); - } - } - } - puts(" }"); - } - puts(" }"); - puts("}"); - - ctx->destroy(ctx); - exit(0); +#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) + + static const char *method_str[] = { + "CP MC ", + "CP L2 ", + "CP L2 ", + "SDMA ", + }; + static const char *placement_str[] = { + /* Clear */ + "fill->VRAM", + "fill->GTT ", + /* Copy */ + "VRAM->VRAM", + "VRAM->GTT ", + "GTT ->VRAM", + }; + + printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); + printf("Heap ,Method ,L2p,Wa,"); + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + if (size >= 1024) + printf("%6uKB,", size / 1024); + else + printf(" %6uB,", size); + } + printf("\n"); + + /* results[log2(size)][placement][method][] */ + struct si_result { + bool is_valid; + bool is_cp; + bool is_sdma; + bool is_cs; + unsigned cache_policy; + unsigned dwords_per_thread; + unsigned waves_per_sh; + unsigned score; + unsigned index; /* index in results[x][y][index] */ + } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; + + /* Run benchmarks. */ + for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { + bool is_copy = placement >= 2; + + printf("-----------,--------,---,--,"); + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) + printf("--------,"); + printf("\n"); + + for (unsigned method = 0; method < NUM_METHODS; method++) { + bool test_cp = method <= 2; + bool test_sdma = method == 3; + bool test_cs = method >= 4; + unsigned cs_method = method - 4; + STATIC_ASSERT(L2_STREAM + 1 == L2_LRU); + unsigned cs_waves_per_sh = + test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0; + cs_method %= 2 * NUM_SHADERS; + unsigned cache_policy = + test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0; + unsigned cs_dwords_per_thread = + test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; + + if (test_sdma && !sctx->sdma_cs) + continue; + + if (sctx->chip_class == GFX6) { + /* GFX6 doesn't support CP DMA operations through L2. */ + if (test_cp && cache_policy != L2_BYPASS) + continue; + /* WAVES_PER_SH is in multiples of 16 on GFX6. */ + if (test_cs && cs_waves_per_sh % 16 != 0) + continue; + } + + printf("%s ,", placement_str[placement]); + if (test_cs) { + printf("CS x%-4u,%3s,", cs_dwords_per_thread, + cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : ""); + } else { + printf("%s,%3s,", method_str[method], + method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : ""); + } + if (test_cs && cs_waves_per_sh) + printf("%2u,", cs_waves_per_sh); + else + printf(" ,"); + + double score = 0; + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + /* Don't test bigger sizes if it's too slow. Print 0. */ + if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) { + printf("%7.0f ,", 0.0); + continue; + } + + enum pipe_resource_usage dst_usage, src_usage; + struct pipe_resource *dst, *src; + struct pipe_query *q[NUM_RUNS]; + unsigned query_type = PIPE_QUERY_TIME_ELAPSED; + + if (test_sdma) { + if (sctx->chip_class == GFX6) + query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI; + else + query_type = SI_QUERY_TIME_ELAPSED_SDMA; + } + + if (placement == 0 || placement == 2 || placement == 4) + dst_usage = PIPE_USAGE_DEFAULT; + else + dst_usage = PIPE_USAGE_STREAM; + + if (placement == 2 || placement == 3) + src_usage = PIPE_USAGE_DEFAULT; + else + src_usage = PIPE_USAGE_STREAM; + + dst = pipe_buffer_create(screen, 0, dst_usage, size); + src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL; + + /* Run tests. */ + for (unsigned iter = 0; iter < NUM_RUNS; iter++) { + q[iter] = ctx->create_query(ctx, query_type, 0); + ctx->begin_query(ctx, q[iter]); + + if (test_cp) { + /* CP DMA */ + if (is_copy) { + si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE, + cache_policy); + } else { + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0, + SI_COHERENCY_NONE, cache_policy); + } + } else if (test_sdma) { + /* SDMA */ + if (is_copy) { + si_sdma_copy_buffer(sctx, dst, src, 0, 0, size); + } else { + si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); + } + } else { + /* Compute */ + /* The memory accesses are coalesced, meaning that the 1st instruction writes + * the 1st contiguous block of data for the whole wave, the 2nd instruction + * writes the 2nd contiguous block of data, etc. + */ + unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); + unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; + unsigned dwords_per_wave = cs_dwords_per_thread * 64; + + unsigned num_dwords = size / 4; + unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + + void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, + cache_policy == L2_STREAM, is_copy); + + struct pipe_grid_info info = {}; + info.block[0] = MIN2(64, num_instructions); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[1] = 1; + info.grid[2] = 1; + + struct pipe_shader_buffer sb[2] = {}; + sb[0].buffer = dst; + sb[0].buffer_size = size; + + if (is_copy) { + sb[1].buffer = src; + sb[1].buffer_size = size; + } else { + for (unsigned i = 0; i < 4; i++) + sctx->cs_user_data[i] = clear_value; + } + + sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); + ctx->bind_compute_state(ctx, cs); + sctx->cs_max_waves_per_sh = cs_waves_per_sh; + + ctx->launch_grid(ctx, &info); + + ctx->bind_compute_state(ctx, NULL); + ctx->delete_compute_state(ctx, cs); + sctx->cs_max_waves_per_sh = 0; /* disable the limit */ + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* Flush L2, so that we don't just test L2 cache performance. */ + if (!test_sdma) { + sctx->flags |= SI_CONTEXT_WB_L2; + sctx->emit_cache_flush(sctx); + } + + ctx->end_query(ctx, q[iter]); + ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); + } + pipe_resource_reference(&dst, NULL); + pipe_resource_reference(&src, NULL); + + /* Get results. */ + uint64_t min = ~0ull, max = 0, total = 0; + + for (unsigned iter = 0; iter < NUM_RUNS; iter++) { + union pipe_query_result result; + + ctx->get_query_result(ctx, q[iter], true, &result); + ctx->destroy_query(ctx, q[iter]); + + min = MIN2(min, result.u64); + max = MAX2(max, result.u64); + total += result.u64; + } + + score = get_MBps_rate(size, total / (double)NUM_RUNS); + printf("%7.0f ,", score); + fflush(stdout); + + struct si_result *r = &results[util_logbase2(size)][placement][method]; + r->is_valid = true; + r->is_cp = test_cp; + r->is_sdma = test_sdma; + r->is_cs = test_cs; + r->cache_policy = cache_policy; + r->dwords_per_thread = cs_dwords_per_thread; + r->waves_per_sh = cs_waves_per_sh; + r->score = score; + r->index = method; + } + puts(""); + } + } + + puts(""); + puts("static struct si_method"); + printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool " + "cached)\n", + sctx->screen->info.name); + puts("{"); + puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); + + /* Analyze results and find the best methods. */ + for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { + if (placement == 0) + puts(" if (dst == RADEON_DOMAIN_VRAM) {"); + else if (placement == 1) + puts(" } else { /* GTT */"); + else if (placement == 2) { + puts("}"); + puts(""); + puts("static struct si_method"); + printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", + sctx->screen->info.name); + printf(" uint64_t size64, bool async, bool cached)\n"); + puts("{"); + puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); + puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); + } else if (placement == 3) + puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); + else + puts(" } else { /* GTT -> VRAM */"); + + for (unsigned mode = 0; mode < 3; mode++) { + bool async = mode == 0; + bool cached = mode == 1; + + if (async) + puts(" if (async) { /* SDMA or async compute */"); + else if (cached) + puts(" if (cached) { /* gfx ring */"); + else + puts(" } else { /* gfx ring - uncached */"); + + /* The list of best chosen methods. */ + struct si_result *methods[32]; + unsigned method_max_size[32]; + unsigned num_methods = 0; + + for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { + /* Find the best method. */ + struct si_result *best = NULL; + + for (unsigned i = 0; i < NUM_METHODS; i++) { + struct si_result *r = &results[util_logbase2(size)][placement][i]; + + if (!r->is_valid) + continue; + + /* Ban CP DMA clears via MC on <= GFX8. They are super slow + * on GTT, which we can get due to BO evictions. + */ + if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp && + r->cache_policy == L2_BYPASS) + continue; + + if (async) { + /* The following constraints for compute IBs try to limit + * resource usage so as not to decrease the performance + * of gfx IBs too much. + */ + + /* Don't use CP DMA on asynchronous rings, because + * the engine is shared with gfx IBs. + */ + if (r->is_cp) + continue; + + /* Don't use L2 caching on asynchronous rings to minimize + * L2 usage. + */ + if (r->cache_policy == L2_LRU) + continue; + + /* Asynchronous compute recommends waves_per_sh != 0 + * to limit CU usage. */ + if (r->is_cs && r->waves_per_sh == 0) + continue; + } else { + /* SDMA is always asynchronous */ + if (r->is_sdma) + continue; + + if (cached && r->cache_policy == L2_BYPASS) + continue; + if (!cached && r->cache_policy == L2_LRU) + continue; + } + + if (!best) { + best = r; + continue; + } + + /* Assume some measurement error. Earlier methods occupy fewer + * resources, so the next method is always more greedy, and we + * don't want to select it due to a measurement error. + */ + double min_improvement = 1.03; + + if (best->score * min_improvement < r->score) + best = r; + } + + if (num_methods > 0) { + unsigned prev_index = num_methods - 1; + struct si_result *prev = methods[prev_index]; + struct si_result *prev_this_size = + &results[util_logbase2(size)][placement][prev->index]; + + /* If the best one is also the best for the previous size, + * just bump the size for the previous one. + * + * If there is no best, it means all methods were too slow + * for this size and were not tested. Use the best one for + * the previous size. + */ + if (!best || + /* If it's the same method as for the previous size: */ + (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma && + prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy && + prev->dwords_per_thread == best->dwords_per_thread && + prev->waves_per_sh == best->waves_per_sh) || + /* If the method for the previous size is also the best + * for this size: */ + (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) { + method_max_size[prev_index] = size; + continue; + } + } + + /* Add it to the list. */ + assert(num_methods < ARRAY_SIZE(methods)); + methods[num_methods] = best; + method_max_size[num_methods] = size; + num_methods++; + } + + for (unsigned i = 0; i < num_methods; i++) { + struct si_result *best = methods[i]; + unsigned size = method_max_size[i]; + + /* The size threshold is between the current benchmarked + * size and the next benchmarked size. */ + if (i < num_methods - 1) + printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2); + else if (i > 0) + printf(" else "); + else + printf(" "); + printf("return "); + + assert(best); + if (best->is_cp) { + printf("CP_DMA(%s);\n", + best->cache_policy == L2_BYPASS + ? "L2_BYPASS" + : best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM"); + } + if (best->is_sdma) + printf("SDMA;\n"); + if (best->is_cs) { + printf("COMPUTE(%s, %u, %u);\n", + best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM", + best->dwords_per_thread, best->waves_per_sh); + } + } + } + puts(" }"); + } + puts(" }"); + puts("}"); + + ctx->destroy(ctx); + exit(0); } diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index bcf9187082b..4f7744a887d 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -23,462 +23,419 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "drm-uapi/drm_fourcc.h" #include "si_pipe.h" #include "si_query.h" +#include "sid.h" +#include "state_tracker/drm_driver.h" #include "util/format/u_format.h" +#include "util/os_time.h" #include "util/u_log.h" #include "util/u_memory.h" #include "util/u_pack_color.h" #include "util/u_resource.h" #include "util/u_surface.h" #include "util/u_transfer.h" -#include "util/os_time.h" + #include #include -#include "state_tracker/drm_driver.h" -#include "sid.h" -#include "amd/addrlib/inc/addrinterface.h" -#include "drm-uapi/drm_fourcc.h" -static enum radeon_surf_mode -si_choose_tiling(struct si_screen *sscreen, - const struct pipe_resource *templ, bool tc_compatible_htile); +#include "amd/addrlib/inc/addrinterface.h" +static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, + const struct pipe_resource *templ, + bool tc_compatible_htile); -bool si_prepare_for_dma_blit(struct si_context *sctx, - struct si_texture *dst, - unsigned dst_level, unsigned dstx, - unsigned dsty, unsigned dstz, - struct si_texture *src, - unsigned src_level, - const struct pipe_box *src_box) +bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src, + unsigned src_level, const struct pipe_box *src_box) { - if (!sctx->sdma_cs) - return false; - - if (dst->surface.bpe != src->surface.bpe) - return false; - - /* MSAA: Blits don't exist in the real world. */ - if (src->buffer.b.b.nr_samples > 1 || - dst->buffer.b.b.nr_samples > 1) - return false; - - /* Depth-stencil surfaces: - * When dst is linear, the DB->CB copy preserves HTILE. - * When dst is tiled, the 3D path must be used to update HTILE. - */ - if (src->is_depth || dst->is_depth) - return false; - - /* DCC as: - * src: Use the 3D path. DCC decompression is expensive. - * dst: Use the 3D path to compress the pixels with DCC. - */ - if (vi_dcc_enabled(src, src_level) || - vi_dcc_enabled(dst, dst_level)) - return false; - - /* CMASK as: - * src: Both texture and SDMA paths need decompression. Use SDMA. - * dst: If overwriting the whole texture, discard CMASK and use - * SDMA. Otherwise, use the 3D path. - */ - if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) { - /* The CMASK clear is only enabled for the first level. */ - assert(dst_level == 0); - if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, - dstx, dsty, dstz, src_box->width, - src_box->height, src_box->depth)) - return false; - - si_texture_discard_cmask(sctx->screen, dst); - } - - /* All requirements are met. Prepare textures for SDMA. */ - if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level)) - sctx->b.flush_resource(&sctx->b, &src->buffer.b.b); - - assert(!(src->dirty_level_mask & (1 << src_level))); - assert(!(dst->dirty_level_mask & (1 << dst_level))); - - return true; + if (!sctx->sdma_cs) + return false; + + if (dst->surface.bpe != src->surface.bpe) + return false; + + /* MSAA: Blits don't exist in the real world. */ + if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1) + return false; + + /* Depth-stencil surfaces: + * When dst is linear, the DB->CB copy preserves HTILE. + * When dst is tiled, the 3D path must be used to update HTILE. + */ + if (src->is_depth || dst->is_depth) + return false; + + /* DCC as: + * src: Use the 3D path. DCC decompression is expensive. + * dst: Use the 3D path to compress the pixels with DCC. + */ + if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level)) + return false; + + /* CMASK as: + * src: Both texture and SDMA paths need decompression. Use SDMA. + * dst: If overwriting the whole texture, discard CMASK and use + * SDMA. Otherwise, use the 3D path. + */ + if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) { + /* The CMASK clear is only enabled for the first level. */ + assert(dst_level == 0); + if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz, + src_box->width, src_box->height, src_box->depth)) + return false; + + si_texture_discard_cmask(sctx->screen, dst); + } + + /* All requirements are met. Prepare textures for SDMA. */ + if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level)) + sctx->b.flush_resource(&sctx->b, &src->buffer.b.b); + + assert(!(src->dirty_level_mask & (1 << src_level))); + assert(!(dst->dirty_level_mask & (1 << dst_level))); + + return true; } /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */ -static void si_copy_region_with_blit(struct pipe_context *pipe, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, + unsigned dstz, struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) { - struct pipe_blit_info blit; - - memset(&blit, 0, sizeof(blit)); - blit.src.resource = src; - blit.src.format = src->format; - blit.src.level = src_level; - blit.src.box = *src_box; - blit.dst.resource = dst; - blit.dst.format = dst->format; - blit.dst.level = dst_level; - blit.dst.box.x = dstx; - blit.dst.box.y = dsty; - blit.dst.box.z = dstz; - blit.dst.box.width = src_box->width; - blit.dst.box.height = src_box->height; - blit.dst.box.depth = src_box->depth; - blit.mask = util_format_get_mask(dst->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - if (blit.mask) { - pipe->blit(pipe, &blit); - } + struct pipe_blit_info blit; + + memset(&blit, 0, sizeof(blit)); + blit.src.resource = src; + blit.src.format = src->format; + blit.src.level = src_level; + blit.src.box = *src_box; + blit.dst.resource = dst; + blit.dst.format = dst->format; + blit.dst.level = dst_level; + blit.dst.box.x = dstx; + blit.dst.box.y = dsty; + blit.dst.box.z = dstz; + blit.dst.box.width = src_box->width; + blit.dst.box.height = src_box->height; + blit.dst.box.depth = src_box->depth; + blit.mask = util_format_get_mask(dst->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + + if (blit.mask) { + pipe->blit(pipe, &blit); + } } /* Copy from a full GPU texture to a transfer's staging one. */ static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; - struct pipe_resource *dst = &stransfer->staging->b.b; - struct pipe_resource *src = transfer->resource; - - if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) { - si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, - src, transfer->level, &transfer->box); - return; - } - - sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, - &transfer->box); + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer; + struct pipe_resource *dst = &stransfer->staging->b.b; + struct pipe_resource *src = transfer->resource; + + if (src->nr_samples > 1 || ((struct si_texture *)src)->is_depth) { + si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box); + return; + } + + sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box); } /* Copy from a transfer's staging texture to a full GPU one. */ static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer; - struct pipe_resource *dst = transfer->resource; - struct pipe_resource *src = &stransfer->staging->b.b; - struct pipe_box sbox; - - u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); - - if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) { - si_copy_region_with_blit(ctx, dst, transfer->level, - transfer->box.x, transfer->box.y, transfer->box.z, - src, 0, &sbox); - return; - } - - if (util_format_is_compressed(dst->format)) { - sbox.width = util_format_get_nblocksx(dst->format, sbox.width); - sbox.height = util_format_get_nblocksx(dst->format, sbox.height); - } - - sctx->dma_copy(ctx, dst, transfer->level, - transfer->box.x, transfer->box.y, transfer->box.z, - src, 0, &sbox); + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer; + struct pipe_resource *dst = transfer->resource; + struct pipe_resource *src = &stransfer->staging->b.b; + struct pipe_box sbox; + + u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); + + if (dst->nr_samples > 1 || ((struct si_texture *)dst)->is_depth) { + si_copy_region_with_blit(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, + transfer->box.z, src, 0, &sbox); + return; + } + + if (util_format_is_compressed(dst->format)) { + sbox.width = util_format_get_nblocksx(dst->format, sbox.width); + sbox.height = util_format_get_nblocksx(dst->format, sbox.height); + } + + sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src, + 0, &sbox); } -static unsigned si_texture_get_offset(struct si_screen *sscreen, - struct si_texture *tex, unsigned level, - const struct pipe_box *box, - unsigned *stride, - unsigned *layer_stride) +static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex, + unsigned level, const struct pipe_box *box, unsigned *stride, + unsigned *layer_stride) { - if (sscreen->info.chip_class >= GFX9) { - *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; - *layer_stride = tex->surface.u.gfx9.surf_slice_size; - - if (!box) - return 0; - - /* Each texture is an array of slices. Each slice is an array - * of mipmap levels. */ - return tex->surface.u.gfx9.surf_offset + - box->z * tex->surface.u.gfx9.surf_slice_size + - tex->surface.u.gfx9.offset[level] + - (box->y / tex->surface.blk_h * - tex->surface.u.gfx9.surf_pitch + - box->x / tex->surface.blk_w) * tex->surface.bpe; - } else { - *stride = tex->surface.u.legacy.level[level].nblk_x * - tex->surface.bpe; - assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX); - *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4; - - if (!box) - return tex->surface.u.legacy.level[level].offset; - - /* Each texture is an array of mipmap levels. Each level is - * an array of slices. */ - return tex->surface.u.legacy.level[level].offset + - box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 + - (box->y / tex->surface.blk_h * - tex->surface.u.legacy.level[level].nblk_x + - box->x / tex->surface.blk_w) * tex->surface.bpe; - } + if (sscreen->info.chip_class >= GFX9) { + *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; + *layer_stride = tex->surface.u.gfx9.surf_slice_size; + + if (!box) + return 0; + + /* Each texture is an array of slices. Each slice is an array + * of mipmap levels. */ + return tex->surface.u.gfx9.surf_offset + box->z * tex->surface.u.gfx9.surf_slice_size + + tex->surface.u.gfx9.offset[level] + + (box->y / tex->surface.blk_h * tex->surface.u.gfx9.surf_pitch + + box->x / tex->surface.blk_w) * + tex->surface.bpe; + } else { + *stride = tex->surface.u.legacy.level[level].nblk_x * tex->surface.bpe; + assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX); + *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4; + + if (!box) + return tex->surface.u.legacy.level[level].offset; + + /* Each texture is an array of mipmap levels. Each level is + * an array of slices. */ + return tex->surface.u.legacy.level[level].offset + + box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 + + (box->y / tex->surface.blk_h * tex->surface.u.legacy.level[level].nblk_x + + box->x / tex->surface.blk_w) * + tex->surface.bpe; + } } -static int si_init_surface(struct si_screen *sscreen, - struct radeon_surf *surface, - const struct pipe_resource *ptex, - enum radeon_surf_mode array_mode, - unsigned pitch_in_bytes_override, - bool is_imported, - bool is_scanout, - bool is_flushed_depth, - bool tc_compatible_htile) +static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surface, + const struct pipe_resource *ptex, enum radeon_surf_mode array_mode, + unsigned pitch_in_bytes_override, bool is_imported, bool is_scanout, + bool is_flushed_depth, bool tc_compatible_htile) { - const struct util_format_description *desc = - util_format_description(ptex->format); - bool is_depth, is_stencil; - int r; - unsigned bpe, flags = 0; - - is_depth = util_format_has_depth(desc); - is_stencil = util_format_has_stencil(desc); - - if (!is_flushed_depth && - ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { - bpe = 4; /* stencil is allocated separately */ - } else { - bpe = util_format_get_blocksize(ptex->format); - assert(util_is_power_of_two_or_zero(bpe)); - } - - if (!is_flushed_depth && is_depth) { - flags |= RADEON_SURF_ZBUFFER; - - if (sscreen->debug_flags & DBG(NO_HYPERZ)) { - flags |= RADEON_SURF_NO_HTILE; - } else if (tc_compatible_htile && - (sscreen->info.chip_class >= GFX9 || - array_mode == RADEON_SURF_MODE_2D)) { - /* TC-compatible HTILE only supports Z32_FLOAT. - * GFX9 also supports Z16_UNORM. - * On GFX8, promote Z16 to Z32. DB->CB copies will convert - * the format for transfers. - */ - if (sscreen->info.chip_class == GFX8) - bpe = 4; - - flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; - } - - if (is_stencil) - flags |= RADEON_SURF_SBUFFER; - } - - if (sscreen->info.chip_class >= GFX8 && - (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || - ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT || - (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed))) - flags |= RADEON_SURF_DISABLE_DCC; - - /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ - if (sscreen->info.family == CHIP_STONEY && - bpe == 16 && ptex->nr_samples >= 2) - flags |= RADEON_SURF_DISABLE_DCC; - - /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */ - if (sscreen->info.chip_class == GFX8 && - ptex->nr_storage_samples >= 4 && - ptex->array_size > 1) - flags |= RADEON_SURF_DISABLE_DCC; - - /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */ - if (sscreen->info.chip_class == GFX9 && - (ptex->nr_storage_samples >= 4 || - (sscreen->info.family == CHIP_RAVEN && - ptex->nr_storage_samples >= 2 && bpe < 4))) - flags |= RADEON_SURF_DISABLE_DCC; - - /* TODO: GFX10: DCC causes corruption with MSAA. */ - if (sscreen->info.chip_class >= GFX10 && - ptex->nr_storage_samples >= 2) - flags |= RADEON_SURF_DISABLE_DCC; - - /* Shared textures must always set up DCC. - * If it's not present, it will be disabled by - * si_get_opaque_metadata later. - */ - if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC))) - flags |= RADEON_SURF_DISABLE_DCC; - - if (is_scanout) { - /* This should catch bugs in gallium users setting incorrect flags. */ - assert(ptex->nr_samples <= 1 && - ptex->array_size == 1 && - ptex->depth0 == 1 && - ptex->last_level == 0 && - !(flags & RADEON_SURF_Z_OR_SBUFFER)); - - flags |= RADEON_SURF_SCANOUT; - } - - if (ptex->bind & PIPE_BIND_SHARED) - flags |= RADEON_SURF_SHAREABLE; - if (is_imported) - flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; - if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) - flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; - if (sscreen->debug_flags & DBG(NO_FMASK)) - flags |= RADEON_SURF_NO_FMASK; - - if (sscreen->info.chip_class == GFX9 && - (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) { - flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE; - surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags); - } - - if (sscreen->info.chip_class >= GFX10 && - (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) { - flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; - surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X; - } - - r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, - array_mode, surface); - if (r) { - return r; - } - - unsigned pitch = pitch_in_bytes_override / bpe; - - if (sscreen->info.chip_class >= GFX9) { - if (pitch) { - surface->u.gfx9.surf_pitch = pitch; - if (ptex->last_level == 0) - surface->u.gfx9.surf.epitch = pitch - 1; - surface->u.gfx9.surf_slice_size = - (uint64_t)pitch * surface->u.gfx9.surf_height * bpe; - } - } else { - if (pitch) { - surface->u.legacy.level[0].nblk_x = pitch; - surface->u.legacy.level[0].slice_size_dw = - ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4; - } - } - return 0; + const struct util_format_description *desc = util_format_description(ptex->format); + bool is_depth, is_stencil; + int r; + unsigned bpe, flags = 0; + + is_depth = util_format_has_depth(desc); + is_stencil = util_format_has_stencil(desc); + + if (!is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { + bpe = 4; /* stencil is allocated separately */ + } else { + bpe = util_format_get_blocksize(ptex->format); + assert(util_is_power_of_two_or_zero(bpe)); + } + + if (!is_flushed_depth && is_depth) { + flags |= RADEON_SURF_ZBUFFER; + + if (sscreen->debug_flags & DBG(NO_HYPERZ)) { + flags |= RADEON_SURF_NO_HTILE; + } else if (tc_compatible_htile && + (sscreen->info.chip_class >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) { + /* TC-compatible HTILE only supports Z32_FLOAT. + * GFX9 also supports Z16_UNORM. + * On GFX8, promote Z16 to Z32. DB->CB copies will convert + * the format for transfers. + */ + if (sscreen->info.chip_class == GFX8) + bpe = 4; + + flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; + } + + if (sscreen->info.chip_class >= GFX8 && + (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT || + (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed))) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */ + if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2) + flags |= RADEON_SURF_DISABLE_DCC; + + /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */ + if (sscreen->info.chip_class == GFX8 && ptex->nr_storage_samples >= 4 && ptex->array_size > 1) + flags |= RADEON_SURF_DISABLE_DCC; + + /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */ + if (sscreen->info.chip_class == GFX9 && + (ptex->nr_storage_samples >= 4 || + (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4))) + flags |= RADEON_SURF_DISABLE_DCC; + + /* TODO: GFX10: DCC causes corruption with MSAA. */ + if (sscreen->info.chip_class >= GFX10 && ptex->nr_storage_samples >= 2) + flags |= RADEON_SURF_DISABLE_DCC; + + /* Shared textures must always set up DCC. + * If it's not present, it will be disabled by + * si_get_opaque_metadata later. + */ + if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC))) + flags |= RADEON_SURF_DISABLE_DCC; + + if (is_scanout) { + /* This should catch bugs in gallium users setting incorrect flags. */ + assert(ptex->nr_samples <= 1 && ptex->array_size == 1 && ptex->depth0 == 1 && + ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER)); + + flags |= RADEON_SURF_SCANOUT; + } + + if (ptex->bind & PIPE_BIND_SHARED) + flags |= RADEON_SURF_SHAREABLE; + if (is_imported) + flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; + if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) + flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; + if (sscreen->debug_flags & DBG(NO_FMASK)) + flags |= RADEON_SURF_NO_FMASK; + + if (sscreen->info.chip_class == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) { + flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE; + surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags); + } + + if (sscreen->info.chip_class >= GFX10 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) { + flags |= RADEON_SURF_FORCE_SWIZZLE_MODE; + surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X; + } + + r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, array_mode, surface); + if (r) { + return r; + } + + unsigned pitch = pitch_in_bytes_override / bpe; + + if (sscreen->info.chip_class >= GFX9) { + if (pitch) { + surface->u.gfx9.surf_pitch = pitch; + if (ptex->last_level == 0) + surface->u.gfx9.surf.epitch = pitch - 1; + surface->u.gfx9.surf_slice_size = (uint64_t)pitch * surface->u.gfx9.surf_height * bpe; + } + } else { + if (pitch) { + surface->u.legacy.level[0].nblk_x = pitch; + surface->u.legacy.level[0].slice_size_dw = + ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4; + } + } + return 0; } -static void si_get_display_metadata(struct si_screen *sscreen, - struct radeon_surf *surf, - struct radeon_bo_metadata *metadata, - enum radeon_surf_mode *array_mode, - bool *is_scanout) +static void si_get_display_metadata(struct si_screen *sscreen, struct radeon_surf *surf, + struct radeon_bo_metadata *metadata, + enum radeon_surf_mode *array_mode, bool *is_scanout) { - if (sscreen->info.chip_class >= GFX9) { - if (metadata->u.gfx9.swizzle_mode > 0) - *array_mode = RADEON_SURF_MODE_2D; - else - *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - - surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode; - *is_scanout = metadata->u.gfx9.scanout; - - if (metadata->u.gfx9.dcc_offset_256B) { - surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max; - assert(metadata->u.gfx9.dcc_independent_64B == 1); - } - } else { - surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config; - surf->u.legacy.bankw = metadata->u.legacy.bankw; - surf->u.legacy.bankh = metadata->u.legacy.bankh; - surf->u.legacy.tile_split = metadata->u.legacy.tile_split; - surf->u.legacy.mtilea = metadata->u.legacy.mtilea; - surf->u.legacy.num_banks = metadata->u.legacy.num_banks; - - if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED) - *array_mode = RADEON_SURF_MODE_2D; - else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED) - *array_mode = RADEON_SURF_MODE_1D; - else - *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - - *is_scanout = metadata->u.legacy.scanout; - } + if (sscreen->info.chip_class >= GFX9) { + if (metadata->u.gfx9.swizzle_mode > 0) + *array_mode = RADEON_SURF_MODE_2D; + else + *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode; + *is_scanout = metadata->u.gfx9.scanout; + + if (metadata->u.gfx9.dcc_offset_256B) { + surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max; + assert(metadata->u.gfx9.dcc_independent_64B == 1); + } + } else { + surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config; + surf->u.legacy.bankw = metadata->u.legacy.bankw; + surf->u.legacy.bankh = metadata->u.legacy.bankh; + surf->u.legacy.tile_split = metadata->u.legacy.tile_split; + surf->u.legacy.mtilea = metadata->u.legacy.mtilea; + surf->u.legacy.num_banks = metadata->u.legacy.num_banks; + + if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_2D; + else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_1D; + else + *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + *is_scanout = metadata->u.legacy.scanout; + } } -void si_eliminate_fast_color_clear(struct si_context *sctx, - struct si_texture *tex) +void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex) { - struct si_screen *sscreen = sctx->screen; - struct pipe_context *ctx = &sctx->b; + struct si_screen *sscreen = sctx->screen; + struct pipe_context *ctx = &sctx->b; - if (ctx == sscreen->aux_context) - simple_mtx_lock(&sscreen->aux_context_lock); + if (ctx == sscreen->aux_context) + simple_mtx_lock(&sscreen->aux_context_lock); - unsigned n = sctx->num_decompress_calls; - ctx->flush_resource(ctx, &tex->buffer.b.b); + unsigned n = sctx->num_decompress_calls; + ctx->flush_resource(ctx, &tex->buffer.b.b); - /* Flush only if any fast clear elimination took place. */ - if (n != sctx->num_decompress_calls) - ctx->flush(ctx, NULL, 0); + /* Flush only if any fast clear elimination took place. */ + if (n != sctx->num_decompress_calls) + ctx->flush(ctx, NULL, 0); - if (ctx == sscreen->aux_context) - simple_mtx_unlock(&sscreen->aux_context_lock); + if (ctx == sscreen->aux_context) + simple_mtx_unlock(&sscreen->aux_context_lock); } -void si_texture_discard_cmask(struct si_screen *sscreen, - struct si_texture *tex) +void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex) { - if (!tex->cmask_buffer) - return; + if (!tex->cmask_buffer) + return; - assert(tex->buffer.b.b.nr_samples <= 1); + assert(tex->buffer.b.b.nr_samples <= 1); - /* Disable CMASK. */ - tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8; - tex->dirty_level_mask = 0; + /* Disable CMASK. */ + tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8; + tex->dirty_level_mask = 0; - tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1); + tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1); - if (tex->cmask_buffer != &tex->buffer) - si_resource_reference(&tex->cmask_buffer, NULL); + if (tex->cmask_buffer != &tex->buffer) + si_resource_reference(&tex->cmask_buffer, NULL); - tex->cmask_buffer = NULL; + tex->cmask_buffer = NULL; - /* Notify all contexts about the change. */ - p_atomic_inc(&sscreen->dirty_tex_counter); - p_atomic_inc(&sscreen->compressed_colortex_counter); + /* Notify all contexts about the change. */ + p_atomic_inc(&sscreen->dirty_tex_counter); + p_atomic_inc(&sscreen->compressed_colortex_counter); } static bool si_can_disable_dcc(struct si_texture *tex) { - /* We can't disable DCC if it can be written by another process. */ - return tex->surface.dcc_offset && - (!tex->buffer.b.is_shared || - !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); + /* We can't disable DCC if it can be written by another process. */ + return tex->surface.dcc_offset && + (!tex->buffer.b.is_shared || + !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); } static void si_texture_zero_dcc_fields(struct si_texture *tex) { - tex->surface.dcc_offset = 0; - tex->surface.display_dcc_offset = 0; - tex->surface.dcc_retile_map_offset = 0; + tex->surface.dcc_offset = 0; + tex->surface.display_dcc_offset = 0; + tex->surface.dcc_retile_map_offset = 0; } -static bool si_texture_discard_dcc(struct si_screen *sscreen, - struct si_texture *tex) +static bool si_texture_discard_dcc(struct si_screen *sscreen, struct si_texture *tex) { - if (!si_can_disable_dcc(tex)) - return false; + if (!si_can_disable_dcc(tex)) + return false; - assert(tex->dcc_separate_buffer == NULL); + assert(tex->dcc_separate_buffer == NULL); - /* Disable DCC. */ - si_texture_zero_dcc_fields(tex); + /* Disable DCC. */ + si_texture_zero_dcc_fields(tex); - /* Notify all contexts about the change. */ - p_atomic_inc(&sscreen->dirty_tex_counter); - return true; + /* Notify all contexts about the change. */ + p_atomic_inc(&sscreen->dirty_tex_counter); + return true; } /** @@ -502,783 +459,726 @@ static bool si_texture_discard_dcc(struct si_screen *sscreen, * \param sctx the current context if you have one, or sscreen->aux_context * if you don't. */ -bool si_texture_disable_dcc(struct si_context *sctx, - struct si_texture *tex) +bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex) { - struct si_screen *sscreen = sctx->screen; + struct si_screen *sscreen = sctx->screen; - if (!sctx->has_graphics) - return si_texture_discard_dcc(sscreen, tex); + if (!sctx->has_graphics) + return si_texture_discard_dcc(sscreen, tex); - if (!si_can_disable_dcc(tex)) - return false; + if (!si_can_disable_dcc(tex)) + return false; - if (&sctx->b == sscreen->aux_context) - simple_mtx_lock(&sscreen->aux_context_lock); + if (&sctx->b == sscreen->aux_context) + simple_mtx_lock(&sscreen->aux_context_lock); - /* Decompress DCC. */ - si_decompress_dcc(sctx, tex); - sctx->b.flush(&sctx->b, NULL, 0); + /* Decompress DCC. */ + si_decompress_dcc(sctx, tex); + sctx->b.flush(&sctx->b, NULL, 0); - if (&sctx->b == sscreen->aux_context) - simple_mtx_unlock(&sscreen->aux_context_lock); + if (&sctx->b == sscreen->aux_context) + simple_mtx_unlock(&sscreen->aux_context_lock); - return si_texture_discard_dcc(sscreen, tex); + return si_texture_discard_dcc(sscreen, tex); } -static void si_reallocate_texture_inplace(struct si_context *sctx, - struct si_texture *tex, - unsigned new_bind_flag, - bool invalidate_storage) +static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_texture *tex, + unsigned new_bind_flag, bool invalidate_storage) { - struct pipe_screen *screen = sctx->b.screen; - struct si_texture *new_tex; - struct pipe_resource templ = tex->buffer.b.b; - unsigned i; - - templ.bind |= new_bind_flag; - - if (tex->buffer.b.is_shared || tex->num_planes > 1) - return; - - if (new_bind_flag == PIPE_BIND_LINEAR) { - if (tex->surface.is_linear) - return; - - /* This fails with MSAA, depth, and compressed textures. */ - if (si_choose_tiling(sctx->screen, &templ, false) != - RADEON_SURF_MODE_LINEAR_ALIGNED) - return; - } - - new_tex = (struct si_texture*)screen->resource_create(screen, &templ); - if (!new_tex) - return; - - /* Copy the pixels to the new texture. */ - if (!invalidate_storage) { - for (i = 0; i <= templ.last_level; i++) { - struct pipe_box box; - - u_box_3d(0, 0, 0, - u_minify(templ.width0, i), u_minify(templ.height0, i), - util_num_layers(&templ, i), &box); - - sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, - &tex->buffer.b.b, i, &box); - } - } - - if (new_bind_flag == PIPE_BIND_LINEAR) { - si_texture_discard_cmask(sctx->screen, tex); - si_texture_discard_dcc(sctx->screen, tex); - } - - /* Replace the structure fields of tex. */ - tex->buffer.b.b.bind = templ.bind; - pb_reference(&tex->buffer.buf, new_tex->buffer.buf); - tex->buffer.gpu_address = new_tex->buffer.gpu_address; - tex->buffer.vram_usage = new_tex->buffer.vram_usage; - tex->buffer.gart_usage = new_tex->buffer.gart_usage; - tex->buffer.bo_size = new_tex->buffer.bo_size; - tex->buffer.bo_alignment = new_tex->buffer.bo_alignment; - tex->buffer.domains = new_tex->buffer.domains; - tex->buffer.flags = new_tex->buffer.flags; - - tex->surface = new_tex->surface; - si_texture_reference(&tex->flushed_depth_texture, - new_tex->flushed_depth_texture); - - tex->surface.fmask_offset = new_tex->surface.fmask_offset; - tex->surface.cmask_offset = new_tex->surface.cmask_offset; - tex->cmask_base_address_reg = new_tex->cmask_base_address_reg; - - if (tex->cmask_buffer == &tex->buffer) - tex->cmask_buffer = NULL; - else - si_resource_reference(&tex->cmask_buffer, NULL); - - if (new_tex->cmask_buffer == &new_tex->buffer) - tex->cmask_buffer = &tex->buffer; - else - si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer); - - tex->surface.dcc_offset = new_tex->surface.dcc_offset; - tex->cb_color_info = new_tex->cb_color_info; - memcpy(tex->color_clear_value, new_tex->color_clear_value, - sizeof(tex->color_clear_value)); - tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode; - - tex->surface.htile_offset = new_tex->surface.htile_offset; - tex->depth_clear_value = new_tex->depth_clear_value; - tex->dirty_level_mask = new_tex->dirty_level_mask; - tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask; - tex->db_render_format = new_tex->db_render_format; - tex->stencil_clear_value = new_tex->stencil_clear_value; - tex->tc_compatible_htile = new_tex->tc_compatible_htile; - tex->depth_cleared = new_tex->depth_cleared; - tex->stencil_cleared = new_tex->stencil_cleared; - tex->upgraded_depth = new_tex->upgraded_depth; - tex->db_compatible = new_tex->db_compatible; - tex->can_sample_z = new_tex->can_sample_z; - tex->can_sample_s = new_tex->can_sample_s; - - tex->separate_dcc_dirty = new_tex->separate_dcc_dirty; - tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty; - tex->dcc_gather_statistics = new_tex->dcc_gather_statistics; - si_resource_reference(&tex->dcc_separate_buffer, - new_tex->dcc_separate_buffer); - si_resource_reference(&tex->last_dcc_separate_buffer, - new_tex->last_dcc_separate_buffer); - - if (new_bind_flag == PIPE_BIND_LINEAR) { - assert(!tex->surface.htile_offset); - assert(!tex->cmask_buffer); - assert(!tex->surface.fmask_size); - assert(!tex->surface.dcc_offset); - assert(!tex->is_depth); - } - - si_texture_reference(&new_tex, NULL); - - p_atomic_inc(&sctx->screen->dirty_tex_counter); + struct pipe_screen *screen = sctx->b.screen; + struct si_texture *new_tex; + struct pipe_resource templ = tex->buffer.b.b; + unsigned i; + + templ.bind |= new_bind_flag; + + if (tex->buffer.b.is_shared || tex->num_planes > 1) + return; + + if (new_bind_flag == PIPE_BIND_LINEAR) { + if (tex->surface.is_linear) + return; + + /* This fails with MSAA, depth, and compressed textures. */ + if (si_choose_tiling(sctx->screen, &templ, false) != RADEON_SURF_MODE_LINEAR_ALIGNED) + return; + } + + new_tex = (struct si_texture *)screen->resource_create(screen, &templ); + if (!new_tex) + return; + + /* Copy the pixels to the new texture. */ + if (!invalidate_storage) { + for (i = 0; i <= templ.last_level; i++) { + struct pipe_box box; + + u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i), + util_num_layers(&templ, i), &box); + + sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box); + } + } + + if (new_bind_flag == PIPE_BIND_LINEAR) { + si_texture_discard_cmask(sctx->screen, tex); + si_texture_discard_dcc(sctx->screen, tex); + } + + /* Replace the structure fields of tex. */ + tex->buffer.b.b.bind = templ.bind; + pb_reference(&tex->buffer.buf, new_tex->buffer.buf); + tex->buffer.gpu_address = new_tex->buffer.gpu_address; + tex->buffer.vram_usage = new_tex->buffer.vram_usage; + tex->buffer.gart_usage = new_tex->buffer.gart_usage; + tex->buffer.bo_size = new_tex->buffer.bo_size; + tex->buffer.bo_alignment = new_tex->buffer.bo_alignment; + tex->buffer.domains = new_tex->buffer.domains; + tex->buffer.flags = new_tex->buffer.flags; + + tex->surface = new_tex->surface; + si_texture_reference(&tex->flushed_depth_texture, new_tex->flushed_depth_texture); + + tex->surface.fmask_offset = new_tex->surface.fmask_offset; + tex->surface.cmask_offset = new_tex->surface.cmask_offset; + tex->cmask_base_address_reg = new_tex->cmask_base_address_reg; + + if (tex->cmask_buffer == &tex->buffer) + tex->cmask_buffer = NULL; + else + si_resource_reference(&tex->cmask_buffer, NULL); + + if (new_tex->cmask_buffer == &new_tex->buffer) + tex->cmask_buffer = &tex->buffer; + else + si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer); + + tex->surface.dcc_offset = new_tex->surface.dcc_offset; + tex->cb_color_info = new_tex->cb_color_info; + memcpy(tex->color_clear_value, new_tex->color_clear_value, sizeof(tex->color_clear_value)); + tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode; + + tex->surface.htile_offset = new_tex->surface.htile_offset; + tex->depth_clear_value = new_tex->depth_clear_value; + tex->dirty_level_mask = new_tex->dirty_level_mask; + tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask; + tex->db_render_format = new_tex->db_render_format; + tex->stencil_clear_value = new_tex->stencil_clear_value; + tex->tc_compatible_htile = new_tex->tc_compatible_htile; + tex->depth_cleared = new_tex->depth_cleared; + tex->stencil_cleared = new_tex->stencil_cleared; + tex->upgraded_depth = new_tex->upgraded_depth; + tex->db_compatible = new_tex->db_compatible; + tex->can_sample_z = new_tex->can_sample_z; + tex->can_sample_s = new_tex->can_sample_s; + + tex->separate_dcc_dirty = new_tex->separate_dcc_dirty; + tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty; + tex->dcc_gather_statistics = new_tex->dcc_gather_statistics; + si_resource_reference(&tex->dcc_separate_buffer, new_tex->dcc_separate_buffer); + si_resource_reference(&tex->last_dcc_separate_buffer, new_tex->last_dcc_separate_buffer); + + if (new_bind_flag == PIPE_BIND_LINEAR) { + assert(!tex->surface.htile_offset); + assert(!tex->cmask_buffer); + assert(!tex->surface.fmask_size); + assert(!tex->surface.dcc_offset); + assert(!tex->is_depth); + } + + si_texture_reference(&new_tex, NULL); + + p_atomic_inc(&sctx->screen->dirty_tex_counter); } static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen) { - return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id; + return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id; } -static void si_set_tex_bo_metadata(struct si_screen *sscreen, - struct si_texture *tex) +static void si_set_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex) { - struct radeon_surf *surface = &tex->surface; - struct pipe_resource *res = &tex->buffer.b.b; - struct radeon_bo_metadata md; - - memset(&md, 0, sizeof(md)); - - if (sscreen->info.chip_class >= GFX9) { - md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; - md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; - - if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) { - uint64_t dcc_offset = - tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset - : tex->surface.dcc_offset; - - assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); - md.u.gfx9.dcc_offset_256B = dcc_offset >> 8; - md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max; - md.u.gfx9.dcc_independent_64B = 1; - } - } else { - md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; - md.u.legacy.pipe_config = surface->u.legacy.pipe_config; - md.u.legacy.bankw = surface->u.legacy.bankw; - md.u.legacy.bankh = surface->u.legacy.bankh; - md.u.legacy.tile_split = surface->u.legacy.tile_split; - md.u.legacy.mtilea = surface->u.legacy.mtilea; - md.u.legacy.num_banks = surface->u.legacy.num_banks; - md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; - md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; - } - - assert(tex->dcc_separate_buffer == NULL); - assert(tex->surface.fmask_size == 0); - - /* Metadata image format format version 1: - * [0] = 1 (metadata format identifier) - * [1] = (VENDOR_ID << 16) | PCI_ID - * [2:9] = image descriptor for the whole resource - * [2] is always 0, because the base address is cleared - * [9] is the DCC offset bits [39:8] from the beginning of - * the buffer - * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level - */ - - md.metadata[0] = 1; /* metadata image format version 1 */ - - /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ - md.metadata[1] = si_get_bo_metadata_word1(sscreen); - - static const unsigned char swizzle[] = { - PIPE_SWIZZLE_X, - PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, - PIPE_SWIZZLE_W - }; - bool is_array = util_texture_is_array(res->target); - uint32_t desc[8]; - - sscreen->make_texture_descriptor(sscreen, tex, true, - res->target, res->format, - swizzle, 0, res->last_level, 0, - is_array ? res->array_size - 1 : 0, - res->width0, res->height0, res->depth0, - desc, NULL); - - si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], - 0, 0, tex->surface.blk_w, false, desc); - - /* Clear the base address and set the relative DCC offset. */ - desc[0] = 0; - desc[1] &= C_008F14_BASE_ADDRESS_HI; - - switch (sscreen->info.chip_class) { - case GFX6: - case GFX7: - break; - case GFX8: - desc[7] = tex->surface.dcc_offset >> 8; - break; - case GFX9: - desc[7] = tex->surface.dcc_offset >> 8; - desc[5] &= C_008F24_META_DATA_ADDRESS; - desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40); - break; - case GFX10: - desc[6] &= C_00A018_META_DATA_ADDRESS_LO; - desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8); - desc[7] = tex->surface.dcc_offset >> 16; - break; - default: - assert(0); - } - - - /* Dwords [2:9] contain the image descriptor. */ - memcpy(&md.metadata[2], desc, sizeof(desc)); - md.size_metadata = 10 * 4; - - /* Dwords [10:..] contain the mipmap level offsets. */ - if (sscreen->info.chip_class <= GFX8) { - for (unsigned i = 0; i <= res->last_level; i++) - md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8; - - md.size_metadata += (1 + res->last_level) * 4; - } - - sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md); + struct radeon_surf *surface = &tex->surface; + struct pipe_resource *res = &tex->buffer.b.b; + struct radeon_bo_metadata md; + + memset(&md, 0, sizeof(md)); + + if (sscreen->info.chip_class >= GFX9) { + md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; + md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + + if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) { + uint64_t dcc_offset = tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset + : tex->surface.dcc_offset; + + assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); + md.u.gfx9.dcc_offset_256B = dcc_offset >> 8; + md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max; + md.u.gfx9.dcc_independent_64B = 1; + } + } else { + md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D + ? RADEON_LAYOUT_TILED + : RADEON_LAYOUT_LINEAR; + md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D + ? RADEON_LAYOUT_TILED + : RADEON_LAYOUT_LINEAR; + md.u.legacy.pipe_config = surface->u.legacy.pipe_config; + md.u.legacy.bankw = surface->u.legacy.bankw; + md.u.legacy.bankh = surface->u.legacy.bankh; + md.u.legacy.tile_split = surface->u.legacy.tile_split; + md.u.legacy.mtilea = surface->u.legacy.mtilea; + md.u.legacy.num_banks = surface->u.legacy.num_banks; + md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; + md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; + } + + assert(tex->dcc_separate_buffer == NULL); + assert(tex->surface.fmask_size == 0); + + /* Metadata image format format version 1: + * [0] = 1 (metadata format identifier) + * [1] = (VENDOR_ID << 16) | PCI_ID + * [2:9] = image descriptor for the whole resource + * [2] is always 0, because the base address is cleared + * [9] is the DCC offset bits [39:8] from the beginning of + * the buffer + * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level + */ + + md.metadata[0] = 1; /* metadata image format version 1 */ + + /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ + md.metadata[1] = si_get_bo_metadata_word1(sscreen); + + static const unsigned char swizzle[] = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W}; + bool is_array = util_texture_is_array(res->target); + uint32_t desc[8]; + + sscreen->make_texture_descriptor(sscreen, tex, true, res->target, res->format, swizzle, 0, + res->last_level, 0, is_array ? res->array_size - 1 : 0, + res->width0, res->height0, res->depth0, desc, NULL); + + si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], 0, 0, + tex->surface.blk_w, false, desc); + + /* Clear the base address and set the relative DCC offset. */ + desc[0] = 0; + desc[1] &= C_008F14_BASE_ADDRESS_HI; + + switch (sscreen->info.chip_class) { + case GFX6: + case GFX7: + break; + case GFX8: + desc[7] = tex->surface.dcc_offset >> 8; + break; + case GFX9: + desc[7] = tex->surface.dcc_offset >> 8; + desc[5] &= C_008F24_META_DATA_ADDRESS; + desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40); + break; + case GFX10: + desc[6] &= C_00A018_META_DATA_ADDRESS_LO; + desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8); + desc[7] = tex->surface.dcc_offset >> 16; + break; + default: + assert(0); + } + + /* Dwords [2:9] contain the image descriptor. */ + memcpy(&md.metadata[2], desc, sizeof(desc)); + md.size_metadata = 10 * 4; + + /* Dwords [10:..] contain the mipmap level offsets. */ + if (sscreen->info.chip_class <= GFX8) { + for (unsigned i = 0; i <= res->last_level; i++) + md.metadata[10 + i] = tex->surface.u.legacy.level[i].offset >> 8; + + md.size_metadata += (1 + res->last_level) * 4; + } + + sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md); } -static bool si_read_tex_bo_metadata(struct si_screen *sscreen, - struct si_texture *tex, - uint64_t offset, - struct radeon_bo_metadata *md) +static bool si_read_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex, + uint64_t offset, struct radeon_bo_metadata *md) { - uint32_t *desc = &md->metadata[2]; - - if (offset || /* Non-zero planes ignore metadata. */ - md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */ - md->metadata[0] == 0 || /* invalid version number */ - md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ { - /* Disable DCC because it might not be enabled. */ - si_texture_zero_dcc_fields(tex); - - /* Don't report an error if the texture comes from an incompatible driver, - * but this might not work. - */ - return true; - } - - /* Validate that sample counts and the number of mipmap levels match. */ - unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]); - unsigned type = G_008F1C_TYPE(desc[3]); - - if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || - type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { - unsigned log_samples = - util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples)); - - if (last_level != log_samples) { - fprintf(stderr, "radeonsi: invalid MSAA texture import, " - "metadata has log2(samples) = %u, the caller set %u\n", - last_level, log_samples); - return false; - } - } else { - if (last_level != tex->buffer.b.b.last_level) { - fprintf(stderr, "radeonsi: invalid mipmapped texture import, " - "metadata has last_level = %u, the caller set %u\n", - last_level, tex->buffer.b.b.last_level); - return false; - } - } - - if (sscreen->info.chip_class >= GFX8 && - G_008F28_COMPRESSION_EN(desc[6])) { - /* Read DCC information. */ - switch (sscreen->info.chip_class) { - case GFX8: - tex->surface.dcc_offset = (uint64_t)desc[7] << 8; - break; - - case GFX9: - tex->surface.dcc_offset = - ((uint64_t)desc[7] << 8) | - ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40); - tex->surface.u.gfx9.dcc.pipe_aligned = - G_008F24_META_PIPE_ALIGNED(desc[5]); - tex->surface.u.gfx9.dcc.rb_aligned = - G_008F24_META_RB_ALIGNED(desc[5]); - - /* If DCC is unaligned, this can only be a displayable image. */ - if (!tex->surface.u.gfx9.dcc.pipe_aligned && - !tex->surface.u.gfx9.dcc.rb_aligned) - assert(tex->surface.is_displayable); - break; - - case GFX10: - tex->surface.dcc_offset = - ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | - ((uint64_t)desc[7] << 16); - tex->surface.u.gfx9.dcc.pipe_aligned = - G_00A018_META_PIPE_ALIGNED(desc[6]); - break; - - default: - assert(0); - return false; - } - } else { - /* Disable DCC. dcc_offset is always set by texture_from_handle - * and must be cleared here. - */ - si_texture_zero_dcc_fields(tex); - } - - return true; + uint32_t *desc = &md->metadata[2]; + + if (offset || /* Non-zero planes ignore metadata. */ + md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */ + md->metadata[0] == 0 || /* invalid version number */ + md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ { + /* Disable DCC because it might not be enabled. */ + si_texture_zero_dcc_fields(tex); + + /* Don't report an error if the texture comes from an incompatible driver, + * but this might not work. + */ + return true; + } + + /* Validate that sample counts and the number of mipmap levels match. */ + unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]); + unsigned type = G_008F1C_TYPE(desc[3]); + + if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + unsigned log_samples = util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples)); + + if (last_level != log_samples) { + fprintf(stderr, + "radeonsi: invalid MSAA texture import, " + "metadata has log2(samples) = %u, the caller set %u\n", + last_level, log_samples); + return false; + } + } else { + if (last_level != tex->buffer.b.b.last_level) { + fprintf(stderr, + "radeonsi: invalid mipmapped texture import, " + "metadata has last_level = %u, the caller set %u\n", + last_level, tex->buffer.b.b.last_level); + return false; + } + } + + if (sscreen->info.chip_class >= GFX8 && G_008F28_COMPRESSION_EN(desc[6])) { + /* Read DCC information. */ + switch (sscreen->info.chip_class) { + case GFX8: + tex->surface.dcc_offset = (uint64_t)desc[7] << 8; + break; + + case GFX9: + tex->surface.dcc_offset = + ((uint64_t)desc[7] << 8) | ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40); + tex->surface.u.gfx9.dcc.pipe_aligned = G_008F24_META_PIPE_ALIGNED(desc[5]); + tex->surface.u.gfx9.dcc.rb_aligned = G_008F24_META_RB_ALIGNED(desc[5]); + + /* If DCC is unaligned, this can only be a displayable image. */ + if (!tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned) + assert(tex->surface.is_displayable); + break; + + case GFX10: + tex->surface.dcc_offset = + ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | ((uint64_t)desc[7] << 16); + tex->surface.u.gfx9.dcc.pipe_aligned = G_00A018_META_PIPE_ALIGNED(desc[6]); + break; + + default: + assert(0); + return false; + } + } else { + /* Disable DCC. dcc_offset is always set by texture_from_handle + * and must be cleared here. + */ + si_texture_zero_dcc_fields(tex); + } + + return true; } static bool si_has_displayable_dcc(struct si_texture *tex) { - struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen; - - if (sscreen->info.chip_class <= GFX8) - return false; - - /* This needs a cache flush before scanout. - * (it can't be scanned out and rendered to simultaneously) - */ - if (sscreen->info.use_display_dcc_unaligned && - tex->surface.dcc_offset && - !tex->surface.u.gfx9.dcc.pipe_aligned && - !tex->surface.u.gfx9.dcc.rb_aligned) - return true; - - /* This needs an explicit flush (flush_resource). */ - if (sscreen->info.use_display_dcc_with_retile_blit && - tex->surface.display_dcc_offset) - return true; - - return false; + struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen; + + if (sscreen->info.chip_class <= GFX8) + return false; + + /* This needs a cache flush before scanout. + * (it can't be scanned out and rendered to simultaneously) + */ + if (sscreen->info.use_display_dcc_unaligned && tex->surface.dcc_offset && + !tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned) + return true; + + /* This needs an explicit flush (flush_resource). */ + if (sscreen->info.use_display_dcc_with_retile_blit && tex->surface.display_dcc_offset) + return true; + + return false; } -static bool si_resource_get_param(struct pipe_screen *screen, - struct pipe_context *context, - struct pipe_resource *resource, - unsigned plane, - unsigned layer, - enum pipe_resource_param param, - unsigned handle_usage, - uint64_t *value) +static bool si_resource_get_param(struct pipe_screen *screen, struct pipe_context *context, + struct pipe_resource *resource, unsigned plane, unsigned layer, + enum pipe_resource_param param, unsigned handle_usage, + uint64_t *value) { - for (unsigned i = 0; i < plane; i++) - resource = resource->next; - - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_texture *tex = (struct si_texture*)resource; - struct winsys_handle whandle; - - switch (param) { - case PIPE_RESOURCE_PARAM_NPLANES: - *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes; - return true; - - case PIPE_RESOURCE_PARAM_STRIDE: - if (resource->target == PIPE_BUFFER) - *value = 0; - else if (sscreen->info.chip_class >= GFX9) - *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; - else - *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe; - return true; - - case PIPE_RESOURCE_PARAM_OFFSET: - if (resource->target == PIPE_BUFFER) - *value = 0; - else if (sscreen->info.chip_class >= GFX9) - *value = tex->surface.u.gfx9.surf_offset + - layer * tex->surface.u.gfx9.surf_slice_size; - else - *value = tex->surface.u.legacy.level[0].offset + - layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; - return true; - - case PIPE_RESOURCE_PARAM_MODIFIER: - *value = DRM_FORMAT_MOD_INVALID; - return true; - - case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED: - case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: - case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD: - memset(&whandle, 0, sizeof(whandle)); - - if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED) - whandle.type = WINSYS_HANDLE_TYPE_SHARED; - else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS) - whandle.type = WINSYS_HANDLE_TYPE_KMS; - else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD) - whandle.type = WINSYS_HANDLE_TYPE_FD; - - if (!screen->resource_get_handle(screen, context, resource, - &whandle, handle_usage)) - return false; - - *value = whandle.handle; - return true; - } - return false; + for (unsigned i = 0; i < plane; i++) + resource = resource->next; + + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_texture *tex = (struct si_texture *)resource; + struct winsys_handle whandle; + + switch (param) { + case PIPE_RESOURCE_PARAM_NPLANES: + *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes; + return true; + + case PIPE_RESOURCE_PARAM_STRIDE: + if (resource->target == PIPE_BUFFER) + *value = 0; + else if (sscreen->info.chip_class >= GFX9) + *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe; + else + *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe; + return true; + + case PIPE_RESOURCE_PARAM_OFFSET: + if (resource->target == PIPE_BUFFER) + *value = 0; + else if (sscreen->info.chip_class >= GFX9) + *value = tex->surface.u.gfx9.surf_offset + layer * tex->surface.u.gfx9.surf_slice_size; + else + *value = tex->surface.u.legacy.level[0].offset + + layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; + return true; + + case PIPE_RESOURCE_PARAM_MODIFIER: + *value = DRM_FORMAT_MOD_INVALID; + return true; + + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED: + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD: + memset(&whandle, 0, sizeof(whandle)); + + if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED) + whandle.type = WINSYS_HANDLE_TYPE_SHARED; + else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS) + whandle.type = WINSYS_HANDLE_TYPE_KMS; + else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD) + whandle.type = WINSYS_HANDLE_TYPE_FD; + + if (!screen->resource_get_handle(screen, context, resource, &whandle, handle_usage)) + return false; + + *value = whandle.handle; + return true; + } + return false; } -static void si_texture_get_info(struct pipe_screen* screen, - struct pipe_resource *resource, - unsigned *pstride, - unsigned *poffset) +static void si_texture_get_info(struct pipe_screen *screen, struct pipe_resource *resource, + unsigned *pstride, unsigned *poffset) { - uint64_t value; - - if (pstride) { - si_resource_get_param(screen, NULL, resource, 0, 0, - PIPE_RESOURCE_PARAM_STRIDE, 0, &value); - *pstride = value; - } - - if (poffset) { - si_resource_get_param(screen, NULL, resource, 0, 0, - PIPE_RESOURCE_PARAM_OFFSET, 0, &value); - *poffset = value; - } + uint64_t value; + + if (pstride) { + si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_STRIDE, 0, &value); + *pstride = value; + } + + if (poffset) { + si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_OFFSET, 0, &value); + *poffset = value; + } } -static bool si_texture_get_handle(struct pipe_screen* screen, - struct pipe_context *ctx, - struct pipe_resource *resource, - struct winsys_handle *whandle, - unsigned usage) +static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_context *ctx, + struct pipe_resource *resource, struct winsys_handle *whandle, + unsigned usage) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_context *sctx; - struct si_resource *res = si_resource(resource); - struct si_texture *tex = (struct si_texture*)resource; - bool update_metadata = false; - unsigned stride, offset, slice_size; - bool flush = false; - - ctx = threaded_context_unwrap_sync(ctx); - sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context); - - if (resource->target != PIPE_BUFFER) { - /* Individual planes are chained pipe_resource instances. */ - for (unsigned i = 0; i < whandle->plane; i++) { - resource = resource->next; - res = si_resource(resource); - tex = (struct si_texture*)resource; - } - - /* This is not supported now, but it might be required for OpenCL - * interop in the future. - */ - if (resource->nr_samples > 1 || tex->is_depth) - return false; - - /* Move a suballocated texture into a non-suballocated allocation. */ - if (sscreen->ws->buffer_is_suballocated(res->buf) || - tex->surface.tile_swizzle || - (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && - sscreen->info.has_local_buffers)) { - assert(!res->b.is_shared); - si_reallocate_texture_inplace(sctx, tex, - PIPE_BIND_SHARED, false); - flush = true; - assert(res->b.b.bind & PIPE_BIND_SHARED); - assert(res->flags & RADEON_FLAG_NO_SUBALLOC); - assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)); - assert(tex->surface.tile_swizzle == 0); - } - - /* Since shader image stores don't support DCC on GFX8, - * disable it for external clients that want write - * access. - */ - if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) || - /* Displayable DCC requires an explicit flush. */ - (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - si_has_displayable_dcc(tex))) { - if (si_texture_disable_dcc(sctx, tex)) { - update_metadata = true; - /* si_texture_disable_dcc flushes the context */ - flush = false; - } - } - - if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - (tex->cmask_buffer || tex->surface.dcc_offset)) { - /* Eliminate fast clear (both CMASK and DCC) */ - si_eliminate_fast_color_clear(sctx, tex); - /* eliminate_fast_color_clear flushes the context */ - flush = false; - - /* Disable CMASK if flush_resource isn't going - * to be called. - */ - if (tex->cmask_buffer) - si_texture_discard_cmask(sscreen, tex); - } - - /* Set metadata. */ - if ((!res->b.is_shared || update_metadata) && whandle->offset == 0) - si_set_tex_bo_metadata(sscreen, tex); - - if (sscreen->info.chip_class >= GFX9) { - slice_size = tex->surface.u.gfx9.surf_slice_size; - } else { - slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; - } - } else { - /* Buffer exports are for the OpenCL interop. */ - /* Move a suballocated buffer into a non-suballocated allocation. */ - if (sscreen->ws->buffer_is_suballocated(res->buf) || - /* A DMABUF export always fails if the BO is local. */ - (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && - sscreen->info.has_local_buffers)) { - assert(!res->b.is_shared); - - /* Allocate a new buffer with PIPE_BIND_SHARED. */ - struct pipe_resource templ = res->b.b; - templ.bind |= PIPE_BIND_SHARED; - - struct pipe_resource *newb = - screen->resource_create(screen, &templ); - if (!newb) - return false; - - /* Copy the old buffer contents to the new one. */ - struct pipe_box box; - u_box_1d(0, newb->width0, &box); - sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, - &res->b.b, 0, &box); - flush = true; - /* Move the new buffer storage to the old pipe_resource. */ - si_replace_buffer_storage(&sctx->b, &res->b.b, newb); - pipe_resource_reference(&newb, NULL); - - assert(res->b.b.bind & PIPE_BIND_SHARED); - assert(res->flags & RADEON_FLAG_NO_SUBALLOC); - } - - /* Buffers */ - slice_size = 0; - } - - si_texture_get_info(screen, resource, &stride, &offset); - - if (flush) - sctx->b.flush(&sctx->b, NULL, 0); - - if (res->b.is_shared) { - /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user - * doesn't set it. - */ - res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) - res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; - } else { - res->b.is_shared = true; - res->external_usage = usage; - } - - whandle->stride = stride; - whandle->offset = offset + slice_size * whandle->layer; - - return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle); + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_context *sctx; + struct si_resource *res = si_resource(resource); + struct si_texture *tex = (struct si_texture *)resource; + bool update_metadata = false; + unsigned stride, offset, slice_size; + bool flush = false; + + ctx = threaded_context_unwrap_sync(ctx); + sctx = (struct si_context *)(ctx ? ctx : sscreen->aux_context); + + if (resource->target != PIPE_BUFFER) { + /* Individual planes are chained pipe_resource instances. */ + for (unsigned i = 0; i < whandle->plane; i++) { + resource = resource->next; + res = si_resource(resource); + tex = (struct si_texture *)resource; + } + + /* This is not supported now, but it might be required for OpenCL + * interop in the future. + */ + if (resource->nr_samples > 1 || tex->is_depth) + return false; + + /* Move a suballocated texture into a non-suballocated allocation. */ + if (sscreen->ws->buffer_is_suballocated(res->buf) || tex->surface.tile_swizzle || + (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && + sscreen->info.has_local_buffers)) { + assert(!res->b.is_shared); + si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_SHARED, false); + flush = true; + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)); + assert(tex->surface.tile_swizzle == 0); + } + + /* Since shader image stores don't support DCC on GFX8, + * disable it for external clients that want write + * access. + */ + if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) || + /* Displayable DCC requires an explicit flush. */ + (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && si_has_displayable_dcc(tex))) { + if (si_texture_disable_dcc(sctx, tex)) { + update_metadata = true; + /* si_texture_disable_dcc flushes the context */ + flush = false; + } + } + + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + (tex->cmask_buffer || tex->surface.dcc_offset)) { + /* Eliminate fast clear (both CMASK and DCC) */ + si_eliminate_fast_color_clear(sctx, tex); + /* eliminate_fast_color_clear flushes the context */ + flush = false; + + /* Disable CMASK if flush_resource isn't going + * to be called. + */ + if (tex->cmask_buffer) + si_texture_discard_cmask(sscreen, tex); + } + + /* Set metadata. */ + if ((!res->b.is_shared || update_metadata) && whandle->offset == 0) + si_set_tex_bo_metadata(sscreen, tex); + + if (sscreen->info.chip_class >= GFX9) { + slice_size = tex->surface.u.gfx9.surf_slice_size; + } else { + slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4; + } + } else { + /* Buffer exports are for the OpenCL interop. */ + /* Move a suballocated buffer into a non-suballocated allocation. */ + if (sscreen->ws->buffer_is_suballocated(res->buf) || + /* A DMABUF export always fails if the BO is local. */ + (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && + sscreen->info.has_local_buffers)) { + assert(!res->b.is_shared); + + /* Allocate a new buffer with PIPE_BIND_SHARED. */ + struct pipe_resource templ = res->b.b; + templ.bind |= PIPE_BIND_SHARED; + + struct pipe_resource *newb = screen->resource_create(screen, &templ); + if (!newb) + return false; + + /* Copy the old buffer contents to the new one. */ + struct pipe_box box; + u_box_1d(0, newb->width0, &box); + sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box); + flush = true; + /* Move the new buffer storage to the old pipe_resource. */ + si_replace_buffer_storage(&sctx->b, &res->b.b, newb); + pipe_resource_reference(&newb, NULL); + + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + } + + /* Buffers */ + slice_size = 0; + } + + si_texture_get_info(screen, resource, &stride, &offset); + + if (flush) + sctx->b.flush(&sctx->b, NULL, 0); + + if (res->b.is_shared) { + /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user + * doesn't set it. + */ + res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + } else { + res->b.is_shared = true; + res->external_usage = usage; + } + + whandle->stride = stride; + whandle->offset = offset + slice_size * whandle->layer; + + return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle); } -static void si_texture_destroy(struct pipe_screen *screen, - struct pipe_resource *ptex) +static void si_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_texture *tex = (struct si_texture*)ptex; - struct si_resource *resource = &tex->buffer; - - if (sscreen->info.chip_class >= GFX9) - free(tex->surface.u.gfx9.dcc_retile_map); - - si_texture_reference(&tex->flushed_depth_texture, NULL); - - if (tex->cmask_buffer != &tex->buffer) { - si_resource_reference(&tex->cmask_buffer, NULL); - } - pb_reference(&resource->buf, NULL); - si_resource_reference(&tex->dcc_separate_buffer, NULL); - si_resource_reference(&tex->last_dcc_separate_buffer, NULL); - FREE(tex); + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_texture *tex = (struct si_texture *)ptex; + struct si_resource *resource = &tex->buffer; + + if (sscreen->info.chip_class >= GFX9) + free(tex->surface.u.gfx9.dcc_retile_map); + + si_texture_reference(&tex->flushed_depth_texture, NULL); + + if (tex->cmask_buffer != &tex->buffer) { + si_resource_reference(&tex->cmask_buffer, NULL); + } + pb_reference(&resource->buf, NULL); + si_resource_reference(&tex->dcc_separate_buffer, NULL); + si_resource_reference(&tex->last_dcc_separate_buffer, NULL); + FREE(tex); } static const struct u_resource_vtbl si_texture_vtbl; -void si_print_texture_info(struct si_screen *sscreen, - struct si_texture *tex, struct u_log_context *log) +void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex, + struct u_log_context *log) { - int i; - - /* Common parameters. */ - u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " - "blk_h=%u, array_size=%u, last_level=%u, " - "bpe=%u, nsamples=%u, flags=0x%x, %s\n", - tex->buffer.b.b.width0, tex->buffer.b.b.height0, - tex->buffer.b.b.depth0, tex->surface.blk_w, - tex->surface.blk_h, - tex->buffer.b.b.array_size, tex->buffer.b.b.last_level, - tex->surface.bpe, tex->buffer.b.b.nr_samples, - tex->surface.flags, util_format_short_name(tex->buffer.b.b.format)); - - if (sscreen->info.chip_class >= GFX9) { - u_log_printf(log, " Surf: size=%"PRIu64", slice_size=%"PRIu64", " - "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n", - tex->surface.surf_size, - tex->surface.u.gfx9.surf_slice_size, - tex->surface.surf_alignment, - tex->surface.u.gfx9.surf.swizzle_mode, - tex->surface.u.gfx9.surf.epitch, - tex->surface.u.gfx9.surf_pitch); - - if (tex->surface.fmask_offset) { - u_log_printf(log, " FMASK: offset=%"PRIu64", size=%"PRIu64", " - "alignment=%u, swmode=%u, epitch=%u\n", - tex->surface.fmask_offset, - tex->surface.fmask_size, - tex->surface.fmask_alignment, - tex->surface.u.gfx9.fmask.swizzle_mode, - tex->surface.u.gfx9.fmask.epitch); - } - - if (tex->cmask_buffer) { - u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, " - "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n", - tex->surface.cmask_offset, - tex->surface.cmask_size, - tex->surface.cmask_alignment, - tex->surface.u.gfx9.cmask.rb_aligned, - tex->surface.u.gfx9.cmask.pipe_aligned); - } - - if (tex->surface.htile_offset) { - u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, alignment=%u, " - "rb_aligned=%u, pipe_aligned=%u\n", - tex->surface.htile_offset, - tex->surface.htile_size, - tex->surface.htile_alignment, - tex->surface.u.gfx9.htile.rb_aligned, - tex->surface.u.gfx9.htile.pipe_aligned); - } - - if (tex->surface.dcc_offset) { - u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, " - "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n", - tex->surface.dcc_offset, tex->surface.dcc_size, - tex->surface.dcc_alignment, - tex->surface.u.gfx9.display_dcc_pitch_max, - tex->surface.num_dcc_levels); - } - - if (tex->surface.u.gfx9.stencil_offset) { - u_log_printf(log, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n", - tex->surface.u.gfx9.stencil_offset, - tex->surface.u.gfx9.stencil.swizzle_mode, - tex->surface.u.gfx9.stencil.epitch); - } - return; - } - - u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " - "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", - tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw, - tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea, - tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config, - (tex->surface.flags & RADEON_SURF_SCANOUT) != 0); - - if (tex->surface.fmask_offset) - u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " - "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", - tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment, - tex->surface.u.legacy.fmask.pitch_in_pixels, - tex->surface.u.legacy.fmask.bankh, - tex->surface.u.legacy.fmask.slice_tile_max, - tex->surface.u.legacy.fmask.tiling_index); - - if (tex->cmask_buffer) - u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, alignment=%u, " - "slice_tile_max=%u\n", - tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment, - tex->surface.u.legacy.cmask_slice_tile_max); - - if (tex->surface.htile_offset) - u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, " - "alignment=%u, TC_compatible = %u\n", - tex->surface.htile_offset, tex->surface.htile_size, - tex->surface.htile_alignment, - tex->tc_compatible_htile); - - if (tex->surface.dcc_offset) { - u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, alignment=%u\n", - tex->surface.dcc_offset, tex->surface.dcc_size, - tex->surface.dcc_alignment); - for (i = 0; i <= tex->buffer.b.b.last_level; i++) - u_log_printf(log, " DCCLevel[%i]: enabled=%u, offset=%u, " - "fast_clear_size=%u\n", - i, i < tex->surface.num_dcc_levels, - tex->surface.u.legacy.level[i].dcc_offset, - tex->surface.u.legacy.level[i].dcc_fast_clear_size); - } - - for (i = 0; i <= tex->buffer.b.b.last_level; i++) - u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " - "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "mode=%u, tiling_index = %u\n", - i, tex->surface.u.legacy.level[i].offset, - (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4, - u_minify(tex->buffer.b.b.width0, i), - u_minify(tex->buffer.b.b.height0, i), - u_minify(tex->buffer.b.b.depth0, i), - tex->surface.u.legacy.level[i].nblk_x, - tex->surface.u.legacy.level[i].nblk_y, - tex->surface.u.legacy.level[i].mode, - tex->surface.u.legacy.tiling_index[i]); - - if (tex->surface.has_stencil) { - u_log_printf(log, " StencilLayout: tilesplit=%u\n", - tex->surface.u.legacy.stencil_tile_split); - for (i = 0; i <= tex->buffer.b.b.last_level; i++) { - u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", " - "slice_size=%"PRIu64", npix_x=%u, " - "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "mode=%u, tiling_index = %u\n", - i, tex->surface.u.legacy.stencil_level[i].offset, - (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4, - u_minify(tex->buffer.b.b.width0, i), - u_minify(tex->buffer.b.b.height0, i), - u_minify(tex->buffer.b.b.depth0, i), - tex->surface.u.legacy.stencil_level[i].nblk_x, - tex->surface.u.legacy.stencil_level[i].nblk_y, - tex->surface.u.legacy.stencil_level[i].mode, - tex->surface.u.legacy.stencil_tiling_index[i]); - } - } + int i; + + /* Common parameters. */ + u_log_printf(log, + " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " + "blk_h=%u, array_size=%u, last_level=%u, " + "bpe=%u, nsamples=%u, flags=0x%x, %s\n", + tex->buffer.b.b.width0, tex->buffer.b.b.height0, tex->buffer.b.b.depth0, + tex->surface.blk_w, tex->surface.blk_h, tex->buffer.b.b.array_size, + tex->buffer.b.b.last_level, tex->surface.bpe, tex->buffer.b.b.nr_samples, + tex->surface.flags, util_format_short_name(tex->buffer.b.b.format)); + + if (sscreen->info.chip_class >= GFX9) { + u_log_printf(log, + " Surf: size=%" PRIu64 ", slice_size=%" PRIu64 ", " + "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n", + tex->surface.surf_size, tex->surface.u.gfx9.surf_slice_size, + tex->surface.surf_alignment, tex->surface.u.gfx9.surf.swizzle_mode, + tex->surface.u.gfx9.surf.epitch, tex->surface.u.gfx9.surf_pitch); + + if (tex->surface.fmask_offset) { + u_log_printf(log, + " FMASK: offset=%" PRIu64 ", size=%" PRIu64 ", " + "alignment=%u, swmode=%u, epitch=%u\n", + tex->surface.fmask_offset, tex->surface.fmask_size, + tex->surface.fmask_alignment, tex->surface.u.gfx9.fmask.swizzle_mode, + tex->surface.u.gfx9.fmask.epitch); + } + + if (tex->cmask_buffer) { + u_log_printf(log, + " CMask: offset=%" PRIu64 ", size=%u, " + "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n", + tex->surface.cmask_offset, tex->surface.cmask_size, + tex->surface.cmask_alignment, tex->surface.u.gfx9.cmask.rb_aligned, + tex->surface.u.gfx9.cmask.pipe_aligned); + } + + if (tex->surface.htile_offset) { + u_log_printf(log, + " HTile: offset=%" PRIu64 ", size=%u, alignment=%u, " + "rb_aligned=%u, pipe_aligned=%u\n", + tex->surface.htile_offset, tex->surface.htile_size, + tex->surface.htile_alignment, tex->surface.u.gfx9.htile.rb_aligned, + tex->surface.u.gfx9.htile.pipe_aligned); + } + + if (tex->surface.dcc_offset) { + u_log_printf(log, + " DCC: offset=%" PRIu64 ", size=%u, " + "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n", + tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment, + tex->surface.u.gfx9.display_dcc_pitch_max, tex->surface.num_dcc_levels); + } + + if (tex->surface.u.gfx9.stencil_offset) { + u_log_printf(log, " Stencil: offset=%" PRIu64 ", swmode=%u, epitch=%u\n", + tex->surface.u.gfx9.stencil_offset, tex->surface.u.gfx9.stencil.swizzle_mode, + tex->surface.u.gfx9.stencil.epitch); + } + return; + } + + u_log_printf(log, + " Layout: size=%" PRIu64 ", alignment=%u, bankw=%u, " + "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", + tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw, + tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, + tex->surface.u.legacy.mtilea, tex->surface.u.legacy.tile_split, + tex->surface.u.legacy.pipe_config, (tex->surface.flags & RADEON_SURF_SCANOUT) != 0); + + if (tex->surface.fmask_offset) + u_log_printf( + log, + " FMask: offset=%" PRIu64 ", size=%" PRIu64 ", alignment=%u, pitch_in_pixels=%u, " + "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", + tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment, + tex->surface.u.legacy.fmask.pitch_in_pixels, tex->surface.u.legacy.fmask.bankh, + tex->surface.u.legacy.fmask.slice_tile_max, tex->surface.u.legacy.fmask.tiling_index); + + if (tex->cmask_buffer) + u_log_printf(log, + " CMask: offset=%" PRIu64 ", size=%u, alignment=%u, " + "slice_tile_max=%u\n", + tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment, + tex->surface.u.legacy.cmask_slice_tile_max); + + if (tex->surface.htile_offset) + u_log_printf(log, + " HTile: offset=%" PRIu64 ", size=%u, " + "alignment=%u, TC_compatible = %u\n", + tex->surface.htile_offset, tex->surface.htile_size, tex->surface.htile_alignment, + tex->tc_compatible_htile); + + if (tex->surface.dcc_offset) { + u_log_printf(log, " DCC: offset=%" PRIu64 ", size=%u, alignment=%u\n", + tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment); + for (i = 0; i <= tex->buffer.b.b.last_level; i++) + u_log_printf(log, + " DCCLevel[%i]: enabled=%u, offset=%u, " + "fast_clear_size=%u\n", + i, i < tex->surface.num_dcc_levels, tex->surface.u.legacy.level[i].dcc_offset, + tex->surface.u.legacy.level[i].dcc_fast_clear_size); + } + + for (i = 0; i <= tex->buffer.b.b.last_level; i++) + u_log_printf(log, + " Level[%i]: offset=%" PRIu64 ", slice_size=%" PRIu64 ", " + "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, tex->surface.u.legacy.level[i].offset, + (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4, + u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i), + u_minify(tex->buffer.b.b.depth0, i), tex->surface.u.legacy.level[i].nblk_x, + tex->surface.u.legacy.level[i].nblk_y, tex->surface.u.legacy.level[i].mode, + tex->surface.u.legacy.tiling_index[i]); + + if (tex->surface.has_stencil) { + u_log_printf(log, " StencilLayout: tilesplit=%u\n", + tex->surface.u.legacy.stencil_tile_split); + for (i = 0; i <= tex->buffer.b.b.last_level; i++) { + u_log_printf(log, + " StencilLevel[%i]: offset=%" PRIu64 ", " + "slice_size=%" PRIu64 ", npix_x=%u, " + "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, tex->surface.u.legacy.stencil_level[i].offset, + (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4, + u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i), + u_minify(tex->buffer.b.b.depth0, i), + tex->surface.u.legacy.stencil_level[i].nblk_x, + tex->surface.u.legacy.stencil_level[i].nblk_y, + tex->surface.u.legacy.stencil_level[i].mode, + tex->surface.u.legacy.stencil_tiling_index[i]); + } + } } /** @@ -1293,611 +1193,569 @@ void si_print_texture_info(struct si_screen *sscreen, * \param alloc_size the size to allocate if plane0 != NULL * \param alignment alignment for the allocation */ -static struct si_texture * -si_texture_create_object(struct pipe_screen *screen, - const struct pipe_resource *base, - const struct radeon_surf *surface, - const struct si_texture *plane0, - struct pb_buffer *imported_buf, - uint64_t offset, - uint64_t alloc_size, - unsigned alignment) +static struct si_texture *si_texture_create_object(struct pipe_screen *screen, + const struct pipe_resource *base, + const struct radeon_surf *surface, + const struct si_texture *plane0, + struct pb_buffer *imported_buf, uint64_t offset, + uint64_t alloc_size, unsigned alignment) { - struct si_texture *tex; - struct si_resource *resource; - struct si_screen *sscreen = (struct si_screen*)screen; - - tex = CALLOC_STRUCT(si_texture); - if (!tex) - goto error; - - resource = &tex->buffer; - resource->b.b = *base; - resource->b.b.next = NULL; - resource->b.vtbl = &si_texture_vtbl; - pipe_reference_init(&resource->b.b.reference, 1); - resource->b.b.screen = screen; - - /* don't include stencil-only formats which we don't support for rendering */ - tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format)); - tex->surface = *surface; - tex->tc_compatible_htile = tex->surface.htile_size != 0 && - (tex->surface.flags & - RADEON_SURF_TC_COMPATIBLE_HTILE); - - /* TC-compatible HTILE: - * - GFX8 only supports Z32_FLOAT. - * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */ - if (tex->tc_compatible_htile) { - if (sscreen->info.chip_class >= GFX9 && - base->format == PIPE_FORMAT_Z16_UNORM) - tex->db_render_format = base->format; - else { - tex->db_render_format = PIPE_FORMAT_Z32_FLOAT; - tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT && - base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; - } - } else { - tex->db_render_format = base->format; - } - - /* Applies to GCN. */ - tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode; - - /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers - * between frames, so the only thing that can enable separate DCC - * with DRI2 is multiple slow clears within a frame. - */ - tex->ps_draw_ratio = 0; - - if (sscreen->info.chip_class >= GFX9) { - tex->surface.u.gfx9.surf_offset = offset; - } else { - for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) - tex->surface.u.legacy.level[i].offset += offset; - } - - if (tex->is_depth) { - if (sscreen->info.chip_class >= GFX9) { - tex->can_sample_z = true; - tex->can_sample_s = true; - - /* Stencil texturing with HTILE doesn't work - * with mipmapping on Navi10-14. */ - if ((sscreen->info.family == CHIP_NAVI10 || - sscreen->info.family == CHIP_NAVI12 || - sscreen->info.family == CHIP_NAVI14) && - base->last_level > 0) - tex->htile_stencil_disabled = true; - } else { - tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted; - tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted; - } - - tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER; - } else { - if (tex->surface.cmask_offset) { - tex->cb_color_info |= S_028C70_FAST_CLEAR(1); - tex->cmask_buffer = &tex->buffer; - } - } - - if (plane0) { - /* The buffer is shared with the first plane. */ - resource->bo_size = plane0->buffer.bo_size; - resource->bo_alignment = plane0->buffer.bo_alignment; - resource->flags = plane0->buffer.flags; - resource->domains = plane0->buffer.domains; - resource->vram_usage = plane0->buffer.vram_usage; - resource->gart_usage = plane0->buffer.gart_usage; - - pb_reference(&resource->buf, plane0->buffer.buf); - resource->gpu_address = plane0->buffer.gpu_address; - } else if (!(surface->flags & RADEON_SURF_IMPORTED)) { - /* Create the backing buffer. */ - si_init_resource_fields(sscreen, resource, alloc_size, alignment); - - if (!si_alloc_resource(sscreen, resource)) - goto error; - } else { - resource->buf = imported_buf; - resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf); - resource->bo_size = imported_buf->size; - resource->bo_alignment = imported_buf->alignment; - resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf); - if (resource->domains & RADEON_DOMAIN_VRAM) - resource->vram_usage = resource->bo_size; - else if (resource->domains & RADEON_DOMAIN_GTT) - resource->gart_usage = resource->bo_size; - } - - if (tex->cmask_buffer) { - /* Initialize the cmask to 0xCC (= compressed state). */ - si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, - tex->surface.cmask_offset, tex->surface.cmask_size, - 0xCCCCCCCC); - } - if (tex->surface.htile_offset) { - uint32_t clear_value = 0; - - if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile) - clear_value = 0x0000030F; - - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.htile_offset, - tex->surface.htile_size, - clear_value); - } - - /* Initialize DCC only if the texture is not being imported. */ - if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) { - /* Clear DCC to black for all tiles with DCC enabled. - * - * This fixes corruption in 3DMark Slingshot Extreme, which - * uses uninitialized textures, causing corruption. - */ - if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 && - tex->buffer.b.b.nr_samples <= 2) { - /* Simple case - all tiles have DCC enabled. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.dcc_offset, - tex->surface.dcc_size, - DCC_CLEAR_COLOR_0000); - } else if (sscreen->info.chip_class >= GFX9) { - /* Clear to uncompressed. Clearing this to black is complicated. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.dcc_offset, - tex->surface.dcc_size, - DCC_UNCOMPRESSED); - } else { - /* GFX8: Initialize mipmap levels and multisamples separately. */ - if (tex->buffer.b.b.nr_samples >= 2) { - /* Clearing this to black is complicated. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.dcc_offset, - tex->surface.dcc_size, - DCC_UNCOMPRESSED); - } else { - /* Clear the enabled mipmap levels to black. */ - unsigned size = 0; - - for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) { - if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size) - break; - - size = tex->surface.u.legacy.level[i].dcc_offset + - tex->surface.u.legacy.level[i].dcc_fast_clear_size; - } - - /* Mipmap levels with DCC. */ - if (size) { - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.dcc_offset, size, - DCC_CLEAR_COLOR_0000); - } - /* Mipmap levels without DCC. */ - if (size != tex->surface.dcc_size) { - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.dcc_offset + size, - tex->surface.dcc_size - size, - DCC_UNCOMPRESSED); - } - } - } - - /* Initialize displayable DCC that requires the retile blit. */ - if (tex->surface.dcc_retile_map_offset) { - /* Uninitialized DCC can hang the display hw. - * Clear to white to indicate that. */ - si_screen_clear_buffer(sscreen, &tex->buffer.b.b, - tex->surface.display_dcc_offset, - tex->surface.u.gfx9.display_dcc_size, - DCC_CLEAR_COLOR_1111); - - /* Upload the DCC retile map. - * Use a staging buffer for the upload, because - * the buffer backing the texture is unmappable. - */ - bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; - unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; - struct si_resource *buf = - si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM, - num_elements * (use_uint16 ? 2 : 4), - sscreen->info.tcc_cache_line_size); - uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL, - PIPE_TRANSFER_WRITE); - uint16_t *us = (uint16_t*)ui; - - /* Upload the retile map into a staging buffer. */ - if (use_uint16) { - for (unsigned i = 0; i < num_elements; i++) - us[i] = tex->surface.u.gfx9.dcc_retile_map[i]; - } else { - for (unsigned i = 0; i < num_elements; i++) - ui[i] = tex->surface.u.gfx9.dcc_retile_map[i]; - } - - /* Copy the staging buffer to the buffer backing the texture. */ - struct si_context *sctx = (struct si_context*)sscreen->aux_context; - - assert(tex->surface.dcc_retile_map_offset <= UINT_MAX); - simple_mtx_lock(&sscreen->aux_context_lock); - si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, - tex->surface.dcc_retile_map_offset, - 0, buf->b.b.width0); - sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); - simple_mtx_unlock(&sscreen->aux_context_lock); - - si_resource_reference(&buf, NULL); - } - } - - /* Initialize the CMASK base register value. */ - tex->cmask_base_address_reg = - (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; - - if (sscreen->debug_flags & DBG(VM)) { - fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n", - tex->buffer.gpu_address, - tex->buffer.gpu_address + tex->buffer.buf->size, - base->width0, base->height0, util_num_layers(base, 0), base->last_level+1, - base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); - } - - if (sscreen->debug_flags & DBG(TEX)) { - puts("Texture:"); - struct u_log_context log; - u_log_context_init(&log); - si_print_texture_info(sscreen, tex, &log); - u_log_new_page_print(&log, stdout); - fflush(stdout); - u_log_context_destroy(&log); - } - - return tex; + struct si_texture *tex; + struct si_resource *resource; + struct si_screen *sscreen = (struct si_screen *)screen; + + tex = CALLOC_STRUCT(si_texture); + if (!tex) + goto error; + + resource = &tex->buffer; + resource->b.b = *base; + resource->b.b.next = NULL; + resource->b.vtbl = &si_texture_vtbl; + pipe_reference_init(&resource->b.b.reference, 1); + resource->b.b.screen = screen; + + /* don't include stencil-only formats which we don't support for rendering */ + tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format)); + tex->surface = *surface; + tex->tc_compatible_htile = + tex->surface.htile_size != 0 && (tex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE); + + /* TC-compatible HTILE: + * - GFX8 only supports Z32_FLOAT. + * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */ + if (tex->tc_compatible_htile) { + if (sscreen->info.chip_class >= GFX9 && base->format == PIPE_FORMAT_Z16_UNORM) + tex->db_render_format = base->format; + else { + tex->db_render_format = PIPE_FORMAT_Z32_FLOAT; + tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT && + base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT; + } + } else { + tex->db_render_format = base->format; + } + + /* Applies to GCN. */ + tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode; + + /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers + * between frames, so the only thing that can enable separate DCC + * with DRI2 is multiple slow clears within a frame. + */ + tex->ps_draw_ratio = 0; + + if (sscreen->info.chip_class >= GFX9) { + tex->surface.u.gfx9.surf_offset = offset; + } else { + for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) + tex->surface.u.legacy.level[i].offset += offset; + } + + if (tex->is_depth) { + if (sscreen->info.chip_class >= GFX9) { + tex->can_sample_z = true; + tex->can_sample_s = true; + + /* Stencil texturing with HTILE doesn't work + * with mipmapping on Navi10-14. */ + if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 || + sscreen->info.family == CHIP_NAVI14) && + base->last_level > 0) + tex->htile_stencil_disabled = true; + } else { + tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted; + tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted; + } + + tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER; + } else { + if (tex->surface.cmask_offset) { + tex->cb_color_info |= S_028C70_FAST_CLEAR(1); + tex->cmask_buffer = &tex->buffer; + } + } + + if (plane0) { + /* The buffer is shared with the first plane. */ + resource->bo_size = plane0->buffer.bo_size; + resource->bo_alignment = plane0->buffer.bo_alignment; + resource->flags = plane0->buffer.flags; + resource->domains = plane0->buffer.domains; + resource->vram_usage = plane0->buffer.vram_usage; + resource->gart_usage = plane0->buffer.gart_usage; + + pb_reference(&resource->buf, plane0->buffer.buf); + resource->gpu_address = plane0->buffer.gpu_address; + } else if (!(surface->flags & RADEON_SURF_IMPORTED)) { + /* Create the backing buffer. */ + si_init_resource_fields(sscreen, resource, alloc_size, alignment); + + if (!si_alloc_resource(sscreen, resource)) + goto error; + } else { + resource->buf = imported_buf; + resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf); + resource->bo_size = imported_buf->size; + resource->bo_alignment = imported_buf->alignment; + resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf); + if (resource->domains & RADEON_DOMAIN_VRAM) + resource->vram_usage = resource->bo_size; + else if (resource->domains & RADEON_DOMAIN_GTT) + resource->gart_usage = resource->bo_size; + } + + if (tex->cmask_buffer) { + /* Initialize the cmask to 0xCC (= compressed state). */ + si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, tex->surface.cmask_offset, + tex->surface.cmask_size, 0xCCCCCCCC); + } + if (tex->surface.htile_offset) { + uint32_t clear_value = 0; + + if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile) + clear_value = 0x0000030F; + + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.htile_offset, + tex->surface.htile_size, clear_value); + } + + /* Initialize DCC only if the texture is not being imported. */ + if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) { + /* Clear DCC to black for all tiles with DCC enabled. + * + * This fixes corruption in 3DMark Slingshot Extreme, which + * uses uninitialized textures, causing corruption. + */ + if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 && + tex->buffer.b.b.nr_samples <= 2) { + /* Simple case - all tiles have DCC enabled. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, + tex->surface.dcc_size, DCC_CLEAR_COLOR_0000); + } else if (sscreen->info.chip_class >= GFX9) { + /* Clear to uncompressed. Clearing this to black is complicated. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, + tex->surface.dcc_size, DCC_UNCOMPRESSED); + } else { + /* GFX8: Initialize mipmap levels and multisamples separately. */ + if (tex->buffer.b.b.nr_samples >= 2) { + /* Clearing this to black is complicated. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, + tex->surface.dcc_size, DCC_UNCOMPRESSED); + } else { + /* Clear the enabled mipmap levels to black. */ + unsigned size = 0; + + for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) { + if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size) + break; + + size = tex->surface.u.legacy.level[i].dcc_offset + + tex->surface.u.legacy.level[i].dcc_fast_clear_size; + } + + /* Mipmap levels with DCC. */ + if (size) { + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, size, + DCC_CLEAR_COLOR_0000); + } + /* Mipmap levels without DCC. */ + if (size != tex->surface.dcc_size) { + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset + size, + tex->surface.dcc_size - size, DCC_UNCOMPRESSED); + } + } + } + + /* Initialize displayable DCC that requires the retile blit. */ + if (tex->surface.dcc_retile_map_offset) { + /* Uninitialized DCC can hang the display hw. + * Clear to white to indicate that. */ + si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.display_dcc_offset, + tex->surface.u.gfx9.display_dcc_size, DCC_CLEAR_COLOR_1111); + + /* Upload the DCC retile map. + * Use a staging buffer for the upload, because + * the buffer backing the texture is unmappable. + */ + bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; + unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; + struct si_resource *buf = si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM, + num_elements * (use_uint16 ? 2 : 4), + sscreen->info.tcc_cache_line_size); + uint32_t *ui = (uint32_t *)sscreen->ws->buffer_map(buf->buf, NULL, PIPE_TRANSFER_WRITE); + uint16_t *us = (uint16_t *)ui; + + /* Upload the retile map into a staging buffer. */ + if (use_uint16) { + for (unsigned i = 0; i < num_elements; i++) + us[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } else { + for (unsigned i = 0; i < num_elements; i++) + ui[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } + + /* Copy the staging buffer to the buffer backing the texture. */ + struct si_context *sctx = (struct si_context *)sscreen->aux_context; + + assert(tex->surface.dcc_retile_map_offset <= UINT_MAX); + simple_mtx_lock(&sscreen->aux_context_lock); + si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, tex->surface.dcc_retile_map_offset, + 0, buf->b.b.width0); + sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); + simple_mtx_unlock(&sscreen->aux_context_lock); + + si_resource_reference(&buf, NULL); + } + } + + /* Initialize the CMASK base register value. */ + tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; + + if (sscreen->debug_flags & DBG(VM)) { + fprintf(stderr, + "VM start=0x%" PRIX64 " end=0x%" PRIX64 + " | Texture %ix%ix%i, %i levels, %i samples, %s\n", + tex->buffer.gpu_address, tex->buffer.gpu_address + tex->buffer.buf->size, + base->width0, base->height0, util_num_layers(base, 0), base->last_level + 1, + base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); + } + + if (sscreen->debug_flags & DBG(TEX)) { + puts("Texture:"); + struct u_log_context log; + u_log_context_init(&log); + si_print_texture_info(sscreen, tex, &log); + u_log_new_page_print(&log, stdout); + fflush(stdout); + u_log_context_destroy(&log); + } + + return tex; error: - FREE(tex); - if (sscreen->info.chip_class >= GFX9) - free(surface->u.gfx9.dcc_retile_map); - return NULL; + FREE(tex); + if (sscreen->info.chip_class >= GFX9) + free(surface->u.gfx9.dcc_retile_map); + return NULL; } -static enum radeon_surf_mode -si_choose_tiling(struct si_screen *sscreen, - const struct pipe_resource *templ, bool tc_compatible_htile) +static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, + const struct pipe_resource *templ, + bool tc_compatible_htile) { - const struct util_format_description *desc = util_format_description(templ->format); - bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING; - bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && - !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH); - - /* MSAA resources must be 2D tiled. */ - if (templ->nr_samples > 1) - return RADEON_SURF_MODE_2D; - - /* Transfer resources should be linear. */ - if (templ->flags & SI_RESOURCE_FLAG_TRANSFER) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8, - * which requires 2D tiling. - */ - if (sscreen->info.chip_class == GFX8 && tc_compatible_htile) - return RADEON_SURF_MODE_2D; - - /* Handle common candidates for the linear mode. - * Compressed textures and DB surfaces must always be tiled. - */ - if (!force_tiling && - !is_depth_stencil && - !util_format_is_compressed(templ->format)) { - if (sscreen->debug_flags & DBG(NO_TILING)) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */ - if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Cursors are linear on AMD GCN. - * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */ - if (templ->bind & PIPE_BIND_CURSOR) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - if (templ->bind & PIPE_BIND_LINEAR) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Textures with a very small height are recommended to be linear. */ - if (templ->target == PIPE_TEXTURE_1D || - templ->target == PIPE_TEXTURE_1D_ARRAY || - /* Only very thin and long 2D textures should benefit from - * linear_aligned. */ - (templ->width0 > 8 && templ->height0 <= 2)) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - - /* Textures likely to be mapped often. */ - if (templ->usage == PIPE_USAGE_STAGING || - templ->usage == PIPE_USAGE_STREAM) - return RADEON_SURF_MODE_LINEAR_ALIGNED; - } - - /* Make small textures 1D tiled. */ - if (templ->width0 <= 16 || templ->height0 <= 16 || - (sscreen->debug_flags & DBG(NO_2D_TILING))) - return RADEON_SURF_MODE_1D; - - /* The allocator will switch to 1D if needed. */ - return RADEON_SURF_MODE_2D; + const struct util_format_description *desc = util_format_description(templ->format); + bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING; + bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && + !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH); + + /* MSAA resources must be 2D tiled. */ + if (templ->nr_samples > 1) + return RADEON_SURF_MODE_2D; + + /* Transfer resources should be linear. */ + if (templ->flags & SI_RESOURCE_FLAG_TRANSFER) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8, + * which requires 2D tiling. + */ + if (sscreen->info.chip_class == GFX8 && tc_compatible_htile) + return RADEON_SURF_MODE_2D; + + /* Handle common candidates for the linear mode. + * Compressed textures and DB surfaces must always be tiled. + */ + if (!force_tiling && !is_depth_stencil && !util_format_is_compressed(templ->format)) { + if (sscreen->debug_flags & DBG(NO_TILING)) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */ + if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Cursors are linear on AMD GCN. + * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */ + if (templ->bind & PIPE_BIND_CURSOR) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + if (templ->bind & PIPE_BIND_LINEAR) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures with a very small height are recommended to be linear. */ + if (templ->target == PIPE_TEXTURE_1D || templ->target == PIPE_TEXTURE_1D_ARRAY || + /* Only very thin and long 2D textures should benefit from + * linear_aligned. */ + (templ->width0 > 8 && templ->height0 <= 2)) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures likely to be mapped often. */ + if (templ->usage == PIPE_USAGE_STAGING || templ->usage == PIPE_USAGE_STREAM) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + } + + /* Make small textures 1D tiled. */ + if (templ->width0 <= 16 || templ->height0 <= 16 || (sscreen->debug_flags & DBG(NO_2D_TILING))) + return RADEON_SURF_MODE_1D; + + /* The allocator will switch to 1D if needed. */ + return RADEON_SURF_MODE_2D; } struct pipe_resource *si_texture_create(struct pipe_screen *screen, - const struct pipe_resource *templ) + const struct pipe_resource *templ) { - struct si_screen *sscreen = (struct si_screen*)screen; - bool is_zs = util_format_is_depth_or_stencil(templ->format); - - if (templ->nr_samples >= 2) { - /* This is hackish (overwriting the const pipe_resource template), - * but should be harmless and state trackers can also see - * the overriden number of samples in the created pipe_resource. - */ - if (is_zs && sscreen->eqaa_force_z_samples) { - ((struct pipe_resource*)templ)->nr_samples = - ((struct pipe_resource*)templ)->nr_storage_samples = - sscreen->eqaa_force_z_samples; - } else if (!is_zs && sscreen->eqaa_force_color_samples) { - ((struct pipe_resource*)templ)->nr_samples = - sscreen->eqaa_force_coverage_samples; - ((struct pipe_resource*)templ)->nr_storage_samples = - sscreen->eqaa_force_color_samples; - } - } - - bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || - templ->flags & SI_RESOURCE_FLAG_TRANSFER; - bool tc_compatible_htile = - sscreen->info.chip_class >= GFX8 && - /* There are issues with TC-compatible HTILE on Tonga (and - * Iceland is the same design), and documented bug workarounds - * don't help. For example, this fails: - * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto - */ - sscreen->info.family != CHIP_TONGA && - sscreen->info.family != CHIP_ICELAND && - (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) && - !(sscreen->debug_flags & DBG(NO_HYPERZ)) && - !is_flushed_depth && - templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */ - is_zs; - enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, - tc_compatible_htile); - - /* This allocates textures with multiple planes like NV12 in 1 buffer. */ - enum { SI_TEXTURE_MAX_PLANES = 3 }; - struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {}; - struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES]; - uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {}; - uint64_t total_size = 0; - unsigned max_alignment = 0; - unsigned num_planes = util_format_get_num_planes(templ->format); - assert(num_planes <= SI_TEXTURE_MAX_PLANES); - - /* Compute texture or plane layouts and offsets. */ - for (unsigned i = 0; i < num_planes; i++) { - plane_templ[i] = *templ; - plane_templ[i].format = util_format_get_plane_format(templ->format, i); - plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0); - plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0); - - /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't - * reallocate the storage to add PIPE_BIND_SHARED, because it's - * shared by 3 pipe_resources. - */ - if (num_planes > 1) - plane_templ[i].bind |= PIPE_BIND_SHARED; - - if (si_init_surface(sscreen, &surface[i], &plane_templ[i], - tile_mode, 0, false, - plane_templ[i].bind & PIPE_BIND_SCANOUT, - is_flushed_depth, tc_compatible_htile)) - return NULL; - - plane_offset[i] = align64(total_size, surface[i].surf_alignment); - total_size = plane_offset[i] + surface[i].total_size; - max_alignment = MAX2(max_alignment, surface[i].surf_alignment); - } - - struct si_texture *plane0 = NULL, *last_plane = NULL; - - for (unsigned i = 0; i < num_planes; i++) { - struct si_texture *tex = - si_texture_create_object(screen, &plane_templ[i], &surface[i], - plane0, NULL, plane_offset[i], - total_size, max_alignment); - if (!tex) { - si_texture_reference(&plane0, NULL); - return NULL; - } - - tex->plane_index = i; - tex->num_planes = num_planes; - - if (!plane0) { - plane0 = last_plane = tex; - } else { - last_plane->buffer.b.b.next = &tex->buffer.b.b; - last_plane = tex; - } - } - - return (struct pipe_resource *)plane0; + struct si_screen *sscreen = (struct si_screen *)screen; + bool is_zs = util_format_is_depth_or_stencil(templ->format); + + if (templ->nr_samples >= 2) { + /* This is hackish (overwriting the const pipe_resource template), + * but should be harmless and state trackers can also see + * the overriden number of samples in the created pipe_resource. + */ + if (is_zs && sscreen->eqaa_force_z_samples) { + ((struct pipe_resource *)templ)->nr_samples = + ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_z_samples; + } else if (!is_zs && sscreen->eqaa_force_color_samples) { + ((struct pipe_resource *)templ)->nr_samples = sscreen->eqaa_force_coverage_samples; + ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_color_samples; + } + } + + bool is_flushed_depth = + templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || templ->flags & SI_RESOURCE_FLAG_TRANSFER; + bool tc_compatible_htile = + sscreen->info.chip_class >= GFX8 && + /* There are issues with TC-compatible HTILE on Tonga (and + * Iceland is the same design), and documented bug workarounds + * don't help. For example, this fails: + * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto + */ + sscreen->info.family != CHIP_TONGA && sscreen->info.family != CHIP_ICELAND && + (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) && + !(sscreen->debug_flags & DBG(NO_HYPERZ)) && !is_flushed_depth && + templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */ + is_zs; + enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, tc_compatible_htile); + + /* This allocates textures with multiple planes like NV12 in 1 buffer. */ + enum + { + SI_TEXTURE_MAX_PLANES = 3 + }; + struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {}; + struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES]; + uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {}; + uint64_t total_size = 0; + unsigned max_alignment = 0; + unsigned num_planes = util_format_get_num_planes(templ->format); + assert(num_planes <= SI_TEXTURE_MAX_PLANES); + + /* Compute texture or plane layouts and offsets. */ + for (unsigned i = 0; i < num_planes; i++) { + plane_templ[i] = *templ; + plane_templ[i].format = util_format_get_plane_format(templ->format, i); + plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0); + plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0); + + /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't + * reallocate the storage to add PIPE_BIND_SHARED, because it's + * shared by 3 pipe_resources. + */ + if (num_planes > 1) + plane_templ[i].bind |= PIPE_BIND_SHARED; + + if (si_init_surface(sscreen, &surface[i], &plane_templ[i], tile_mode, 0, false, + plane_templ[i].bind & PIPE_BIND_SCANOUT, is_flushed_depth, + tc_compatible_htile)) + return NULL; + + plane_offset[i] = align64(total_size, surface[i].surf_alignment); + total_size = plane_offset[i] + surface[i].total_size; + max_alignment = MAX2(max_alignment, surface[i].surf_alignment); + } + + struct si_texture *plane0 = NULL, *last_plane = NULL; + + for (unsigned i = 0; i < num_planes; i++) { + struct si_texture *tex = + si_texture_create_object(screen, &plane_templ[i], &surface[i], plane0, NULL, + plane_offset[i], total_size, max_alignment); + if (!tex) { + si_texture_reference(&plane0, NULL); + return NULL; + } + + tex->plane_index = i; + tex->num_planes = num_planes; + + if (!plane0) { + plane0 = last_plane = tex; + } else { + last_plane->buffer.b.b.next = &tex->buffer.b.b; + last_plane = tex; + } + } + + return (struct pipe_resource *)plane0; } static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen, - const struct pipe_resource *templ, - struct pb_buffer *buf, - unsigned stride, - unsigned offset, - unsigned usage, - bool dedicated) + const struct pipe_resource *templ, + struct pb_buffer *buf, unsigned stride, + unsigned offset, unsigned usage, + bool dedicated) { - enum radeon_surf_mode array_mode; - struct radeon_surf surface = {}; - struct radeon_bo_metadata metadata = {}; - struct si_texture *tex; - bool is_scanout; - int r; - - /* Ignore metadata for non-zero planes. */ - if (offset != 0) - dedicated = false; - - if (dedicated) { - sscreen->ws->buffer_get_metadata(buf, &metadata); - si_get_display_metadata(sscreen, &surface, &metadata, - &array_mode, &is_scanout); - } else { - /** - * The bo metadata is unset for un-dedicated images. So we fall - * back to linear. See answer to question 5 of the - * VK_KHX_external_memory spec for some details. - * - * It is possible that this case isn't going to work if the - * surface pitch isn't correctly aligned by default. - * - * In order to support it correctly we require multi-image - * metadata to be syncrhonized between radv and radeonsi. The - * semantics of associating multiple image metadata to a memory - * object on the vulkan export side are not concretely defined - * either. - * - * All the use cases we are aware of at the moment for memory - * objects use dedicated allocations. So lets keep the initial - * implementation simple. - * - * A possible alternative is to attempt to reconstruct the - * tiling information when the TexParameter TEXTURE_TILING_EXT - * is set. - */ - array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - is_scanout = false; - } - - r = si_init_surface(sscreen, &surface, templ, - array_mode, stride, true, is_scanout, - false, false); - if (r) - return NULL; - - tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, - offset, 0, 0); - if (!tex) - return NULL; - - tex->buffer.b.is_shared = true; - tex->buffer.external_usage = usage; - tex->num_planes = 1; - - if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) { - si_texture_reference(&tex, NULL); - return NULL; - } - - /* Displayable DCC requires an explicit flush. */ - if (dedicated && offset == 0 && - !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && - si_has_displayable_dcc(tex)) { - /* TODO: do we need to decompress DCC? */ - if (si_texture_discard_dcc(sscreen, tex)) { - /* Update BO metadata after disabling DCC. */ - si_set_tex_bo_metadata(sscreen, tex); - } - } - - assert(tex->surface.tile_swizzle == 0); - return &tex->buffer.b.b; + enum radeon_surf_mode array_mode; + struct radeon_surf surface = {}; + struct radeon_bo_metadata metadata = {}; + struct si_texture *tex; + bool is_scanout; + int r; + + /* Ignore metadata for non-zero planes. */ + if (offset != 0) + dedicated = false; + + if (dedicated) { + sscreen->ws->buffer_get_metadata(buf, &metadata); + si_get_display_metadata(sscreen, &surface, &metadata, &array_mode, &is_scanout); + } else { + /** + * The bo metadata is unset for un-dedicated images. So we fall + * back to linear. See answer to question 5 of the + * VK_KHX_external_memory spec for some details. + * + * It is possible that this case isn't going to work if the + * surface pitch isn't correctly aligned by default. + * + * In order to support it correctly we require multi-image + * metadata to be syncrhonized between radv and radeonsi. The + * semantics of associating multiple image metadata to a memory + * object on the vulkan export side are not concretely defined + * either. + * + * All the use cases we are aware of at the moment for memory + * objects use dedicated allocations. So lets keep the initial + * implementation simple. + * + * A possible alternative is to attempt to reconstruct the + * tiling information when the TexParameter TEXTURE_TILING_EXT + * is set. + */ + array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + is_scanout = false; + } + + r = + si_init_surface(sscreen, &surface, templ, array_mode, stride, true, is_scanout, false, false); + if (r) + return NULL; + + tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, offset, 0, 0); + if (!tex) + return NULL; + + tex->buffer.b.is_shared = true; + tex->buffer.external_usage = usage; + tex->num_planes = 1; + + if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) { + si_texture_reference(&tex, NULL); + return NULL; + } + + /* Displayable DCC requires an explicit flush. */ + if (dedicated && offset == 0 && !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + si_has_displayable_dcc(tex)) { + /* TODO: do we need to decompress DCC? */ + if (si_texture_discard_dcc(sscreen, tex)) { + /* Update BO metadata after disabling DCC. */ + si_set_tex_bo_metadata(sscreen, tex); + } + } + + assert(tex->surface.tile_swizzle == 0); + return &tex->buffer.b.b; } static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen, - const struct pipe_resource *templ, - struct winsys_handle *whandle, - unsigned usage) + const struct pipe_resource *templ, + struct winsys_handle *whandle, unsigned usage) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct pb_buffer *buf = NULL; - - /* Support only 2D textures without mipmaps */ - if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT && - templ->target != PIPE_TEXTURE_2D_ARRAY) || - templ->last_level != 0) - return NULL; - - buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, - sscreen->info.max_alignment); - if (!buf) - return NULL; - - return si_texture_from_winsys_buffer(sscreen, templ, buf, - whandle->stride, whandle->offset, - usage, true); + struct si_screen *sscreen = (struct si_screen *)screen; + struct pb_buffer *buf = NULL; + + /* Support only 2D textures without mipmaps */ + if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT && + templ->target != PIPE_TEXTURE_2D_ARRAY) || + templ->last_level != 0) + return NULL; + + buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment); + if (!buf) + return NULL; + + return si_texture_from_winsys_buffer(sscreen, templ, buf, whandle->stride, whandle->offset, + usage, true); } -bool si_init_flushed_depth_texture(struct pipe_context *ctx, - struct pipe_resource *texture) +bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture) { - struct si_texture *tex = (struct si_texture*)texture; - struct pipe_resource resource; - enum pipe_format pipe_format = texture->format; - - assert(!tex->flushed_depth_texture); - - if (!tex->can_sample_z && tex->can_sample_s) { - switch (pipe_format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - /* Save memory by not allocating the S plane. */ - pipe_format = PIPE_FORMAT_Z32_FLOAT; - break; - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - /* Save memory bandwidth by not copying the - * stencil part during flush. - * - * This potentially increases memory bandwidth - * if an application uses both Z and S texturing - * simultaneously (a flushed Z24S8 texture - * would be stored compactly), but how often - * does that really happen? - */ - pipe_format = PIPE_FORMAT_Z24X8_UNORM; - break; - default:; - } - } else if (!tex->can_sample_s && tex->can_sample_z) { - assert(util_format_has_stencil(util_format_description(pipe_format))); - - /* DB->CB copies to an 8bpp surface don't work. */ - pipe_format = PIPE_FORMAT_X24S8_UINT; - } - - memset(&resource, 0, sizeof(resource)); - resource.target = texture->target; - resource.format = pipe_format; - resource.width0 = texture->width0; - resource.height0 = texture->height0; - resource.depth0 = texture->depth0; - resource.array_size = texture->array_size; - resource.last_level = texture->last_level; - resource.nr_samples = texture->nr_samples; - resource.usage = PIPE_USAGE_DEFAULT; - resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL; - resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH; - - tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource); - if (!tex->flushed_depth_texture) { - PRINT_ERR("failed to create temporary texture to hold flushed depth\n"); - return false; - } - return true; + struct si_texture *tex = (struct si_texture *)texture; + struct pipe_resource resource; + enum pipe_format pipe_format = texture->format; + + assert(!tex->flushed_depth_texture); + + if (!tex->can_sample_z && tex->can_sample_s) { + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Save memory by not allocating the S plane. */ + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Save memory bandwidth by not copying the + * stencil part during flush. + * + * This potentially increases memory bandwidth + * if an application uses both Z and S texturing + * simultaneously (a flushed Z24S8 texture + * would be stored compactly), but how often + * does that really happen? + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + default:; + } + } else if (!tex->can_sample_s && tex->can_sample_z) { + assert(util_format_has_stencil(util_format_description(pipe_format))); + + /* DB->CB copies to an 8bpp surface don't work. */ + pipe_format = PIPE_FORMAT_X24S8_UINT; + } + + memset(&resource, 0, sizeof(resource)); + resource.target = texture->target; + resource.format = pipe_format; + resource.width0 = texture->width0; + resource.height0 = texture->height0; + resource.depth0 = texture->depth0; + resource.array_size = texture->array_size; + resource.last_level = texture->last_level; + resource.nr_samples = texture->nr_samples; + resource.usage = PIPE_USAGE_DEFAULT; + resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL; + resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH; + + tex->flushed_depth_texture = + (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource); + if (!tex->flushed_depth_texture) { + PRINT_ERR("failed to create temporary texture to hold flushed depth\n"); + return false; + } + return true; } /** @@ -1905,836 +1763,759 @@ bool si_init_flushed_depth_texture(struct pipe_context *ctx, * which is supposed to hold a subregion of the texture "orig" at the given * mipmap level. */ -static void si_init_temp_resource_from_box(struct pipe_resource *res, - struct pipe_resource *orig, - const struct pipe_box *box, - unsigned level, unsigned flags) +static void si_init_temp_resource_from_box(struct pipe_resource *res, struct pipe_resource *orig, + const struct pipe_box *box, unsigned level, + unsigned flags) { - memset(res, 0, sizeof(*res)); - res->format = orig->format; - res->width0 = box->width; - res->height0 = box->height; - res->depth0 = 1; - res->array_size = 1; - res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; - res->flags = flags; - - if (flags & SI_RESOURCE_FLAG_TRANSFER && - util_format_is_compressed(orig->format)) { - /* Transfer resources are allocated with linear tiling, which is - * not supported for compressed formats. - */ - unsigned blocksize = - util_format_get_blocksize(orig->format); - - if (blocksize == 8) { - res->format = PIPE_FORMAT_R16G16B16A16_UINT; - } else { - assert(blocksize == 16); - res->format = PIPE_FORMAT_R32G32B32A32_UINT; - } - - res->width0 = util_format_get_nblocksx(orig->format, box->width); - res->height0 = util_format_get_nblocksy(orig->format, box->height); - } - - /* We must set the correct texture target and dimensions for a 3D box. */ - if (box->depth > 1 && util_max_layer(orig, level) > 0) { - res->target = PIPE_TEXTURE_2D_ARRAY; - res->array_size = box->depth; - } else { - res->target = PIPE_TEXTURE_2D; - } + memset(res, 0, sizeof(*res)); + res->format = orig->format; + res->width0 = box->width; + res->height0 = box->height; + res->depth0 = 1; + res->array_size = 1; + res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; + res->flags = flags; + + if (flags & SI_RESOURCE_FLAG_TRANSFER && util_format_is_compressed(orig->format)) { + /* Transfer resources are allocated with linear tiling, which is + * not supported for compressed formats. + */ + unsigned blocksize = util_format_get_blocksize(orig->format); + + if (blocksize == 8) { + res->format = PIPE_FORMAT_R16G16B16A16_UINT; + } else { + assert(blocksize == 16); + res->format = PIPE_FORMAT_R32G32B32A32_UINT; + } + + res->width0 = util_format_get_nblocksx(orig->format, box->width); + res->height0 = util_format_get_nblocksy(orig->format, box->height); + } + + /* We must set the correct texture target and dimensions for a 3D box. */ + if (box->depth > 1 && util_max_layer(orig, level) > 0) { + res->target = PIPE_TEXTURE_2D_ARRAY; + res->array_size = box->depth; + } else { + res->target = PIPE_TEXTURE_2D; + } } -static bool si_can_invalidate_texture(struct si_screen *sscreen, - struct si_texture *tex, - unsigned transfer_usage, - const struct pipe_box *box) +static bool si_can_invalidate_texture(struct si_screen *sscreen, struct si_texture *tex, + unsigned transfer_usage, const struct pipe_box *box) { - return !tex->buffer.b.is_shared && - !(tex->surface.flags & RADEON_SURF_IMPORTED) && - !(transfer_usage & PIPE_TRANSFER_READ) && - tex->buffer.b.b.last_level == 0 && - util_texrange_covers_whole_level(&tex->buffer.b.b, 0, - box->x, box->y, box->z, - box->width, box->height, - box->depth); + return !tex->buffer.b.is_shared && !(tex->surface.flags & RADEON_SURF_IMPORTED) && + !(transfer_usage & PIPE_TRANSFER_READ) && tex->buffer.b.b.last_level == 0 && + util_texrange_covers_whole_level(&tex->buffer.b.b, 0, box->x, box->y, box->z, box->width, + box->height, box->depth); } -static void si_texture_invalidate_storage(struct si_context *sctx, - struct si_texture *tex) +static void si_texture_invalidate_storage(struct si_context *sctx, struct si_texture *tex) { - struct si_screen *sscreen = sctx->screen; + struct si_screen *sscreen = sctx->screen; - /* There is no point in discarding depth and tiled buffers. */ - assert(!tex->is_depth); - assert(tex->surface.is_linear); + /* There is no point in discarding depth and tiled buffers. */ + assert(!tex->is_depth); + assert(tex->surface.is_linear); - /* Reallocate the buffer in the same pipe_resource. */ - si_alloc_resource(sscreen, &tex->buffer); + /* Reallocate the buffer in the same pipe_resource. */ + si_alloc_resource(sscreen, &tex->buffer); - /* Initialize the CMASK base address (needed even without CMASK). */ - tex->cmask_base_address_reg = - (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; + /* Initialize the CMASK base address (needed even without CMASK). */ + tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8; - p_atomic_inc(&sscreen->dirty_tex_counter); + p_atomic_inc(&sscreen->dirty_tex_counter); - sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size; + sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size; } -static void *si_texture_transfer_map(struct pipe_context *ctx, - struct pipe_resource *texture, - unsigned level, - unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **ptransfer) +static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture, + unsigned level, unsigned usage, const struct pipe_box *box, + struct pipe_transfer **ptransfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_texture *tex = (struct si_texture*)texture; - struct si_transfer *trans; - struct si_resource *buf; - unsigned offset = 0; - char *map; - bool use_staging_texture = false; - - assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER)); - assert(box->width && box->height && box->depth); - - if (tex->is_depth) { - /* Depth textures use staging unconditionally. */ - use_staging_texture = true; - } else { - /* Degrade the tile mode if we get too many transfers on APUs. - * On dGPUs, the staging texture is always faster. - * Only count uploads that are at least 4x4 pixels large. - */ - if (!sctx->screen->info.has_dedicated_vram && - level == 0 && - box->width >= 4 && box->height >= 4 && - p_atomic_inc_return(&tex->num_level0_transfers) == 10) { - bool can_invalidate = - si_can_invalidate_texture(sctx->screen, tex, - usage, box); - - si_reallocate_texture_inplace(sctx, tex, - PIPE_BIND_LINEAR, - can_invalidate); - } - - /* Tiled textures need to be converted into a linear texture for CPU - * access. The staging texture is always linear and is placed in GART. - * - * Reading from VRAM or GTT WC is slow, always use the staging - * texture in this case. - * - * Use the staging texture for uploads if the underlying BO - * is busy. - */ - if (!tex->surface.is_linear) - use_staging_texture = true; - else if (usage & PIPE_TRANSFER_READ) - use_staging_texture = - tex->buffer.domains & RADEON_DOMAIN_VRAM || - tex->buffer.flags & RADEON_FLAG_GTT_WC; - /* Write & linear only: */ - else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, - RADEON_USAGE_READWRITE) || - !sctx->ws->buffer_wait(tex->buffer.buf, 0, - RADEON_USAGE_READWRITE)) { - /* It's busy. */ - if (si_can_invalidate_texture(sctx->screen, tex, - usage, box)) - si_texture_invalidate_storage(sctx, tex); - else - use_staging_texture = true; - } - } - - trans = CALLOC_STRUCT(si_transfer); - if (!trans) - return NULL; - pipe_resource_reference(&trans->b.b.resource, texture); - trans->b.b.level = level; - trans->b.b.usage = usage; - trans->b.b.box = *box; - - if (use_staging_texture) { - struct pipe_resource resource; - struct si_texture *staging; - - si_init_temp_resource_from_box(&resource, texture, box, level, - SI_RESOURCE_FLAG_TRANSFER); - resource.usage = (usage & PIPE_TRANSFER_READ) ? - PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; - - /* Since depth-stencil textures don't support linear tiling, - * blit from ZS to color and vice versa. u_blitter will do - * the packing for these formats. - */ - if (tex->is_depth) - resource.format = util_blitter_get_color_format_for_zs(resource.format); - - /* Create the temporary texture. */ - staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource); - if (!staging) { - PRINT_ERR("failed to create temporary texture to hold untiled copy\n"); - goto fail_trans; - } - trans->staging = &staging->buffer; - - /* Just get the strides. */ - si_texture_get_offset(sctx->screen, staging, 0, NULL, - &trans->b.b.stride, - &trans->b.b.layer_stride); - - if (usage & PIPE_TRANSFER_READ) - si_copy_to_staging_texture(ctx, trans); - else - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - - buf = trans->staging; - } else { - /* the resource is mapped directly */ - offset = si_texture_get_offset(sctx->screen, tex, level, box, - &trans->b.b.stride, - &trans->b.b.layer_stride); - buf = &tex->buffer; - } - - /* Always unmap texture CPU mappings on 32-bit architectures, so that - * we don't run out of the CPU address space. - */ - if (sizeof(void*) == 4) - usage |= RADEON_TRANSFER_TEMPORARY; - - if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage))) - goto fail_trans; - - *ptransfer = &trans->b.b; - return map + offset; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture *tex = (struct si_texture *)texture; + struct si_transfer *trans; + struct si_resource *buf; + unsigned offset = 0; + char *map; + bool use_staging_texture = false; + + assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER)); + assert(box->width && box->height && box->depth); + + if (tex->is_depth) { + /* Depth textures use staging unconditionally. */ + use_staging_texture = true; + } else { + /* Degrade the tile mode if we get too many transfers on APUs. + * On dGPUs, the staging texture is always faster. + * Only count uploads that are at least 4x4 pixels large. + */ + if (!sctx->screen->info.has_dedicated_vram && level == 0 && box->width >= 4 && + box->height >= 4 && p_atomic_inc_return(&tex->num_level0_transfers) == 10) { + bool can_invalidate = si_can_invalidate_texture(sctx->screen, tex, usage, box); + + si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_LINEAR, can_invalidate); + } + + /* Tiled textures need to be converted into a linear texture for CPU + * access. The staging texture is always linear and is placed in GART. + * + * Reading from VRAM or GTT WC is slow, always use the staging + * texture in this case. + * + * Use the staging texture for uploads if the underlying BO + * is busy. + */ + if (!tex->surface.is_linear) + use_staging_texture = true; + else if (usage & PIPE_TRANSFER_READ) + use_staging_texture = + tex->buffer.domains & RADEON_DOMAIN_VRAM || tex->buffer.flags & RADEON_FLAG_GTT_WC; + /* Write & linear only: */ + else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, RADEON_USAGE_READWRITE) || + !sctx->ws->buffer_wait(tex->buffer.buf, 0, RADEON_USAGE_READWRITE)) { + /* It's busy. */ + if (si_can_invalidate_texture(sctx->screen, tex, usage, box)) + si_texture_invalidate_storage(sctx, tex); + else + use_staging_texture = true; + } + } + + trans = CALLOC_STRUCT(si_transfer); + if (!trans) + return NULL; + pipe_resource_reference(&trans->b.b.resource, texture); + trans->b.b.level = level; + trans->b.b.usage = usage; + trans->b.b.box = *box; + + if (use_staging_texture) { + struct pipe_resource resource; + struct si_texture *staging; + + si_init_temp_resource_from_box(&resource, texture, box, level, SI_RESOURCE_FLAG_TRANSFER); + resource.usage = (usage & PIPE_TRANSFER_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; + + /* Since depth-stencil textures don't support linear tiling, + * blit from ZS to color and vice versa. u_blitter will do + * the packing for these formats. + */ + if (tex->is_depth) + resource.format = util_blitter_get_color_format_for_zs(resource.format); + + /* Create the temporary texture. */ + staging = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource); + if (!staging) { + PRINT_ERR("failed to create temporary texture to hold untiled copy\n"); + goto fail_trans; + } + trans->staging = &staging->buffer; + + /* Just get the strides. */ + si_texture_get_offset(sctx->screen, staging, 0, NULL, &trans->b.b.stride, + &trans->b.b.layer_stride); + + if (usage & PIPE_TRANSFER_READ) + si_copy_to_staging_texture(ctx, trans); + else + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + + buf = trans->staging; + } else { + /* the resource is mapped directly */ + offset = si_texture_get_offset(sctx->screen, tex, level, box, &trans->b.b.stride, + &trans->b.b.layer_stride); + buf = &tex->buffer; + } + + /* Always unmap texture CPU mappings on 32-bit architectures, so that + * we don't run out of the CPU address space. + */ + if (sizeof(void *) == 4) + usage |= RADEON_TRANSFER_TEMPORARY; + + if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage))) + goto fail_trans; + + *ptransfer = &trans->b.b; + return map + offset; fail_trans: - si_resource_reference(&trans->staging, NULL); - pipe_resource_reference(&trans->b.b.resource, NULL); - FREE(trans); - return NULL; + si_resource_reference(&trans->staging, NULL); + pipe_resource_reference(&trans->b.b.resource, NULL); + FREE(trans); + return NULL; } -static void si_texture_transfer_unmap(struct pipe_context *ctx, - struct pipe_transfer* transfer) +static void si_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer) { - struct si_context *sctx = (struct si_context*)ctx; - struct si_transfer *stransfer = (struct si_transfer*)transfer; - struct pipe_resource *texture = transfer->resource; - struct si_texture *tex = (struct si_texture*)texture; - - /* Always unmap texture CPU mappings on 32-bit architectures, so that - * we don't run out of the CPU address space. - */ - if (sizeof(void*) == 4) { - struct si_resource *buf = - stransfer->staging ? stransfer->staging : &tex->buffer; - - sctx->ws->buffer_unmap(buf->buf); - } - - if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) - si_copy_from_staging_texture(ctx, stransfer); - - if (stransfer->staging) { - sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size; - si_resource_reference(&stransfer->staging, NULL); - } - - /* Heuristic for {upload, draw, upload, draw, ..}: - * - * Flush the gfx IB if we've allocated too much texture storage. - * - * The idea is that we don't want to build IBs that use too much - * memory and put pressure on the kernel memory manager and we also - * want to make temporary and invalidated buffers go idle ASAP to - * decrease the total memory usage or make them reusable. The memory - * usage will be slightly higher than given here because of the buffer - * cache in the winsys. - * - * The result is that the kernel memory manager is never a bottleneck. - */ - if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - sctx->num_alloc_tex_transfer_bytes = 0; - } - - pipe_resource_reference(&transfer->resource, NULL); - FREE(transfer); + struct si_context *sctx = (struct si_context *)ctx; + struct si_transfer *stransfer = (struct si_transfer *)transfer; + struct pipe_resource *texture = transfer->resource; + struct si_texture *tex = (struct si_texture *)texture; + + /* Always unmap texture CPU mappings on 32-bit architectures, so that + * we don't run out of the CPU address space. + */ + if (sizeof(void *) == 4) { + struct si_resource *buf = stransfer->staging ? stransfer->staging : &tex->buffer; + + sctx->ws->buffer_unmap(buf->buf); + } + + if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) + si_copy_from_staging_texture(ctx, stransfer); + + if (stransfer->staging) { + sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size; + si_resource_reference(&stransfer->staging, NULL); + } + + /* Heuristic for {upload, draw, upload, draw, ..}: + * + * Flush the gfx IB if we've allocated too much texture storage. + * + * The idea is that we don't want to build IBs that use too much + * memory and put pressure on the kernel memory manager and we also + * want to make temporary and invalidated buffers go idle ASAP to + * decrease the total memory usage or make them reusable. The memory + * usage will be slightly higher than given here because of the buffer + * cache in the winsys. + * + * The result is that the kernel memory manager is never a bottleneck. + */ + if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); + sctx->num_alloc_tex_transfer_bytes = 0; + } + + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); } -static const struct u_resource_vtbl si_texture_vtbl = -{ - NULL, /* get_handle */ - si_texture_destroy, /* resource_destroy */ - si_texture_transfer_map, /* transfer_map */ - u_default_transfer_flush_region, /* transfer_flush_region */ - si_texture_transfer_unmap, /* transfer_unmap */ +static const struct u_resource_vtbl si_texture_vtbl = { + NULL, /* get_handle */ + si_texture_destroy, /* resource_destroy */ + si_texture_transfer_map, /* transfer_map */ + u_default_transfer_flush_region, /* transfer_flush_region */ + si_texture_transfer_unmap, /* transfer_unmap */ }; /* Return if it's allowed to reinterpret one format as another with DCC enabled. */ -bool vi_dcc_formats_compatible(struct si_screen *sscreen, - enum pipe_format format1, - enum pipe_format format2) +bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1, + enum pipe_format format2) { - const struct util_format_description *desc1, *desc2; - - /* No format change - exit early. */ - if (format1 == format2) - return true; - - format1 = si_simplify_cb_format(format1); - format2 = si_simplify_cb_format(format2); - - /* Check again after format adjustments. */ - if (format1 == format2) - return true; - - desc1 = util_format_description(format1); - desc2 = util_format_description(format2); - - if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || - desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return false; - - /* Float and non-float are totally incompatible. */ - if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) != - (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT)) - return false; - - /* Channel sizes must match across DCC formats. - * Comparing just the first 2 channels should be enough. - */ - if (desc1->channel[0].size != desc2->channel[0].size || - (desc1->nr_channels >= 2 && - desc1->channel[1].size != desc2->channel[1].size)) - return false; - - /* Everything below is not needed if the driver never uses the DCC - * clear code with the value of 1. - */ - - /* If the clear values are all 1 or all 0, this constraint can be - * ignored. */ - if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2)) - return false; - - /* Channel types must match if the clear value of 1 is used. - * The type categories are only float, signed, unsigned. - * NORM and INT are always compatible. - */ - if (desc1->channel[0].type != desc2->channel[0].type || - (desc1->nr_channels >= 2 && - desc1->channel[1].type != desc2->channel[1].type)) - return false; - - return true; + const struct util_format_description *desc1, *desc2; + + /* No format change - exit early. */ + if (format1 == format2) + return true; + + format1 = si_simplify_cb_format(format1); + format2 = si_simplify_cb_format(format2); + + /* Check again after format adjustments. */ + if (format1 == format2) + return true; + + desc1 = util_format_description(format1); + desc2 = util_format_description(format2); + + if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return false; + + /* Float and non-float are totally incompatible. */ + if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) != + (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT)) + return false; + + /* Channel sizes must match across DCC formats. + * Comparing just the first 2 channels should be enough. + */ + if (desc1->channel[0].size != desc2->channel[0].size || + (desc1->nr_channels >= 2 && desc1->channel[1].size != desc2->channel[1].size)) + return false; + + /* Everything below is not needed if the driver never uses the DCC + * clear code with the value of 1. + */ + + /* If the clear values are all 1 or all 0, this constraint can be + * ignored. */ + if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2)) + return false; + + /* Channel types must match if the clear value of 1 is used. + * The type categories are only float, signed, unsigned. + * NORM and INT are always compatible. + */ + if (desc1->channel[0].type != desc2->channel[0].type || + (desc1->nr_channels >= 2 && desc1->channel[1].type != desc2->channel[1].type)) + return false; + + return true; } -bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format) +bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level, + enum pipe_format view_format) { - struct si_texture *stex = (struct si_texture *)tex; + struct si_texture *stex = (struct si_texture *)tex; - return vi_dcc_enabled(stex, level) && - !vi_dcc_formats_compatible((struct si_screen*)tex->screen, - tex->format, view_format); + return vi_dcc_enabled(stex, level) && + !vi_dcc_formats_compatible((struct si_screen *)tex->screen, tex->format, view_format); } /* This can't be merged with the above function, because * vi_dcc_formats_compatible should be called only when DCC is enabled. */ -void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, - struct pipe_resource *tex, - unsigned level, - enum pipe_format view_format) +void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex, + unsigned level, enum pipe_format view_format) { - struct si_texture *stex = (struct si_texture *)tex; + struct si_texture *stex = (struct si_texture *)tex; - if (vi_dcc_formats_are_incompatible(tex, level, view_format)) - if (!si_texture_disable_dcc(sctx, stex)) - si_decompress_dcc(sctx, stex); + if (vi_dcc_formats_are_incompatible(tex, level, view_format)) + if (!si_texture_disable_dcc(sctx, stex)) + si_decompress_dcc(sctx, stex); } struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe, - struct pipe_resource *texture, - const struct pipe_surface *templ, - unsigned width0, unsigned height0, - unsigned width, unsigned height) + struct pipe_resource *texture, + const struct pipe_surface *templ, unsigned width0, + unsigned height0, unsigned width, unsigned height) { - struct si_surface *surface = CALLOC_STRUCT(si_surface); - - if (!surface) - return NULL; - - assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); - assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level)); - - pipe_reference_init(&surface->base.reference, 1); - pipe_resource_reference(&surface->base.texture, texture); - surface->base.context = pipe; - surface->base.format = templ->format; - surface->base.width = width; - surface->base.height = height; - surface->base.u = templ->u; - - surface->width0 = width0; - surface->height0 = height0; - - surface->dcc_incompatible = - texture->target != PIPE_BUFFER && - vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, - templ->format); - return &surface->base; + struct si_surface *surface = CALLOC_STRUCT(si_surface); + + if (!surface) + return NULL; + + assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); + assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level)); + + pipe_reference_init(&surface->base.reference, 1); + pipe_resource_reference(&surface->base.texture, texture); + surface->base.context = pipe; + surface->base.format = templ->format; + surface->base.width = width; + surface->base.height = height; + surface->base.u = templ->u; + + surface->width0 = width0; + surface->height0 = height0; + + surface->dcc_incompatible = + texture->target != PIPE_BUFFER && + vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, templ->format); + return &surface->base; } -static struct pipe_surface *si_create_surface(struct pipe_context *pipe, - struct pipe_resource *tex, - const struct pipe_surface *templ) +static struct pipe_surface *si_create_surface(struct pipe_context *pipe, struct pipe_resource *tex, + const struct pipe_surface *templ) { - unsigned level = templ->u.tex.level; - unsigned width = u_minify(tex->width0, level); - unsigned height = u_minify(tex->height0, level); - unsigned width0 = tex->width0; - unsigned height0 = tex->height0; - - if (tex->target != PIPE_BUFFER && templ->format != tex->format) { - const struct util_format_description *tex_desc - = util_format_description(tex->format); - const struct util_format_description *templ_desc - = util_format_description(templ->format); - - assert(tex_desc->block.bits == templ_desc->block.bits); - - /* Adjust size of surface if and only if the block width or - * height is changed. */ - if (tex_desc->block.width != templ_desc->block.width || - tex_desc->block.height != templ_desc->block.height) { - unsigned nblks_x = util_format_get_nblocksx(tex->format, width); - unsigned nblks_y = util_format_get_nblocksy(tex->format, height); - - width = nblks_x * templ_desc->block.width; - height = nblks_y * templ_desc->block.height; - - width0 = util_format_get_nblocksx(tex->format, width0); - height0 = util_format_get_nblocksy(tex->format, height0); - } - } - - return si_create_surface_custom(pipe, tex, templ, - width0, height0, - width, height); + unsigned level = templ->u.tex.level; + unsigned width = u_minify(tex->width0, level); + unsigned height = u_minify(tex->height0, level); + unsigned width0 = tex->width0; + unsigned height0 = tex->height0; + + if (tex->target != PIPE_BUFFER && templ->format != tex->format) { + const struct util_format_description *tex_desc = util_format_description(tex->format); + const struct util_format_description *templ_desc = util_format_description(templ->format); + + assert(tex_desc->block.bits == templ_desc->block.bits); + + /* Adjust size of surface if and only if the block width or + * height is changed. */ + if (tex_desc->block.width != templ_desc->block.width || + tex_desc->block.height != templ_desc->block.height) { + unsigned nblks_x = util_format_get_nblocksx(tex->format, width); + unsigned nblks_y = util_format_get_nblocksy(tex->format, height); + + width = nblks_x * templ_desc->block.width; + height = nblks_y * templ_desc->block.height; + + width0 = util_format_get_nblocksx(tex->format, width0); + height0 = util_format_get_nblocksy(tex->format, height0); + } + } + + return si_create_surface_custom(pipe, tex, templ, width0, height0, width, height); } -static void si_surface_destroy(struct pipe_context *pipe, - struct pipe_surface *surface) +static void si_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surface) { - pipe_resource_reference(&surface->texture, NULL); - FREE(surface); + pipe_resource_reference(&surface->texture, NULL); + FREE(surface); } unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap) { - const struct util_format_description *desc = util_format_description(format); - -#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz) - - if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ - return V_028C70_SWAP_STD; - - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) - return ~0U; - - switch (desc->nr_channels) { - case 1: - if (HAS_SWIZZLE(0,X)) - return V_028C70_SWAP_STD; /* X___ */ - else if (HAS_SWIZZLE(3,X)) - return V_028C70_SWAP_ALT_REV; /* ___X */ - break; - case 2: - if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) || - (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) || - (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y))) - return V_028C70_SWAP_STD; /* XY__ */ - else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || - (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || - (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) - /* YX__ */ - return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV); - else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) - return V_028C70_SWAP_ALT; /* X__Y */ - else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) - return V_028C70_SWAP_ALT_REV; /* Y__X */ - break; - case 3: - if (HAS_SWIZZLE(0,X)) - return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD); - else if (HAS_SWIZZLE(0,Z)) - return V_028C70_SWAP_STD_REV; /* ZYX */ - break; - case 4: - /* check the middle channels, the 1st and 4th channel can be NONE */ - if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { - return V_028C70_SWAP_STD; /* XYZW */ - } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { - return V_028C70_SWAP_STD_REV; /* WZYX */ - } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { - return V_028C70_SWAP_ALT; /* ZYXW */ - } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { - /* YZWX */ - if (desc->is_array) - return V_028C70_SWAP_ALT_REV; - else - return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV); - } - break; - } - return ~0U; + const struct util_format_description *desc = util_format_description(format); + +#define HAS_SWIZZLE(chan, swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz) + + if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ + return V_028C70_SWAP_STD; + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return ~0U; + + switch (desc->nr_channels) { + case 1: + if (HAS_SWIZZLE(0, X)) + return V_028C70_SWAP_STD; /* X___ */ + else if (HAS_SWIZZLE(3, X)) + return V_028C70_SWAP_ALT_REV; /* ___X */ + break; + case 2: + if ((HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, Y)) || (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, NONE)) || + (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, Y))) + return V_028C70_SWAP_STD; /* XY__ */ + else if ((HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, X)) || + (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, NONE)) || + (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, X))) + /* YX__ */ + return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV); + else if (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(3, Y)) + return V_028C70_SWAP_ALT; /* X__Y */ + else if (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(3, X)) + return V_028C70_SWAP_ALT_REV; /* Y__X */ + break; + case 3: + if (HAS_SWIZZLE(0, X)) + return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD); + else if (HAS_SWIZZLE(0, Z)) + return V_028C70_SWAP_STD_REV; /* ZYX */ + break; + case 4: + /* check the middle channels, the 1st and 4th channel can be NONE */ + if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, Z)) { + return V_028C70_SWAP_STD; /* XYZW */ + } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, Y)) { + return V_028C70_SWAP_STD_REV; /* WZYX */ + } else if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, X)) { + return V_028C70_SWAP_ALT; /* ZYXW */ + } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, W)) { + /* YZWX */ + if (desc->is_array) + return V_028C70_SWAP_ALT_REV; + else + return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV); + } + break; + } + return ~0U; } /* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */ -static void vi_dcc_clean_up_context_slot(struct si_context *sctx, - int slot) +static void vi_dcc_clean_up_context_slot(struct si_context *sctx, int slot) { - int i; + int i; - if (sctx->dcc_stats[slot].query_active) - vi_separate_dcc_stop_query(sctx, - sctx->dcc_stats[slot].tex); + if (sctx->dcc_stats[slot].query_active) + vi_separate_dcc_stop_query(sctx, sctx->dcc_stats[slot].tex); - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++) - if (sctx->dcc_stats[slot].ps_stats[i]) { - sctx->b.destroy_query(&sctx->b, - sctx->dcc_stats[slot].ps_stats[i]); - sctx->dcc_stats[slot].ps_stats[i] = NULL; - } + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++) + if (sctx->dcc_stats[slot].ps_stats[i]) { + sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[slot].ps_stats[i]); + sctx->dcc_stats[slot].ps_stats[i] = NULL; + } - si_texture_reference(&sctx->dcc_stats[slot].tex, NULL); + si_texture_reference(&sctx->dcc_stats[slot].tex, NULL); } /** * Return the per-context slot where DCC statistics queries for the texture live. */ -static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, - struct si_texture *tex) +static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, struct si_texture *tex) { - int i, empty_slot = -1; - - /* Remove zombie textures (textures kept alive by this array only). */ - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) - if (sctx->dcc_stats[i].tex && - sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1) - vi_dcc_clean_up_context_slot(sctx, i); - - /* Find the texture. */ - for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { - /* Return if found. */ - if (sctx->dcc_stats[i].tex == tex) { - sctx->dcc_stats[i].last_use_timestamp = os_time_get(); - return i; - } - - /* Record the first seen empty slot. */ - if (empty_slot == -1 && !sctx->dcc_stats[i].tex) - empty_slot = i; - } - - /* Not found. Remove the oldest member to make space in the array. */ - if (empty_slot == -1) { - int oldest_slot = 0; - - /* Find the oldest slot. */ - for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++) - if (sctx->dcc_stats[oldest_slot].last_use_timestamp > - sctx->dcc_stats[i].last_use_timestamp) - oldest_slot = i; - - /* Clean up the oldest slot. */ - vi_dcc_clean_up_context_slot(sctx, oldest_slot); - empty_slot = oldest_slot; - } - - /* Add the texture to the new slot. */ - si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex); - sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); - return empty_slot; + int i, empty_slot = -1; + + /* Remove zombie textures (textures kept alive by this array only). */ + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) + if (sctx->dcc_stats[i].tex && sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1) + vi_dcc_clean_up_context_slot(sctx, i); + + /* Find the texture. */ + for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { + /* Return if found. */ + if (sctx->dcc_stats[i].tex == tex) { + sctx->dcc_stats[i].last_use_timestamp = os_time_get(); + return i; + } + + /* Record the first seen empty slot. */ + if (empty_slot == -1 && !sctx->dcc_stats[i].tex) + empty_slot = i; + } + + /* Not found. Remove the oldest member to make space in the array. */ + if (empty_slot == -1) { + int oldest_slot = 0; + + /* Find the oldest slot. */ + for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++) + if (sctx->dcc_stats[oldest_slot].last_use_timestamp > + sctx->dcc_stats[i].last_use_timestamp) + oldest_slot = i; + + /* Clean up the oldest slot. */ + vi_dcc_clean_up_context_slot(sctx, oldest_slot); + empty_slot = oldest_slot; + } + + /* Add the texture to the new slot. */ + si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex); + sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); + return empty_slot; } -static struct pipe_query * -vi_create_resuming_pipestats_query(struct si_context *sctx) +static struct pipe_query *vi_create_resuming_pipestats_query(struct si_context *sctx) { - struct si_query_hw *query = (struct si_query_hw*) - sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0); + struct si_query_hw *query = + (struct si_query_hw *)sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0); - query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES; - return (struct pipe_query*)query; + query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES; + return (struct pipe_query *)query; } /** * Called when binding a color buffer. */ -void vi_separate_dcc_start_query(struct si_context *sctx, - struct si_texture *tex) +void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex) { - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - assert(!sctx->dcc_stats[i].query_active); + assert(!sctx->dcc_stats[i].query_active); - if (!sctx->dcc_stats[i].ps_stats[0]) - sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx); + if (!sctx->dcc_stats[i].ps_stats[0]) + sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx); - /* begin or resume the query */ - sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); - sctx->dcc_stats[i].query_active = true; + /* begin or resume the query */ + sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); + sctx->dcc_stats[i].query_active = true; } /** * Called when unbinding a color buffer. */ -void vi_separate_dcc_stop_query(struct si_context *sctx, - struct si_texture *tex) +void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex) { - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - assert(sctx->dcc_stats[i].query_active); - assert(sctx->dcc_stats[i].ps_stats[0]); + assert(sctx->dcc_stats[i].query_active); + assert(sctx->dcc_stats[i].ps_stats[0]); - /* pause or end the query */ - sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); - sctx->dcc_stats[i].query_active = false; + /* pause or end the query */ + sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]); + sctx->dcc_stats[i].query_active = false; } static bool vi_should_enable_separate_dcc(struct si_texture *tex) { - /* The minimum number of fullscreen draws per frame that is required - * to enable DCC. */ - return tex->ps_draw_ratio + tex->num_slow_clears >= 5; + /* The minimum number of fullscreen draws per frame that is required + * to enable DCC. */ + return tex->ps_draw_ratio + tex->num_slow_clears >= 5; } /* Called by fast clear. */ -void vi_separate_dcc_try_enable(struct si_context *sctx, - struct si_texture *tex) +void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex) { - /* The intent is to use this with shared displayable back buffers, - * but it's not strictly limited only to them. - */ - if (!tex->buffer.b.is_shared || - !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || - tex->buffer.b.b.target != PIPE_TEXTURE_2D || - tex->buffer.b.b.last_level > 0 || - !tex->surface.dcc_size || - sctx->screen->debug_flags & DBG(NO_DCC) || - sctx->screen->debug_flags & DBG(NO_DCC_FB)) - return; - - assert(sctx->chip_class >= GFX8); - - if (tex->surface.dcc_offset) - return; /* already enabled */ - - /* Enable the DCC stat gathering. */ - if (!tex->dcc_gather_statistics) { - tex->dcc_gather_statistics = true; - vi_separate_dcc_start_query(sctx, tex); - } - - if (!vi_should_enable_separate_dcc(tex)) - return; /* stats show that DCC decompression is too expensive */ - - assert(tex->surface.num_dcc_levels); - assert(!tex->dcc_separate_buffer); - - si_texture_discard_cmask(sctx->screen, tex); - - /* Get a DCC buffer. */ - if (tex->last_dcc_separate_buffer) { - assert(tex->dcc_gather_statistics); - assert(!tex->dcc_separate_buffer); - tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; - tex->last_dcc_separate_buffer = NULL; - } else { - tex->dcc_separate_buffer = - si_aligned_buffer_create(sctx->b.screen, - SI_RESOURCE_FLAG_UNMAPPABLE, - PIPE_USAGE_DEFAULT, - tex->surface.dcc_size, - tex->surface.dcc_alignment); - if (!tex->dcc_separate_buffer) - return; - } - - /* dcc_offset is the absolute GPUVM address. */ - tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address; - - /* no need to flag anything since this is called by fast clear that - * flags framebuffer state - */ + /* The intent is to use this with shared displayable back buffers, + * but it's not strictly limited only to them. + */ + if (!tex->buffer.b.is_shared || + !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || + tex->buffer.b.b.target != PIPE_TEXTURE_2D || tex->buffer.b.b.last_level > 0 || + !tex->surface.dcc_size || sctx->screen->debug_flags & DBG(NO_DCC) || + sctx->screen->debug_flags & DBG(NO_DCC_FB)) + return; + + assert(sctx->chip_class >= GFX8); + + if (tex->surface.dcc_offset) + return; /* already enabled */ + + /* Enable the DCC stat gathering. */ + if (!tex->dcc_gather_statistics) { + tex->dcc_gather_statistics = true; + vi_separate_dcc_start_query(sctx, tex); + } + + if (!vi_should_enable_separate_dcc(tex)) + return; /* stats show that DCC decompression is too expensive */ + + assert(tex->surface.num_dcc_levels); + assert(!tex->dcc_separate_buffer); + + si_texture_discard_cmask(sctx->screen, tex); + + /* Get a DCC buffer. */ + if (tex->last_dcc_separate_buffer) { + assert(tex->dcc_gather_statistics); + assert(!tex->dcc_separate_buffer); + tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; + tex->last_dcc_separate_buffer = NULL; + } else { + tex->dcc_separate_buffer = + si_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, + tex->surface.dcc_size, tex->surface.dcc_alignment); + if (!tex->dcc_separate_buffer) + return; + } + + /* dcc_offset is the absolute GPUVM address. */ + tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address; + + /* no need to flag anything since this is called by fast clear that + * flags framebuffer state + */ } /** * Called by pipe_context::flush_resource, the place where DCC decompression * takes place. */ -void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, - struct si_texture *tex) +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex) { - struct si_context *sctx = (struct si_context*)ctx; - struct pipe_query *tmp; - unsigned i = vi_get_context_dcc_stats_index(sctx, tex); - bool query_active = sctx->dcc_stats[i].query_active; - bool disable = false; - - if (sctx->dcc_stats[i].ps_stats[2]) { - union pipe_query_result result; - - /* Read the results. */ - struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2]; - ctx->get_query_result(ctx, query, - true, &result); - si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer); - - /* Compute the approximate number of fullscreen draws. */ - tex->ps_draw_ratio = - result.pipeline_statistics.ps_invocations / - (tex->buffer.b.b.width0 * tex->buffer.b.b.height0); - sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio; - - disable = tex->dcc_separate_buffer && - !vi_should_enable_separate_dcc(tex); - } - - tex->num_slow_clears = 0; - - /* stop the statistics query for ps_stats[0] */ - if (query_active) - vi_separate_dcc_stop_query(sctx, tex); - - /* Move the queries in the queue by one. */ - tmp = sctx->dcc_stats[i].ps_stats[2]; - sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1]; - sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0]; - sctx->dcc_stats[i].ps_stats[0] = tmp; - - /* create and start a new query as ps_stats[0] */ - if (query_active) - vi_separate_dcc_start_query(sctx, tex); - - if (disable) { - assert(!tex->last_dcc_separate_buffer); - tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; - tex->dcc_separate_buffer = NULL; - tex->surface.dcc_offset = 0; - /* no need to flag anything since this is called after - * decompression that re-sets framebuffer state - */ - } + struct si_context *sctx = (struct si_context *)ctx; + struct pipe_query *tmp; + unsigned i = vi_get_context_dcc_stats_index(sctx, tex); + bool query_active = sctx->dcc_stats[i].query_active; + bool disable = false; + + if (sctx->dcc_stats[i].ps_stats[2]) { + union pipe_query_result result; + + /* Read the results. */ + struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2]; + ctx->get_query_result(ctx, query, true, &result); + si_query_buffer_reset(sctx, &((struct si_query_hw *)query)->buffer); + + /* Compute the approximate number of fullscreen draws. */ + tex->ps_draw_ratio = result.pipeline_statistics.ps_invocations / + (tex->buffer.b.b.width0 * tex->buffer.b.b.height0); + sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio; + + disable = tex->dcc_separate_buffer && !vi_should_enable_separate_dcc(tex); + } + + tex->num_slow_clears = 0; + + /* stop the statistics query for ps_stats[0] */ + if (query_active) + vi_separate_dcc_stop_query(sctx, tex); + + /* Move the queries in the queue by one. */ + tmp = sctx->dcc_stats[i].ps_stats[2]; + sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1]; + sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0]; + sctx->dcc_stats[i].ps_stats[0] = tmp; + + /* create and start a new query as ps_stats[0] */ + if (query_active) + vi_separate_dcc_start_query(sctx, tex); + + if (disable) { + assert(!tex->last_dcc_separate_buffer); + tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; + tex->dcc_separate_buffer = NULL; + tex->surface.dcc_offset = 0; + /* no need to flag anything since this is called after + * decompression that re-sets framebuffer state + */ + } } static struct pipe_memory_object * -si_memobj_from_handle(struct pipe_screen *screen, - struct winsys_handle *whandle, - bool dedicated) +si_memobj_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle, bool dedicated) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object); - struct pb_buffer *buf = NULL; - - if (!memobj) - return NULL; + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object); + struct pb_buffer *buf = NULL; - buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, - sscreen->info.max_alignment); - if (!buf) { - free(memobj); - return NULL; - } + if (!memobj) + return NULL; - memobj->b.dedicated = dedicated; - memobj->buf = buf; - memobj->stride = whandle->stride; + buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment); + if (!buf) { + free(memobj); + return NULL; + } - return (struct pipe_memory_object *)memobj; + memobj->b.dedicated = dedicated; + memobj->buf = buf; + memobj->stride = whandle->stride; + return (struct pipe_memory_object *)memobj; } -static void -si_memobj_destroy(struct pipe_screen *screen, - struct pipe_memory_object *_memobj) +static void si_memobj_destroy(struct pipe_screen *screen, struct pipe_memory_object *_memobj) { - struct si_memory_object *memobj = (struct si_memory_object *)_memobj; + struct si_memory_object *memobj = (struct si_memory_object *)_memobj; - pb_reference(&memobj->buf, NULL); - free(memobj); + pb_reference(&memobj->buf, NULL); + free(memobj); } -static struct pipe_resource * -si_texture_from_memobj(struct pipe_screen *screen, - const struct pipe_resource *templ, - struct pipe_memory_object *_memobj, - uint64_t offset) +static struct pipe_resource *si_texture_from_memobj(struct pipe_screen *screen, + const struct pipe_resource *templ, + struct pipe_memory_object *_memobj, + uint64_t offset) { - struct si_screen *sscreen = (struct si_screen*)screen; - struct si_memory_object *memobj = (struct si_memory_object *)_memobj; - struct pipe_resource *tex = - si_texture_from_winsys_buffer(sscreen, templ, memobj->buf, - memobj->stride, offset, - PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | - PIPE_HANDLE_USAGE_SHADER_WRITE, - memobj->b.dedicated); - if (!tex) - return NULL; - - /* si_texture_from_winsys_buffer doesn't increment refcount of - * memobj->buf, so increment it here. - */ - struct pb_buffer *buf = NULL; - pb_reference(&buf, memobj->buf); - return tex; + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_memory_object *memobj = (struct si_memory_object *)_memobj; + struct pipe_resource *tex = si_texture_from_winsys_buffer( + sscreen, templ, memobj->buf, memobj->stride, offset, + PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | PIPE_HANDLE_USAGE_SHADER_WRITE, memobj->b.dedicated); + if (!tex) + return NULL; + + /* si_texture_from_winsys_buffer doesn't increment refcount of + * memobj->buf, so increment it here. + */ + struct pb_buffer *buf = NULL; + pb_reference(&buf, memobj->buf); + return tex; } -static bool si_check_resource_capability(struct pipe_screen *screen, - struct pipe_resource *resource, - unsigned bind) +static bool si_check_resource_capability(struct pipe_screen *screen, struct pipe_resource *resource, + unsigned bind) { - struct si_texture *tex = (struct si_texture*)resource; + struct si_texture *tex = (struct si_texture *)resource; - /* Buffers only support the linear flag. */ - if (resource->target == PIPE_BUFFER) - return (bind & ~PIPE_BIND_LINEAR) == 0; + /* Buffers only support the linear flag. */ + if (resource->target == PIPE_BUFFER) + return (bind & ~PIPE_BIND_LINEAR) == 0; - if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear) - return false; + if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear) + return false; - if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable) - return false; + if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable) + return false; - /* TODO: PIPE_BIND_CURSOR - do we care? */ - return true; + /* TODO: PIPE_BIND_CURSOR - do we care? */ + return true; } void si_init_screen_texture_functions(struct si_screen *sscreen) { - sscreen->b.resource_from_handle = si_texture_from_handle; - sscreen->b.resource_get_handle = si_texture_get_handle; - sscreen->b.resource_get_param = si_resource_get_param; - sscreen->b.resource_get_info = si_texture_get_info; - sscreen->b.resource_from_memobj = si_texture_from_memobj; - sscreen->b.memobj_create_from_handle = si_memobj_from_handle; - sscreen->b.memobj_destroy = si_memobj_destroy; - sscreen->b.check_resource_capability = si_check_resource_capability; + sscreen->b.resource_from_handle = si_texture_from_handle; + sscreen->b.resource_get_handle = si_texture_get_handle; + sscreen->b.resource_get_param = si_resource_get_param; + sscreen->b.resource_get_info = si_texture_get_info; + sscreen->b.resource_from_memobj = si_texture_from_memobj; + sscreen->b.memobj_create_from_handle = si_memobj_from_handle; + sscreen->b.memobj_destroy = si_memobj_destroy; + sscreen->b.check_resource_capability = si_check_resource_capability; } void si_init_context_texture_functions(struct si_context *sctx) { - sctx->b.create_surface = si_create_surface; - sctx->b.surface_destroy = si_surface_destroy; + sctx->b.create_surface = si_create_surface; + sctx->b.surface_destroy = si_surface_destroy; } diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c index 5511c2d7ad2..0f38cce0f96 100644 --- a/src/gallium/drivers/radeonsi/si_uvd.c +++ b/src/gallium/drivers/radeonsi/si_uvd.c @@ -25,79 +25,77 @@ * **************************************************************************/ -#include "si_pipe.h" -#include "radeon/radeon_video.h" #include "radeon/radeon_uvd.h" +#include "radeon/radeon_uvd_enc.h" #include "radeon/radeon_vce.h" #include "radeon/radeon_vcn_dec.h" #include "radeon/radeon_vcn_enc.h" -#include "radeon/radeon_uvd_enc.h" +#include "radeon/radeon_video.h" +#include "si_pipe.h" #include "util/u_video.h" /** * creates an video buffer with an UVD compatible memory layout */ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, - const struct pipe_video_buffer *tmpl) + const struct pipe_video_buffer *tmpl) { - struct pipe_video_buffer vidbuf = *tmpl; - /* TODO: get tiling working */ - vidbuf.bind |= PIPE_BIND_LINEAR; + struct pipe_video_buffer vidbuf = *tmpl; + /* TODO: get tiling working */ + vidbuf.bind |= PIPE_BIND_LINEAR; - return vl_video_buffer_create_as_resource(pipe, &vidbuf); + return vl_video_buffer_create_as_resource(pipe, &vidbuf); } /* set the decoding target buffer offsets */ -static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) +static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) { - struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen; - struct si_texture *luma = (struct si_texture *)buf->resources[0]; - struct si_texture *chroma = (struct si_texture *)buf->resources[1]; - enum ruvd_surface_type type = (sscreen->info.chip_class >= GFX9) ? - RUVD_SURFACE_TYPE_GFX9 : - RUVD_SURFACE_TYPE_LEGACY; + struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen; + struct si_texture *luma = (struct si_texture *)buf->resources[0]; + struct si_texture *chroma = (struct si_texture *)buf->resources[1]; + enum ruvd_surface_type type = + (sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY; - msg->body.decode.dt_field_mode = buf->base.interlaced; + msg->body.decode.dt_field_mode = buf->base.interlaced; - si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type); + si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type); - return luma->buffer.buf; + return luma->buffer.buf; } /* get the radeon resources for VCE */ -static void si_vce_get_buffer(struct pipe_resource *resource, - struct pb_buffer **handle, - struct radeon_surf **surface) +static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle, + struct radeon_surf **surface) { - struct si_texture *res = (struct si_texture *)resource; + struct si_texture *res = (struct si_texture *)resource; - if (handle) - *handle = res->buffer.buf; + if (handle) + *handle = res->buffer.buf; - if (surface) - *surface = &res->surface; + if (surface) + *surface = &res->surface; } /** * creates an UVD compatible decoder */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, - const struct pipe_video_codec *templ) + const struct pipe_video_codec *templ) { - struct si_context *ctx = (struct si_context *)context; - bool vcn = ctx->family >= CHIP_RAVEN; + struct si_context *ctx = (struct si_context *)context; + bool vcn = ctx->family >= CHIP_RAVEN; - if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { - if (vcn) { - return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); - } else { - if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC) - return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); - else - return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); - } - } + if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { + if (vcn) { + return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); + } else { + if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC) + return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); + else + return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer); + } + } - return (vcn) ? radeon_create_decoder(context, templ) : - si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb); + return (vcn) ? radeon_create_decoder(context, templ) + : si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb); } -- 2.30.2