+++ /dev/null
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "sid.h"
#include "si_pipe.h"
+#include "sid.h"
static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
{
- width = u_minify(width, level);
- return DIV_ROUND_UP(width, blk_w);
+ width = u_minify(width, level);
+ return DIV_ROUND_UP(width, blk_w);
}
-static unsigned encode_tile_info(struct si_context *sctx,
- struct si_texture *tex, unsigned level,
- bool set_bpp)
+static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
+ bool set_bpp)
{
- struct radeon_info *info = &sctx->screen->info;
- unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
- unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
- unsigned tile_mode = info->si_tile_mode_array[tile_index];
- unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
-
- return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) |
- (G_009910_ARRAY_MODE(tile_mode) << 3) |
- (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
- /* Non-depth modes don't have TILE_SPLIT set. */
- ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
- (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
- (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
- (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
- (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
- (G_009910_PIPE_CONFIG(tile_mode) << 26);
+ struct radeon_info *info = &sctx->screen->info;
+ unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
+ unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
+ unsigned tile_mode = info->si_tile_mode_array[tile_index];
+ unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
+
+ return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) |
+ (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
+ /* Non-depth modes don't have TILE_SPLIT set. */
+ ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
+ (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
+ (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
+ (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
+ (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
+ (G_009910_PIPE_CONFIG(tile_mode) << 26);
}
-
-static bool si_sdma_v4_copy_texture(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+ unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src, unsigned src_level,
+ const struct pipe_box *src_box)
{
- struct si_texture *ssrc = (struct si_texture*)src;
- struct si_texture *sdst = (struct si_texture*)dst;
-
- unsigned bpp = sdst->surface.bpe;
- uint64_t dst_address = sdst->buffer.gpu_address +
- sdst->surface.u.gfx9.surf_offset;
- uint64_t src_address = ssrc->buffer.gpu_address +
- ssrc->surface.u.gfx9.surf_offset;
- unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
- unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
- uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
- uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
- unsigned srcx = src_box->x / ssrc->surface.blk_w;
- unsigned srcy = src_box->y / ssrc->surface.blk_h;
- unsigned srcz = src_box->z;
- unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
- unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
- unsigned copy_depth = src_box->depth;
- unsigned xalign = MAX2(1, 4 / bpp);
-
- assert(src_level <= src->last_level);
- assert(dst_level <= dst->last_level);
- assert(sdst->surface.u.gfx9.surf_offset +
- dst_slice_pitch * bpp * (dstz + src_box->depth) <=
- sdst->buffer.buf->size);
- assert(ssrc->surface.u.gfx9.surf_offset +
- src_slice_pitch * bpp * (srcz + src_box->depth) <=
- ssrc->buffer.buf->size);
-
- if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
- dstz, ssrc, src_level, src_box))
- return false;
-
- dstx /= sdst->surface.blk_w;
- dsty /= sdst->surface.blk_h;
-
- if (srcx >= (1 << 14) ||
- srcy >= (1 << 14) ||
- srcz >= (1 << 11) ||
- dstx >= (1 << 14) ||
- dsty >= (1 << 14) ||
- dstz >= (1 << 11))
- return false;
-
- /* Linear -> linear sub-window copy. */
- if (ssrc->surface.is_linear &&
- sdst->surface.is_linear) {
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
- /* Check if everything fits into the bitfields */
- if (!(src_pitch <= (1 << 19) &&
- dst_pitch <= (1 << 19) &&
- src_slice_pitch <= (1 << 28) &&
- dst_slice_pitch <= (1 << 28) &&
- copy_width <= (1 << 14) &&
- copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11)))
- return false;
-
- si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
- src_address += ssrc->surface.u.gfx9.offset[src_level];
- dst_address += sdst->surface.u.gfx9.offset[dst_level];
-
- /* Check alignments */
- if ((src_address % 4) != 0 ||
- (dst_address % 4) != 0 ||
- (src_pitch % xalign) != 0)
- return false;
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
- (util_logbase2(bpp) << 29));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
- radeon_emit(cs, src_slice_pitch - 1);
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
- radeon_emit(cs, dst_slice_pitch - 1);
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- return true;
- }
-
- /* Linear <-> Tiled sub-window copy */
- if (ssrc->surface.is_linear != sdst->surface.is_linear) {
- struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
- struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
- unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
- unsigned linear_level = linear == ssrc ? src_level : dst_level;
- unsigned tiled_x = tiled == ssrc ? srcx : dstx;
- unsigned linear_x = linear == ssrc ? srcx : dstx;
- unsigned tiled_y = tiled == ssrc ? srcy : dsty;
- unsigned linear_y = linear == ssrc ? srcy : dsty;
- unsigned tiled_z = tiled == ssrc ? srcz : dstz;
- unsigned linear_z = linear == ssrc ? srcz : dstz;
- unsigned tiled_width = tiled == ssrc ?
- DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) :
- DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
- unsigned tiled_height = tiled == ssrc ?
- DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) :
- DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
- unsigned tiled_depth = tiled == ssrc ?
- ssrc->buffer.b.b.depth0 :
- sdst->buffer.b.b.depth0;
- unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
- unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
- uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
- uint64_t linear_address = linear == ssrc ? src_address : dst_address;
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
- linear_address += linear->surface.u.gfx9.offset[linear_level];
-
- /* Check if everything fits into the bitfields */
- if (!(tiled_x <= (1 << 14) &&
- tiled_y <= (1 << 14) &&
- tiled_z <= (1 << 11) &&
- tiled_width <= (1 << 14) &&
- tiled_height <= (1 << 14) &&
- tiled_depth <= (1 << 11) &&
- tiled->surface.u.gfx9.surf.epitch <= (1 << 16) &&
- linear_x <= (1 << 14) &&
- linear_y <= (1 << 14) &&
- linear_z <= (1 << 11) &&
- linear_pitch <= (1 << 14) &&
- linear_slice_pitch <= (1 << 28) &&
- copy_width <= (1 << 14) &&
- copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11)))
- return false;
-
- /* Check alignments */
- if ((tiled_address % 256 != 0) ||
- (linear_address % 4 != 0) ||
- (linear_pitch % xalign != 0) ||
- (linear_slice_pitch % xalign != 0))
- return false;
-
- si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
- tiled->buffer.b.b.last_level << 20 |
- tiled_level << 24 |
- (linear == sdst ? 1u : 0) << 31);
- radeon_emit(cs, (uint32_t) tiled_address);
- radeon_emit(cs, (uint32_t) (tiled_address >> 32));
- radeon_emit(cs, tiled_x | (tiled_y << 16));
- radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
- radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
- radeon_emit(cs, util_logbase2(bpp) |
- tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
- tiled->surface.u.gfx9.resource_type << 9 |
- tiled->surface.u.gfx9.surf.epitch << 16);
- radeon_emit(cs, (uint32_t) linear_address);
- radeon_emit(cs, (uint32_t) (linear_address >> 32));
- radeon_emit(cs, linear_x | (linear_y << 16));
- radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
- radeon_emit(cs, linear_slice_pitch - 1);
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- return true;
- }
-
- return false;
+ struct si_texture *ssrc = (struct si_texture *)src;
+ struct si_texture *sdst = (struct si_texture *)dst;
+
+ unsigned bpp = sdst->surface.bpe;
+ uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
+ uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
+ unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
+ unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
+ uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
+ uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
+ unsigned srcx = src_box->x / ssrc->surface.blk_w;
+ unsigned srcy = src_box->y / ssrc->surface.blk_h;
+ unsigned srcz = src_box->z;
+ unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+ unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+ unsigned copy_depth = src_box->depth;
+ unsigned xalign = MAX2(1, 4 / bpp);
+
+ assert(src_level <= src->last_level);
+ assert(dst_level <= dst->last_level);
+ assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+ sdst->buffer.buf->size);
+ assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <=
+ ssrc->buffer.buf->size);
+
+ if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+ return false;
+
+ dstx /= sdst->surface.blk_w;
+ dsty /= sdst->surface.blk_h;
+
+ if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+ dsty >= (1 << 14) || dstz >= (1 << 11))
+ return false;
+
+ /* Linear -> linear sub-window copy. */
+ if (ssrc->surface.is_linear && sdst->surface.is_linear) {
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+ /* Check if everything fits into the bitfields */
+ if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) &&
+ dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+ copy_depth <= (1 << 11)))
+ return false;
+
+ si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+ src_address += ssrc->surface.u.gfx9.offset[src_level];
+ dst_address += sdst->surface.u.gfx9.offset[dst_level];
+
+ /* Check alignments */
+ if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0)
+ return false;
+
+ radeon_emit(
+ cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+ (util_logbase2(bpp) << 29));
+ radeon_emit(cs, src_address);
+ radeon_emit(cs, src_address >> 32);
+ radeon_emit(cs, srcx | (srcy << 16));
+ radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
+ radeon_emit(cs, src_slice_pitch - 1);
+ radeon_emit(cs, dst_address);
+ radeon_emit(cs, dst_address >> 32);
+ radeon_emit(cs, dstx | (dsty << 16));
+ radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
+ radeon_emit(cs, dst_slice_pitch - 1);
+ radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+ radeon_emit(cs, (copy_depth - 1));
+ return true;
+ }
+
+ /* Linear <-> Tiled sub-window copy */
+ if (ssrc->surface.is_linear != sdst->surface.is_linear) {
+ struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
+ struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+ unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+ unsigned linear_level = linear == ssrc ? src_level : dst_level;
+ unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+ unsigned linear_x = linear == ssrc ? srcx : dstx;
+ unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+ unsigned linear_y = linear == ssrc ? srcy : dsty;
+ unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+ unsigned linear_z = linear == ssrc ? srcz : dstz;
+ unsigned tiled_width = tiled == ssrc
+ ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w)
+ : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
+ unsigned tiled_height = tiled == ssrc
+ ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h)
+ : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
+ unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0;
+ unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+ unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+ uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+ uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+ linear_address += linear->surface.u.gfx9.offset[linear_level];
+
+ /* Check if everything fits into the bitfields */
+ if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) &&
+ tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) &&
+ tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && linear_x <= (1 << 14) &&
+ linear_y <= (1 << 14) && linear_z <= (1 << 11) && linear_pitch <= (1 << 14) &&
+ linear_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) &&
+ copy_height <= (1 << 14) && copy_depth <= (1 << 11)))
+ return false;
+
+ /* Check alignments */
+ if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) ||
+ (linear_slice_pitch % xalign != 0))
+ return false;
+
+ si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+ radeon_emit(
+ cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+ tiled->buffer.b.b.last_level << 20 | tiled_level << 24 |
+ (linear == sdst ? 1u : 0) << 31);
+ radeon_emit(cs, (uint32_t)tiled_address);
+ radeon_emit(cs, (uint32_t)(tiled_address >> 32));
+ radeon_emit(cs, tiled_x | (tiled_y << 16));
+ radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
+ radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
+ radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
+ tiled->surface.u.gfx9.resource_type << 9 |
+ tiled->surface.u.gfx9.surf.epitch << 16);
+ radeon_emit(cs, (uint32_t)linear_address);
+ radeon_emit(cs, (uint32_t)(linear_address >> 32));
+ radeon_emit(cs, linear_x | (linear_y << 16));
+ radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+ radeon_emit(cs, linear_slice_pitch - 1);
+ radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+ radeon_emit(cs, (copy_depth - 1));
+ return true;
+ }
+
+ return false;
}
-static bool cik_sdma_copy_texture(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+ unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src, unsigned src_level,
+ const struct pipe_box *src_box)
{
- struct radeon_info *info = &sctx->screen->info;
- struct si_texture *ssrc = (struct si_texture*)src;
- struct si_texture *sdst = (struct si_texture*)dst;
- unsigned bpp = sdst->surface.bpe;
- uint64_t dst_address = sdst->buffer.gpu_address +
- sdst->surface.u.legacy.level[dst_level].offset;
- uint64_t src_address = ssrc->buffer.gpu_address +
- ssrc->surface.u.legacy.level[src_level].offset;
- unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
- unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
- unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
- unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
- unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
- unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
- unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
- unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
- unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ?
- sdst->surface.tile_swizzle : 0;
- unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ?
- ssrc->surface.tile_swizzle : 0;
- unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
- unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
- uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
- uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
- unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0,
- dst_level, sdst->surface.blk_w);
- unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0,
- src_level, ssrc->surface.blk_w);
- unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0,
- dst_level, sdst->surface.blk_h);
- unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0,
- src_level, ssrc->surface.blk_h);
- unsigned srcx = src_box->x / ssrc->surface.blk_w;
- unsigned srcy = src_box->y / ssrc->surface.blk_h;
- unsigned srcz = src_box->z;
- unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
- unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
- unsigned copy_depth = src_box->depth;
-
- assert(src_level <= src->last_level);
- assert(dst_level <= dst->last_level);
- assert(sdst->surface.u.legacy.level[dst_level].offset +
- dst_slice_pitch * bpp * (dstz + src_box->depth) <=
- sdst->buffer.buf->size);
- assert(ssrc->surface.u.legacy.level[src_level].offset +
- src_slice_pitch * bpp * (srcz + src_box->depth) <=
- ssrc->buffer.buf->size);
-
- if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
- dstz, ssrc, src_level, src_box))
- return false;
-
- dstx /= sdst->surface.blk_w;
- dsty /= sdst->surface.blk_h;
-
- if (srcx >= (1 << 14) ||
- srcy >= (1 << 14) ||
- srcz >= (1 << 11) ||
- dstx >= (1 << 14) ||
- dsty >= (1 << 14) ||
- dstz >= (1 << 11))
- return false;
-
- dst_address |= dst_tile_swizzle << 8;
- src_address |= src_tile_swizzle << 8;
-
- /* Linear -> linear sub-window copy. */
- if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
- src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
- /* check if everything fits into the bitfields */
- src_pitch <= (1 << 14) &&
- dst_pitch <= (1 << 14) &&
- src_slice_pitch <= (1 << 28) &&
- dst_slice_pitch <= (1 << 28) &&
- copy_width <= (1 << 14) &&
- copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11) &&
- /* HW limitation - GFX7: */
- (sctx->chip_class != GFX7 ||
- (copy_width < (1 << 14) &&
- copy_height < (1 << 14) &&
- copy_depth < (1 << 11))) &&
- /* HW limitation - some GFX7 parts: */
- ((sctx->family != CHIP_BONAIRE &&
- sctx->family != CHIP_KAVERI) ||
- (srcx + copy_width != (1 << 14) &&
- srcy + copy_height != (1 << 14)))) {
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
- si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
- (util_logbase2(bpp) << 29));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
- radeon_emit(cs, src_slice_pitch - 1);
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
- radeon_emit(cs, dst_slice_pitch - 1);
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width | (copy_height << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
-
- /* Tiled <-> linear sub-window copy. */
- if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
- struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
- struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
- unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
- unsigned linear_level = linear == ssrc ? src_level : dst_level;
- unsigned tiled_x = tiled == ssrc ? srcx : dstx;
- unsigned linear_x = linear == ssrc ? srcx : dstx;
- unsigned tiled_y = tiled == ssrc ? srcy : dsty;
- unsigned linear_y = linear == ssrc ? srcy : dsty;
- unsigned tiled_z = tiled == ssrc ? srcz : dstz;
- unsigned linear_z = linear == ssrc ? srcz : dstz;
- unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
- unsigned linear_width = linear == ssrc ? src_width : dst_width;
- unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
- unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
- unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
- unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
- uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
- uint64_t linear_address = linear == ssrc ? src_address : dst_address;
- unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
-
- assert(tiled_pitch % 8 == 0);
- assert(tiled_slice_pitch % 64 == 0);
- unsigned pitch_tile_max = tiled_pitch / 8 - 1;
- unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
- unsigned xalign = MAX2(1, 4 / bpp);
- unsigned copy_width_aligned = copy_width;
-
- /* If the region ends at the last pixel and is unaligned, we
- * can copy the remainder of the line that is not visible to
- * make it aligned.
- */
- if (copy_width % xalign != 0 &&
- linear_x + copy_width == linear_width &&
- tiled_x + copy_width == tiled_width &&
- linear_x + align(copy_width, xalign) <= linear_pitch &&
- tiled_x + align(copy_width, xalign) <= tiled_pitch)
- copy_width_aligned = align(copy_width, xalign);
-
- /* HW limitations. */
- if ((sctx->family == CHIP_BONAIRE ||
- sctx->family == CHIP_KAVERI) &&
- linear_pitch - 1 == 0x3fff &&
- bpp == 16)
- return false;
-
- if (sctx->chip_class == GFX7 &&
- (copy_width_aligned == (1 << 14) ||
- copy_height == (1 << 14) ||
- copy_depth == (1 << 11)))
- return false;
-
- if ((sctx->family == CHIP_BONAIRE ||
- sctx->family == CHIP_KAVERI ||
- sctx->family == CHIP_KABINI) &&
- (tiled_x + copy_width == (1 << 14) ||
- tiled_y + copy_height == (1 << 14)))
- return false;
-
- /* The hw can read outside of the given linear buffer bounds,
- * or access those pages but not touch the memory in case
- * of writes. (it still causes a VM fault)
- *
- * Out-of-bounds memory access or page directory access must
- * be prevented.
- */
- int64_t start_linear_address, end_linear_address;
- unsigned granularity;
-
- /* Deduce the size of reads from the linear surface. */
- switch (tiled_micro_mode) {
- case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
- granularity = bpp == 1 ? 64 / (8*bpp) :
- 128 / (8*bpp);
- break;
- case V_009910_ADDR_SURF_THIN_MICRO_TILING:
- case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
- if (0 /* TODO: THICK microtiling */)
- granularity = bpp == 1 ? 32 / (8*bpp) :
- bpp == 2 ? 64 / (8*bpp) :
- bpp <= 8 ? 128 / (8*bpp) :
- 256 / (8*bpp);
- else
- granularity = bpp <= 2 ? 64 / (8*bpp) :
- bpp <= 8 ? 128 / (8*bpp) :
- 256 / (8*bpp);
- break;
- default:
- return false;
- }
-
- /* The linear reads start at tiled_x & ~(granularity - 1).
- * If linear_x == 0 && tiled_x % granularity != 0, the hw
- * starts reading from an address preceding linear_address!!!
- */
- start_linear_address =
- linear->surface.u.legacy.level[linear_level].offset +
- bpp * (linear_z * linear_slice_pitch +
- linear_y * linear_pitch +
- linear_x);
- start_linear_address -= (int)(bpp * (tiled_x % granularity));
-
- end_linear_address =
- linear->surface.u.legacy.level[linear_level].offset +
- bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
- (linear_y + copy_height - 1) * linear_pitch +
- (linear_x + copy_width));
-
- if ((tiled_x + copy_width) % granularity)
- end_linear_address += granularity -
- (tiled_x + copy_width) % granularity;
-
- if (start_linear_address < 0 ||
- end_linear_address > linear->surface.surf_size)
- return false;
-
- /* Check requirements. */
- if (tiled_address % 256 == 0 &&
- linear_address % 4 == 0 &&
- linear_pitch % xalign == 0 &&
- linear_x % xalign == 0 &&
- tiled_x % xalign == 0 &&
- copy_width_aligned % xalign == 0 &&
- tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
- /* check if everything fits into the bitfields */
- tiled->surface.u.legacy.tile_split <= 4096 &&
- pitch_tile_max < (1 << 11) &&
- slice_tile_max < (1 << 22) &&
- linear_pitch <= (1 << 14) &&
- linear_slice_pitch <= (1 << 28) &&
- copy_width_aligned <= (1 << 14) &&
- copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11)) {
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
- uint32_t direction = linear == sdst ? 1u << 31 : 0;
-
- si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
- direction);
- radeon_emit(cs, tiled_address);
- radeon_emit(cs, tiled_address >> 32);
- radeon_emit(cs, tiled_x | (tiled_y << 16));
- radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
- radeon_emit(cs, slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
- radeon_emit(cs, linear_address);
- radeon_emit(cs, linear_address >> 32);
- radeon_emit(cs, linear_x | (linear_y << 16));
- radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
- radeon_emit(cs, linear_slice_pitch - 1);
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width_aligned | (copy_height << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
- }
-
- /* Tiled -> Tiled sub-window copy. */
- if (dst_mode >= RADEON_SURF_MODE_1D &&
- src_mode >= RADEON_SURF_MODE_1D &&
- /* check if these fit into the bitfields */
- src_address % 256 == 0 &&
- dst_address % 256 == 0 &&
- ssrc->surface.u.legacy.tile_split <= 4096 &&
- sdst->surface.u.legacy.tile_split <= 4096 &&
- dstx % 8 == 0 &&
- dsty % 8 == 0 &&
- srcx % 8 == 0 &&
- srcy % 8 == 0 &&
- /* this can either be equal, or display->rotated (GFX8+ only) */
- (src_micro_mode == dst_micro_mode ||
- (sctx->chip_class >= GFX8 &&
- src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
- dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
- assert(src_pitch % 8 == 0);
- assert(dst_pitch % 8 == 0);
- assert(src_slice_pitch % 64 == 0);
- assert(dst_slice_pitch % 64 == 0);
- unsigned src_pitch_tile_max = src_pitch / 8 - 1;
- unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
- unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
- unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
- unsigned copy_width_aligned = copy_width;
- unsigned copy_height_aligned = copy_height;
-
- /* If the region ends at the last pixel and is unaligned, we
- * can copy the remainder of the tile that is not visible to
- * make it aligned.
- */
- if (copy_width % 8 != 0 &&
- srcx + copy_width == src_width &&
- dstx + copy_width == dst_width)
- copy_width_aligned = align(copy_width, 8);
-
- if (copy_height % 8 != 0 &&
- srcy + copy_height == src_height &&
- dsty + copy_height == dst_height)
- copy_height_aligned = align(copy_height, 8);
-
- /* check if these fit into the bitfields */
- if (src_pitch_tile_max < (1 << 11) &&
- dst_pitch_tile_max < (1 << 11) &&
- src_slice_tile_max < (1 << 22) &&
- dst_slice_tile_max < (1 << 22) &&
- copy_width_aligned <= (1 << 14) &&
- copy_height_aligned <= (1 << 14) &&
- copy_depth <= (1 << 11) &&
- copy_width_aligned % 8 == 0 &&
- copy_height_aligned % 8 == 0 &&
- /* HW limitation - GFX7: */
- (sctx->chip_class != GFX7 ||
- (copy_width_aligned < (1 << 14) &&
- copy_height_aligned < (1 << 14) &&
- copy_depth < (1 << 11))) &&
- /* HW limitation - some GFX7 parts: */
- ((sctx->family != CHIP_BONAIRE &&
- sctx->family != CHIP_KAVERI &&
- sctx->family != CHIP_KABINI) ||
- (srcx + copy_width_aligned != (1 << 14) &&
- srcy + copy_height_aligned != (1 << 14) &&
- dstx + copy_width != (1 << 14)))) {
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
- si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
- radeon_emit(cs, src_slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
- radeon_emit(cs, dst_slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width_aligned |
- (copy_height_aligned << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width_aligned - 8) |
- ((copy_height_aligned - 8) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
- }
-
- return false;
+ struct radeon_info *info = &sctx->screen->info;
+ struct si_texture *ssrc = (struct si_texture *)src;
+ struct si_texture *sdst = (struct si_texture *)dst;
+ unsigned bpp = sdst->surface.bpe;
+ uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset;
+ uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset;
+ unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
+ unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
+ unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
+ unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
+ unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
+ unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
+ unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
+ unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
+ unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
+ unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
+ unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
+ unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
+ uint64_t dst_slice_pitch =
+ ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
+ uint64_t src_slice_pitch =
+ ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
+ unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w);
+ unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w);
+ unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h);
+ unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h);
+ unsigned srcx = src_box->x / ssrc->surface.blk_w;
+ unsigned srcy = src_box->y / ssrc->surface.blk_h;
+ unsigned srcz = src_box->z;
+ unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+ unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+ unsigned copy_depth = src_box->depth;
+
+ assert(src_level <= src->last_level);
+ assert(dst_level <= dst->last_level);
+ assert(sdst->surface.u.legacy.level[dst_level].offset +
+ dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+ sdst->buffer.buf->size);
+ assert(ssrc->surface.u.legacy.level[src_level].offset +
+ src_slice_pitch * bpp * (srcz + src_box->depth) <=
+ ssrc->buffer.buf->size);
+
+ if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+ return false;
+
+ dstx /= sdst->surface.blk_w;
+ dsty /= sdst->surface.blk_h;
+
+ if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+ dsty >= (1 << 14) || dstz >= (1 << 11))
+ return false;
+
+ dst_address |= dst_tile_swizzle << 8;
+ src_address |= src_tile_swizzle << 8;
+
+ /* Linear -> linear sub-window copy. */
+ if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
+ /* check if everything fits into the bitfields */
+ src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
+ dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+ copy_depth <= (1 << 11) &&
+ /* HW limitation - GFX7: */
+ (sctx->chip_class != GFX7 ||
+ (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) &&
+ /* HW limitation - some GFX7 parts: */
+ ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
+ (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) {
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+ si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+ radeon_emit(
+ cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+ (util_logbase2(bpp) << 29));
+ radeon_emit(cs, src_address);
+ radeon_emit(cs, src_address >> 32);
+ radeon_emit(cs, srcx | (srcy << 16));
+ radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
+ radeon_emit(cs, src_slice_pitch - 1);
+ radeon_emit(cs, dst_address);
+ radeon_emit(cs, dst_address >> 32);
+ radeon_emit(cs, dstx | (dsty << 16));
+ radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
+ radeon_emit(cs, dst_slice_pitch - 1);
+ if (sctx->chip_class == GFX7) {
+ radeon_emit(cs, copy_width | (copy_height << 16));
+ radeon_emit(cs, copy_depth);
+ } else {
+ radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+ radeon_emit(cs, (copy_depth - 1));
+ }
+ return true;
+ }
+
+ /* Tiled <-> linear sub-window copy. */
+ if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
+ struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
+ struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+ unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+ unsigned linear_level = linear == ssrc ? src_level : dst_level;
+ unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+ unsigned linear_x = linear == ssrc ? srcx : dstx;
+ unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+ unsigned linear_y = linear == ssrc ? srcy : dsty;
+ unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+ unsigned linear_z = linear == ssrc ? srcz : dstz;
+ unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
+ unsigned linear_width = linear == ssrc ? src_width : dst_width;
+ unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
+ unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+ unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
+ unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+ uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+ uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+ unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
+
+ assert(tiled_pitch % 8 == 0);
+ assert(tiled_slice_pitch % 64 == 0);
+ unsigned pitch_tile_max = tiled_pitch / 8 - 1;
+ unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
+ unsigned xalign = MAX2(1, 4 / bpp);
+ unsigned copy_width_aligned = copy_width;
+
+ /* If the region ends at the last pixel and is unaligned, we
+ * can copy the remainder of the line that is not visible to
+ * make it aligned.
+ */
+ if (copy_width % xalign != 0 && linear_x + copy_width == linear_width &&
+ tiled_x + copy_width == tiled_width &&
+ linear_x + align(copy_width, xalign) <= linear_pitch &&
+ tiled_x + align(copy_width, xalign) <= tiled_pitch)
+ copy_width_aligned = align(copy_width, xalign);
+
+ /* HW limitations. */
+ if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
+ linear_pitch - 1 == 0x3fff && bpp == 16)
+ return false;
+
+ if (sctx->chip_class == GFX7 &&
+ (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11)))
+ return false;
+
+ if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
+ sctx->family == CHIP_KABINI) &&
+ (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14)))
+ return false;
+
+ /* The hw can read outside of the given linear buffer bounds,
+ * or access those pages but not touch the memory in case
+ * of writes. (it still causes a VM fault)
+ *
+ * Out-of-bounds memory access or page directory access must
+ * be prevented.
+ */
+ int64_t start_linear_address, end_linear_address;
+ unsigned granularity;
+
+ /* Deduce the size of reads from the linear surface. */
+ switch (tiled_micro_mode) {
+ case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
+ granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
+ break;
+ case V_009910_ADDR_SURF_THIN_MICRO_TILING:
+ case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
+ if (0 /* TODO: THICK microtiling */)
+ granularity =
+ bpp == 1 ? 32 / (8 * bpp)
+ : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+ else
+ granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+ break;
+ default:
+ return false;
+ }
+
+ /* The linear reads start at tiled_x & ~(granularity - 1).
+ * If linear_x == 0 && tiled_x % granularity != 0, the hw
+ * starts reading from an address preceding linear_address!!!
+ */
+ start_linear_address =
+ linear->surface.u.legacy.level[linear_level].offset +
+ bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x);
+ start_linear_address -= (int)(bpp * (tiled_x % granularity));
+
+ end_linear_address =
+ linear->surface.u.legacy.level[linear_level].offset +
+ bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
+ (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width));
+
+ if ((tiled_x + copy_width) % granularity)
+ end_linear_address += granularity - (tiled_x + copy_width) % granularity;
+
+ if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
+ return false;
+
+ /* Check requirements. */
+ if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
+ linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 &&
+ tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
+ /* check if everything fits into the bitfields */
+ tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
+ slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
+ linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
+ copy_height <= (1 << 14) && copy_depth <= (1 << 11)) {
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+ uint32_t direction = linear == sdst ? 1u << 31 : 0;
+
+ si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+ radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+ CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+ direction);
+ radeon_emit(cs, tiled_address);
+ radeon_emit(cs, tiled_address >> 32);
+ radeon_emit(cs, tiled_x | (tiled_y << 16));
+ radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
+ radeon_emit(cs, slice_tile_max);
+ radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
+ radeon_emit(cs, linear_address);
+ radeon_emit(cs, linear_address >> 32);
+ radeon_emit(cs, linear_x | (linear_y << 16));
+ radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+ radeon_emit(cs, linear_slice_pitch - 1);
+ if (sctx->chip_class == GFX7) {
+ radeon_emit(cs, copy_width_aligned | (copy_height << 16));
+ radeon_emit(cs, copy_depth);
+ } else {
+ radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
+ radeon_emit(cs, (copy_depth - 1));
+ }
+ return true;
+ }
+ }
+
+ /* Tiled -> Tiled sub-window copy. */
+ if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D &&
+ /* check if these fit into the bitfields */
+ src_address % 256 == 0 && dst_address % 256 == 0 &&
+ ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 &&
+ dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 &&
+ /* this can either be equal, or display->rotated (GFX8+ only) */
+ (src_micro_mode == dst_micro_mode ||
+ (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
+ dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
+ assert(src_pitch % 8 == 0);
+ assert(dst_pitch % 8 == 0);
+ assert(src_slice_pitch % 64 == 0);
+ assert(dst_slice_pitch % 64 == 0);
+ unsigned src_pitch_tile_max = src_pitch / 8 - 1;
+ unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
+ unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
+ unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
+ unsigned copy_width_aligned = copy_width;
+ unsigned copy_height_aligned = copy_height;
+
+ /* If the region ends at the last pixel and is unaligned, we
+ * can copy the remainder of the tile that is not visible to
+ * make it aligned.
+ */
+ if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width)
+ copy_width_aligned = align(copy_width, 8);
+
+ if (copy_height % 8 != 0 && srcy + copy_height == src_height &&
+ dsty + copy_height == dst_height)
+ copy_height_aligned = align(copy_height, 8);
+
+ /* check if these fit into the bitfields */
+ if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) &&
+ src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) &&
+ copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) &&
+ copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 &&
+ /* HW limitation - GFX7: */
+ (sctx->chip_class != GFX7 ||
+ (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) &&
+ copy_depth < (1 << 11))) &&
+ /* HW limitation - some GFX7 parts: */
+ ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI &&
+ sctx->family != CHIP_KABINI) ||
+ (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) &&
+ dstx + copy_width != (1 << 14)))) {
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+ si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
+
+ radeon_emit(
+ cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
+ radeon_emit(cs, src_address);
+ radeon_emit(cs, src_address >> 32);
+ radeon_emit(cs, srcx | (srcy << 16));
+ radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
+ radeon_emit(cs, src_slice_tile_max);
+ radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
+ radeon_emit(cs, dst_address);
+ radeon_emit(cs, dst_address >> 32);
+ radeon_emit(cs, dstx | (dsty << 16));
+ radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
+ radeon_emit(cs, dst_slice_tile_max);
+ radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
+ if (sctx->chip_class == GFX7) {
+ radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16));
+ radeon_emit(cs, copy_depth);
+ } else {
+ radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16));
+ radeon_emit(cs, (copy_depth - 1));
+ }
+ return true;
+ }
+ }
+
+ return false;
}
-static void cik_sdma_copy(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src,
+ unsigned src_level, const struct pipe_box *src_box)
{
- struct si_context *sctx = (struct si_context *)ctx;
-
- assert(src->target != PIPE_BUFFER);
-
- if (!sctx->sdma_cs ||
- src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
- goto fallback;
-
- /* SDMA causes corruption. See:
- * https://bugs.freedesktop.org/show_bug.cgi?id=110575
- * https://bugs.freedesktop.org/show_bug.cgi?id=110635
- *
- * Keep SDMA enabled on APUs.
- */
- if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
- (!sctx->screen->info.has_dedicated_vram &&
- !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
- if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
- cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
- src, src_level, src_box))
- return;
- else if (sctx->chip_class == GFX9 &&
- si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
- src, src_level, src_box))
- return;
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ assert(src->target != PIPE_BUFFER);
+
+ if (!sctx->sdma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+ dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
+ goto fallback;
+
+ /* SDMA causes corruption. See:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=110575
+ * https://bugs.freedesktop.org/show_bug.cgi?id=110635
+ *
+ * Keep SDMA enabled on APUs.
+ */
+ if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
+ (!sctx->screen->info.has_dedicated_vram &&
+ !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
+ if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
+ cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box))
+ return;
+ else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty,
+ dstz, src, src_level, src_box))
+ return;
+ }
fallback:
- si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
- src, src_level, src_box);
+ si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
}
void cik_init_sdma_functions(struct si_context *sctx)
{
- sctx->dma_copy = cik_sdma_copy;
+ sctx->dma_copy = cik_sdma_copy;
}
// DriConf options specific to radeonsi
DRI_CONF_SECTION_PERFORMANCE
- DRI_CONF_ADAPTIVE_SYNC("true")
- DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
- DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
- DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
+DRI_CONF_ADAPTIVE_SYNC("true")
+DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
+DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
DRI_CONF_SECTION_END
DRI_CONF_SECTION_DEBUG
//= BEGIN VERBATIM
-#define OPT_BOOL(name, dflt, description) \
- DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
- DRI_CONF_DESC(en, description) \
- DRI_CONF_OPT_END
+#define OPT_BOOL(name, dflt, description) \
+ DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
+ DRI_CONF_DESC(en, description) \
+ DRI_CONF_OPT_END
#include "radeonsi/si_debug_options.h"
//= END VERBATIM
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <stddef.h>
-
#include "si_pipe.h"
#include "si_query.h"
+#include "sid.h"
#include "util/u_memory.h"
#include "util/u_suballoc.h"
-#include "sid.h"
+
+#include <stddef.h>
/**
* The query buffer is written to by ESGS NGG shaders with statistics about
* without additional GPU cost.
*/
struct gfx10_sh_query_buffer {
- struct list_head list;
- struct si_resource *buf;
- unsigned refcount;
+ struct list_head list;
+ struct si_resource *buf;
+ unsigned refcount;
- /* Offset into the buffer in bytes; points at the first un-emitted entry. */
- unsigned head;
+ /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+ unsigned head;
};
/* Memory layout of the query buffer. Must be kept in sync with shaders
* of all those values unconditionally.
*/
struct gfx10_sh_query_buffer_mem {
- struct {
- uint64_t generated_primitives_start_dummy;
- uint64_t emitted_primitives_start_dummy;
- uint64_t generated_primitives;
- uint64_t emitted_primitives;
- } stream[4];
- uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
- uint32_t pad[31];
+ struct {
+ uint64_t generated_primitives_start_dummy;
+ uint64_t emitted_primitives_start_dummy;
+ uint64_t generated_primitives;
+ uint64_t emitted_primitives;
+ } stream[4];
+ uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+ uint32_t pad[31];
};
/* Shader-based queries. */
struct gfx10_sh_query {
- struct si_query b;
+ struct si_query b;
- struct gfx10_sh_query_buffer *first;
- struct gfx10_sh_query_buffer *last;
- unsigned first_begin;
- unsigned last_end;
+ struct gfx10_sh_query_buffer *first;
+ struct gfx10_sh_query_buffer *last;
+ unsigned first_begin;
+ unsigned last_end;
- unsigned stream;
+ unsigned stream;
};
static void emit_shader_query(struct si_context *sctx)
{
- assert(!list_is_empty(&sctx->shader_query_buffers));
+ assert(!list_is_empty(&sctx->shader_query_buffers));
- struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+ struct gfx10_sh_query_buffer *qbuf =
+ list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
}
static void gfx10_release_query_buffers(struct si_context *sctx,
- struct gfx10_sh_query_buffer *first,
- struct gfx10_sh_query_buffer *last)
+ struct gfx10_sh_query_buffer *first,
+ struct gfx10_sh_query_buffer *last)
{
- while (first) {
- struct gfx10_sh_query_buffer *qbuf = first;
- if (first != last)
- first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
- else
- first = NULL;
-
- qbuf->refcount--;
- if (qbuf->refcount)
- continue;
-
- if (qbuf->list.next == &sctx->shader_query_buffers)
- continue; /* keep the most recent buffer; it may not be full yet */
- if (qbuf->list.prev == &sctx->shader_query_buffers)
- continue; /* keep the oldest buffer for recycling */
-
- list_del(&qbuf->list);
- si_resource_reference(&qbuf->buf, NULL);
- FREE(qbuf);
- }
+ while (first) {
+ struct gfx10_sh_query_buffer *qbuf = first;
+ if (first != last)
+ first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+ else
+ first = NULL;
+
+ qbuf->refcount--;
+ if (qbuf->refcount)
+ continue;
+
+ if (qbuf->list.next == &sctx->shader_query_buffers)
+ continue; /* keep the most recent buffer; it may not be full yet */
+ if (qbuf->list.prev == &sctx->shader_query_buffers)
+ continue; /* keep the oldest buffer for recycling */
+
+ list_del(&qbuf->list);
+ si_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
}
static bool gfx10_alloc_query_buffer(struct si_context *sctx)
{
- if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
- return true;
-
- struct gfx10_sh_query_buffer *qbuf = NULL;
-
- if (!list_is_empty(&sctx->shader_query_buffers)) {
- qbuf = list_last_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
- goto success;
-
- qbuf = list_first_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- if (!qbuf->refcount &&
- !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
- sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
- /* Can immediately re-use the oldest buffer */
- list_del(&qbuf->list);
- } else {
- qbuf = NULL;
- }
- }
-
- if (!qbuf) {
- qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
- if (unlikely(!qbuf))
- return false;
-
- struct si_screen *screen = sctx->screen;
- unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
- screen->info.min_alloc_size);
- qbuf->buf = si_resource(
- pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
- if (unlikely(!qbuf->buf)) {
- FREE(qbuf);
- return false;
- }
- }
-
- /* The buffer is currently unused by the GPU. Initialize it.
- *
- * We need to set the high bit of all the primitive counters for
- * compatibility with the SET_PREDICATION packet.
- */
- uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
- PIPE_TRANSFER_WRITE |
- PIPE_TRANSFER_UNSYNCHRONIZED);
- assert(results);
-
- for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
- i < e; ++i) {
- for (unsigned j = 0; j < 16; ++j)
- results[32 * i + j] = (uint64_t)1 << 63;
- results[32 * i + 16] = 0;
- }
-
- list_addtail(&qbuf->list, &sctx->shader_query_buffers);
- qbuf->head = 0;
- qbuf->refcount = sctx->num_active_shader_queries;
+ if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+ return true;
+
+ struct gfx10_sh_query_buffer *qbuf = NULL;
+
+ if (!list_is_empty(&sctx->shader_query_buffers)) {
+ qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+ goto success;
+
+ qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ if (!qbuf->refcount &&
+ !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+ sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ /* Can immediately re-use the oldest buffer */
+ list_del(&qbuf->list);
+ } else {
+ qbuf = NULL;
+ }
+ }
+
+ if (!qbuf) {
+ qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+ if (unlikely(!qbuf))
+ return false;
+
+ struct si_screen *screen = sctx->screen;
+ unsigned buf_size =
+ MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
+ qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+ if (unlikely(!qbuf->buf)) {
+ FREE(qbuf);
+ return false;
+ }
+ }
+
+ /* The buffer is currently unused by the GPU. Initialize it.
+ *
+ * We need to set the high bit of all the primitive counters for
+ * compatibility with the SET_PREDICATION packet.
+ */
+ uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+ PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+ assert(results);
+
+ for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
+ ++i) {
+ for (unsigned j = 0; j < 16; ++j)
+ results[32 * i + j] = (uint64_t)1 << 63;
+ results[32 * i + 16] = 0;
+ }
+
+ list_addtail(&qbuf->list, &sctx->shader_query_buffers);
+ qbuf->head = 0;
+ qbuf->refcount = sctx->num_active_shader_queries;
success:;
- struct pipe_shader_buffer sbuf;
- sbuf.buffer = &qbuf->buf->b.b;
- sbuf.buffer_offset = qbuf->head;
- sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
- si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
- sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
-
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
- return true;
+ struct pipe_shader_buffer sbuf;
+ sbuf.buffer = &qbuf->buf->b.b;
+ sbuf.buffer_offset = qbuf->head;
+ sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+ si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+ sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+ return true;
}
static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
{
- struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
- gfx10_release_query_buffers(sctx, query->first, query->last);
- FREE(query);
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ gfx10_release_query_buffers(sctx, query->first, query->last);
+ FREE(query);
}
static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
{
- struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
- gfx10_release_query_buffers(sctx, query->first, query->last);
- query->first = query->last = NULL;
+ gfx10_release_query_buffers(sctx, query->first, query->last);
+ query->first = query->last = NULL;
- if (unlikely(!gfx10_alloc_query_buffer(sctx)))
- return false;
+ if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+ return false;
- query->first = list_last_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- query->first_begin = query->first->head;
+ query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ query->first_begin = query->first->head;
- sctx->num_active_shader_queries++;
- query->first->refcount++;
+ sctx->num_active_shader_queries++;
+ query->first->refcount++;
- return true;
+ return true;
}
static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
{
- struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-
- if (unlikely(!query->first))
- return false; /* earlier out of memory error */
-
- query->last = list_last_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- query->last_end = query->last->head;
-
- /* Signal the fence of the previous chunk */
- if (query->last_end != 0) {
- uint64_t fence_va = query->last->buf->gpu_address;
- fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
- fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
- si_cp_release_mem(sctx, sctx->gfx_cs,
- V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- query->last->buf, fence_va, 0xffffffff,
- PIPE_QUERY_GPU_FINISHED);
- }
-
- sctx->num_active_shader_queries--;
-
- if (sctx->num_active_shader_queries > 0) {
- gfx10_alloc_query_buffer(sctx);
- } else {
- si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
- sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
-
- /* If a query_begin is followed by a query_end without a draw
- * in-between, we need to clear the atom to ensure that the
- * next query_begin will re-initialize the shader buffer. */
- si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
- }
-
- return true;
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+ if (unlikely(!query->first))
+ return false; /* earlier out of memory error */
+
+ query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ query->last_end = query->last->head;
+
+ /* Signal the fence of the previous chunk */
+ if (query->last_end != 0) {
+ uint64_t fence_va = query->last->buf->gpu_address;
+ fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+ fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+ si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
+ 0xffffffff, PIPE_QUERY_GPU_FINISHED);
+ }
+
+ sctx->num_active_shader_queries--;
+
+ if (sctx->num_active_shader_queries > 0) {
+ gfx10_alloc_query_buffer(sctx);
+ } else {
+ si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+ sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
+
+ /* If a query_begin is followed by a query_end without a draw
+ * in-between, we need to clear the atom to ensure that the
+ * next query_begin will re-initialize the shader buffer. */
+ si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+ }
+
+ return true;
}
static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
- struct gfx10_sh_query_buffer_mem *qmem,
- union pipe_query_result *result)
+ struct gfx10_sh_query_buffer_mem *qmem,
+ union pipe_query_result *result)
{
- static const uint64_t mask = ((uint64_t)1 << 63) - 1;
-
- switch (query->b.type) {
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
- break;
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- result->u64 += qmem->stream[query->stream].generated_primitives & mask;
- break;
- case PIPE_QUERY_SO_STATISTICS:
- result->so_statistics.num_primitives_written +=
- qmem->stream[query->stream].emitted_primitives & mask;
- result->so_statistics.primitives_storage_needed +=
- qmem->stream[query->stream].generated_primitives & mask;
- break;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- result->b |= qmem->stream[query->stream].emitted_primitives !=
- qmem->stream[query->stream].generated_primitives;
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
- result->b |= qmem->stream[query->stream].emitted_primitives !=
- qmem->stream[query->stream].generated_primitives;
- }
- break;
- default:
- assert(0);
- }
+ static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ result->so_statistics.num_primitives_written +=
+ qmem->stream[query->stream].emitted_primitives & mask;
+ result->so_statistics.primitives_storage_needed +=
+ qmem->stream[query->stream].generated_primitives & mask;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result->b |= qmem->stream[query->stream].emitted_primitives !=
+ qmem->stream[query->stream].generated_primitives;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ result->b |= qmem->stream[query->stream].emitted_primitives !=
+ qmem->stream[query->stream].generated_primitives;
+ }
+ break;
+ default:
+ assert(0);
+ }
}
-static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
- bool wait, union pipe_query_result *result)
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
+ union pipe_query_result *result)
{
- struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
- util_query_clear_result(result, query->b.type);
+ util_query_clear_result(result, query->b.type);
- if (unlikely(!query->first))
- return false; /* earlier out of memory error */
- assert(query->last);
+ if (unlikely(!query->first))
+ return false; /* earlier out of memory error */
+ assert(query->last);
- for (struct gfx10_sh_query_buffer *qbuf = query->last;;
- qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
- unsigned usage = PIPE_TRANSFER_READ |
- (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
- void *map;
+ for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+ qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+ unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ void *map;
- if (rquery->b.flushed)
- map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
- else
- map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+ if (rquery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
- if (!map)
- return false;
+ if (!map)
+ return false;
- unsigned results_begin = 0;
- unsigned results_end = qbuf->head;
- if (qbuf == query->first)
- results_begin = query->first_begin;
- if (qbuf == query->last)
- results_end = query->last_end;
+ unsigned results_begin = 0;
+ unsigned results_end = qbuf->head;
+ if (qbuf == query->first)
+ results_begin = query->first_begin;
+ if (qbuf == query->last)
+ results_end = query->last_end;
- while (results_begin != results_end) {
- struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
- results_begin += sizeof(*qmem);
+ while (results_begin != results_end) {
+ struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+ results_begin += sizeof(*qmem);
- gfx10_sh_query_add_result(query, qmem, result);
- }
+ gfx10_sh_query_add_result(query, qmem, result);
+ }
- if (qbuf == query->first)
- break;
- }
+ if (qbuf == query->first)
+ break;
+ }
- return true;
+ return true;
}
-static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
- struct si_query *rquery,
- bool wait,
- enum pipe_query_value_type result_type,
- int index,
- struct pipe_resource *resource,
- unsigned offset)
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
+ bool wait, enum pipe_query_value_type result_type,
+ int index, struct pipe_resource *resource,
+ unsigned offset)
{
- struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
- struct si_qbo_state saved_state = {};
- struct pipe_resource *tmp_buffer = NULL;
- unsigned tmp_buffer_offset = 0;
-
- if (!sctx->sh_query_result_shader) {
- sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
- if (!sctx->sh_query_result_shader)
- return;
- }
-
- if (query->first != query->last) {
- u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
- &tmp_buffer_offset, &tmp_buffer);
- if (!tmp_buffer)
- return;
- }
-
- si_save_qbo_state(sctx, &saved_state);
-
- /* Pre-fill the constants configuring the shader behavior. */
- struct {
- uint32_t config;
- uint32_t offset;
- uint32_t chain;
- uint32_t result_count;
- } consts;
- struct pipe_constant_buffer constant_buffer = {};
-
- if (index >= 0) {
- switch (query->b.type) {
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- consts.offset = sizeof(uint32_t) * query->stream;
- consts.config = 0;
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- consts.offset = sizeof(uint32_t) * (4 + query->stream);
- consts.config = 0;
- break;
- case PIPE_QUERY_SO_STATISTICS:
- consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
- consts.config = 0;
- break;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- consts.offset = sizeof(uint32_t) * query->stream;
- consts.config = 2;
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- consts.offset = 0;
- consts.config = 3;
- break;
- default: unreachable("bad query type");
- }
- } else {
- /* Check result availability. */
- consts.offset = 0;
- consts.config = 1;
- }
-
- if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
- consts.config |= 8;
-
- constant_buffer.buffer_size = sizeof(consts);
- constant_buffer.user_buffer = &consts;
-
- /* Pre-fill the SSBOs and grid. */
- struct pipe_shader_buffer ssbo[3];
- struct pipe_grid_info grid = {};
-
- ssbo[1].buffer = tmp_buffer;
- ssbo[1].buffer_offset = tmp_buffer_offset;
- ssbo[1].buffer_size = 16;
-
- ssbo[2] = ssbo[1];
-
- sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
- grid.block[0] = 1;
- grid.block[1] = 1;
- grid.block[2] = 1;
- grid.grid[0] = 1;
- grid.grid[1] = 1;
- grid.grid[2] = 1;
-
- struct gfx10_sh_query_buffer *qbuf = query->first;
- for (;;) {
- unsigned begin = qbuf == query->first ? query->first_begin : 0;
- unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
- if (!end)
- continue;
-
- ssbo[0].buffer = &qbuf->buf->b.b;
- ssbo[0].buffer_offset = begin;
- ssbo[0].buffer_size = end - begin;
-
- consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
- consts.chain = 0;
- if (qbuf != query->first)
- consts.chain |= 1;
- if (qbuf != query->last)
- consts.chain |= 2;
-
- if (qbuf == query->last) {
- ssbo[2].buffer = resource;
- ssbo[2].buffer_offset = offset;
- ssbo[2].buffer_size = 8;
- }
-
- sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
- sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
-
- if (wait) {
- uint64_t va;
-
- /* Wait for result availability. Wait only for readiness
- * of the last entry, since the fence writes should be
- * serialized in the CP.
- */
- va = qbuf->buf->gpu_address;
- va += end - sizeof(struct gfx10_sh_query_buffer_mem);
- va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-
- si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
- }
-
- sctx->b.launch_grid(&sctx->b, &grid);
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
- if (qbuf == query->last)
- break;
- qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
- }
-
- si_restore_qbo_state(sctx, &saved_state);
- pipe_resource_reference(&tmp_buffer, NULL);
+ struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+ struct si_qbo_state saved_state = {};
+ struct pipe_resource *tmp_buffer = NULL;
+ unsigned tmp_buffer_offset = 0;
+
+ if (!sctx->sh_query_result_shader) {
+ sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+ if (!sctx->sh_query_result_shader)
+ return;
+ }
+
+ if (query->first != query->last) {
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+ if (!tmp_buffer)
+ return;
+ }
+
+ si_save_qbo_state(sctx, &saved_state);
+
+ /* Pre-fill the constants configuring the shader behavior. */
+ struct {
+ uint32_t config;
+ uint32_t offset;
+ uint32_t chain;
+ uint32_t result_count;
+ } consts;
+ struct pipe_constant_buffer constant_buffer = {};
+
+ if (index >= 0) {
+ switch (query->b.type) {
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ consts.offset = sizeof(uint32_t) * query->stream;
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ consts.offset = sizeof(uint32_t) * (4 + query->stream);
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+ consts.config = 0;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ consts.offset = sizeof(uint32_t) * query->stream;
+ consts.config = 2;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ consts.offset = 0;
+ consts.config = 3;
+ break;
+ default:
+ unreachable("bad query type");
+ }
+ } else {
+ /* Check result availability. */
+ consts.offset = 0;
+ consts.config = 1;
+ }
+
+ if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+ consts.config |= 8;
+
+ constant_buffer.buffer_size = sizeof(consts);
+ constant_buffer.user_buffer = &consts;
+
+ /* Pre-fill the SSBOs and grid. */
+ struct pipe_shader_buffer ssbo[3];
+ struct pipe_grid_info grid = {};
+
+ ssbo[1].buffer = tmp_buffer;
+ ssbo[1].buffer_offset = tmp_buffer_offset;
+ ssbo[1].buffer_size = 16;
+
+ ssbo[2] = ssbo[1];
+
+ sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+ grid.block[0] = 1;
+ grid.block[1] = 1;
+ grid.block[2] = 1;
+ grid.grid[0] = 1;
+ grid.grid[1] = 1;
+ grid.grid[2] = 1;
+
+ struct gfx10_sh_query_buffer *qbuf = query->first;
+ for (;;) {
+ unsigned begin = qbuf == query->first ? query->first_begin : 0;
+ unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+ if (!end)
+ continue;
+
+ ssbo[0].buffer = &qbuf->buf->b.b;
+ ssbo[0].buffer_offset = begin;
+ ssbo[0].buffer_size = end - begin;
+
+ consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+ consts.chain = 0;
+ if (qbuf != query->first)
+ consts.chain |= 1;
+ if (qbuf != query->last)
+ consts.chain |= 2;
+
+ if (qbuf == query->last) {
+ ssbo[2].buffer = resource;
+ ssbo[2].buffer_offset = offset;
+ ssbo[2].buffer_size = 8;
+ }
+
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+ if (wait) {
+ uint64_t va;
+
+ /* Wait for result availability. Wait only for readiness
+ * of the last entry, since the fence writes should be
+ * serialized in the CP.
+ */
+ va = qbuf->buf->gpu_address;
+ va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+ va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+ si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+ }
+
+ sctx->b.launch_grid(&sctx->b, &grid);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (qbuf == query->last)
+ break;
+ qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+ }
+
+ si_restore_qbo_state(sctx, &saved_state);
+ pipe_resource_reference(&tmp_buffer, NULL);
}
static const struct si_query_ops gfx10_sh_query_ops = {
- .destroy = gfx10_sh_query_destroy,
- .begin = gfx10_sh_query_begin,
- .end = gfx10_sh_query_end,
- .get_result = gfx10_sh_query_get_result,
- .get_result_resource = gfx10_sh_query_get_result_resource,
+ .destroy = gfx10_sh_query_destroy,
+ .begin = gfx10_sh_query_begin,
+ .end = gfx10_sh_query_end,
+ .get_result = gfx10_sh_query_get_result,
+ .get_result_resource = gfx10_sh_query_get_result_resource,
};
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
- enum pipe_query_type query_type,
- unsigned index)
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+ unsigned index)
{
- struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
- if (unlikely(!query))
- return NULL;
+ struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+ if (unlikely(!query))
+ return NULL;
- query->b.ops = &gfx10_sh_query_ops;
- query->b.type = query_type;
- query->stream = index;
+ query->b.ops = &gfx10_sh_query_ops;
+ query->b.type = query_type;
+ query->stream = index;
- return (struct pipe_query *)query;
+ return (struct pipe_query *)query;
}
void gfx10_init_query(struct si_context *sctx)
{
- list_inithead(&sctx->shader_query_buffers);
- sctx->atoms.s.shader_query.emit = emit_shader_query;
+ list_inithead(&sctx->shader_query_buffers);
+ sctx->atoms.s.shader_query.emit = emit_shader_query;
}
void gfx10_destroy_query(struct si_context *sctx)
{
- while (!list_is_empty(&sctx->shader_query_buffers)) {
- struct gfx10_sh_query_buffer *qbuf =
- list_first_entry(&sctx->shader_query_buffers,
- struct gfx10_sh_query_buffer, list);
- list_del(&qbuf->list);
-
- assert(!qbuf->refcount);
- si_resource_reference(&qbuf->buf, NULL);
- FREE(qbuf);
- }
+ while (!list_is_empty(&sctx->shader_query_buffers)) {
+ struct gfx10_sh_query_buffer *qbuf =
+ list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+ list_del(&qbuf->list);
+
+ assert(!qbuf->refcount);
+ si_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+#include "ac_llvm_cull.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
-
#include "sid.h"
-
#include "util/u_memory.h"
#include "util/u_prim.h"
-#include "ac_llvm_cull.h"
static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+ return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
}
static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
+ return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
}
static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef tmp;
- tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
- return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef tmp;
+ tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
+ return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
}
static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
+ return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
}
static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
+ return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
}
static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
+ return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
}
static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
{
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
- LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
+ return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+ LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
}
static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
{
- if (ctx->type == PIPE_SHADER_VERTEX) {
- LLVMValueRef tmp;
- tmp = LLVMBuildLShr(ctx->ac.builder,
- ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
- LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
- return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
- }
- return ctx->ac.i1false;
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ LLVMValueRef tmp;
+ tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
+ LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
+ return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
+ }
+ return ctx->ac.i1false;
}
/**
* Return the number of vertices as a constant in \p num_vertices,
* and return a more precise value as LLVMValueRef from the function.
*/
-static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
- unsigned *num_vertices)
+static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
{
- const struct si_shader_info *info = &ctx->shader->selector->info;
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
- /* Blits always use axis-aligned rectangles with 3 vertices. */
- *num_vertices = 3;
- return LLVMConstInt(ctx->ac.i32, 3, 0);
- } else {
- /* We always build up all three indices for the prim export
- * independent of the primitive type. The additional garbage
- * data shouldn't hurt. This number doesn't matter with
- * NGG passthrough.
- */
- *num_vertices = 3;
-
- /* Extract OUTPRIM field. */
- LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
- return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
- }
- } else {
- assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-
- if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
- *num_vertices = 1;
- else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
- *num_vertices = 2;
- else
- *num_vertices = 3;
-
- return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
- }
+ const struct si_shader_info *info = &ctx->shader->selector->info;
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+ /* Blits always use axis-aligned rectangles with 3 vertices. */
+ *num_vertices = 3;
+ return LLVMConstInt(ctx->ac.i32, 3, 0);
+ } else {
+ /* We always build up all three indices for the prim export
+ * independent of the primitive type. The additional garbage
+ * data shouldn't hurt. This number doesn't matter with
+ * NGG passthrough.
+ */
+ *num_vertices = 3;
+
+ /* Extract OUTPRIM field. */
+ LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
+ return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
+ }
+ } else {
+ assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+
+ if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
+ *num_vertices = 1;
+ else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+ *num_vertices = 2;
+ else
+ *num_vertices = 3;
+
+ return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
+ }
}
bool gfx10_ngg_export_prim_early(struct si_shader *shader)
{
- struct si_shader_selector *sel = shader->selector;
+ struct si_shader_selector *sel = shader->selector;
- assert(shader->key.as_ngg && !shader->key.as_es);
+ assert(shader->key.as_ngg && !shader->key.as_es);
- return sel->type != PIPE_SHADER_GEOMETRY &&
- !sel->info.writes_edgeflag;
+ return sel->type != PIPE_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
}
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
{
- ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
- ngg_get_vtx_cnt(ctx),
- ngg_get_prim_cnt(ctx));
+ ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
+ ngg_get_prim_cnt(ctx));
}
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
- LLVMValueRef user_edgeflags[3],
- LLVMValueRef prim_passthrough)
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+ LLVMValueRef prim_passthrough)
{
- LLVMBuilderRef builder = ctx->ac.builder;
-
- if (gfx10_is_ngg_passthrough(ctx->shader) ||
- ctx->shader->key.opt.ngg_culling) {
- ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
- {
- struct ac_ngg_prim prim = {};
-
- if (prim_passthrough)
- prim.passthrough = prim_passthrough;
- else
- prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-
- /* This is only used with NGG culling, which returns the NGG
- * passthrough prim export encoding.
- */
- if (ctx->shader->selector->info.writes_edgeflag) {
- unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
- LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
-
- unsigned num_vertices;
- ngg_get_vertices_per_prim(ctx, &num_vertices);
-
- for (unsigned i = 0; i < num_vertices; i++) {
- unsigned shift = 9 + i*10;
- LLVMValueRef edge;
-
- edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
- edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
- edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
- edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
- }
- prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
- }
-
- ac_build_export_prim(&ctx->ac, &prim);
- }
- ac_build_endif(&ctx->ac, 6001);
- return;
- }
-
- ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
- {
- struct ac_ngg_prim prim = {};
-
- ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
-
- prim.isnull = ctx->ac.i1false;
- prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
- prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
- prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-
- for (unsigned i = 0; i < prim.num_vertices; ++i) {
- prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
-
- if (ctx->shader->selector->info.writes_edgeflag) {
- LLVMValueRef edge;
-
- edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
- edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
- prim.edgeflag[i] = edge;
- }
- }
-
- ac_build_export_prim(&ctx->ac, &prim);
- }
- ac_build_endif(&ctx->ac, 6001);
+ LLVMBuilderRef builder = ctx->ac.builder;
+
+ if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
+ ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+ {
+ struct ac_ngg_prim prim = {};
+
+ if (prim_passthrough)
+ prim.passthrough = prim_passthrough;
+ else
+ prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+
+ /* This is only used with NGG culling, which returns the NGG
+ * passthrough prim export encoding.
+ */
+ if (ctx->shader->selector->info.writes_edgeflag) {
+ unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
+ LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
+
+ unsigned num_vertices;
+ ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+ for (unsigned i = 0; i < num_vertices; i++) {
+ unsigned shift = 9 + i * 10;
+ LLVMValueRef edge;
+
+ edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
+ edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
+ edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
+ edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
+ }
+ prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
+ }
+
+ ac_build_export_prim(&ctx->ac, &prim);
+ }
+ ac_build_endif(&ctx->ac, 6001);
+ return;
+ }
+
+ ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+ {
+ struct ac_ngg_prim prim = {};
+
+ ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
+
+ prim.isnull = ctx->ac.i1false;
+ prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+ prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+ prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+
+ for (unsigned i = 0; i < prim.num_vertices; ++i) {
+ prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
+
+ if (ctx->shader->selector->info.writes_edgeflag) {
+ LLVMValueRef edge;
+
+ edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
+ edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
+ prim.edgeflag[i] = edge;
+ }
+ }
+
+ ac_build_export_prim(&ctx->ac, &prim);
+ }
+ ac_build_endif(&ctx->ac, 6001);
}
-static void build_streamout_vertex(struct si_shader_context *ctx,
- LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
- unsigned stream, LLVMValueRef offset_vtx,
- LLVMValueRef vertexptr)
+static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
+ LLVMValueRef *wg_offset_dw, unsigned stream,
+ LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
{
- struct si_shader_info *info = &ctx->shader->selector->info;
- struct pipe_stream_output_info *so = &ctx->shader->selector->so;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef offset[4] = {};
- LLVMValueRef tmp;
-
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (!wg_offset_dw[buffer])
- continue;
-
- tmp = LLVMBuildMul(builder, offset_vtx,
- LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), "");
- tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
- offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
- }
-
- for (unsigned i = 0; i < so->num_outputs; ++i) {
- if (so->output[i].stream != stream)
- continue;
-
- unsigned reg = so->output[i].register_index;
- struct si_shader_output_values out;
- out.semantic_name = info->output_semantic_name[reg];
- out.semantic_index = info->output_semantic_index[reg];
-
- for (unsigned comp = 0; comp < 4; comp++) {
- tmp = ac_build_gep0(&ctx->ac, vertexptr,
- LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
- out.values[comp] = LLVMBuildLoad(builder, tmp, "");
- out.vertex_stream[comp] =
- (info->output_streams[reg] >> (2 * comp)) & 3;
- }
-
- si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
- }
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef offset[4] = {};
+ LLVMValueRef tmp;
+
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (!wg_offset_dw[buffer])
+ continue;
+
+ tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
+ "");
+ tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
+ offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
+ }
+
+ for (unsigned i = 0; i < so->num_outputs; ++i) {
+ if (so->output[i].stream != stream)
+ continue;
+
+ unsigned reg = so->output[i].register_index;
+ struct si_shader_output_values out;
+ out.semantic_name = info->output_semantic_name[reg];
+ out.semantic_index = info->output_semantic_index[reg];
+
+ for (unsigned comp = 0; comp < 4; comp++) {
+ tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
+ out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+ out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
+ }
+
+ si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
+ }
}
struct ngg_streamout {
- LLVMValueRef num_vertices;
+ LLVMValueRef num_vertices;
- /* per-thread data */
- LLVMValueRef prim_enable[4]; /* i1 per stream */
- LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
+ /* per-thread data */
+ LLVMValueRef prim_enable[4]; /* i1 per stream */
+ LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
- /* Output */
- LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
+ /* Output */
+ LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
};
/**
*
* Clobbers gs_ngg_scratch[8:].
*/
-static void build_streamout(struct si_shader_context *ctx,
- struct ngg_streamout *nggso)
+static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
{
- struct si_shader_info *info = &ctx->shader->selector->info;
- struct pipe_stream_output_info *so = &ctx->shader->selector->so;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef tid = get_thread_id_in_tg(ctx);
- LLVMValueRef tmp, tmp2;
- LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
- LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
- LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
- LLVMValueRef so_buffer[4] = {};
- unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
- (nggso->vertices[2] ? 1 : 0);
- LLVMValueRef prim_stride_dw[4] = {};
- LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
- int stream_for_buffer[4] = { -1, -1, -1, -1 };
- unsigned bufmask_for_stream[4] = {};
- bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
- unsigned scratch_emit_base = isgs ? 4 : 0;
- LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
- unsigned scratch_offset_base = isgs ? 8 : 4;
- LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
-
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
-
- /* Determine the mapping of streamout buffers to vertex streams. */
- for (unsigned i = 0; i < so->num_outputs; ++i) {
- unsigned buf = so->output[i].output_buffer;
- unsigned stream = so->output[i].stream;
- assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
- stream_for_buffer[buf] = stream;
- bufmask_for_stream[stream] |= 1 << buf;
- }
-
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (stream_for_buffer[buffer] == -1)
- continue;
-
- assert(so->stride[buffer]);
-
- tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
- prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
- prim_stride_dw_vgpr = ac_build_writelane(
- &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
- LLVMConstInt(ctx->ac.i32, buffer, false));
-
- so_buffer[buffer] = ac_build_load_to_sgpr(
- &ctx->ac, buf_ptr,
- LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
- }
-
- tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
- ac_build_ifcc(&ctx->ac, tmp, 5200);
- {
- LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
- LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
-
- /* Advance the streamout offsets in GDS. */
- LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
- LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
- ac_build_ifcc(&ctx->ac, tmp, 5210);
- {
- if (isgs) {
- tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
- tmp = LLVMBuildLoad(builder, tmp, "");
- } else {
- tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0,
- ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
- }
- LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
-
- unsigned swizzle[4];
- int unused_stream = -1;
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream]) {
- unused_stream = stream;
- break;
- }
- }
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (stream_for_buffer[buffer] >= 0) {
- swizzle[buffer] = stream_for_buffer[buffer];
- } else {
- assert(unused_stream >= 0);
- swizzle[buffer] = unused_stream;
- }
- }
-
- tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
- swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
- tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-
- LLVMValueRef args[] = {
- LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
- tmp,
- ctx->ac.i32_0, // ordering
- ctx->ac.i32_0, // scope
- ctx->ac.i1false, // isVolatile
- LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
- ctx->ac.i1true, // wave release
- ctx->ac.i1true, // wave done
- };
- tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
- ctx->ac.i32, args, ARRAY_SIZE(args), 0);
-
- /* Keep offsets in a VGPR for quick retrieval via readlane by
- * the first wave for bounds checking, and also store in LDS
- * for retrieval by all waves later. */
- LLVMBuildStore(builder, tmp, offsets_vgpr);
-
- tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
- scratch_offset_basev, "");
- tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
- LLVMBuildStore(builder, tmp, tmp2);
- }
- ac_build_endif(&ctx->ac, 5210);
-
- /* Determine the max emit per buffer. This is done via the SALU, in part
- * because LLVM can't generate divide-by-multiply if we try to do this
- * via VALU with one lane per buffer.
- */
- LLVMValueRef max_emit[4] = {};
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (stream_for_buffer[buffer] == -1)
- continue;
-
- LLVMValueRef bufsize_dw =
- LLVMBuildLShr(builder,
- LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
- i32_2, "");
-
- tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
- LLVMValueRef offset_dw =
- ac_build_readlane(&ctx->ac, tmp,
- LLVMConstInt(ctx->ac.i32, buffer, false));
-
- tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
- tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
-
- tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
- max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
- }
-
- /* Determine the number of emitted primitives per stream and fixup the
- * GDS counter if necessary.
- *
- * This is complicated by the fact that a single stream can emit to
- * multiple buffers (but luckily not vice versa).
- */
- LLVMValueRef emit_vgpr = ctx->ac.i32_0;
-
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
- LLVMValueRef generated =
- ac_build_readlane(&ctx->ac, tmp,
- LLVMConstInt(ctx->ac.i32, stream, false));
-
- LLVMValueRef emit = generated;
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (stream_for_buffer[buffer] == stream)
- emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
- }
-
- emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
- LLVMConstInt(ctx->ac.i32, stream, false));
-
- /* Fixup the offset using a plain GDS atomic if we overflowed. */
- tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
- ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
- tmp = LLVMBuildLShr(builder,
- LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
- ac_get_thread_id(&ctx->ac), "");
- tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
- ac_build_ifcc(&ctx->ac, tmp, 5222);
- {
- tmp = LLVMBuildSub(builder, generated, emit, "");
- tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
- tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
- LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
- LLVMAtomicOrderingMonotonic, false);
- }
- ac_build_endif(&ctx->ac, 5222);
- ac_build_endif(&ctx->ac, 5221);
- }
-
- tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
- ac_build_ifcc(&ctx->ac, tmp, 5225);
- {
- tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
- scratch_emit_basev, "");
- tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
- LLVMBuildStore(builder, emit_vgpr, tmp);
- }
- ac_build_endif(&ctx->ac, 5225);
- }
- ac_build_endif(&ctx->ac, 5200);
-
- /* Determine the workgroup-relative per-thread / primitive offset into
- * the streamout buffers */
- struct ac_wg_scan primemit_scan[4] = {};
-
- if (isgs) {
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- primemit_scan[stream].enable_exclusive = true;
- primemit_scan[stream].op = nir_op_iadd;
- primemit_scan[stream].src = nggso->prim_enable[stream];
- primemit_scan[stream].scratch =
- ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
- LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
- primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
- primemit_scan[stream].numwaves = get_tgsize(ctx);
- primemit_scan[stream].maxwaves = 8;
- ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
- }
- }
-
- ac_build_s_barrier(&ctx->ac);
-
- /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
- LLVMValueRef wgoffset_dw[4] = {};
-
- {
- LLVMValueRef scratch_vgpr;
-
- tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
- scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
-
- for (unsigned buffer = 0; buffer < 4; ++buffer) {
- if (stream_for_buffer[buffer] >= 0) {
- wgoffset_dw[buffer] = ac_build_readlane(
- &ctx->ac, scratch_vgpr,
- LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
- }
- }
-
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (info->num_stream_output_components[stream]) {
- nggso->emit[stream] = ac_build_readlane(
- &ctx->ac, scratch_vgpr,
- LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
- }
- }
- }
-
- /* Write out primitive data */
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- if (isgs) {
- ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
- } else {
- primemit_scan[stream].result_exclusive = tid;
- }
-
- tmp = LLVMBuildICmp(builder, LLVMIntULT,
- primemit_scan[stream].result_exclusive,
- nggso->emit[stream], "");
- tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
- ac_build_ifcc(&ctx->ac, tmp, 5240);
- {
- LLVMValueRef offset_vtx =
- LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
- nggso->num_vertices, "");
-
- for (unsigned i = 0; i < max_num_vertices; ++i) {
- tmp = LLVMBuildICmp(builder, LLVMIntULT,
- LLVMConstInt(ctx->ac.i32, i, false),
- nggso->num_vertices, "");
- ac_build_ifcc(&ctx->ac, tmp, 5241);
- build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
- stream, offset_vtx, nggso->vertices[i]);
- ac_build_endif(&ctx->ac, 5241);
- offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
- }
- }
- ac_build_endif(&ctx->ac, 5240);
- }
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef tid = get_thread_id_in_tg(ctx);
+ LLVMValueRef tmp, tmp2;
+ LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
+ LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
+ LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
+ LLVMValueRef so_buffer[4] = {};
+ unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
+ LLVMValueRef prim_stride_dw[4] = {};
+ LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
+ int stream_for_buffer[4] = {-1, -1, -1, -1};
+ unsigned bufmask_for_stream[4] = {};
+ bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
+ unsigned scratch_emit_base = isgs ? 4 : 0;
+ LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
+ unsigned scratch_offset_base = isgs ? 8 : 4;
+ LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
+
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+
+ /* Determine the mapping of streamout buffers to vertex streams. */
+ for (unsigned i = 0; i < so->num_outputs; ++i) {
+ unsigned buf = so->output[i].output_buffer;
+ unsigned stream = so->output[i].stream;
+ assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
+ stream_for_buffer[buf] = stream;
+ bufmask_for_stream[stream] |= 1 << buf;
+ }
+
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (stream_for_buffer[buffer] == -1)
+ continue;
+
+ assert(so->stride[buffer]);
+
+ tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
+ prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
+ prim_stride_dw_vgpr =
+ ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
+ LLVMConstInt(ctx->ac.i32, buffer, false));
+
+ so_buffer[buffer] = ac_build_load_to_sgpr(
+ &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
+ }
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5200);
+ {
+ LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+ LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+ /* Advance the streamout offsets in GDS. */
+ LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+ LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5210);
+ {
+ if (isgs) {
+ tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
+ tmp = LLVMBuildLoad(builder, tmp, "");
+ } else {
+ tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
+ }
+ LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
+
+ unsigned swizzle[4];
+ int unused_stream = -1;
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream]) {
+ unused_stream = stream;
+ break;
+ }
+ }
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (stream_for_buffer[buffer] >= 0) {
+ swizzle[buffer] = stream_for_buffer[buffer];
+ } else {
+ assert(unused_stream >= 0);
+ swizzle[buffer] = unused_stream;
+ }
+ }
+
+ tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+ tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+
+ LLVMValueRef args[] = {
+ LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
+ tmp,
+ ctx->ac.i32_0, // ordering
+ ctx->ac.i32_0, // scope
+ ctx->ac.i1false, // isVolatile
+ LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
+ ctx->ac.i1true, // wave release
+ ctx->ac.i1true, // wave done
+ };
+ tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
+ ARRAY_SIZE(args), 0);
+
+ /* Keep offsets in a VGPR for quick retrieval via readlane by
+ * the first wave for bounds checking, and also store in LDS
+ * for retrieval by all waves later. */
+ LLVMBuildStore(builder, tmp, offsets_vgpr);
+
+ tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
+ tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
+ LLVMBuildStore(builder, tmp, tmp2);
+ }
+ ac_build_endif(&ctx->ac, 5210);
+
+ /* Determine the max emit per buffer. This is done via the SALU, in part
+ * because LLVM can't generate divide-by-multiply if we try to do this
+ * via VALU with one lane per buffer.
+ */
+ LLVMValueRef max_emit[4] = {};
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (stream_for_buffer[buffer] == -1)
+ continue;
+
+ LLVMValueRef bufsize_dw = LLVMBuildLShr(
+ builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
+
+ tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
+ LLVMValueRef offset_dw =
+ ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
+
+ tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
+ tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
+
+ tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
+ max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
+ }
+
+ /* Determine the number of emitted primitives per stream and fixup the
+ * GDS counter if necessary.
+ *
+ * This is complicated by the fact that a single stream can emit to
+ * multiple buffers (but luckily not vice versa).
+ */
+ LLVMValueRef emit_vgpr = ctx->ac.i32_0;
+
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
+ LLVMValueRef generated =
+ ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
+
+ LLVMValueRef emit = generated;
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (stream_for_buffer[buffer] == stream)
+ emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
+ }
+
+ emit_vgpr =
+ ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
+
+ /* Fixup the offset using a plain GDS atomic if we overflowed. */
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
+ tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
+ ac_get_thread_id(&ctx->ac), "");
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5222);
+ {
+ tmp = LLVMBuildSub(builder, generated, emit, "");
+ tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+ tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
+ LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
+ LLVMAtomicOrderingMonotonic, false);
+ }
+ ac_build_endif(&ctx->ac, 5222);
+ ac_build_endif(&ctx->ac, 5221);
+ }
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5225);
+ {
+ tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
+ tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
+ LLVMBuildStore(builder, emit_vgpr, tmp);
+ }
+ ac_build_endif(&ctx->ac, 5225);
+ }
+ ac_build_endif(&ctx->ac, 5200);
+
+ /* Determine the workgroup-relative per-thread / primitive offset into
+ * the streamout buffers */
+ struct ac_wg_scan primemit_scan[4] = {};
+
+ if (isgs) {
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ primemit_scan[stream].enable_exclusive = true;
+ primemit_scan[stream].op = nir_op_iadd;
+ primemit_scan[stream].src = nggso->prim_enable[stream];
+ primemit_scan[stream].scratch = ac_build_gep0(
+ &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
+ primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
+ primemit_scan[stream].numwaves = get_tgsize(ctx);
+ primemit_scan[stream].maxwaves = 8;
+ ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
+ }
+ }
+
+ ac_build_s_barrier(&ctx->ac);
+
+ /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
+ LLVMValueRef wgoffset_dw[4] = {};
+
+ {
+ LLVMValueRef scratch_vgpr;
+
+ tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
+ scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
+
+ for (unsigned buffer = 0; buffer < 4; ++buffer) {
+ if (stream_for_buffer[buffer] >= 0) {
+ wgoffset_dw[buffer] =
+ ac_build_readlane(&ctx->ac, scratch_vgpr,
+ LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
+ }
+ }
+
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (info->num_stream_output_components[stream]) {
+ nggso->emit[stream] =
+ ac_build_readlane(&ctx->ac, scratch_vgpr,
+ LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
+ }
+ }
+ }
+
+ /* Write out primitive data */
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ if (isgs) {
+ ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
+ } else {
+ primemit_scan[stream].result_exclusive = tid;
+ }
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
+ nggso->emit[stream], "");
+ tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
+ ac_build_ifcc(&ctx->ac, tmp, 5240);
+ {
+ LLVMValueRef offset_vtx =
+ LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
+
+ for (unsigned i = 0; i < max_num_vertices; ++i) {
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
+ nggso->num_vertices, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5241);
+ build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
+ nggso->vertices[i]);
+ ac_build_endif(&ctx->ac, 5241);
+ offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
+ }
+ }
+ ac_build_endif(&ctx->ac, 5240);
+ }
}
/* LDS layout of ES vertex data for NGG culling. */
-enum {
- /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
- * ES thread ID. After vertex compaction, compacted ES threads
- * store the old thread ID here to copy input VGPRs from uncompacted
- * ES threads.
- * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
- * Byte 2: TES rel patch ID
- * Byte 3: Unused
- */
- lds_byte0_accept_flag = 0,
- lds_byte0_old_thread_id = 0,
- lds_byte1_new_thread_id,
- lds_byte2_tes_rel_patch_id,
- lds_byte3_unused,
-
- lds_packed_data = 0, /* lds_byteN_... */
-
- lds_pos_x,
- lds_pos_y,
- lds_pos_z,
- lds_pos_w,
- lds_pos_x_div_w,
- lds_pos_y_div_w,
- /* If VS: */
- lds_vertex_id,
- lds_instance_id, /* optional */
- /* If TES: */
- lds_tes_u = lds_vertex_id,
- lds_tes_v = lds_instance_id,
- lds_tes_patch_id, /* optional */
+enum
+{
+ /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
+ * ES thread ID. After vertex compaction, compacted ES threads
+ * store the old thread ID here to copy input VGPRs from uncompacted
+ * ES threads.
+ * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
+ * Byte 2: TES rel patch ID
+ * Byte 3: Unused
+ */
+ lds_byte0_accept_flag = 0,
+ lds_byte0_old_thread_id = 0,
+ lds_byte1_new_thread_id,
+ lds_byte2_tes_rel_patch_id,
+ lds_byte3_unused,
+
+ lds_packed_data = 0, /* lds_byteN_... */
+
+ lds_pos_x,
+ lds_pos_y,
+ lds_pos_z,
+ lds_pos_w,
+ lds_pos_x_div_w,
+ lds_pos_y_div_w,
+ /* If VS: */
+ lds_vertex_id,
+ lds_instance_id, /* optional */
+ /* If TES: */
+ lds_tes_u = lds_vertex_id,
+ lds_tes_v = lds_instance_id,
+ lds_tes_patch_id, /* optional */
};
-static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx,
- LLVMValueRef ptr, unsigned byte_index)
+static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
+ unsigned byte_index)
{
- assert(byte_index < 4);
- LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
- LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
+ assert(byte_index < 4);
+ LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+ LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
- return LLVMBuildGEP(ctx->ac.builder,
- LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""),
- &index, 1, "");
+ return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
+ 1, "");
}
static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
{
- unsigned lds_vertex_size = 0;
-
- /* The edgeflag is always stored in the last element that's also
- * used for padding to reduce LDS bank conflicts. */
- if (shader->selector->so.num_outputs)
- lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
- if (shader->selector->info.writes_edgeflag)
- lds_vertex_size = MAX2(lds_vertex_size, 1);
-
- /* LDS size for passing data from GS to ES.
- * GS stores Primitive IDs into LDS at the address corresponding
- * to the ES thread of the provoking vertex. All ES threads
- * load and export PrimitiveID for their thread.
- */
- if (shader->selector->type == PIPE_SHADER_VERTEX &&
- shader->key.mono.u.vs_export_prim_id)
- lds_vertex_size = MAX2(lds_vertex_size, 1);
-
- if (shader->key.opt.ngg_culling) {
- if (shader->selector->type == PIPE_SHADER_VERTEX) {
- STATIC_ASSERT(lds_instance_id + 1 == 9);
- lds_vertex_size = MAX2(lds_vertex_size, 9);
- } else {
- assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
-
- if (shader->selector->info.uses_primid ||
- shader->key.mono.u.vs_export_prim_id) {
- STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
- lds_vertex_size = MAX2(lds_vertex_size, 11);
- } else {
- STATIC_ASSERT(lds_tes_v + 1 == 9);
- lds_vertex_size = MAX2(lds_vertex_size, 9);
- }
- }
- }
-
- return lds_vertex_size;
+ unsigned lds_vertex_size = 0;
+
+ /* The edgeflag is always stored in the last element that's also
+ * used for padding to reduce LDS bank conflicts. */
+ if (shader->selector->so.num_outputs)
+ lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
+ if (shader->selector->info.writes_edgeflag)
+ lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+ /* LDS size for passing data from GS to ES.
+ * GS stores Primitive IDs into LDS at the address corresponding
+ * to the ES thread of the provoking vertex. All ES threads
+ * load and export PrimitiveID for their thread.
+ */
+ if (shader->selector->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
+ lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+ if (shader->key.opt.ngg_culling) {
+ if (shader->selector->type == PIPE_SHADER_VERTEX) {
+ STATIC_ASSERT(lds_instance_id + 1 == 9);
+ lds_vertex_size = MAX2(lds_vertex_size, 9);
+ } else {
+ assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
+
+ if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
+ STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
+ lds_vertex_size = MAX2(lds_vertex_size, 11);
+ } else {
+ STATIC_ASSERT(lds_tes_v + 1 == 9);
+ lds_vertex_size = MAX2(lds_vertex_size, 9);
+ }
+ }
+ }
+
+ return lds_vertex_size;
}
/**
* Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
* for the vertex outputs.
*/
-static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
- LLVMValueRef vtxid)
+static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
{
- /* The extra dword is used to avoid LDS bank conflicts. */
- unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
- LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
- LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
- LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
- return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
+ /* The extra dword is used to avoid LDS bank conflicts. */
+ unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
+ LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
+ LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
+ LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
+ return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
}
-static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
- LLVMValueRef ret, struct ac_arg param,
- unsigned return_index)
+static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
+ struct ac_arg param, unsigned return_index)
{
- LLVMValueRef v = ac_get_arg(&ctx->ac, param);
-
- for (unsigned i = 0; i < 4; i++) {
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- ac_llvm_extract_elem(&ctx->ac, v, i),
- return_index + i, "");
- }
- return ret;
+ LLVMValueRef v = ac_get_arg(&ctx->ac, param);
+
+ for (unsigned i = 0; i < 4; i++) {
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
+ return_index + i, "");
+ }
+ return ret;
}
-static void load_bitmasks_2x64(struct si_shader_context *ctx,
- LLVMValueRef lds_ptr, unsigned dw_offset,
- LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
+static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr,
+ unsigned dw_offset, LLVMValueRef mask[2],
+ LLVMValueRef *total_bitcount)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr,
- LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2),
- AC_ADDR_SPACE_LDS), "");
- for (unsigned i = 0; i < 2; i++) {
- LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
- mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
- }
-
- /* We get better code if we don't use the 128-bit bitcount. */
- *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
- ac_build_bit_count(&ctx->ac, mask[1]), "");
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef ptr64 = LLVMBuildPointerCast(
+ builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), "");
+ for (unsigned i = 0; i < 2; i++) {
+ LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
+ mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
+ }
+
+ /* We get better code if we don't use the 128-bit bitcount. */
+ *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
+ ac_build_bit_count(&ctx->ac, mask[1]), "");
}
/**
* \param wave_info_num_bits the bit size of thread count field in merged_wave_info
* \param wave_info_shift the bit offset of the thread count field in merged_wave_info
*/
-static void update_thread_counts(struct si_shader_context *ctx,
- LLVMValueRef *new_num_threads,
- LLVMValueRef *tg_info,
- unsigned tg_info_num_bits,
- unsigned tg_info_shift,
- LLVMValueRef *wave_info,
- unsigned wave_info_num_bits,
- unsigned wave_info_shift)
+static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
+ LLVMValueRef *tg_info, unsigned tg_info_num_bits,
+ unsigned tg_info_shift, LLVMValueRef *wave_info,
+ unsigned wave_info_num_bits, unsigned wave_info_shift)
{
- LLVMBuilderRef builder = ctx->ac.builder;
-
- /* Update the total thread count. */
- unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
- *tg_info = LLVMBuildAnd(builder, *tg_info,
- LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
- *tg_info = LLVMBuildOr(builder, *tg_info,
- LLVMBuildShl(builder, *new_num_threads,
- LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
-
- /* Update the per-wave thread count. */
- LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
- *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
- *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
- *new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads,
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
- unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
- *wave_info = LLVMBuildAnd(builder, *wave_info,
- LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
- *wave_info = LLVMBuildOr(builder, *wave_info,
- LLVMBuildShl(builder, *new_num_threads,
- LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), "");
+ LLVMBuilderRef builder = ctx->ac.builder;
+
+ /* Update the total thread count. */
+ unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
+ *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
+ *tg_info = LLVMBuildOr(
+ builder, *tg_info,
+ LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
+
+ /* Update the per-wave thread count. */
+ LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
+ *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
+ *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
+ *new_num_threads =
+ ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
+ unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
+ *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
+ *wave_info = LLVMBuildOr(
+ builder, *wave_info,
+ LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
+ "");
}
/**
* Also return the position, which is passed to the shader as an input,
* so that we don't compute it twice.
*/
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader *shader = ctx->shader;
- struct si_shader_selector *sel = shader->selector;
- struct si_shader_info *info = &sel->info;
- LLVMBuilderRef builder = ctx->ac.builder;
-
- assert(shader->key.opt.ngg_culling);
- assert(shader->key.as_ngg);
- assert(sel->type == PIPE_SHADER_VERTEX ||
- (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
-
- LLVMValueRef position[4] = {};
- for (unsigned i = 0; i < info->num_outputs; i++) {
- switch (info->output_semantic_name[i]) {
- case TGSI_SEMANTIC_POSITION:
- for (unsigned j = 0; j < 4; j++) {
- position[j] = LLVMBuildLoad(ctx->ac.builder,
- addrs[4 * i + j], "");
- }
- break;
- }
- }
- assert(position[0]);
-
- /* Store Position.XYZW into LDS. */
- LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
- for (unsigned chan = 0; chan < 4; chan++) {
- LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]),
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
- }
- /* Store Position.XY / W into LDS. */
- for (unsigned chan = 0; chan < 2; chan++) {
- LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
- LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val),
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
- }
-
- /* Store VertexID and InstanceID. ES threads will have to load them
- * from LDS after vertex compaction and use them instead of their own
- * system values.
- */
- bool uses_instance_id = false;
- bool uses_tes_prim_id = false;
- LLVMValueRef packed_data = ctx->ac.i32_0;
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- uses_instance_id = sel->info.uses_instanceid ||
- shader->key.part.vs.prolog.instance_divisor_is_one ||
- shader->key.part.vs.prolog.instance_divisor_is_fetched;
-
- LLVMBuildStore(builder, ctx->abi.vertex_id,
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
- if (uses_instance_id) {
- LLVMBuildStore(builder, ctx->abi.instance_id,
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
- }
- } else {
- uses_tes_prim_id = sel->info.uses_primid ||
- shader->key.mono.u.vs_export_prim_id;
-
- assert(ctx->type == PIPE_SHADER_TESS_EVAL);
- LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
- LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
- packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
- LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
- if (uses_tes_prim_id) {
- LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
- }
- }
- /* Initialize the packed data. */
- LLVMBuildStore(builder, packed_data,
- ac_build_gep0(&ctx->ac, es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
- ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
- LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
- /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
- * than 4 waves, but we always read all 4 values. This is where the thread
- * bitmasks of unculled threads will be stored.
- *
- * gs_ngg_scratch layout: esmask[0..3]
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101);
- {
- LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
- LLVMBuildStore(builder, ctx->ac.i32_0,
- ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
- }
- ac_build_endif(&ctx->ac, 16101);
- ac_build_s_barrier(&ctx->ac);
-
- /* The hardware requires that there are no holes between unculled vertices,
- * which means we have to pack ES threads, i.e. reduce the ES thread count
- * and move ES input VGPRs to lower threads. The upside is that varyings
- * are only fetched and computed for unculled vertices.
- *
- * Vertex compaction in GS threads:
- *
- * Part 1: Compute the surviving vertex mask in GS threads:
- * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
- * - In GS, notify ES threads whether the vertex survived.
- * - Barrier
- * - ES threads will create the mask and store it in LDS.
- * - Barrier
- * - Each GS thread loads the vertex masks from LDS.
- *
- * Part 2: Compact ES threads in GS threads:
- * - Compute the prefix sum for all 3 vertices from the masks. These are the new
- * thread IDs for each vertex within the primitive.
- * - Write the value of the old thread ID into the LDS address of the new thread ID.
- * The ES thread will load the old thread ID and use it to load the position, VertexID,
- * and InstanceID.
- * - Update vertex indices and null flag in the GS input VGPRs.
- * - Barrier
- *
- * Part 3: Update inputs GPRs
- * - For all waves, update per-wave thread counts in input SGPRs.
- * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
- */
-
- LLVMValueRef vtxindex[3];
- if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
- /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
- * into these VGPRs.
- */
- vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
- vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
- vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
- } else {
- vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
- vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
- vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
- };
- LLVMValueRef gs_vtxptr[] = {
- ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
- ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
- ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
- };
- es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
- LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-
- /* Do culling in GS threads. */
- ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
- {
- /* Load positions. */
- LLVMValueRef pos[3][4] = {};
- for (unsigned vtx = 0; vtx < 3; vtx++) {
- for (unsigned chan = 0; chan < 4; chan++) {
- unsigned index;
- if (chan == 0 || chan == 1)
- index = lds_pos_x_div_w + chan;
- else if (chan == 3)
- index = lds_pos_w;
- else
- continue;
-
- LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx],
- LLVMConstInt(ctx->ac.i32, index, 0));
- pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
- pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
- }
- }
-
- /* Load the viewport state for small prim culling. */
- LLVMValueRef vp = ac_build_load_invariant(&ctx->ac,
- ac_get_arg(&ctx->ac, ctx->small_prim_cull_info),
- ctx->ac.i32_0);
- vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
- LLVMValueRef vp_scale[2], vp_translate[2];
- vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
- vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
- vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
- vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
- /* Get the small prim filter precision. */
- LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
- small_prim_precision = LLVMBuildOr(builder, small_prim_precision,
- LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
- small_prim_precision = LLVMBuildShl(builder, small_prim_precision,
- LLVMConstInt(ctx->ac.i32, 23, 0), "");
- small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
-
- /* Execute culling code. */
- struct ac_cull_options options = {};
- options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
- options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
- options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
- options.cull_small_prims = options.cull_view_xy;
- options.cull_zero_area = options.cull_front || options.cull_back;
- options.cull_w = true;
-
- /* Tell ES threads whether their vertex survived. */
- ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true,
- vp_scale, vp_translate,
- small_prim_precision, &options), 16003);
- {
- LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
- for (unsigned vtx = 0; vtx < 3; vtx++) {
- LLVMBuildStore(builder, ctx->ac.i8_1,
- si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
- }
- }
- ac_build_endif(&ctx->ac, 16003);
- }
- ac_build_endif(&ctx->ac, 16002);
- ac_build_s_barrier(&ctx->ac);
-
- gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
-
- LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
-
- /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
- ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
- {
- LLVMValueRef es_accepted_flag =
- LLVMBuildLoad(builder,
- si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
-
- LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE,
- es_accepted_flag, ctx->ac.i8_0, "");
- LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
-
- LLVMBuildStore(builder, es_accepted_bool, es_accepted);
-
- ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ,
- tid, ctx->ac.i32_0, ""), 16008);
- {
- LLVMBuildStore(builder, es_mask,
- ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
- get_wave_id_in_tg(ctx)));
- }
- ac_build_endif(&ctx->ac, 16008);
- }
- ac_build_endif(&ctx->ac, 16007);
- ac_build_s_barrier(&ctx->ac);
-
- /* Load the vertex masks and compute the new ES thread count. */
- LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
- load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
- new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
-
- /* ES threads compute their prefix sum, which is the new ES thread ID.
- * Then they write the value of the old thread ID into the LDS address
- * of the new thread ID. It will be used it to load input VGPRs from
- * the old thread's LDS location.
- */
- ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
- {
- LLVMValueRef old_id = get_thread_id_in_tg(ctx);
- LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
-
- LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
- si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id),
- lds_byte0_old_thread_id));
- LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
- si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
- }
- ac_build_endif(&ctx->ac, 16009);
-
- /* Kill waves that have inactive threads. */
- kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
- ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
- LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), "");
- ac_build_ifcc(&ctx->ac, kill_wave, 19202);
- {
- /* If we are killing wave 0, send that there are no primitives
- * in this threadgroup.
- */
- ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
- ctx->ac.i32_0, ctx->ac.i32_0);
- ac_build_s_endpgm(&ctx->ac);
- }
- ac_build_endif(&ctx->ac, 19202);
- ac_build_s_barrier(&ctx->ac);
-
- /* Send the final vertex and primitive counts. */
- ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
- new_num_es_threads, ngg_get_prim_cnt(ctx));
-
- /* Update thread counts in SGPRs. */
- LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
- LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
-
- /* This also converts the thread count from the total count to the per-wave count. */
- update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12,
- &new_merged_wave_info, 8, 0);
-
- /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
- LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- /* Set the null flag at the beginning (culled), and then
- * overwrite it for accepted primitives.
- */
- LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
-
- /* Get vertex indices after vertex compaction. */
- ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
- {
- struct ac_ngg_prim prim = {};
- prim.num_vertices = 3;
- prim.isnull = ctx->ac.i1false;
-
- for (unsigned vtx = 0; vtx < 3; vtx++) {
- prim.index[vtx] =
- LLVMBuildLoad(builder,
- si_build_gep_i8(ctx, gs_vtxptr[vtx],
- lds_byte1_new_thread_id), "");
- prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
- prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
- }
-
- /* Set the new GS input VGPR. */
- LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
- }
- ac_build_endif(&ctx->ac, 16011);
-
- if (gfx10_ngg_export_prim_early(shader))
- gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
-
- /* Set the new ES input VGPRs. */
- LLVMValueRef es_data[4];
- LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- for (unsigned i = 0; i < 4; i++)
- es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid,
- new_num_es_threads, ""), 16012);
- {
- LLVMValueRef old_id, old_es_vtxptr, tmp;
-
- /* Load ES input VGPRs from the ES thread before compaction. */
- old_id = LLVMBuildLoad(builder,
- si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
- old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
-
- LLVMBuildStore(builder, old_id, old_thread_id);
- old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
-
- for (unsigned i = 0; i < 2; i++) {
- tmp = LLVMBuildLoad(builder,
- ac_build_gep0(&ctx->ac, old_es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), "");
- LLVMBuildStore(builder, tmp, es_data[i]);
- }
-
- if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- tmp = LLVMBuildLoad(builder,
- si_build_gep_i8(ctx, old_es_vtxptr,
- lds_byte2_tes_rel_patch_id), "");
- tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
- LLVMBuildStore(builder, tmp, es_data[2]);
-
- if (uses_tes_prim_id) {
- tmp = LLVMBuildLoad(builder,
- ac_build_gep0(&ctx->ac, old_es_vtxptr,
- LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), "");
- LLVMBuildStore(builder, tmp, es_data[3]);
- }
- }
- }
- ac_build_endif(&ctx->ac, 16012);
-
- /* Return values for the main function. */
- LLVMValueRef ret = ctx->return_value;
- LLVMValueRef val;
-
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
- if (ctx->type == PIPE_SHADER_TESS_EVAL)
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
-
- ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
- 8 + SI_SGPR_RW_BUFFERS);
- ret = si_insert_input_ptr(ctx, ret,
- ctx->bindless_samplers_and_images,
- 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
- ret = si_insert_input_ptr(ctx, ret,
- ctx->const_and_shader_buffers,
- 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
- ret = si_insert_input_ptr(ctx, ret,
- ctx->samplers_and_images,
- 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
- ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
- 8 + SI_SGPR_VS_STATE_BITS);
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex,
- 8 + SI_SGPR_BASE_VERTEX);
- ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance,
- 8 + SI_SGPR_START_INSTANCE);
- ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id,
- 8 + SI_SGPR_DRAWID);
- ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
- 8 + SI_VS_NUM_USER_SGPR);
-
- for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
- ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
- 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
- }
- } else {
- assert(ctx->type == PIPE_SHADER_TESS_EVAL);
- ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
- 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
- ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr,
- 8 + SI_SGPR_TES_OFFCHIP_ADDR);
- }
-
- unsigned vgpr;
- if (ctx->type == PIPE_SHADER_VERTEX) {
- if (shader->selector->num_vbos_in_user_sgprs) {
- vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
- shader->selector->num_vbos_in_user_sgprs * 4;
- } else {
- vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
- }
- } else {
- vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
- }
-
- val = LLVMBuildLoad(builder, new_vgpr0, "");
- ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
- vgpr++, "");
- vgpr++; /* gs_vtx23_offset */
-
- ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
- ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
- vgpr++; /* gs_vtx45_offset */
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- val = LLVMBuildLoad(builder, es_data[0], "");
- ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
- vgpr++, ""); /* VGPR5 - VertexID */
- vgpr += 2;
- if (uses_instance_id) {
- val = LLVMBuildLoad(builder, es_data[1], "");
- ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
- vgpr++, ""); /* VGPR8 - InstanceID */
- } else {
- vgpr++;
- }
- } else {
- assert(ctx->type == PIPE_SHADER_TESS_EVAL);
- unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
- for (unsigned i = 0; i < num_vgprs; i++) {
- val = LLVMBuildLoad(builder, es_data[i], "");
- ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
- vgpr++, "");
- }
- if (num_vgprs == 3)
- vgpr++;
- }
- /* Return the old thread ID. */
- val = LLVMBuildLoad(builder, old_thread_id, "");
- ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
-
- /* These two also use LDS. */
- if (sel->info.writes_edgeflag ||
- (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
- ac_build_s_barrier(&ctx->ac);
-
- ctx->return_value = ret;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader *shader = ctx->shader;
+ struct si_shader_selector *sel = shader->selector;
+ struct si_shader_info *info = &sel->info;
+ LLVMBuilderRef builder = ctx->ac.builder;
+
+ assert(shader->key.opt.ngg_culling);
+ assert(shader->key.as_ngg);
+ assert(sel->type == PIPE_SHADER_VERTEX ||
+ (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
+
+ LLVMValueRef position[4] = {};
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ switch (info->output_semantic_name[i]) {
+ case TGSI_SEMANTIC_POSITION:
+ for (unsigned j = 0; j < 4; j++) {
+ position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+ }
+ break;
+ }
+ }
+ assert(position[0]);
+
+ /* Store Position.XYZW into LDS. */
+ LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+ for (unsigned chan = 0; chan < 4; chan++) {
+ LLVMBuildStore(
+ builder, ac_to_integer(&ctx->ac, position[chan]),
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
+ }
+ /* Store Position.XY / W into LDS. */
+ for (unsigned chan = 0; chan < 2; chan++) {
+ LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
+ LLVMBuildStore(
+ builder, ac_to_integer(&ctx->ac, val),
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
+ }
+
+ /* Store VertexID and InstanceID. ES threads will have to load them
+ * from LDS after vertex compaction and use them instead of their own
+ * system values.
+ */
+ bool uses_instance_id = false;
+ bool uses_tes_prim_id = false;
+ LLVMValueRef packed_data = ctx->ac.i32_0;
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ uses_instance_id = sel->info.uses_instanceid ||
+ shader->key.part.vs.prolog.instance_divisor_is_one ||
+ shader->key.part.vs.prolog.instance_divisor_is_fetched;
+
+ LLVMBuildStore(
+ builder, ctx->abi.vertex_id,
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
+ if (uses_instance_id) {
+ LLVMBuildStore(
+ builder, ctx->abi.instance_id,
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
+ }
+ } else {
+ uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
+
+ assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+ LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
+ LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
+ packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
+ LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
+ if (uses_tes_prim_id) {
+ LLVMBuildStore(
+ builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
+ }
+ }
+ /* Initialize the packed data. */
+ LLVMBuildStore(
+ builder, packed_data,
+ ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
+ ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+ LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+ /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
+ * than 4 waves, but we always read all 4 values. This is where the thread
+ * bitmasks of unculled threads will be stored.
+ *
+ * gs_ngg_scratch layout: esmask[0..3]
+ */
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
+ LLVMConstInt(ctx->ac.i32, 3, 0), ""),
+ 16101);
+ {
+ LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
+ LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
+ }
+ ac_build_endif(&ctx->ac, 16101);
+ ac_build_s_barrier(&ctx->ac);
+
+ /* The hardware requires that there are no holes between unculled vertices,
+ * which means we have to pack ES threads, i.e. reduce the ES thread count
+ * and move ES input VGPRs to lower threads. The upside is that varyings
+ * are only fetched and computed for unculled vertices.
+ *
+ * Vertex compaction in GS threads:
+ *
+ * Part 1: Compute the surviving vertex mask in GS threads:
+ * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
+ * - In GS, notify ES threads whether the vertex survived.
+ * - Barrier
+ * - ES threads will create the mask and store it in LDS.
+ * - Barrier
+ * - Each GS thread loads the vertex masks from LDS.
+ *
+ * Part 2: Compact ES threads in GS threads:
+ * - Compute the prefix sum for all 3 vertices from the masks. These are the new
+ * thread IDs for each vertex within the primitive.
+ * - Write the value of the old thread ID into the LDS address of the new thread ID.
+ * The ES thread will load the old thread ID and use it to load the position, VertexID,
+ * and InstanceID.
+ * - Update vertex indices and null flag in the GS input VGPRs.
+ * - Barrier
+ *
+ * Part 3: Update inputs GPRs
+ * - For all waves, update per-wave thread counts in input SGPRs.
+ * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
+ */
+
+ LLVMValueRef vtxindex[3];
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
+ /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
+ * into these VGPRs.
+ */
+ vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+ vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
+ vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
+ } else {
+ vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+ vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+ vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+ };
+ LLVMValueRef gs_vtxptr[] = {
+ ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
+ ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
+ ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
+ };
+ es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+ LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+
+ /* Do culling in GS threads. */
+ ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
+ {
+ /* Load positions. */
+ LLVMValueRef pos[3][4] = {};
+ for (unsigned vtx = 0; vtx < 3; vtx++) {
+ for (unsigned chan = 0; chan < 4; chan++) {
+ unsigned index;
+ if (chan == 0 || chan == 1)
+ index = lds_pos_x_div_w + chan;
+ else if (chan == 3)
+ index = lds_pos_w;
+ else
+ continue;
+
+ LLVMValueRef addr =
+ ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
+ pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
+ pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
+ }
+ }
+
+ /* Load the viewport state for small prim culling. */
+ LLVMValueRef vp = ac_build_load_invariant(
+ &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
+ vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+ LLVMValueRef vp_scale[2], vp_translate[2];
+ vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+ vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+ vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+ vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+ /* Get the small prim filter precision. */
+ LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
+ small_prim_precision =
+ LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
+ small_prim_precision =
+ LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
+ small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
+
+ /* Execute culling code. */
+ struct ac_cull_options options = {};
+ options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
+ options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
+ options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
+ options.cull_small_prims = options.cull_view_xy;
+ options.cull_zero_area = options.cull_front || options.cull_back;
+ options.cull_w = true;
+
+ /* Tell ES threads whether their vertex survived. */
+ ac_build_ifcc(&ctx->ac,
+ ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+ small_prim_precision, &options),
+ 16003);
+ {
+ LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
+ for (unsigned vtx = 0; vtx < 3; vtx++) {
+ LLVMBuildStore(builder, ctx->ac.i8_1,
+ si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
+ }
+ }
+ ac_build_endif(&ctx->ac, 16003);
+ }
+ ac_build_endif(&ctx->ac, 16002);
+ ac_build_s_barrier(&ctx->ac);
+
+ gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
+
+ LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
+
+ /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
+ ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
+ {
+ LLVMValueRef es_accepted_flag =
+ LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
+
+ LLVMValueRef es_accepted_bool =
+ LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, "");
+ LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
+
+ LLVMBuildStore(builder, es_accepted_bool, es_accepted);
+
+ ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
+ {
+ LLVMBuildStore(builder, es_mask,
+ ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
+ }
+ ac_build_endif(&ctx->ac, 16008);
+ }
+ ac_build_endif(&ctx->ac, 16007);
+ ac_build_s_barrier(&ctx->ac);
+
+ /* Load the vertex masks and compute the new ES thread count. */
+ LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
+ load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
+ new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
+
+ /* ES threads compute their prefix sum, which is the new ES thread ID.
+ * Then they write the value of the old thread ID into the LDS address
+ * of the new thread ID. It will be used it to load input VGPRs from
+ * the old thread's LDS location.
+ */
+ ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
+ {
+ LLVMValueRef old_id = get_thread_id_in_tg(ctx);
+ LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
+
+ LLVMBuildStore(
+ builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
+ si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
+ LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
+ si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
+ }
+ ac_build_endif(&ctx->ac, 16009);
+
+ /* Kill waves that have inactive threads. */
+ kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
+ ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
+ LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
+ "");
+ ac_build_ifcc(&ctx->ac, kill_wave, 19202);
+ {
+ /* If we are killing wave 0, send that there are no primitives
+ * in this threadgroup.
+ */
+ ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
+ ac_build_s_endpgm(&ctx->ac);
+ }
+ ac_build_endif(&ctx->ac, 19202);
+ ac_build_s_barrier(&ctx->ac);
+
+ /* Send the final vertex and primitive counts. */
+ ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
+ ngg_get_prim_cnt(ctx));
+
+ /* Update thread counts in SGPRs. */
+ LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
+ LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
+
+ /* This also converts the thread count from the total count to the per-wave count. */
+ update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
+ 0);
+
+ /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
+ LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ /* Set the null flag at the beginning (culled), and then
+ * overwrite it for accepted primitives.
+ */
+ LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
+
+ /* Get vertex indices after vertex compaction. */
+ ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
+ {
+ struct ac_ngg_prim prim = {};
+ prim.num_vertices = 3;
+ prim.isnull = ctx->ac.i1false;
+
+ for (unsigned vtx = 0; vtx < 3; vtx++) {
+ prim.index[vtx] = LLVMBuildLoad(
+ builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
+ prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
+ prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
+ }
+
+ /* Set the new GS input VGPR. */
+ LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
+ }
+ ac_build_endif(&ctx->ac, 16011);
+
+ if (gfx10_ngg_export_prim_early(shader))
+ gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
+
+ /* Set the new ES input VGPRs. */
+ LLVMValueRef es_data[4];
+ LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ for (unsigned i = 0; i < 4; i++)
+ es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
+ 16012);
+ {
+ LLVMValueRef old_id, old_es_vtxptr, tmp;
+
+ /* Load ES input VGPRs from the ES thread before compaction. */
+ old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
+ old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
+
+ LLVMBuildStore(builder, old_id, old_thread_id);
+ old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
+
+ for (unsigned i = 0; i < 2; i++) {
+ tmp = LLVMBuildLoad(
+ builder,
+ ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
+ "");
+ LLVMBuildStore(builder, tmp, es_data[i]);
+ }
+
+ if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+ tmp = LLVMBuildLoad(builder,
+ si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
+ tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+ LLVMBuildStore(builder, tmp, es_data[2]);
+
+ if (uses_tes_prim_id) {
+ tmp = LLVMBuildLoad(builder,
+ ac_build_gep0(&ctx->ac, old_es_vtxptr,
+ LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
+ "");
+ LLVMBuildStore(builder, tmp, es_data[3]);
+ }
+ }
+ }
+ ac_build_endif(&ctx->ac, 16012);
+
+ /* Return values for the main function. */
+ LLVMValueRef ret = ctx->return_value;
+ LLVMValueRef val;
+
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
+ if (ctx->type == PIPE_SHADER_TESS_EVAL)
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
+
+ ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+ ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+ ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
+ 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
+ ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
+ ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
+ ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
+ ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
+ ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
+
+ for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
+ ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
+ 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
+ }
+ } else {
+ assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+ ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
+ ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
+ }
+
+ unsigned vgpr;
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ if (shader->selector->num_vbos_in_user_sgprs) {
+ vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+ } else {
+ vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
+ }
+ } else {
+ vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+ }
+
+ val = LLVMBuildLoad(builder, new_vgpr0, "");
+ ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+ vgpr++; /* gs_vtx23_offset */
+
+ ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+ ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+ vgpr++; /* gs_vtx45_offset */
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ val = LLVMBuildLoad(builder, es_data[0], "");
+ ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+ ""); /* VGPR5 - VertexID */
+ vgpr += 2;
+ if (uses_instance_id) {
+ val = LLVMBuildLoad(builder, es_data[1], "");
+ ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+ ""); /* VGPR8 - InstanceID */
+ } else {
+ vgpr++;
+ }
+ } else {
+ assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+ unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
+ for (unsigned i = 0; i < num_vgprs; i++) {
+ val = LLVMBuildLoad(builder, es_data[i], "");
+ ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+ }
+ if (num_vgprs == 3)
+ vgpr++;
+ }
+ /* Return the old thread ID. */
+ val = LLVMBuildLoad(builder, old_thread_id, "");
+ ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+
+ /* These two also use LDS. */
+ if (sel->info.writes_edgeflag ||
+ (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+ ac_build_s_barrier(&ctx->ac);
+
+ ctx->return_value = ret;
}
/**
* Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
*/
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_selector *sel = ctx->shader->selector;
- struct si_shader_info *info = &sel->info;
- struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef tmp, tmp2;
-
- assert(!ctx->shader->is_gs_copy_shader);
- assert(info->num_outputs <= max_outputs);
-
- LLVMValueRef vertex_ptr = NULL;
-
- if (sel->so.num_outputs || sel->info.writes_edgeflag)
- vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
- for (unsigned i = 0; i < info->num_outputs; i++) {
- outputs[i].semantic_name = info->output_semantic_name[i];
- outputs[i].semantic_index = info->output_semantic_index[i];
-
- for (unsigned j = 0; j < 4; j++) {
- outputs[i].vertex_stream[j] =
- (info->output_streams[i] >> (2 * j)) & 3;
-
- /* TODO: we may store more outputs than streamout needs,
- * but streamout performance isn't that important.
- */
- if (sel->so.num_outputs) {
- tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
- LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
- tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
- tmp2 = ac_to_integer(&ctx->ac, tmp2);
- LLVMBuildStore(builder, tmp2, tmp);
- }
- }
-
- /* Store the edgeflag at the end (if streamout is enabled) */
- if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
- sel->info.writes_edgeflag) {
- LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
- /* The output is a float, but the hw expects a 1-bit integer. */
- edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
- edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
-
- tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
- tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
- LLVMBuildStore(builder, edgeflag, tmp);
- }
- }
-
- bool unterminated_es_if_block =
- !sel->so.num_outputs &&
- !sel->info.writes_edgeflag &&
- !ctx->screen->use_ngg_streamout && /* no query buffer */
- (ctx->type != PIPE_SHADER_VERTEX ||
- !ctx->shader->key.mono.u.vs_export_prim_id);
-
- if (!unterminated_es_if_block)
- ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
- LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
- LLVMValueRef is_es_thread = si_is_es_thread(ctx);
- LLVMValueRef vtxindex[3];
-
- if (ctx->shader->key.opt.ngg_culling) {
- vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
- vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
- vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
- } else {
- vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
- vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
- vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
- }
-
- /* Determine the number of vertices per primitive. */
- unsigned num_vertices;
- LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
-
- /* Streamout */
- LLVMValueRef emitted_prims = NULL;
-
- if (sel->so.num_outputs) {
- assert(!unterminated_es_if_block);
-
- struct ngg_streamout nggso = {};
- nggso.num_vertices = num_vertices_val;
- nggso.prim_enable[0] = is_gs_thread;
-
- for (unsigned i = 0; i < num_vertices; ++i)
- nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-
- build_streamout(ctx, &nggso);
- emitted_prims = nggso.emit[0];
- }
-
- LLVMValueRef user_edgeflags[3] = {};
-
- if (sel->info.writes_edgeflag) {
- assert(!unterminated_es_if_block);
-
- /* Streamout already inserted the barrier, so don't insert it again. */
- if (!sel->so.num_outputs)
- ac_build_s_barrier(&ctx->ac);
-
- ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
- /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
- for (unsigned i = 0; i < num_vertices; i++) {
- tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
- tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
- tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
- tmp = LLVMBuildLoad(builder, tmp, "");
- tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
- user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
- LLVMBuildStore(builder, tmp, user_edgeflags[i]);
- }
- ac_build_endif(&ctx->ac, 5400);
- }
-
- /* Copy Primitive IDs from GS threads to the LDS address corresponding
- * to the ES thread of the provoking vertex.
- */
- if (ctx->type == PIPE_SHADER_VERTEX &&
- ctx->shader->key.mono.u.vs_export_prim_id) {
- assert(!unterminated_es_if_block);
-
- /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
- if (sel->so.num_outputs || sel->info.writes_edgeflag)
- ac_build_s_barrier(&ctx->ac);
-
- ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
- /* Extract the PROVOKING_VTX_INDEX field. */
- LLVMValueRef provoking_vtx_in_prim =
- si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
-
- /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
- LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
- LLVMValueRef provoking_vtx_index =
- LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
- LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
-
- LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
- ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
- ac_build_endif(&ctx->ac, 5400);
- }
-
- /* Update query buffer */
- if (ctx->screen->use_ngg_streamout &&
- !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
- assert(!unterminated_es_if_block);
-
- tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
- tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
- ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
- tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
- ac_build_ifcc(&ctx->ac, tmp, 5030);
- tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
- sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
- ac_build_ifcc(&ctx->ac, tmp, 5031);
- {
- LLVMValueRef args[] = {
- ngg_get_prim_cnt(ctx),
- ngg_get_query_buf(ctx),
- LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
- ctx->ac.i32_0, /* soffset */
- ctx->ac.i32_0, /* cachepolicy */
- };
-
- if (sel->so.num_outputs) {
- args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
- args[2] = ac_build_writelane(&ctx->ac, args[2],
- LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1);
- }
-
- /* TODO: should this be 64-bit atomics? */
- ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
- ctx->ac.i32, args, 5, 0);
- }
- ac_build_endif(&ctx->ac, 5031);
- ac_build_endif(&ctx->ac, 5030);
- ac_build_endif(&ctx->ac, 5029);
- }
-
- /* Build the primitive export. */
- if (!gfx10_ngg_export_prim_early(ctx->shader)) {
- assert(!unterminated_es_if_block);
- gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
- }
-
- /* Export per-vertex data (positions and parameters). */
- if (!unterminated_es_if_block)
- ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
- {
- unsigned i;
-
- /* Unconditionally (re-)load the values for proper SSA form. */
- for (i = 0; i < info->num_outputs; i++) {
- /* If the NGG cull shader part computed the position, don't
- * use the position from the current shader part. Instead,
- * load it from LDS.
- */
- if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
- ctx->shader->key.opt.ngg_culling) {
- vertex_ptr = ngg_nogs_vertex_ptr(ctx,
- ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
-
- for (unsigned j = 0; j < 4; j++) {
- tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
- tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
- tmp = LLVMBuildLoad(builder, tmp, "");
- outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
- }
- } else {
- for (unsigned j = 0; j < 4; j++) {
- outputs[i].values[j] =
- LLVMBuildLoad(builder,
- addrs[4 * i + j], "");
- }
- }
- }
-
- if (ctx->shader->key.mono.u.vs_export_prim_id) {
- outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
- outputs[i].semantic_index = 0;
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- /* Wait for GS stores to finish. */
- ac_build_s_barrier(&ctx->ac);
-
- tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
- tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
- outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
- } else {
- assert(ctx->type == PIPE_SHADER_TESS_EVAL);
- outputs[i].values[0] = si_get_primitive_id(ctx, 0);
- }
-
- outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
- for (unsigned j = 1; j < 4; j++)
- outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
-
- memset(outputs[i].vertex_stream, 0,
- sizeof(outputs[i].vertex_stream));
- i++;
- }
-
- si_llvm_build_vs_exports(ctx, outputs, i);
- }
- ac_build_endif(&ctx->ac, 6002);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_selector *sel = ctx->shader->selector;
+ struct si_shader_info *info = &sel->info;
+ struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef tmp, tmp2;
+
+ assert(!ctx->shader->is_gs_copy_shader);
+ assert(info->num_outputs <= max_outputs);
+
+ LLVMValueRef vertex_ptr = NULL;
+
+ if (sel->so.num_outputs || sel->info.writes_edgeflag)
+ vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ outputs[i].semantic_name = info->output_semantic_name[i];
+ outputs[i].semantic_index = info->output_semantic_index[i];
+
+ for (unsigned j = 0; j < 4; j++) {
+ outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+
+ /* TODO: we may store more outputs than streamout needs,
+ * but streamout performance isn't that important.
+ */
+ if (sel->so.num_outputs) {
+ tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
+ tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+ tmp2 = ac_to_integer(&ctx->ac, tmp2);
+ LLVMBuildStore(builder, tmp2, tmp);
+ }
+ }
+
+ /* Store the edgeflag at the end (if streamout is enabled) */
+ if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) {
+ LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
+ /* The output is a float, but the hw expects a 1-bit integer. */
+ edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
+ edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
+
+ tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+ tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+ LLVMBuildStore(builder, edgeflag, tmp);
+ }
+ }
+
+ bool unterminated_es_if_block =
+ !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+ !ctx->screen->use_ngg_streamout && /* no query buffer */
+ (ctx->type != PIPE_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
+
+ if (!unterminated_es_if_block)
+ ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+ LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
+ LLVMValueRef is_es_thread = si_is_es_thread(ctx);
+ LLVMValueRef vtxindex[3];
+
+ if (ctx->shader->key.opt.ngg_culling) {
+ vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
+ vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
+ vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
+ } else {
+ vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+ vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+ vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+ }
+
+ /* Determine the number of vertices per primitive. */
+ unsigned num_vertices;
+ LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+ /* Streamout */
+ LLVMValueRef emitted_prims = NULL;
+
+ if (sel->so.num_outputs) {
+ assert(!unterminated_es_if_block);
+
+ struct ngg_streamout nggso = {};
+ nggso.num_vertices = num_vertices_val;
+ nggso.prim_enable[0] = is_gs_thread;
+
+ for (unsigned i = 0; i < num_vertices; ++i)
+ nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+
+ build_streamout(ctx, &nggso);
+ emitted_prims = nggso.emit[0];
+ }
+
+ LLVMValueRef user_edgeflags[3] = {};
+
+ if (sel->info.writes_edgeflag) {
+ assert(!unterminated_es_if_block);
+
+ /* Streamout already inserted the barrier, so don't insert it again. */
+ if (!sel->so.num_outputs)
+ ac_build_s_barrier(&ctx->ac);
+
+ ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+ /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
+ for (unsigned i = 0; i < num_vertices; i++) {
+ tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+ tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+ tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
+ tmp = LLVMBuildLoad(builder, tmp, "");
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+ user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
+ LLVMBuildStore(builder, tmp, user_edgeflags[i]);
+ }
+ ac_build_endif(&ctx->ac, 5400);
+ }
+
+ /* Copy Primitive IDs from GS threads to the LDS address corresponding
+ * to the ES thread of the provoking vertex.
+ */
+ if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
+ assert(!unterminated_es_if_block);
+
+ /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
+ if (sel->so.num_outputs || sel->info.writes_edgeflag)
+ ac_build_s_barrier(&ctx->ac);
+
+ ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+ /* Extract the PROVOKING_VTX_INDEX field. */
+ LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
+
+ /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
+ LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
+ LLVMValueRef provoking_vtx_index =
+ LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
+ LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
+
+ LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
+ ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
+ ac_build_endif(&ctx->ac, 5400);
+ }
+
+ /* Update query buffer */
+ if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+ assert(!unterminated_es_if_block);
+
+ tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5030);
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
+ sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5031);
+ {
+ LLVMValueRef args[] = {
+ ngg_get_prim_cnt(ctx),
+ ngg_get_query_buf(ctx),
+ LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
+ ctx->ac.i32_0, /* soffset */
+ ctx->ac.i32_0, /* cachepolicy */
+ };
+
+ if (sel->so.num_outputs) {
+ args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
+ args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
+ ctx->ac.i32_1);
+ }
+
+ /* TODO: should this be 64-bit atomics? */
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+ 0);
+ }
+ ac_build_endif(&ctx->ac, 5031);
+ ac_build_endif(&ctx->ac, 5030);
+ ac_build_endif(&ctx->ac, 5029);
+ }
+
+ /* Build the primitive export. */
+ if (!gfx10_ngg_export_prim_early(ctx->shader)) {
+ assert(!unterminated_es_if_block);
+ gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
+ }
+
+ /* Export per-vertex data (positions and parameters). */
+ if (!unterminated_es_if_block)
+ ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
+ {
+ unsigned i;
+
+ /* Unconditionally (re-)load the values for proper SSA form. */
+ for (i = 0; i < info->num_outputs; i++) {
+ /* If the NGG cull shader part computed the position, don't
+ * use the position from the current shader part. Instead,
+ * load it from LDS.
+ */
+ if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
+ ctx->shader->key.opt.ngg_culling) {
+ vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
+
+ for (unsigned j = 0; j < 4; j++) {
+ tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
+ tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+ tmp = LLVMBuildLoad(builder, tmp, "");
+ outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+ }
+ } else {
+ for (unsigned j = 0; j < 4; j++) {
+ outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+ }
+ }
+ }
+
+ if (ctx->shader->key.mono.u.vs_export_prim_id) {
+ outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+ outputs[i].semantic_index = 0;
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ /* Wait for GS stores to finish. */
+ ac_build_s_barrier(&ctx->ac);
+
+ tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+ tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+ outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
+ } else {
+ assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+ outputs[i].values[0] = si_get_primitive_id(ctx, 0);
+ }
+
+ outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
+ for (unsigned j = 1; j < 4; j++)
+ outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+ memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+ i++;
+ }
+
+ si_llvm_build_vs_exports(ctx, outputs, i);
+ }
+ ac_build_endif(&ctx->ac, 6002);
}
-static LLVMValueRef
-ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
+static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
{
- const struct si_shader_selector *sel = ctx->shader->selector;
- const struct si_shader_info *info = &sel->info;
-
- LLVMTypeRef elements[2] = {
- LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
- LLVMArrayType(ctx->ac.i8, 4),
- };
- LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
- type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
- return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
+ const struct si_shader_selector *sel = ctx->shader->selector;
+ const struct si_shader_info *info = &sel->info;
+
+ LLVMTypeRef elements[2] = {
+ LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
+ LLVMArrayType(ctx->ac.i8, 4),
+ };
+ LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
+ type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
+ return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
}
/**
*
* \return an LDS pointer to type {[N x i32], [4 x i8]}
*/
-static LLVMValueRef
-ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
+static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
{
- struct si_shader_selector *sel = ctx->shader->selector;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
-
- /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
- unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
- if (write_stride_2exp) {
- LLVMValueRef row =
- LLVMBuildLShr(builder, vertexidx,
- LLVMConstInt(ctx->ac.i32, 5, false), "");
- LLVMValueRef swizzle =
- LLVMBuildAnd(builder, row,
- LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
- false), "");
- vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
- }
-
- return ac_build_gep0(&ctx->ac, storage, vertexidx);
+ struct si_shader_selector *sel = ctx->shader->selector;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
+
+ /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
+ unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
+ if (write_stride_2exp) {
+ LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
+ LLVMValueRef swizzle = LLVMBuildAnd(
+ builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
+ vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
+ }
+
+ return ac_build_gep0(&ctx->ac, storage, vertexidx);
}
-static LLVMValueRef
-ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
- LLVMValueRef emitidx)
+static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
+ LLVMValueRef emitidx)
{
- struct si_shader_selector *sel = ctx->shader->selector;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef tmp;
-
- tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
- tmp = LLVMBuildMul(builder, tmp, gsthread, "");
- const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
- return ngg_gs_vertex_ptr(ctx, vertexidx);
+ struct si_shader_selector *sel = ctx->shader->selector;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef tmp;
+
+ tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
+ tmp = LLVMBuildMul(builder, tmp, gsthread, "");
+ const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
+ return ngg_gs_vertex_ptr(ctx, vertexidx);
}
-static LLVMValueRef
-ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
- unsigned out_idx)
+static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
+ LLVMValueRef vertexptr, unsigned out_idx)
{
- LLVMValueRef gep_idx[3] = {
- ctx->ac.i32_0, /* implied C-style array */
- ctx->ac.i32_0, /* first struct entry */
- LLVMConstInt(ctx->ac.i32, out_idx, false),
- };
- return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+ LLVMValueRef gep_idx[3] = {
+ ctx->ac.i32_0, /* implied C-style array */
+ ctx->ac.i32_0, /* first struct entry */
+ LLVMConstInt(ctx->ac.i32, out_idx, false),
+ };
+ return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
}
-static LLVMValueRef
-ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
- unsigned stream)
+static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
+ LLVMValueRef vertexptr, unsigned stream)
{
- LLVMValueRef gep_idx[3] = {
- ctx->ac.i32_0, /* implied C-style array */
- ctx->ac.i32_1, /* second struct entry */
- LLVMConstInt(ctx->ac.i32, stream, false),
- };
- return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+ LLVMValueRef gep_idx[3] = {
+ ctx->ac.i32_0, /* implied C-style array */
+ ctx->ac.i32_1, /* second struct entry */
+ LLVMConstInt(ctx->ac.i32, stream, false),
+ };
+ return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
}
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
- unsigned stream,
- LLVMValueRef *addrs)
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
{
- const struct si_shader_selector *sel = ctx->shader->selector;
- const struct si_shader_info *info = &sel->info;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef tmp;
- const LLVMValueRef vertexidx =
- LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-
- /* If this thread has already emitted the declared maximum number of
- * vertices, skip the write: excessive vertex emissions are not
- * supposed to have any effect.
- */
- const LLVMValueRef can_emit =
- LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
- LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
-
- tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
- tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
- LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
- ac_build_ifcc(&ctx->ac, can_emit, 9001);
-
- const LLVMValueRef vertexptr =
- ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
- unsigned out_idx = 0;
- for (unsigned i = 0; i < info->num_outputs; i++) {
- for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
- if (!(info->output_usagemask[i] & (1 << chan)) ||
- ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
- continue;
-
- LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
- out_val = ac_to_integer(&ctx->ac, out_val);
- LLVMBuildStore(builder, out_val,
- ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
- }
- }
- assert(out_idx * 4 == sel->gsvs_vertex_size);
-
- /* Determine and store whether this vertex completed a primitive. */
- const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
-
- tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
- const LLVMValueRef iscompleteprim =
- LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
-
- /* Since the geometry shader emits triangle strips, we need to
- * track which primitive is odd and swap vertex indices to get
- * the correct vertex order.
- */
- LLVMValueRef is_odd = ctx->ac.i1false;
- if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
- tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
- is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
- }
-
- tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
- LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
-
- /* The per-vertex primitive flag encoding:
- * bit 0: whether this vertex finishes a primitive
- * bit 1: whether the primitive is odd (if we are emitting triangle strips)
- */
- tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
- tmp = LLVMBuildOr(builder, tmp,
- LLVMBuildShl(builder,
- LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
- ctx->ac.i8_1, ""), "");
- LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
-
- tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
- tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
- LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
-
- ac_build_endif(&ctx->ac, 9001);
+ const struct si_shader_selector *sel = ctx->shader->selector;
+ const struct si_shader_info *info = &sel->info;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef tmp;
+ const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+
+ /* If this thread has already emitted the declared maximum number of
+ * vertices, skip the write: excessive vertex emissions are not
+ * supposed to have any effect.
+ */
+ const LLVMValueRef can_emit =
+ LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
+ LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+
+ tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+ tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
+ LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+ ac_build_ifcc(&ctx->ac, can_emit, 9001);
+
+ const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
+ unsigned out_idx = 0;
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
+ if (!(info->output_usagemask[i] & (1 << chan)) ||
+ ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+ continue;
+
+ LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
+ out_val = ac_to_integer(&ctx->ac, out_val);
+ LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
+ }
+ }
+ assert(out_idx * 4 == sel->gsvs_vertex_size);
+
+ /* Determine and store whether this vertex completed a primitive. */
+ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
+
+ tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
+ const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
+
+ /* Since the geometry shader emits triangle strips, we need to
+ * track which primitive is odd and swap vertex indices to get
+ * the correct vertex order.
+ */
+ LLVMValueRef is_odd = ctx->ac.i1false;
+ if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
+ tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
+ is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
+ }
+
+ tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
+ LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
+
+ /* The per-vertex primitive flag encoding:
+ * bit 0: whether this vertex finishes a primitive
+ * bit 1: whether the primitive is odd (if we are emitting triangle strips)
+ */
+ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
+ tmp = LLVMBuildOr(
+ builder, tmp,
+ LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
+ LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
+
+ tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+ tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+ LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
+ ac_build_endif(&ctx->ac, 9001);
}
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
{
- /* Zero out the part of LDS scratch that is used to accumulate the
- * per-stream generated primitive count.
- */
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
- LLVMValueRef tid = get_thread_id_in_tg(ctx);
- LLVMValueRef tmp;
-
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
- ac_build_ifcc(&ctx->ac, tmp, 5090);
- {
- LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
- LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
- }
- ac_build_endif(&ctx->ac, 5090);
-
- ac_build_s_barrier(&ctx->ac);
+ /* Zero out the part of LDS scratch that is used to accumulate the
+ * per-stream generated primitive count.
+ */
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+ LLVMValueRef tid = get_thread_id_in_tg(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5090);
+ {
+ LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+ LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
+ }
+ ac_build_endif(&ctx->ac, 5090);
+
+ ac_build_s_barrier(&ctx->ac);
}
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
{
- const struct si_shader_selector *sel = ctx->shader->selector;
- const struct si_shader_info *info = &sel->info;
- const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
- LLVMValueRef tmp, tmp2;
-
- /* Zero out remaining (non-emitted) primitive flags.
- *
- * Note: Alternatively, we could pass the relevant gs_next_vertex to
- * the emit threads via LDS. This is likely worse in the expected
- * typical case where each GS thread emits the full set of
- * vertices.
- */
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
-
- ac_build_bgnloop(&ctx->ac, 5100);
-
- const LLVMValueRef vertexidx =
- LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
- tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
- LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
- ac_build_ifcc(&ctx->ac, tmp, 5101);
- ac_build_break(&ctx->ac);
- ac_build_endif(&ctx->ac, 5101);
-
- tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
- LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
- tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
- LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
-
- ac_build_endloop(&ctx->ac, 5100);
- }
-
- /* Accumulate generated primitives counts across the entire threadgroup. */
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- LLVMValueRef numprims =
- LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
- numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
-
- tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
- ac_build_ifcc(&ctx->ac, tmp, 5105);
- {
- LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
- LLVMConstInt(ctx->ac.i32, stream, false)),
- numprims, LLVMAtomicOrderingMonotonic, false);
- }
- ac_build_endif(&ctx->ac, 5105);
- }
-
- ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
- ac_build_s_barrier(&ctx->ac);
-
- const LLVMValueRef tid = get_thread_id_in_tg(ctx);
- LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
-
- /* Streamout */
- if (sel->so.num_outputs) {
- struct ngg_streamout nggso = {};
-
- nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
-
- LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
- for (unsigned stream = 0; stream < 4; ++stream) {
- if (!info->num_stream_output_components[stream])
- continue;
-
- tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
- tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
- tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
- nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
- }
-
- for (unsigned i = 0; i < verts_per_prim; ++i) {
- tmp = LLVMBuildSub(builder, tid,
- LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
- tmp = ngg_gs_vertex_ptr(ctx, tmp);
- nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
- }
-
- build_streamout(ctx, &nggso);
- }
-
- /* Write shader query data. */
- if (ctx->screen->use_ngg_streamout) {
- tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
- tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
- ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
- unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
- LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
- ac_build_ifcc(&ctx->ac, tmp, 5110);
- {
- LLVMValueRef offset;
- tmp = tid;
- if (sel->so.num_outputs)
- tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
- offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
- if (sel->so.num_outputs) {
- tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
- tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
- offset = LLVMBuildAdd(builder, offset, tmp, "");
- }
-
- tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
- LLVMValueRef args[] = {
- tmp,
- ngg_get_query_buf(ctx),
- offset,
- LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
- ctx->ac.i32_0, /* cachepolicy */
- };
- ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
- ctx->ac.i32, args, 5, 0);
- }
- ac_build_endif(&ctx->ac, 5110);
- ac_build_endif(&ctx->ac, 5109);
- }
-
- /* Determine vertex liveness. */
- LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
-
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
- ac_build_ifcc(&ctx->ac, tmp, 5120);
- {
- for (unsigned i = 0; i < verts_per_prim; ++i) {
- const LLVMValueRef primidx =
- LLVMBuildAdd(builder, tid,
- LLVMConstInt(ctx->ac.i32, i, false), "");
-
- if (i > 0) {
- tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
- ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
- }
-
- /* Load primitive liveness */
- tmp = ngg_gs_vertex_ptr(ctx, primidx);
- tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
- const LLVMValueRef primlive =
- LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
- tmp = LLVMBuildLoad(builder, vertliveptr, "");
- tmp = LLVMBuildOr(builder, tmp, primlive, ""),
- LLVMBuildStore(builder, tmp, vertliveptr);
-
- if (i > 0)
- ac_build_endif(&ctx->ac, 5121 + i);
- }
- }
- ac_build_endif(&ctx->ac, 5120);
-
- /* Inclusive scan addition across the current wave. */
- LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
- struct ac_wg_scan vertlive_scan = {};
- vertlive_scan.op = nir_op_iadd;
- vertlive_scan.enable_reduce = true;
- vertlive_scan.enable_exclusive = true;
- vertlive_scan.src = vertlive;
- vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
- vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
- vertlive_scan.numwaves = get_tgsize(ctx);
- vertlive_scan.maxwaves = 8;
-
- ac_build_wg_scan(&ctx->ac, &vertlive_scan);
-
- /* Skip all exports (including index exports) when possible. At least on
- * early gfx10 revisions this is also to avoid hangs.
- */
- LLVMValueRef have_exports =
- LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
- num_emit_threads =
- LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
-
- /* Allocate export space. Send this message as early as possible, to
- * hide the latency of the SQ <-> SPI roundtrip.
- *
- * Note: We could consider compacting primitives for export as well.
- * PA processes 1 non-null prim / clock, but it fetches 4 DW of
- * prim data per clock and skips null primitives at no additional
- * cost. So compacting primitives can only be beneficial when
- * there are 4 or more contiguous null primitives in the export
- * (in the common case of single-dword prim exports).
- */
- ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
- vertlive_scan.result_reduce, num_emit_threads);
-
- /* Setup the reverse vertex compaction permutation. We re-use stream 1
- * of the primitive liveness flags, relying on the fact that each
- * threadgroup can have at most 256 threads. */
- ac_build_ifcc(&ctx->ac, vertlive, 5130);
- {
- tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
- tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
- LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
- }
- ac_build_endif(&ctx->ac, 5130);
-
- ac_build_s_barrier(&ctx->ac);
-
- /* Export primitive data */
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
- ac_build_ifcc(&ctx->ac, tmp, 5140);
- {
- LLVMValueRef flags;
- struct ac_ngg_prim prim = {};
- prim.num_vertices = verts_per_prim;
-
- tmp = ngg_gs_vertex_ptr(ctx, tid);
- flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
- prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
-
- for (unsigned i = 0; i < verts_per_prim; ++i) {
- prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
- LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
- prim.edgeflag[i] = ctx->ac.i1false;
- }
-
- /* Geometry shaders output triangle strips, but NGG expects triangles. */
- if (verts_per_prim == 3) {
- LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
- is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
- LLVMValueRef flatshade_first =
- LLVMBuildICmp(builder, LLVMIntEQ,
- si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
- ctx->ac.i32_0, "");
-
- ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
- flatshade_first,
- prim.index);
- }
-
- ac_build_export_prim(&ctx->ac, &prim);
- }
- ac_build_endif(&ctx->ac, 5140);
-
- /* Export position and parameter data */
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
- ac_build_ifcc(&ctx->ac, tmp, 5145);
- {
- struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
-
- tmp = ngg_gs_vertex_ptr(ctx, tid);
- tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
- tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
- const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
-
- unsigned out_idx = 0;
- for (unsigned i = 0; i < info->num_outputs; i++) {
- outputs[i].semantic_name = info->output_semantic_name[i];
- outputs[i].semantic_index = info->output_semantic_index[i];
-
- for (unsigned j = 0; j < 4; j++, out_idx++) {
- tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
- tmp = LLVMBuildLoad(builder, tmp, "");
- outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
- outputs[i].vertex_stream[j] =
- (info->output_streams[i] >> (2 * j)) & 3;
- }
- }
-
- si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
- }
- ac_build_endif(&ctx->ac, 5145);
+ const struct si_shader_selector *sel = ctx->shader->selector;
+ const struct si_shader_info *info = &sel->info;
+ const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
+ LLVMValueRef tmp, tmp2;
+
+ /* Zero out remaining (non-emitted) primitive flags.
+ *
+ * Note: Alternatively, we could pass the relevant gs_next_vertex to
+ * the emit threads via LDS. This is likely worse in the expected
+ * typical case where each GS thread emits the full set of
+ * vertices.
+ */
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
+
+ ac_build_bgnloop(&ctx->ac, 5100);
+
+ const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+ tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
+ LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5101);
+ ac_build_break(&ctx->ac);
+ ac_build_endif(&ctx->ac, 5101);
+
+ tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+ LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+ tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
+ LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
+
+ ac_build_endloop(&ctx->ac, 5100);
+ }
+
+ /* Accumulate generated primitives counts across the entire threadgroup. */
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+ numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5105);
+ {
+ LLVMBuildAtomicRMW(
+ builder, LLVMAtomicRMWBinOpAdd,
+ ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
+ numprims, LLVMAtomicOrderingMonotonic, false);
+ }
+ ac_build_endif(&ctx->ac, 5105);
+ }
+
+ ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+ ac_build_s_barrier(&ctx->ac);
+
+ const LLVMValueRef tid = get_thread_id_in_tg(ctx);
+ LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
+
+ /* Streamout */
+ if (sel->so.num_outputs) {
+ struct ngg_streamout nggso = {};
+
+ nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
+
+ LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ if (!info->num_stream_output_components[stream])
+ continue;
+
+ tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+ nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
+ }
+
+ for (unsigned i = 0; i < verts_per_prim; ++i) {
+ tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
+ "");
+ tmp = ngg_gs_vertex_ptr(ctx, tmp);
+ nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+ }
+
+ build_streamout(ctx, &nggso);
+ }
+
+ /* Write shader query data. */
+ if (ctx->screen->use_ngg_streamout) {
+ tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+ tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
+ unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+ LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
+ ac_build_ifcc(&ctx->ac, tmp, 5110);
+ {
+ LLVMValueRef offset;
+ tmp = tid;
+ if (sel->so.num_outputs)
+ tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
+ offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
+ if (sel->so.num_outputs) {
+ tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
+ tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
+ offset = LLVMBuildAdd(builder, offset, tmp, "");
+ }
+
+ tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+ LLVMValueRef args[] = {
+ tmp, ngg_get_query_buf(ctx),
+ offset, LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
+ ctx->ac.i32_0, /* cachepolicy */
+ };
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+ 0);
+ }
+ ac_build_endif(&ctx->ac, 5110);
+ ac_build_endif(&ctx->ac, 5109);
+ }
+
+ /* Determine vertex liveness. */
+ LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
+
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5120);
+ {
+ for (unsigned i = 0; i < verts_per_prim; ++i) {
+ const LLVMValueRef primidx =
+ LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
+
+ if (i > 0) {
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
+ }
+
+ /* Load primitive liveness */
+ tmp = ngg_gs_vertex_ptr(ctx, primidx);
+ tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+ const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+ tmp = LLVMBuildLoad(builder, vertliveptr, "");
+ tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
+
+ if (i > 0)
+ ac_build_endif(&ctx->ac, 5121 + i);
+ }
+ }
+ ac_build_endif(&ctx->ac, 5120);
+
+ /* Inclusive scan addition across the current wave. */
+ LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
+ struct ac_wg_scan vertlive_scan = {};
+ vertlive_scan.op = nir_op_iadd;
+ vertlive_scan.enable_reduce = true;
+ vertlive_scan.enable_exclusive = true;
+ vertlive_scan.src = vertlive;
+ vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
+ vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
+ vertlive_scan.numwaves = get_tgsize(ctx);
+ vertlive_scan.maxwaves = 8;
+
+ ac_build_wg_scan(&ctx->ac, &vertlive_scan);
+
+ /* Skip all exports (including index exports) when possible. At least on
+ * early gfx10 revisions this is also to avoid hangs.
+ */
+ LLVMValueRef have_exports =
+ LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
+ num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
+
+ /* Allocate export space. Send this message as early as possible, to
+ * hide the latency of the SQ <-> SPI roundtrip.
+ *
+ * Note: We could consider compacting primitives for export as well.
+ * PA processes 1 non-null prim / clock, but it fetches 4 DW of
+ * prim data per clock and skips null primitives at no additional
+ * cost. So compacting primitives can only be beneficial when
+ * there are 4 or more contiguous null primitives in the export
+ * (in the common case of single-dword prim exports).
+ */
+ ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
+ num_emit_threads);
+
+ /* Setup the reverse vertex compaction permutation. We re-use stream 1
+ * of the primitive liveness flags, relying on the fact that each
+ * threadgroup can have at most 256 threads. */
+ ac_build_ifcc(&ctx->ac, vertlive, 5130);
+ {
+ tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
+ tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
+ LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
+ }
+ ac_build_endif(&ctx->ac, 5130);
+
+ ac_build_s_barrier(&ctx->ac);
+
+ /* Export primitive data */
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5140);
+ {
+ LLVMValueRef flags;
+ struct ac_ngg_prim prim = {};
+ prim.num_vertices = verts_per_prim;
+
+ tmp = ngg_gs_vertex_ptr(ctx, tid);
+ flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+ prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
+
+ for (unsigned i = 0; i < verts_per_prim; ++i) {
+ prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
+ LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
+ prim.edgeflag[i] = ctx->ac.i1false;
+ }
+
+ /* Geometry shaders output triangle strips, but NGG expects triangles. */
+ if (verts_per_prim == 3) {
+ LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
+ is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
+ LLVMValueRef flatshade_first = LLVMBuildICmp(
+ builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+ ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
+ }
+
+ ac_build_export_prim(&ctx->ac, &prim);
+ }
+ ac_build_endif(&ctx->ac, 5140);
+
+ /* Export position and parameter data */
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
+ ac_build_ifcc(&ctx->ac, tmp, 5145);
+ {
+ struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+
+ tmp = ngg_gs_vertex_ptr(ctx, tid);
+ tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
+ tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+ const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
+
+ unsigned out_idx = 0;
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ outputs[i].semantic_name = info->output_semantic_name[i];
+ outputs[i].semantic_index = info->output_semantic_index[i];
+
+ for (unsigned j = 0; j < 4; j++, out_idx++) {
+ tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
+ tmp = LLVMBuildLoad(builder, tmp, "");
+ outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+ outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+ }
+ }
+
+ si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
+ }
+ ac_build_endif(&ctx->ac, 5145);
}
static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
- unsigned min_verts_per_prim, bool use_adjacency)
+ unsigned min_verts_per_prim, bool use_adjacency)
{
- unsigned max_reuse = max_esverts - min_verts_per_prim;
- if (use_adjacency)
- max_reuse /= 2;
- *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+ unsigned max_reuse = max_esverts - min_verts_per_prim;
+ if (use_adjacency)
+ max_reuse /= 2;
+ *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
}
/**
*/
void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
{
- const struct si_shader_selector *gs_sel = shader->selector;
- const struct si_shader_selector *es_sel =
- shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
- const enum pipe_shader_type gs_type = gs_sel->type;
- const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
- const unsigned input_prim = si_get_input_prim(gs_sel);
- const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
- input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
- const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
- const unsigned min_verts_per_prim =
- gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
-
- /* All these are in dwords: */
- /* We can't allow using the whole LDS, because GS waves compete with
- * other shader stages for LDS space.
- *
- * TODO: We should really take the shader's internal LDS use into
- * account. The linker will fail if the size is greater than
- * 8K dwords.
- */
- const unsigned max_lds_size = 8 * 1024 - 768;
- const unsigned target_lds_size = max_lds_size;
- unsigned esvert_lds_size = 0;
- unsigned gsprim_lds_size = 0;
-
- /* All these are per subgroup: */
- bool max_vert_out_per_gs_instance = false;
- unsigned max_gsprims_base = 128; /* default prim group size clamp */
- unsigned max_esverts_base = 128;
-
- if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
- max_gsprims_base = 128 / 3;
- max_esverts_base = max_gsprims_base * 3;
- } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
- max_gsprims_base = 126;
- max_esverts_base = 128;
- }
-
- /* Hardware has the following non-natural restrictions on the value
- * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
- * the draw:
- * - at most 252 for any line input primitive type
- * - at most 251 for any quad input primitive type
- * - at most 251 for triangle strips with adjacency (this happens to
- * be the natural limit for triangle *lists* with adjacency)
- */
- max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
- if (gs_type == PIPE_SHADER_GEOMETRY) {
- unsigned max_out_verts_per_gsprim =
- gs_sel->gs_max_out_vertices * gs_num_invocations;
-
- if (max_out_verts_per_gsprim <= 256) {
- if (max_out_verts_per_gsprim) {
- max_gsprims_base = MIN2(max_gsprims_base,
- 256 / max_out_verts_per_gsprim);
- }
- } else {
- /* Use special multi-cycling mode in which each GS
- * instance gets its own subgroup. Does not work with
- * tessellation. */
- max_vert_out_per_gs_instance = true;
- max_gsprims_base = 1;
- max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
- }
-
- esvert_lds_size = es_sel->esgs_itemsize / 4;
- gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
- } else {
- /* VS and TES. */
- /* LDS size for passing data from ES to GS. */
- esvert_lds_size = ngg_nogs_vertex_size(shader);
- }
-
- unsigned max_gsprims = max_gsprims_base;
- unsigned max_esverts = max_esverts_base;
-
- if (esvert_lds_size)
- max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
- if (gsprim_lds_size)
- max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
- if (esvert_lds_size || gsprim_lds_size) {
- /* Now that we have a rough proportionality between esverts
- * and gsprims based on the primitive type, scale both of them
- * down simultaneously based on required LDS space.
- *
- * We could be smarter about this if we knew how much vertex
- * reuse to expect.
- */
- unsigned lds_total = max_esverts * esvert_lds_size +
- max_gsprims * gsprim_lds_size;
- if (lds_total > target_lds_size) {
- max_esverts = max_esverts * target_lds_size / lds_total;
- max_gsprims = max_gsprims * target_lds_size / lds_total;
-
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
- min_verts_per_prim, use_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
- }
- }
-
- /* Round up towards full wave sizes for better ALU utilization. */
- if (!max_vert_out_per_gs_instance) {
- const unsigned wavesize = gs_sel->screen->ge_wave_size;
- unsigned orig_max_esverts;
- unsigned orig_max_gsprims;
- do {
- orig_max_esverts = max_esverts;
- orig_max_gsprims = max_gsprims;
-
- max_esverts = align(max_esverts, wavesize);
- max_esverts = MIN2(max_esverts, max_esverts_base);
- if (esvert_lds_size)
- max_esverts = MIN2(max_esverts,
- (max_lds_size - max_gsprims * gsprim_lds_size) /
- esvert_lds_size);
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
- max_gsprims = align(max_gsprims, wavesize);
- max_gsprims = MIN2(max_gsprims, max_gsprims_base);
- if (gsprim_lds_size)
- max_gsprims = MIN2(max_gsprims,
- (max_lds_size - max_esverts * esvert_lds_size) /
- gsprim_lds_size);
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
- min_verts_per_prim, use_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
- } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
- }
-
- /* Hardware restriction: minimum value of max_esverts */
- max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
-
- unsigned max_out_vertices =
- max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices :
- gs_type == PIPE_SHADER_GEOMETRY ?
- max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices :
- max_esverts;
- assert(max_out_vertices <= 256);
-
- unsigned prim_amp_factor = 1;
- if (gs_type == PIPE_SHADER_GEOMETRY) {
- /* Number of output primitives per GS input primitive after
- * GS instancing. */
- prim_amp_factor = gs_sel->gs_max_out_vertices;
- }
-
- /* The GE only checks against the maximum number of ES verts after
- * allocating a full GS primitive. So we need to ensure that whenever
- * this check passes, there is enough space for a full primitive without
- * vertex reuse.
- */
- shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
- shader->ngg.max_gsprims = max_gsprims;
- shader->ngg.max_out_verts = max_out_vertices;
- shader->ngg.prim_amp_factor = prim_amp_factor;
- shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
-
- shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
- shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
-
- assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
+ const struct si_shader_selector *gs_sel = shader->selector;
+ const struct si_shader_selector *es_sel =
+ shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
+ const enum pipe_shader_type gs_type = gs_sel->type;
+ const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+ const unsigned input_prim = si_get_input_prim(gs_sel);
+ const bool use_adjacency =
+ input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+ const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
+ const unsigned min_verts_per_prim = gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space.
+ *
+ * TODO: We should really take the shader's internal LDS use into
+ * account. The linker will fail if the size is greater than
+ * 8K dwords.
+ */
+ const unsigned max_lds_size = 8 * 1024 - 768;
+ const unsigned target_lds_size = max_lds_size;
+ unsigned esvert_lds_size = 0;
+ unsigned gsprim_lds_size = 0;
+
+ /* All these are per subgroup: */
+ bool max_vert_out_per_gs_instance = false;
+ unsigned max_gsprims_base = 128; /* default prim group size clamp */
+ unsigned max_esverts_base = 128;
+
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+ max_gsprims_base = 128 / 3;
+ max_esverts_base = max_gsprims_base * 3;
+ } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+ max_gsprims_base = 126;
+ max_esverts_base = 128;
+ }
+
+ /* Hardware has the following non-natural restrictions on the value
+ * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+ * the draw:
+ * - at most 252 for any line input primitive type
+ * - at most 251 for any quad input primitive type
+ * - at most 251 for triangle strips with adjacency (this happens to
+ * be the natural limit for triangle *lists* with adjacency)
+ */
+ max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+ if (gs_type == PIPE_SHADER_GEOMETRY) {
+ unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
+
+ if (max_out_verts_per_gsprim <= 256) {
+ if (max_out_verts_per_gsprim) {
+ max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+ }
+ } else {
+ /* Use special multi-cycling mode in which each GS
+ * instance gets its own subgroup. Does not work with
+ * tessellation. */
+ max_vert_out_per_gs_instance = true;
+ max_gsprims_base = 1;
+ max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
+ }
+
+ esvert_lds_size = es_sel->esgs_itemsize / 4;
+ gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+ } else {
+ /* VS and TES. */
+ /* LDS size for passing data from ES to GS. */
+ esvert_lds_size = ngg_nogs_vertex_size(shader);
+ }
+
+ unsigned max_gsprims = max_gsprims_base;
+ unsigned max_esverts = max_esverts_base;
+
+ if (esvert_lds_size)
+ max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+ if (gsprim_lds_size)
+ max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+ if (esvert_lds_size || gsprim_lds_size) {
+ /* Now that we have a rough proportionality between esverts
+ * and gsprims based on the primitive type, scale both of them
+ * down simultaneously based on required LDS space.
+ *
+ * We could be smarter about this if we knew how much vertex
+ * reuse to expect.
+ */
+ unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
+ if (lds_total > target_lds_size) {
+ max_esverts = max_esverts * target_lds_size / lds_total;
+ max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ }
+ }
+
+ /* Round up towards full wave sizes for better ALU utilization. */
+ if (!max_vert_out_per_gs_instance) {
+ const unsigned wavesize = gs_sel->screen->ge_wave_size;
+ unsigned orig_max_esverts;
+ unsigned orig_max_gsprims;
+ do {
+ orig_max_esverts = max_esverts;
+ orig_max_gsprims = max_gsprims;
+
+ max_esverts = align(max_esverts, wavesize);
+ max_esverts = MIN2(max_esverts, max_esverts_base);
+ if (esvert_lds_size)
+ max_esverts =
+ MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+ max_gsprims = align(max_gsprims, wavesize);
+ max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+ if (gsprim_lds_size)
+ max_gsprims =
+ MIN2(max_gsprims, (max_lds_size - max_esverts * esvert_lds_size) / gsprim_lds_size);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+ }
+
+ /* Hardware restriction: minimum value of max_esverts */
+ max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+ unsigned max_out_vertices =
+ max_vert_out_per_gs_instance
+ ? gs_sel->gs_max_out_vertices
+ : gs_type == PIPE_SHADER_GEOMETRY
+ ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
+ : max_esverts;
+ assert(max_out_vertices <= 256);
+
+ unsigned prim_amp_factor = 1;
+ if (gs_type == PIPE_SHADER_GEOMETRY) {
+ /* Number of output primitives per GS input primitive after
+ * GS instancing. */
+ prim_amp_factor = gs_sel->gs_max_out_vertices;
+ }
+
+ /* The GE only checks against the maximum number of ES verts after
+ * allocating a full GS primitive. So we need to ensure that whenever
+ * this check passes, there is enough space for a full primitive without
+ * vertex reuse.
+ */
+ shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+ shader->ngg.max_gsprims = max_gsprims;
+ shader->ngg.max_out_verts = max_out_vertices;
+ shader->ngg.prim_amp_factor = prim_amp_factor;
+ shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+
+ shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+ shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+
+ assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
#include "si_compute.h"
+#include "si_pipe.h"
#include "util/format/u_format.h"
#include "util/u_log.h"
#include "util/u_surface.h"
-enum {
- SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
- SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
+enum
+{
+ SI_COPY =
+ SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
- SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
- SI_SAVE_FRAGMENT_STATE,
+ SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE,
- SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
- SI_DISABLE_RENDER_COND,
+ SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
- SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
+ SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
};
void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op)
{
- util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
- util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
- util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
- util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
- util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
- (struct pipe_stream_output_target**)sctx->streamout.targets);
- util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-
- if (op & SI_SAVE_FRAGMENT_STATE) {
- util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
- util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
- util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
- util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
- util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
- util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
- util_blitter_save_window_rectangles(sctx->blitter,
- sctx->window_rectangles_include,
- sctx->num_window_rectangles,
- sctx->window_rectangles);
- }
-
- if (op & SI_SAVE_FRAMEBUFFER)
- util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
-
- if (op & SI_SAVE_TEXTURES) {
- util_blitter_save_fragment_sampler_states(
- sctx->blitter, 2,
- (void**)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
-
- util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
- sctx->samplers[PIPE_SHADER_FRAGMENT].views);
- }
-
- if (op & SI_DISABLE_RENDER_COND)
- sctx->render_cond_force_off = true;
-
- if (sctx->screen->dpbb_allowed) {
- sctx->dpbb_force_off = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
- }
+ util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
+ util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
+ util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
+ util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+ util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
+ (struct pipe_stream_output_target **)sctx->streamout.targets);
+ util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+ if (op & SI_SAVE_FRAGMENT_STATE) {
+ util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+ util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+ util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+ util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+ util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
+ util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
+ util_blitter_save_window_rectangles(sctx->blitter, sctx->window_rectangles_include,
+ sctx->num_window_rectangles, sctx->window_rectangles);
+ }
+
+ if (op & SI_SAVE_FRAMEBUFFER)
+ util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
+
+ if (op & SI_SAVE_TEXTURES) {
+ util_blitter_save_fragment_sampler_states(
+ sctx->blitter, 2, (void **)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
+
+ util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
+ sctx->samplers[PIPE_SHADER_FRAGMENT].views);
+ }
+
+ if (op & SI_DISABLE_RENDER_COND)
+ sctx->render_cond_force_off = true;
+
+ if (sctx->screen->dpbb_allowed) {
+ sctx->dpbb_force_off = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+ }
}
void si_blitter_end(struct si_context *sctx)
{
- if (sctx->screen->dpbb_allowed) {
- sctx->dpbb_force_off = false;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
- }
-
- sctx->render_cond_force_off = false;
-
- /* Restore shader pointers because the VS blit shader changed all
- * non-global VS user SGPRs. */
- sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
- sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
- sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ if (sctx->screen->dpbb_allowed) {
+ sctx->dpbb_force_off = false;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+ }
+
+ sctx->render_cond_force_off = false;
+
+ /* Restore shader pointers because the VS blit shader changed all
+ * non-global VS user SGPRs. */
+ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+ sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
static unsigned u_max_sample(struct pipe_resource *r)
{
- return r->nr_samples ? r->nr_samples - 1 : 0;
+ return r->nr_samples ? r->nr_samples - 1 : 0;
}
-static unsigned
-si_blit_dbcb_copy(struct si_context *sctx,
- struct si_texture *src,
- struct si_texture *dst,
- unsigned planes, unsigned level_mask,
- unsigned first_layer, unsigned last_layer,
- unsigned first_sample, unsigned last_sample)
+static unsigned si_blit_dbcb_copy(struct si_context *sctx, struct si_texture *src,
+ struct si_texture *dst, unsigned planes, unsigned level_mask,
+ unsigned first_layer, unsigned last_layer, unsigned first_sample,
+ unsigned last_sample)
{
- struct pipe_surface surf_tmpl = {{0}};
- unsigned layer, sample, checked_last_layer, max_layer;
- unsigned fully_copied_levels = 0;
+ struct pipe_surface surf_tmpl = {{0}};
+ unsigned layer, sample, checked_last_layer, max_layer;
+ unsigned fully_copied_levels = 0;
- if (planes & PIPE_MASK_Z)
- sctx->dbcb_depth_copy_enabled = true;
- if (planes & PIPE_MASK_S)
- sctx->dbcb_stencil_copy_enabled = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ if (planes & PIPE_MASK_Z)
+ sctx->dbcb_depth_copy_enabled = true;
+ if (planes & PIPE_MASK_S)
+ sctx->dbcb_stencil_copy_enabled = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
+ assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
- sctx->decompression_enabled = true;
+ sctx->decompression_enabled = true;
- while (level_mask) {
- unsigned level = u_bit_scan(&level_mask);
+ while (level_mask) {
+ unsigned level = u_bit_scan(&level_mask);
- /* The smaller the mipmap level, the less layers there are
- * as far as 3D textures are concerned. */
- max_layer = util_max_layer(&src->buffer.b.b, level);
- checked_last_layer = MIN2(last_layer, max_layer);
+ /* The smaller the mipmap level, the less layers there are
+ * as far as 3D textures are concerned. */
+ max_layer = util_max_layer(&src->buffer.b.b, level);
+ checked_last_layer = MIN2(last_layer, max_layer);
- surf_tmpl.u.tex.level = level;
+ surf_tmpl.u.tex.level = level;
- for (layer = first_layer; layer <= checked_last_layer; layer++) {
- struct pipe_surface *zsurf, *cbsurf;
+ for (layer = first_layer; layer <= checked_last_layer; layer++) {
+ struct pipe_surface *zsurf, *cbsurf;
- surf_tmpl.format = src->buffer.b.b.format;
- surf_tmpl.u.tex.first_layer = layer;
- surf_tmpl.u.tex.last_layer = layer;
+ surf_tmpl.format = src->buffer.b.b.format;
+ surf_tmpl.u.tex.first_layer = layer;
+ surf_tmpl.u.tex.last_layer = layer;
- zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
+ zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
- surf_tmpl.format = dst->buffer.b.b.format;
- cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
+ surf_tmpl.format = dst->buffer.b.b.format;
+ cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
- for (sample = first_sample; sample <= last_sample; sample++) {
- if (sample != sctx->dbcb_copy_sample) {
- sctx->dbcb_copy_sample = sample;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
+ for (sample = first_sample; sample <= last_sample; sample++) {
+ if (sample != sctx->dbcb_copy_sample) {
+ sctx->dbcb_copy_sample = sample;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
- si_blitter_begin(sctx, SI_DECOMPRESS);
- util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
- sctx->custom_dsa_flush, 1.0f);
- si_blitter_end(sctx);
- }
+ si_blitter_begin(sctx, SI_DECOMPRESS);
+ util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
+ sctx->custom_dsa_flush, 1.0f);
+ si_blitter_end(sctx);
+ }
- pipe_surface_reference(&zsurf, NULL);
- pipe_surface_reference(&cbsurf, NULL);
- }
+ pipe_surface_reference(&zsurf, NULL);
+ pipe_surface_reference(&cbsurf, NULL);
+ }
- if (first_layer == 0 && last_layer >= max_layer &&
- first_sample == 0 && last_sample >= u_max_sample(&src->buffer.b.b))
- fully_copied_levels |= 1u << level;
- }
+ if (first_layer == 0 && last_layer >= max_layer && first_sample == 0 &&
+ last_sample >= u_max_sample(&src->buffer.b.b))
+ fully_copied_levels |= 1u << level;
+ }
- sctx->decompression_enabled = false;
- sctx->dbcb_depth_copy_enabled = false;
- sctx->dbcb_stencil_copy_enabled = false;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ sctx->decompression_enabled = false;
+ sctx->dbcb_depth_copy_enabled = false;
+ sctx->dbcb_stencil_copy_enabled = false;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- return fully_copied_levels;
+ return fully_copied_levels;
}
/* Helper function for si_blit_decompress_zs_in_place.
*/
-static void
-si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
- struct si_texture *texture,
- unsigned planes, unsigned level_mask,
- unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
+ struct si_texture *texture, unsigned planes,
+ unsigned level_mask, unsigned first_layer,
+ unsigned last_layer)
{
- struct pipe_surface *zsurf, surf_tmpl = {{0}};
- unsigned layer, max_layer, checked_last_layer;
- unsigned fully_decompressed_mask = 0;
+ struct pipe_surface *zsurf, surf_tmpl = {{0}};
+ unsigned layer, max_layer, checked_last_layer;
+ unsigned fully_decompressed_mask = 0;
- if (!level_mask)
- return;
+ if (!level_mask)
+ return;
- if (planes & PIPE_MASK_S)
- sctx->db_flush_stencil_inplace = true;
- if (planes & PIPE_MASK_Z)
- sctx->db_flush_depth_inplace = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ if (planes & PIPE_MASK_S)
+ sctx->db_flush_stencil_inplace = true;
+ if (planes & PIPE_MASK_Z)
+ sctx->db_flush_depth_inplace = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- surf_tmpl.format = texture->buffer.b.b.format;
+ surf_tmpl.format = texture->buffer.b.b.format;
- sctx->decompression_enabled = true;
+ sctx->decompression_enabled = true;
- while (level_mask) {
- unsigned level = u_bit_scan(&level_mask);
+ while (level_mask) {
+ unsigned level = u_bit_scan(&level_mask);
- surf_tmpl.u.tex.level = level;
+ surf_tmpl.u.tex.level = level;
- /* The smaller the mipmap level, the less layers there are
- * as far as 3D textures are concerned. */
- max_layer = util_max_layer(&texture->buffer.b.b, level);
- checked_last_layer = MIN2(last_layer, max_layer);
+ /* The smaller the mipmap level, the less layers there are
+ * as far as 3D textures are concerned. */
+ max_layer = util_max_layer(&texture->buffer.b.b, level);
+ checked_last_layer = MIN2(last_layer, max_layer);
- for (layer = first_layer; layer <= checked_last_layer; layer++) {
- surf_tmpl.u.tex.first_layer = layer;
- surf_tmpl.u.tex.last_layer = layer;
+ for (layer = first_layer; layer <= checked_last_layer; layer++) {
+ surf_tmpl.u.tex.first_layer = layer;
+ surf_tmpl.u.tex.last_layer = layer;
- zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
+ zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
- si_blitter_begin(sctx, SI_DECOMPRESS);
- util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0,
- sctx->custom_dsa_flush,
- 1.0f);
- si_blitter_end(sctx);
+ si_blitter_begin(sctx, SI_DECOMPRESS);
+ util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0, sctx->custom_dsa_flush,
+ 1.0f);
+ si_blitter_end(sctx);
- pipe_surface_reference(&zsurf, NULL);
- }
+ pipe_surface_reference(&zsurf, NULL);
+ }
- /* The texture will always be dirty if some layers aren't flushed.
- * I don't think this case occurs often though. */
- if (first_layer == 0 && last_layer >= max_layer) {
- fully_decompressed_mask |= 1u << level;
- }
- }
+ /* The texture will always be dirty if some layers aren't flushed.
+ * I don't think this case occurs often though. */
+ if (first_layer == 0 && last_layer >= max_layer) {
+ fully_decompressed_mask |= 1u << level;
+ }
+ }
- if (planes & PIPE_MASK_Z)
- texture->dirty_level_mask &= ~fully_decompressed_mask;
- if (planes & PIPE_MASK_S)
- texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
+ if (planes & PIPE_MASK_Z)
+ texture->dirty_level_mask &= ~fully_decompressed_mask;
+ if (planes & PIPE_MASK_S)
+ texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
- sctx->decompression_enabled = false;
- sctx->db_flush_depth_inplace = false;
- sctx->db_flush_stencil_inplace = false;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ sctx->decompression_enabled = false;
+ sctx->db_flush_depth_inplace = false;
+ sctx->db_flush_stencil_inplace = false;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
}
/* Helper function of si_flush_depth_texture: decompress the given levels
* of Z and/or S planes in place.
*/
-static void
-si_blit_decompress_zs_in_place(struct si_context *sctx,
- struct si_texture *texture,
- unsigned levels_z, unsigned levels_s,
- unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_in_place(struct si_context *sctx, struct si_texture *texture,
+ unsigned levels_z, unsigned levels_s,
+ unsigned first_layer, unsigned last_layer)
{
- unsigned both = levels_z & levels_s;
-
- /* First, do combined Z & S decompresses for levels that need it. */
- if (both) {
- si_blit_decompress_zs_planes_in_place(
- sctx, texture, PIPE_MASK_Z | PIPE_MASK_S,
- both,
- first_layer, last_layer);
- levels_z &= ~both;
- levels_s &= ~both;
- }
-
- /* Now do separate Z and S decompresses. */
- if (levels_z) {
- si_blit_decompress_zs_planes_in_place(
- sctx, texture, PIPE_MASK_Z,
- levels_z,
- first_layer, last_layer);
- }
-
- if (levels_s) {
- si_blit_decompress_zs_planes_in_place(
- sctx, texture, PIPE_MASK_S,
- levels_s,
- first_layer, last_layer);
- }
+ unsigned both = levels_z & levels_s;
+
+ /* First, do combined Z & S decompresses for levels that need it. */
+ if (both) {
+ si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z | PIPE_MASK_S, both,
+ first_layer, last_layer);
+ levels_z &= ~both;
+ levels_s &= ~both;
+ }
+
+ /* Now do separate Z and S decompresses. */
+ if (levels_z) {
+ si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z, levels_z, first_layer,
+ last_layer);
+ }
+
+ if (levels_s) {
+ si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_S, levels_s, first_layer,
+ last_layer);
+ }
}
-static void
-si_decompress_depth(struct si_context *sctx,
- struct si_texture *tex,
- unsigned required_planes,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer)
+static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
+ unsigned required_planes, unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer)
{
- unsigned inplace_planes = 0;
- unsigned copy_planes = 0;
- unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
- unsigned levels_z = 0;
- unsigned levels_s = 0;
-
- if (required_planes & PIPE_MASK_Z) {
- levels_z = level_mask & tex->dirty_level_mask;
-
- if (levels_z) {
- if (si_can_sample_zs(tex, false))
- inplace_planes |= PIPE_MASK_Z;
- else
- copy_planes |= PIPE_MASK_Z;
- }
- }
- if (required_planes & PIPE_MASK_S) {
- levels_s = level_mask & tex->stencil_dirty_level_mask;
-
- if (levels_s) {
- if (si_can_sample_zs(tex, true))
- inplace_planes |= PIPE_MASK_S;
- else
- copy_planes |= PIPE_MASK_S;
- }
- }
-
- if (unlikely(sctx->log))
- u_log_printf(sctx->log,
- "\n------------------------------------------------\n"
- "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
- first_level, last_level, levels_z, levels_s);
-
- /* We may have to allocate the flushed texture here when called from
- * si_decompress_subresource.
- */
- if (copy_planes &&
- (tex->flushed_depth_texture ||
- si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
- struct si_texture *dst = tex->flushed_depth_texture;
- unsigned fully_copied_levels;
- unsigned levels = 0;
-
- assert(tex->flushed_depth_texture);
-
- if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
- copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
-
- if (copy_planes & PIPE_MASK_Z) {
- levels |= levels_z;
- levels_z = 0;
- }
- if (copy_planes & PIPE_MASK_S) {
- levels |= levels_s;
- levels_s = 0;
- }
-
- fully_copied_levels = si_blit_dbcb_copy(
- sctx, tex, dst, copy_planes, levels,
- first_layer, last_layer,
- 0, u_max_sample(&tex->buffer.b.b));
-
- if (copy_planes & PIPE_MASK_Z)
- tex->dirty_level_mask &= ~fully_copied_levels;
- if (copy_planes & PIPE_MASK_S)
- tex->stencil_dirty_level_mask &= ~fully_copied_levels;
- }
-
- if (inplace_planes) {
- bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
- bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level,
- inplace_planes);
-
- /* Don't decompress if there is no HTILE or when HTILE is
- * TC-compatible. */
- if (has_htile && !tc_compat_htile) {
- si_blit_decompress_zs_in_place(
- sctx, tex,
- levels_z, levels_s,
- first_layer, last_layer);
- } else {
- /* This is only a cache flush.
- *
- * Only clear the mask that we are flushing, because
- * si_make_DB_shader_coherent() treats different levels
- * and depth and stencil differently.
- */
- if (inplace_planes & PIPE_MASK_Z)
- tex->dirty_level_mask &= ~levels_z;
- if (inplace_planes & PIPE_MASK_S)
- tex->stencil_dirty_level_mask &= ~levels_s;
- }
-
- /* Only in-place decompression needs to flush DB caches, or
- * when we don't decompress but TC-compatible planes are dirty.
- */
- si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
- inplace_planes & PIPE_MASK_S,
- tc_compat_htile);
- }
- /* set_framebuffer_state takes care of coherency for single-sample.
- * The DB->CB copy uses CB for the final writes.
- */
- if (copy_planes && tex->buffer.b.b.nr_samples > 1)
- si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
- false, true /* no DCC */);
+ unsigned inplace_planes = 0;
+ unsigned copy_planes = 0;
+ unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+ unsigned levels_z = 0;
+ unsigned levels_s = 0;
+
+ if (required_planes & PIPE_MASK_Z) {
+ levels_z = level_mask & tex->dirty_level_mask;
+
+ if (levels_z) {
+ if (si_can_sample_zs(tex, false))
+ inplace_planes |= PIPE_MASK_Z;
+ else
+ copy_planes |= PIPE_MASK_Z;
+ }
+ }
+ if (required_planes & PIPE_MASK_S) {
+ levels_s = level_mask & tex->stencil_dirty_level_mask;
+
+ if (levels_s) {
+ if (si_can_sample_zs(tex, true))
+ inplace_planes |= PIPE_MASK_S;
+ else
+ copy_planes |= PIPE_MASK_S;
+ }
+ }
+
+ if (unlikely(sctx->log))
+ u_log_printf(sctx->log,
+ "\n------------------------------------------------\n"
+ "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
+ first_level, last_level, levels_z, levels_s);
+
+ /* We may have to allocate the flushed texture here when called from
+ * si_decompress_subresource.
+ */
+ if (copy_planes &&
+ (tex->flushed_depth_texture || si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
+ struct si_texture *dst = tex->flushed_depth_texture;
+ unsigned fully_copied_levels;
+ unsigned levels = 0;
+
+ assert(tex->flushed_depth_texture);
+
+ if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
+ copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
+
+ if (copy_planes & PIPE_MASK_Z) {
+ levels |= levels_z;
+ levels_z = 0;
+ }
+ if (copy_planes & PIPE_MASK_S) {
+ levels |= levels_s;
+ levels_s = 0;
+ }
+
+ fully_copied_levels = si_blit_dbcb_copy(sctx, tex, dst, copy_planes, levels, first_layer,
+ last_layer, 0, u_max_sample(&tex->buffer.b.b));
+
+ if (copy_planes & PIPE_MASK_Z)
+ tex->dirty_level_mask &= ~fully_copied_levels;
+ if (copy_planes & PIPE_MASK_S)
+ tex->stencil_dirty_level_mask &= ~fully_copied_levels;
+ }
+
+ if (inplace_planes) {
+ bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
+ bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level, inplace_planes);
+
+ /* Don't decompress if there is no HTILE or when HTILE is
+ * TC-compatible. */
+ if (has_htile && !tc_compat_htile) {
+ si_blit_decompress_zs_in_place(sctx, tex, levels_z, levels_s, first_layer, last_layer);
+ } else {
+ /* This is only a cache flush.
+ *
+ * Only clear the mask that we are flushing, because
+ * si_make_DB_shader_coherent() treats different levels
+ * and depth and stencil differently.
+ */
+ if (inplace_planes & PIPE_MASK_Z)
+ tex->dirty_level_mask &= ~levels_z;
+ if (inplace_planes & PIPE_MASK_S)
+ tex->stencil_dirty_level_mask &= ~levels_s;
+ }
+
+ /* Only in-place decompression needs to flush DB caches, or
+ * when we don't decompress but TC-compatible planes are dirty.
+ */
+ si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, inplace_planes & PIPE_MASK_S,
+ tc_compat_htile);
+ }
+ /* set_framebuffer_state takes care of coherency for single-sample.
+ * The DB->CB copy uses CB for the final writes.
+ */
+ if (copy_planes && tex->buffer.b.b.nr_samples > 1)
+ si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */);
}
-static void
-si_decompress_sampler_depth_textures(struct si_context *sctx,
- struct si_samplers *textures)
+static void si_decompress_sampler_depth_textures(struct si_context *sctx,
+ struct si_samplers *textures)
{
- unsigned i;
- unsigned mask = textures->needs_depth_decompress_mask;
+ unsigned i;
+ unsigned mask = textures->needs_depth_decompress_mask;
- while (mask) {
- struct pipe_sampler_view *view;
- struct si_sampler_view *sview;
- struct si_texture *tex;
+ while (mask) {
+ struct pipe_sampler_view *view;
+ struct si_sampler_view *sview;
+ struct si_texture *tex;
- i = u_bit_scan(&mask);
+ i = u_bit_scan(&mask);
- view = textures->views[i];
- assert(view);
- sview = (struct si_sampler_view*)view;
+ view = textures->views[i];
+ assert(view);
+ sview = (struct si_sampler_view *)view;
- tex = (struct si_texture *)view->texture;
- assert(tex->db_compatible);
+ tex = (struct si_texture *)view->texture;
+ assert(tex->db_compatible);
- si_decompress_depth(sctx, tex,
- sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
- view->u.tex.first_level, view->u.tex.last_level,
- 0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
- }
+ si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+ view->u.tex.first_level, view->u.tex.last_level, 0,
+ util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+ }
}
-static void si_blit_decompress_color(struct si_context *sctx,
- struct si_texture *tex,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- bool need_dcc_decompress,
- bool need_fmask_expand)
+static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ bool need_dcc_decompress, bool need_fmask_expand)
{
- void* custom_blend;
- unsigned layer, checked_last_layer, max_layer;
- unsigned level_mask =
- u_bit_consecutive(first_level, last_level - first_level + 1);
-
- if (!need_dcc_decompress)
- level_mask &= tex->dirty_level_mask;
- if (!level_mask)
- goto expand_fmask;
-
- if (unlikely(sctx->log))
- u_log_printf(sctx->log,
- "\n------------------------------------------------\n"
- "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
- first_level, last_level, level_mask);
-
- if (need_dcc_decompress) {
- custom_blend = sctx->custom_blend_dcc_decompress;
-
- assert(tex->surface.dcc_offset);
-
- /* disable levels without DCC */
- for (int i = first_level; i <= last_level; i++) {
- if (!vi_dcc_enabled(tex, i))
- level_mask &= ~(1 << i);
- }
- } else if (tex->surface.fmask_size) {
- custom_blend = sctx->custom_blend_fmask_decompress;
- } else {
- custom_blend = sctx->custom_blend_eliminate_fastclear;
- }
-
- sctx->decompression_enabled = true;
-
- while (level_mask) {
- unsigned level = u_bit_scan(&level_mask);
-
- /* The smaller the mipmap level, the less layers there are
- * as far as 3D textures are concerned. */
- max_layer = util_max_layer(&tex->buffer.b.b, level);
- checked_last_layer = MIN2(last_layer, max_layer);
-
- for (layer = first_layer; layer <= checked_last_layer; layer++) {
- struct pipe_surface *cbsurf, surf_tmpl;
-
- surf_tmpl.format = tex->buffer.b.b.format;
- surf_tmpl.u.tex.level = level;
- surf_tmpl.u.tex.first_layer = layer;
- surf_tmpl.u.tex.last_layer = layer;
- cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
-
- /* Required before and after FMASK and DCC_DECOMPRESS. */
- if (custom_blend == sctx->custom_blend_fmask_decompress ||
- custom_blend == sctx->custom_blend_dcc_decompress)
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
- si_blitter_begin(sctx, SI_DECOMPRESS);
- util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
- si_blitter_end(sctx);
-
- if (custom_blend == sctx->custom_blend_fmask_decompress ||
- custom_blend == sctx->custom_blend_dcc_decompress)
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
- pipe_surface_reference(&cbsurf, NULL);
- }
-
- /* The texture will always be dirty if some layers aren't flushed.
- * I don't think this case occurs often though. */
- if (first_layer == 0 && last_layer >= max_layer) {
- tex->dirty_level_mask &= ~(1 << level);
- }
- }
-
- sctx->decompression_enabled = false;
- si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
- vi_dcc_enabled(tex, first_level),
- tex->surface.u.gfx9.dcc.pipe_aligned);
+ void *custom_blend;
+ unsigned layer, checked_last_layer, max_layer;
+ unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+
+ if (!need_dcc_decompress)
+ level_mask &= tex->dirty_level_mask;
+ if (!level_mask)
+ goto expand_fmask;
+
+ if (unlikely(sctx->log))
+ u_log_printf(sctx->log,
+ "\n------------------------------------------------\n"
+ "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
+ first_level, last_level, level_mask);
+
+ if (need_dcc_decompress) {
+ custom_blend = sctx->custom_blend_dcc_decompress;
+
+ assert(tex->surface.dcc_offset);
+
+ /* disable levels without DCC */
+ for (int i = first_level; i <= last_level; i++) {
+ if (!vi_dcc_enabled(tex, i))
+ level_mask &= ~(1 << i);
+ }
+ } else if (tex->surface.fmask_size) {
+ custom_blend = sctx->custom_blend_fmask_decompress;
+ } else {
+ custom_blend = sctx->custom_blend_eliminate_fastclear;
+ }
+
+ sctx->decompression_enabled = true;
+
+ while (level_mask) {
+ unsigned level = u_bit_scan(&level_mask);
+
+ /* The smaller the mipmap level, the less layers there are
+ * as far as 3D textures are concerned. */
+ max_layer = util_max_layer(&tex->buffer.b.b, level);
+ checked_last_layer = MIN2(last_layer, max_layer);
+
+ for (layer = first_layer; layer <= checked_last_layer; layer++) {
+ struct pipe_surface *cbsurf, surf_tmpl;
+
+ surf_tmpl.format = tex->buffer.b.b.format;
+ surf_tmpl.u.tex.level = level;
+ surf_tmpl.u.tex.first_layer = layer;
+ surf_tmpl.u.tex.last_layer = layer;
+ cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
+
+ /* Required before and after FMASK and DCC_DECOMPRESS. */
+ if (custom_blend == sctx->custom_blend_fmask_decompress ||
+ custom_blend == sctx->custom_blend_dcc_decompress)
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+ si_blitter_begin(sctx, SI_DECOMPRESS);
+ util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
+ si_blitter_end(sctx);
+
+ if (custom_blend == sctx->custom_blend_fmask_decompress ||
+ custom_blend == sctx->custom_blend_dcc_decompress)
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+ pipe_surface_reference(&cbsurf, NULL);
+ }
+
+ /* The texture will always be dirty if some layers aren't flushed.
+ * I don't think this case occurs often though. */
+ if (first_layer == 0 && last_layer >= max_layer) {
+ tex->dirty_level_mask &= ~(1 << level);
+ }
+ }
+
+ sctx->decompression_enabled = false;
+ si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, vi_dcc_enabled(tex, first_level),
+ tex->surface.u.gfx9.dcc.pipe_aligned);
expand_fmask:
- if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
- si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
- tex->fmask_is_identity = true;
- }
+ if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
+ si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
+ tex->fmask_is_identity = true;
+ }
}
-static void
-si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
- unsigned first_level, unsigned last_level,
- bool need_fmask_expand)
+static void si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
+ unsigned first_level, unsigned last_level,
+ bool need_fmask_expand)
{
- /* CMASK or DCC can be discarded and we can still end up here. */
- if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
- return;
+ /* CMASK or DCC can be discarded and we can still end up here. */
+ if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
+ return;
- si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
- util_max_layer(&tex->buffer.b.b, first_level),
- false, need_fmask_expand);
+ si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
+ util_max_layer(&tex->buffer.b.b, first_level), false,
+ need_fmask_expand);
}
-static void
-si_decompress_sampler_color_textures(struct si_context *sctx,
- struct si_samplers *textures)
+static void si_decompress_sampler_color_textures(struct si_context *sctx,
+ struct si_samplers *textures)
{
- unsigned i;
- unsigned mask = textures->needs_color_decompress_mask;
+ unsigned i;
+ unsigned mask = textures->needs_color_decompress_mask;
- while (mask) {
- struct pipe_sampler_view *view;
- struct si_texture *tex;
+ while (mask) {
+ struct pipe_sampler_view *view;
+ struct si_texture *tex;
- i = u_bit_scan(&mask);
+ i = u_bit_scan(&mask);
- view = textures->views[i];
- assert(view);
+ view = textures->views[i];
+ assert(view);
- tex = (struct si_texture *)view->texture;
+ tex = (struct si_texture *)view->texture;
- si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
- view->u.tex.last_level, false);
- }
+ si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+ false);
+ }
}
-static void
-si_decompress_image_color_textures(struct si_context *sctx,
- struct si_images *images)
+static void si_decompress_image_color_textures(struct si_context *sctx, struct si_images *images)
{
- unsigned i;
- unsigned mask = images->needs_color_decompress_mask;
+ unsigned i;
+ unsigned mask = images->needs_color_decompress_mask;
- while (mask) {
- const struct pipe_image_view *view;
- struct si_texture *tex;
+ while (mask) {
+ const struct pipe_image_view *view;
+ struct si_texture *tex;
- i = u_bit_scan(&mask);
+ i = u_bit_scan(&mask);
- view = &images->views[i];
- assert(view->resource->target != PIPE_BUFFER);
+ view = &images->views[i];
+ assert(view->resource->target != PIPE_BUFFER);
- tex = (struct si_texture *)view->resource;
+ tex = (struct si_texture *)view->resource;
- si_decompress_color_texture(sctx, tex, view->u.tex.level,
- view->u.tex.level,
- view->access & PIPE_IMAGE_ACCESS_WRITE);
- }
+ si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+ view->access & PIPE_IMAGE_ACCESS_WRITE);
+ }
}
-static void si_check_render_feedback_texture(struct si_context *sctx,
- struct si_texture *tex,
- unsigned first_level,
- unsigned last_level,
- unsigned first_layer,
- unsigned last_layer)
+static void si_check_render_feedback_texture(struct si_context *sctx, struct si_texture *tex,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer)
{
- bool render_feedback = false;
+ bool render_feedback = false;
- if (!tex->surface.dcc_offset)
- return;
+ if (!tex->surface.dcc_offset)
+ return;
- for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
- struct si_surface * surf;
+ for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
+ struct si_surface *surf;
- if (!sctx->framebuffer.state.cbufs[j])
- continue;
+ if (!sctx->framebuffer.state.cbufs[j])
+ continue;
- surf = (struct si_surface*)sctx->framebuffer.state.cbufs[j];
+ surf = (struct si_surface *)sctx->framebuffer.state.cbufs[j];
- if (tex == (struct si_texture *)surf->base.texture &&
- surf->base.u.tex.level >= first_level &&
- surf->base.u.tex.level <= last_level &&
- surf->base.u.tex.first_layer <= last_layer &&
- surf->base.u.tex.last_layer >= first_layer) {
- render_feedback = true;
- break;
- }
- }
+ if (tex == (struct si_texture *)surf->base.texture && surf->base.u.tex.level >= first_level &&
+ surf->base.u.tex.level <= last_level && surf->base.u.tex.first_layer <= last_layer &&
+ surf->base.u.tex.last_layer >= first_layer) {
+ render_feedback = true;
+ break;
+ }
+ }
- if (render_feedback)
- si_texture_disable_dcc(sctx, tex);
+ if (render_feedback)
+ si_texture_disable_dcc(sctx, tex);
}
-static void si_check_render_feedback_textures(struct si_context *sctx,
- struct si_samplers *textures)
+static void si_check_render_feedback_textures(struct si_context *sctx, struct si_samplers *textures)
{
- uint32_t mask = textures->enabled_mask;
+ uint32_t mask = textures->enabled_mask;
- while (mask) {
- const struct pipe_sampler_view *view;
- struct si_texture *tex;
+ while (mask) {
+ const struct pipe_sampler_view *view;
+ struct si_texture *tex;
- unsigned i = u_bit_scan(&mask);
+ unsigned i = u_bit_scan(&mask);
- view = textures->views[i];
- if(view->texture->target == PIPE_BUFFER)
- continue;
+ view = textures->views[i];
+ if (view->texture->target == PIPE_BUFFER)
+ continue;
- tex = (struct si_texture *)view->texture;
+ tex = (struct si_texture *)view->texture;
- si_check_render_feedback_texture(sctx, tex,
- view->u.tex.first_level,
- view->u.tex.last_level,
- view->u.tex.first_layer,
- view->u.tex.last_layer);
- }
+ si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+ view->u.tex.first_layer, view->u.tex.last_layer);
+ }
}
-static void si_check_render_feedback_images(struct si_context *sctx,
- struct si_images *images)
+static void si_check_render_feedback_images(struct si_context *sctx, struct si_images *images)
{
- uint32_t mask = images->enabled_mask;
+ uint32_t mask = images->enabled_mask;
- while (mask) {
- const struct pipe_image_view *view;
- struct si_texture *tex;
+ while (mask) {
+ const struct pipe_image_view *view;
+ struct si_texture *tex;
- unsigned i = u_bit_scan(&mask);
+ unsigned i = u_bit_scan(&mask);
- view = &images->views[i];
- if (view->resource->target == PIPE_BUFFER)
- continue;
+ view = &images->views[i];
+ if (view->resource->target == PIPE_BUFFER)
+ continue;
- tex = (struct si_texture *)view->resource;
+ tex = (struct si_texture *)view->resource;
- si_check_render_feedback_texture(sctx, tex,
- view->u.tex.level,
- view->u.tex.level,
- view->u.tex.first_layer,
- view->u.tex.last_layer);
- }
+ si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+ view->u.tex.first_layer, view->u.tex.last_layer);
+ }
}
static void si_check_render_feedback_resident_textures(struct si_context *sctx)
{
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- struct pipe_sampler_view *view;
- struct si_texture *tex;
-
- view = (*tex_handle)->view;
- if (view->texture->target == PIPE_BUFFER)
- continue;
-
- tex = (struct si_texture *)view->texture;
-
- si_check_render_feedback_texture(sctx, tex,
- view->u.tex.first_level,
- view->u.tex.last_level,
- view->u.tex.first_layer,
- view->u.tex.last_layer);
- }
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ struct pipe_sampler_view *view;
+ struct si_texture *tex;
+
+ view = (*tex_handle)->view;
+ if (view->texture->target == PIPE_BUFFER)
+ continue;
+
+ tex = (struct si_texture *)view->texture;
+
+ si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+ view->u.tex.first_layer, view->u.tex.last_layer);
+ }
}
static void si_check_render_feedback_resident_images(struct si_context *sctx)
{
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- struct pipe_image_view *view;
- struct si_texture *tex;
-
- view = &(*img_handle)->view;
- if (view->resource->target == PIPE_BUFFER)
- continue;
-
- tex = (struct si_texture *)view->resource;
-
- si_check_render_feedback_texture(sctx, tex,
- view->u.tex.level,
- view->u.tex.level,
- view->u.tex.first_layer,
- view->u.tex.last_layer);
- }
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ struct pipe_image_view *view;
+ struct si_texture *tex;
+
+ view = &(*img_handle)->view;
+ if (view->resource->target == PIPE_BUFFER)
+ continue;
+
+ tex = (struct si_texture *)view->resource;
+
+ si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+ view->u.tex.first_layer, view->u.tex.last_layer);
+ }
}
static void si_check_render_feedback(struct si_context *sctx)
{
- if (!sctx->need_check_render_feedback)
- return;
+ if (!sctx->need_check_render_feedback)
+ return;
- /* There is no render feedback if color writes are disabled.
- * (e.g. a pixel shader with image stores)
- */
- if (!si_get_total_colormask(sctx))
- return;
+ /* There is no render feedback if color writes are disabled.
+ * (e.g. a pixel shader with image stores)
+ */
+ if (!si_get_total_colormask(sctx))
+ return;
- for (int i = 0; i < SI_NUM_SHADERS; ++i) {
- si_check_render_feedback_images(sctx, &sctx->images[i]);
- si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
- }
+ for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+ si_check_render_feedback_images(sctx, &sctx->images[i]);
+ si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
+ }
- si_check_render_feedback_resident_images(sctx);
- si_check_render_feedback_resident_textures(sctx);
+ si_check_render_feedback_resident_images(sctx);
+ si_check_render_feedback_resident_textures(sctx);
- sctx->need_check_render_feedback = false;
+ sctx->need_check_render_feedback = false;
}
static void si_decompress_resident_textures(struct si_context *sctx)
{
- util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress,
- struct si_texture_handle *, tex_handle) {
- struct pipe_sampler_view *view = (*tex_handle)->view;
- struct si_texture *tex = (struct si_texture *)view->texture;
-
- si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
- view->u.tex.last_level, false);
- }
-
- util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress,
- struct si_texture_handle *, tex_handle) {
- struct pipe_sampler_view *view = (*tex_handle)->view;
- struct si_sampler_view *sview = (struct si_sampler_view *)view;
- struct si_texture *tex = (struct si_texture *)view->texture;
-
- si_decompress_depth(sctx, tex,
- sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
- view->u.tex.first_level, view->u.tex.last_level,
- 0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
- }
+ util_dynarray_foreach (&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+ tex_handle) {
+ struct pipe_sampler_view *view = (*tex_handle)->view;
+ struct si_texture *tex = (struct si_texture *)view->texture;
+
+ si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+ false);
+ }
+
+ util_dynarray_foreach (&sctx->resident_tex_needs_depth_decompress, struct si_texture_handle *,
+ tex_handle) {
+ struct pipe_sampler_view *view = (*tex_handle)->view;
+ struct si_sampler_view *sview = (struct si_sampler_view *)view;
+ struct si_texture *tex = (struct si_texture *)view->texture;
+
+ si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+ view->u.tex.first_level, view->u.tex.last_level, 0,
+ util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+ }
}
static void si_decompress_resident_images(struct si_context *sctx)
{
- util_dynarray_foreach(&sctx->resident_img_needs_color_decompress,
- struct si_image_handle *, img_handle) {
- struct pipe_image_view *view = &(*img_handle)->view;
- struct si_texture *tex = (struct si_texture *)view->resource;
-
- si_decompress_color_texture(sctx, tex, view->u.tex.level,
- view->u.tex.level,
- view->access & PIPE_IMAGE_ACCESS_WRITE);
- }
+ util_dynarray_foreach (&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+ img_handle) {
+ struct pipe_image_view *view = &(*img_handle)->view;
+ struct si_texture *tex = (struct si_texture *)view->resource;
+
+ si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+ view->access & PIPE_IMAGE_ACCESS_WRITE);
+ }
}
void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
{
- unsigned compressed_colortex_counter, mask;
-
- if (sctx->blitter->running)
- return;
-
- /* Update the compressed_colortex_mask if necessary. */
- compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
- if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
- sctx->last_compressed_colortex_counter = compressed_colortex_counter;
- si_update_needs_color_decompress_masks(sctx);
- }
-
- /* Decompress color & depth textures if needed. */
- mask = sctx->shader_needs_decompress_mask & shader_mask;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
-
- if (sctx->samplers[i].needs_depth_decompress_mask) {
- si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
- }
- if (sctx->samplers[i].needs_color_decompress_mask) {
- si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
- }
- if (sctx->images[i].needs_color_decompress_mask) {
- si_decompress_image_color_textures(sctx, &sctx->images[i]);
- }
- }
-
- if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
- if (sctx->uses_bindless_samplers)
- si_decompress_resident_textures(sctx);
- if (sctx->uses_bindless_images)
- si_decompress_resident_images(sctx);
-
- if (sctx->ps_uses_fbfetch) {
- struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
- si_decompress_color_texture(sctx,
- (struct si_texture*)cb0->texture,
- cb0->u.tex.first_layer,
- cb0->u.tex.last_layer, false);
- }
-
- si_check_render_feedback(sctx);
- } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
- if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
- si_decompress_resident_textures(sctx);
- if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
- si_decompress_resident_images(sctx);
- }
+ unsigned compressed_colortex_counter, mask;
+
+ if (sctx->blitter->running)
+ return;
+
+ /* Update the compressed_colortex_mask if necessary. */
+ compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
+ if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
+ sctx->last_compressed_colortex_counter = compressed_colortex_counter;
+ si_update_needs_color_decompress_masks(sctx);
+ }
+
+ /* Decompress color & depth textures if needed. */
+ mask = sctx->shader_needs_decompress_mask & shader_mask;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+
+ if (sctx->samplers[i].needs_depth_decompress_mask) {
+ si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
+ }
+ if (sctx->samplers[i].needs_color_decompress_mask) {
+ si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
+ }
+ if (sctx->images[i].needs_color_decompress_mask) {
+ si_decompress_image_color_textures(sctx, &sctx->images[i]);
+ }
+ }
+
+ if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
+ if (sctx->uses_bindless_samplers)
+ si_decompress_resident_textures(sctx);
+ if (sctx->uses_bindless_images)
+ si_decompress_resident_images(sctx);
+
+ if (sctx->ps_uses_fbfetch) {
+ struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+ si_decompress_color_texture(sctx, (struct si_texture *)cb0->texture,
+ cb0->u.tex.first_layer, cb0->u.tex.last_layer, false);
+ }
+
+ si_check_render_feedback(sctx);
+ } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
+ if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
+ si_decompress_resident_textures(sctx);
+ if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
+ si_decompress_resident_images(sctx);
+ }
}
/* Helper for decompressing a portion of a color or depth resource before
* blitting if any decompression is needed.
* The driver doesn't decompress resources automatically while u_blitter is
* rendering. */
-void si_decompress_subresource(struct pipe_context *ctx,
- struct pipe_resource *tex,
- unsigned planes, unsigned level,
- unsigned first_layer, unsigned last_layer)
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+ unsigned level, unsigned first_layer, unsigned last_layer)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture *stex = (struct si_texture*)tex;
-
- if (stex->db_compatible) {
- planes &= PIPE_MASK_Z | PIPE_MASK_S;
-
- if (!stex->surface.has_stencil)
- planes &= ~PIPE_MASK_S;
-
- /* If we've rendered into the framebuffer and it's a blitting
- * source, make sure the decompression pass is invoked
- * by dirtying the framebuffer.
- */
- if (sctx->framebuffer.state.zsbuf &&
- sctx->framebuffer.state.zsbuf->u.tex.level == level &&
- sctx->framebuffer.state.zsbuf->texture == tex)
- si_update_fb_dirtiness_after_rendering(sctx);
-
- si_decompress_depth(sctx, stex, planes,
- level, level,
- first_layer, last_layer);
- } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
- /* If we've rendered into the framebuffer and it's a blitting
- * source, make sure the decompression pass is invoked
- * by dirtying the framebuffer.
- */
- for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (sctx->framebuffer.state.cbufs[i] &&
- sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
- sctx->framebuffer.state.cbufs[i]->texture == tex) {
- si_update_fb_dirtiness_after_rendering(sctx);
- break;
- }
- }
-
- si_blit_decompress_color(sctx, stex, level, level,
- first_layer, last_layer, false, false);
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *stex = (struct si_texture *)tex;
+
+ if (stex->db_compatible) {
+ planes &= PIPE_MASK_Z | PIPE_MASK_S;
+
+ if (!stex->surface.has_stencil)
+ planes &= ~PIPE_MASK_S;
+
+ /* If we've rendered into the framebuffer and it's a blitting
+ * source, make sure the decompression pass is invoked
+ * by dirtying the framebuffer.
+ */
+ if (sctx->framebuffer.state.zsbuf && sctx->framebuffer.state.zsbuf->u.tex.level == level &&
+ sctx->framebuffer.state.zsbuf->texture == tex)
+ si_update_fb_dirtiness_after_rendering(sctx);
+
+ si_decompress_depth(sctx, stex, planes, level, level, first_layer, last_layer);
+ } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
+ /* If we've rendered into the framebuffer and it's a blitting
+ * source, make sure the decompression pass is invoked
+ * by dirtying the framebuffer.
+ */
+ for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (sctx->framebuffer.state.cbufs[i] &&
+ sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
+ sctx->framebuffer.state.cbufs[i]->texture == tex) {
+ si_update_fb_dirtiness_after_rendering(sctx);
+ break;
+ }
+ }
+
+ si_blit_decompress_color(sctx, stex, level, level, first_layer, last_layer, false, false);
+ }
}
struct texture_orig_info {
- unsigned format;
- unsigned width0;
- unsigned height0;
- unsigned npix_x;
- unsigned npix_y;
- unsigned npix0_x;
- unsigned npix0_y;
+ unsigned format;
+ unsigned width0;
+ unsigned height0;
+ unsigned npix_x;
+ unsigned npix_y;
+ unsigned npix0_x;
+ unsigned npix0_y;
};
-void si_resource_copy_region(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+ unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src, unsigned src_level,
+ const struct pipe_box *src_box)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture *ssrc = (struct si_texture*)src;
- struct si_texture *sdst = (struct si_texture*)dst;
- struct pipe_surface *dst_view, dst_templ;
- struct pipe_sampler_view src_templ, *src_view;
- unsigned dst_width, dst_height, src_width0, src_height0;
- unsigned dst_width0, dst_height0, src_force_level = 0;
- struct pipe_box sbox, dstbox;
-
- /* Handle buffers first. */
- if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
- si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
- return;
- }
-
- if (!util_format_is_compressed(src->format) &&
- !util_format_is_compressed(dst->format) &&
- !util_format_is_depth_or_stencil(src->format) &&
- src->nr_samples <= 1 &&
- !sdst->surface.dcc_offset &&
- !(dst->target != src->target &&
- (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
- si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
- return;
- }
-
- assert(u_max_sample(dst) == u_max_sample(src));
-
- /* The driver doesn't decompress resources automatically while
- * u_blitter is rendering. */
- si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
- src_box->z, src_box->z + src_box->depth - 1);
-
- dst_width = u_minify(dst->width0, dst_level);
- dst_height = u_minify(dst->height0, dst_level);
- dst_width0 = dst->width0;
- dst_height0 = dst->height0;
- src_width0 = src->width0;
- src_height0 = src->height0;
-
- util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
- util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
-
- if (util_format_is_compressed(src->format) ||
- util_format_is_compressed(dst->format)) {
- unsigned blocksize = ssrc->surface.bpe;
-
- if (blocksize == 8)
- src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
- else
- src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
- dst_templ.format = src_templ.format;
-
- dst_width = util_format_get_nblocksx(dst->format, dst_width);
- dst_height = util_format_get_nblocksy(dst->format, dst_height);
- dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
- dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
- src_width0 = util_format_get_nblocksx(src->format, src_width0);
- src_height0 = util_format_get_nblocksy(src->format, src_height0);
-
- dstx = util_format_get_nblocksx(dst->format, dstx);
- dsty = util_format_get_nblocksy(dst->format, dsty);
-
- sbox.x = util_format_get_nblocksx(src->format, src_box->x);
- sbox.y = util_format_get_nblocksy(src->format, src_box->y);
- sbox.z = src_box->z;
- sbox.width = util_format_get_nblocksx(src->format, src_box->width);
- sbox.height = util_format_get_nblocksy(src->format, src_box->height);
- sbox.depth = src_box->depth;
- src_box = &sbox;
-
- src_force_level = src_level;
- } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
- if (util_format_is_subsampled_422(src->format)) {
- src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
- dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
-
- dst_width = util_format_get_nblocksx(dst->format, dst_width);
- dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
- src_width0 = util_format_get_nblocksx(src->format, src_width0);
-
- dstx = util_format_get_nblocksx(dst->format, dstx);
-
- sbox = *src_box;
- sbox.x = util_format_get_nblocksx(src->format, src_box->x);
- sbox.width = util_format_get_nblocksx(src->format, src_box->width);
- src_box = &sbox;
- } else {
- unsigned blocksize = ssrc->surface.bpe;
-
- switch (blocksize) {
- case 1:
- dst_templ.format = PIPE_FORMAT_R8_UNORM;
- src_templ.format = PIPE_FORMAT_R8_UNORM;
- break;
- case 2:
- dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
- src_templ.format = PIPE_FORMAT_R8G8_UNORM;
- break;
- case 4:
- dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
- src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
- break;
- case 8:
- dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
- src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
- break;
- case 16:
- dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
- src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
- break;
- default:
- fprintf(stderr, "Unhandled format %s with blocksize %u\n",
- util_format_short_name(src->format), blocksize);
- assert(0);
- }
- }
- }
-
- /* SNORM8 blitting has precision issues on some chips. Use the SINT
- * equivalent instead, which doesn't force DCC decompression.
- * Note that some chips avoid this issue by using SDMA.
- */
- if (util_format_is_snorm8(dst_templ.format)) {
- dst_templ.format = src_templ.format =
- util_format_snorm8_to_sint8(dst_templ.format);
- }
-
- vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level,
- dst_templ.format);
- vi_disable_dcc_if_incompatible_format(sctx, src, src_level,
- src_templ.format);
-
- /* Initialize the surface. */
- dst_view = si_create_surface_custom(ctx, dst, &dst_templ,
- dst_width0, dst_height0,
- dst_width, dst_height);
-
- /* Initialize the sampler view. */
- src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
- src_width0, src_height0,
- src_force_level);
-
- u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
- abs(src_box->depth), &dstbox);
-
- /* Copy. */
- si_blitter_begin(sctx, SI_COPY);
- util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
- src_view, src_box, src_width0, src_height0,
- PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
- false);
- si_blitter_end(sctx);
-
- pipe_surface_reference(&dst_view, NULL);
- pipe_sampler_view_reference(&src_view, NULL);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *ssrc = (struct si_texture *)src;
+ struct si_texture *sdst = (struct si_texture *)dst;
+ struct pipe_surface *dst_view, dst_templ;
+ struct pipe_sampler_view src_templ, *src_view;
+ unsigned dst_width, dst_height, src_width0, src_height0;
+ unsigned dst_width0, dst_height0, src_force_level = 0;
+ struct pipe_box sbox, dstbox;
+
+ /* Handle buffers first. */
+ if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+ si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
+ return;
+ }
+
+ if (!util_format_is_compressed(src->format) && !util_format_is_compressed(dst->format) &&
+ !util_format_is_depth_or_stencil(src->format) && src->nr_samples <= 1 &&
+ !sdst->surface.dcc_offset &&
+ !(dst->target != src->target &&
+ (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
+ si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
+ return;
+ }
+
+ assert(u_max_sample(dst) == u_max_sample(src));
+
+ /* The driver doesn't decompress resources automatically while
+ * u_blitter is rendering. */
+ si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+ src_box->z + src_box->depth - 1);
+
+ dst_width = u_minify(dst->width0, dst_level);
+ dst_height = u_minify(dst->height0, dst_level);
+ dst_width0 = dst->width0;
+ dst_height0 = dst->height0;
+ src_width0 = src->width0;
+ src_height0 = src->height0;
+
+ util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+ util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
+
+ if (util_format_is_compressed(src->format) || util_format_is_compressed(dst->format)) {
+ unsigned blocksize = ssrc->surface.bpe;
+
+ if (blocksize == 8)
+ src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+ else
+ src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+ dst_templ.format = src_templ.format;
+
+ dst_width = util_format_get_nblocksx(dst->format, dst_width);
+ dst_height = util_format_get_nblocksy(dst->format, dst_height);
+ dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+ dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
+ src_width0 = util_format_get_nblocksx(src->format, src_width0);
+ src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+ dstx = util_format_get_nblocksx(dst->format, dstx);
+ dsty = util_format_get_nblocksy(dst->format, dsty);
+
+ sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+ sbox.y = util_format_get_nblocksy(src->format, src_box->y);
+ sbox.z = src_box->z;
+ sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+ sbox.height = util_format_get_nblocksy(src->format, src_box->height);
+ sbox.depth = src_box->depth;
+ src_box = &sbox;
+
+ src_force_level = src_level;
+ } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+ if (util_format_is_subsampled_422(src->format)) {
+ src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+ dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+ dst_width = util_format_get_nblocksx(dst->format, dst_width);
+ dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+ src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+ dstx = util_format_get_nblocksx(dst->format, dstx);
+
+ sbox = *src_box;
+ sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+ sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+ src_box = &sbox;
+ } else {
+ unsigned blocksize = ssrc->surface.bpe;
+
+ switch (blocksize) {
+ case 1:
+ dst_templ.format = PIPE_FORMAT_R8_UNORM;
+ src_templ.format = PIPE_FORMAT_R8_UNORM;
+ break;
+ case 2:
+ dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+ src_templ.format = PIPE_FORMAT_R8G8_UNORM;
+ break;
+ case 4:
+ dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+ src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+ break;
+ case 8:
+ dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+ src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+ break;
+ case 16:
+ dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+ src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+ break;
+ default:
+ fprintf(stderr, "Unhandled format %s with blocksize %u\n",
+ util_format_short_name(src->format), blocksize);
+ assert(0);
+ }
+ }
+ }
+
+ /* SNORM8 blitting has precision issues on some chips. Use the SINT
+ * equivalent instead, which doesn't force DCC decompression.
+ * Note that some chips avoid this issue by using SDMA.
+ */
+ if (util_format_is_snorm8(dst_templ.format)) {
+ dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format);
+ }
+
+ vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level, dst_templ.format);
+ vi_disable_dcc_if_incompatible_format(sctx, src, src_level, src_templ.format);
+
+ /* Initialize the surface. */
+ dst_view = si_create_surface_custom(ctx, dst, &dst_templ, dst_width0, dst_height0, dst_width,
+ dst_height);
+
+ /* Initialize the sampler view. */
+ src_view =
+ si_create_sampler_view_custom(ctx, src, &src_templ, src_width0, src_height0, src_force_level);
+
+ u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), abs(src_box->depth),
+ &dstbox);
+
+ /* Copy. */
+ si_blitter_begin(sctx, SI_COPY);
+ util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
+ src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+ si_blitter_end(sctx);
+
+ pipe_surface_reference(&dst_view, NULL);
+ pipe_sampler_view_reference(&src_view, NULL);
}
-static void si_do_CB_resolve(struct si_context *sctx,
- const struct pipe_blit_info *info,
- struct pipe_resource *dst,
- unsigned dst_level, unsigned dst_z,
- enum pipe_format format)
+static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_info *info,
+ struct pipe_resource *dst, unsigned dst_level, unsigned dst_z,
+ enum pipe_format format)
{
- /* Required before and after CB_RESOLVE. */
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
- si_blitter_begin(sctx, SI_COLOR_RESOLVE |
- (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
- util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
- info->src.resource, info->src.box.z,
- ~0, sctx->custom_blend_resolve,
- format);
- si_blitter_end(sctx);
-
- /* Flush caches for possible texturing. */
- si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
+ /* Required before and after CB_RESOLVE. */
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+ si_blitter_begin(
+ sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource,
+ info->src.box.z, ~0, sctx->custom_blend_resolve, format);
+ si_blitter_end(sctx);
+
+ /* Flush caches for possible texturing. */
+ si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
}
-static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
- const struct pipe_blit_info *info)
+static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_texture *src = (struct si_texture*)info->src.resource;
- struct si_texture *dst = (struct si_texture*)info->dst.resource;
- ASSERTED struct si_texture *stmp;
- unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
- unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
- enum pipe_format format = info->src.format;
- struct pipe_resource *tmp, templ;
- struct pipe_blit_info blit;
-
- /* Check basic requirements for hw resolve. */
- if (!(info->src.resource->nr_samples > 1 &&
- info->dst.resource->nr_samples <= 1 &&
- !util_format_is_pure_integer(format) &&
- !util_format_is_depth_or_stencil(format) &&
- util_max_layer(info->src.resource, 0) == 0))
- return false;
-
- /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
- * the format is R16G16. Use R16A16, which does work.
- */
- if (format == PIPE_FORMAT_R16G16_UNORM)
- format = PIPE_FORMAT_R16A16_UNORM;
- if (format == PIPE_FORMAT_R16G16_SNORM)
- format = PIPE_FORMAT_R16A16_SNORM;
-
- /* Check the remaining requirements for hw resolve. */
- if (util_max_layer(info->dst.resource, info->dst.level) == 0 &&
- !info->scissor_enable &&
- (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
- util_is_format_compatible(util_format_description(info->src.format),
- util_format_description(info->dst.format)) &&
- dst_width == info->src.resource->width0 &&
- dst_height == info->src.resource->height0 &&
- info->dst.box.x == 0 &&
- info->dst.box.y == 0 &&
- info->dst.box.width == dst_width &&
- info->dst.box.height == dst_height &&
- info->dst.box.depth == 1 &&
- info->src.box.x == 0 &&
- info->src.box.y == 0 &&
- info->src.box.width == dst_width &&
- info->src.box.height == dst_height &&
- info->src.box.depth == 1 &&
- !dst->surface.is_linear &&
- (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
- /* Check the last constraint. */
- if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
- /* The next fast clear will switch to this mode to
- * get direct hw resolve next time if the mode is
- * different now.
- *
- * TODO-GFX10: This does not work in GFX10 because MSAA
- * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
- * In some cases we could change the swizzle of the
- * destination texture instead, but the more general
- * solution is to implement compute shader resolve.
- */
- src->last_msaa_resolve_target_micro_mode =
- dst->surface.micro_tile_mode;
- goto resolve_to_temp;
- }
-
- /* Resolving into a surface with DCC is unsupported. Since
- * it's being overwritten anyway, clear it to uncompressed.
- * This is still the fastest codepath even with this clear.
- */
- if (vi_dcc_enabled(dst, info->dst.level)) {
- if (!vi_dcc_clear_level(sctx, dst, info->dst.level,
- DCC_UNCOMPRESSED))
- goto resolve_to_temp;
-
- dst->dirty_level_mask &= ~(1 << info->dst.level);
- }
-
- /* Resolve directly from src to dst. */
- si_do_CB_resolve(sctx, info, info->dst.resource,
- info->dst.level, info->dst.box.z, format);
- return true;
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *src = (struct si_texture *)info->src.resource;
+ struct si_texture *dst = (struct si_texture *)info->dst.resource;
+ ASSERTED struct si_texture *stmp;
+ unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
+ unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
+ enum pipe_format format = info->src.format;
+ struct pipe_resource *tmp, templ;
+ struct pipe_blit_info blit;
+
+ /* Check basic requirements for hw resolve. */
+ if (!(info->src.resource->nr_samples > 1 && info->dst.resource->nr_samples <= 1 &&
+ !util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format) &&
+ util_max_layer(info->src.resource, 0) == 0))
+ return false;
+
+ /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
+ * the format is R16G16. Use R16A16, which does work.
+ */
+ if (format == PIPE_FORMAT_R16G16_UNORM)
+ format = PIPE_FORMAT_R16A16_UNORM;
+ if (format == PIPE_FORMAT_R16G16_SNORM)
+ format = PIPE_FORMAT_R16A16_SNORM;
+
+ /* Check the remaining requirements for hw resolve. */
+ if (util_max_layer(info->dst.resource, info->dst.level) == 0 && !info->scissor_enable &&
+ (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
+ util_is_format_compatible(util_format_description(info->src.format),
+ util_format_description(info->dst.format)) &&
+ dst_width == info->src.resource->width0 && dst_height == info->src.resource->height0 &&
+ info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.width == dst_width &&
+ info->dst.box.height == dst_height && info->dst.box.depth == 1 && info->src.box.x == 0 &&
+ info->src.box.y == 0 && info->src.box.width == dst_width &&
+ info->src.box.height == dst_height && info->src.box.depth == 1 && !dst->surface.is_linear &&
+ (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
+ /* Check the last constraint. */
+ if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
+ /* The next fast clear will switch to this mode to
+ * get direct hw resolve next time if the mode is
+ * different now.
+ *
+ * TODO-GFX10: This does not work in GFX10 because MSAA
+ * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
+ * In some cases we could change the swizzle of the
+ * destination texture instead, but the more general
+ * solution is to implement compute shader resolve.
+ */
+ src->last_msaa_resolve_target_micro_mode = dst->surface.micro_tile_mode;
+ goto resolve_to_temp;
+ }
+
+ /* Resolving into a surface with DCC is unsupported. Since
+ * it's being overwritten anyway, clear it to uncompressed.
+ * This is still the fastest codepath even with this clear.
+ */
+ if (vi_dcc_enabled(dst, info->dst.level)) {
+ if (!vi_dcc_clear_level(sctx, dst, info->dst.level, DCC_UNCOMPRESSED))
+ goto resolve_to_temp;
+
+ dst->dirty_level_mask &= ~(1 << info->dst.level);
+ }
+
+ /* Resolve directly from src to dst. */
+ si_do_CB_resolve(sctx, info, info->dst.resource, info->dst.level, info->dst.box.z, format);
+ return true;
+ }
resolve_to_temp:
- /* Shader-based resolve is VERY SLOW. Instead, resolve into
- * a temporary texture and blit.
- */
- memset(&templ, 0, sizeof(templ));
- templ.target = PIPE_TEXTURE_2D;
- templ.format = info->src.resource->format;
- templ.width0 = info->src.resource->width0;
- templ.height0 = info->src.resource->height0;
- templ.depth0 = 1;
- templ.array_size = 1;
- templ.usage = PIPE_USAGE_DEFAULT;
- templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING |
- SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
- SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
- SI_RESOURCE_FLAG_DISABLE_DCC;
-
- /* The src and dst microtile modes must be the same. */
- if (sctx->chip_class <= GFX8 &&
- src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
- templ.bind = PIPE_BIND_SCANOUT;
- else
- templ.bind = 0;
-
- tmp = ctx->screen->resource_create(ctx->screen, &templ);
- if (!tmp)
- return false;
- stmp = (struct si_texture*)tmp;
-
- assert(!stmp->surface.is_linear);
- assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
-
- /* resolve */
- si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
-
- /* blit */
- blit = *info;
- blit.src.resource = tmp;
- blit.src.box.z = 0;
-
- si_blitter_begin(sctx, SI_BLIT |
- (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
- util_blitter_blit(sctx->blitter, &blit);
- si_blitter_end(sctx);
-
- pipe_resource_reference(&tmp, NULL);
- return true;
+ /* Shader-based resolve is VERY SLOW. Instead, resolve into
+ * a temporary texture and blit.
+ */
+ memset(&templ, 0, sizeof(templ));
+ templ.target = PIPE_TEXTURE_2D;
+ templ.format = info->src.resource->format;
+ templ.width0 = info->src.resource->width0;
+ templ.height0 = info->src.resource->height0;
+ templ.depth0 = 1;
+ templ.array_size = 1;
+ templ.usage = PIPE_USAGE_DEFAULT;
+ templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
+ SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
+ SI_RESOURCE_FLAG_DISABLE_DCC;
+
+ /* The src and dst microtile modes must be the same. */
+ if (sctx->chip_class <= GFX8 && src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
+ templ.bind = PIPE_BIND_SCANOUT;
+ else
+ templ.bind = 0;
+
+ tmp = ctx->screen->resource_create(ctx->screen, &templ);
+ if (!tmp)
+ return false;
+ stmp = (struct si_texture *)tmp;
+
+ assert(!stmp->surface.is_linear);
+ assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
+
+ /* resolve */
+ si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
+
+ /* blit */
+ blit = *info;
+ blit.src.resource = tmp;
+ blit.src.box.z = 0;
+
+ si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_blit(sctx->blitter, &blit);
+ si_blitter_end(sctx);
+
+ pipe_resource_reference(&tmp, NULL);
+ return true;
}
-static void si_blit(struct pipe_context *ctx,
- const struct pipe_blit_info *info)
+static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_texture *dst = (struct si_texture *)info->dst.resource;
-
- if (do_hardware_msaa_resolve(ctx, info)) {
- return;
- }
-
- /* Using SDMA for copying to a linear texture in GTT is much faster.
- * This improves DRI PRIME performance.
- *
- * resource_copy_region can't do this yet, because dma_copy calls it
- * on failure (recursion).
- */
- if (dst->surface.is_linear &&
- util_can_blit_via_copy_region(info, false)) {
- sctx->dma_copy(ctx, info->dst.resource, info->dst.level,
- info->dst.box.x, info->dst.box.y,
- info->dst.box.z,
- info->src.resource, info->src.level,
- &info->src.box);
- return;
- }
-
- assert(util_blitter_is_blit_supported(sctx->blitter, info));
-
- /* The driver doesn't decompress resources automatically while
- * u_blitter is rendering. */
- vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
- info->src.level,
- info->src.format);
- vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
- info->dst.level,
- info->dst.format);
- si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
- info->src.level,
- info->src.box.z,
- info->src.box.z + info->src.box.depth - 1);
-
- if (sctx->screen->debug_flags & DBG(FORCE_SDMA) &&
- util_try_blit_via_copy_region(ctx, info))
- return;
-
- si_blitter_begin(sctx, SI_BLIT |
- (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
- util_blitter_blit(sctx->blitter, info);
- si_blitter_end(sctx);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *dst = (struct si_texture *)info->dst.resource;
+
+ if (do_hardware_msaa_resolve(ctx, info)) {
+ return;
+ }
+
+ /* Using SDMA for copying to a linear texture in GTT is much faster.
+ * This improves DRI PRIME performance.
+ *
+ * resource_copy_region can't do this yet, because dma_copy calls it
+ * on failure (recursion).
+ */
+ if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) {
+ sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y,
+ info->dst.box.z, info->src.resource, info->src.level, &info->src.box);
+ return;
+ }
+
+ assert(util_blitter_is_blit_supported(sctx->blitter, info));
+
+ /* The driver doesn't decompress resources automatically while
+ * u_blitter is rendering. */
+ vi_disable_dcc_if_incompatible_format(sctx, info->src.resource, info->src.level,
+ info->src.format);
+ vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource, info->dst.level,
+ info->dst.format);
+ si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level,
+ info->src.box.z, info->src.box.z + info->src.box.depth - 1);
+
+ if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info))
+ return;
+
+ si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_blit(sctx->blitter, info);
+ si_blitter_end(sctx);
}
-static bool si_generate_mipmap(struct pipe_context *ctx,
- struct pipe_resource *tex,
- enum pipe_format format,
- unsigned base_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer)
+static bool si_generate_mipmap(struct pipe_context *ctx, struct pipe_resource *tex,
+ enum pipe_format format, unsigned base_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_texture *stex = (struct si_texture *)tex;
-
- if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
- return false;
-
- /* The driver doesn't decompress resources automatically while
- * u_blitter is rendering. */
- vi_disable_dcc_if_incompatible_format(sctx, tex, base_level,
- format);
- si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS,
- base_level, first_layer, last_layer);
-
- /* Clear dirty_level_mask for the levels that will be overwritten. */
- assert(base_level < last_level);
- stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
- last_level - base_level);
-
- sctx->generate_mipmap_for_depth = stex->is_depth;
-
- si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
- util_blitter_generate_mipmap(sctx->blitter, tex, format,
- base_level, last_level,
- first_layer, last_layer);
- si_blitter_end(sctx);
-
- sctx->generate_mipmap_for_depth = false;
- return true;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *stex = (struct si_texture *)tex;
+
+ if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
+ return false;
+
+ /* The driver doesn't decompress resources automatically while
+ * u_blitter is rendering. */
+ vi_disable_dcc_if_incompatible_format(sctx, tex, base_level, format);
+ si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, base_level, first_layer, last_layer);
+
+ /* Clear dirty_level_mask for the levels that will be overwritten. */
+ assert(base_level < last_level);
+ stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, last_level - base_level);
+
+ sctx->generate_mipmap_for_depth = stex->is_depth;
+
+ si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
+ util_blitter_generate_mipmap(sctx->blitter, tex, format, base_level, last_level, first_layer,
+ last_layer);
+ si_blitter_end(sctx);
+
+ sctx->generate_mipmap_for_depth = false;
+ return true;
}
-static void si_flush_resource(struct pipe_context *ctx,
- struct pipe_resource *res)
+static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *res)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_texture *tex = (struct si_texture*)res;
-
- assert(res->target != PIPE_BUFFER);
- assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
-
- /* st/dri calls flush twice per frame (not a bug), this prevents double
- * decompression. */
- if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
- return;
-
- if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
- si_blit_decompress_color(sctx, tex, 0, res->last_level,
- 0, util_max_layer(res, 0),
- tex->dcc_separate_buffer != NULL, false);
-
- if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
- si_retile_dcc(sctx, tex);
- tex->displayable_dcc_dirty = false;
- }
- }
-
- /* Always do the analysis even if DCC is disabled at the moment. */
- if (tex->dcc_gather_statistics) {
- bool separate_dcc_dirty = tex->separate_dcc_dirty;
-
- /* If the color buffer hasn't been unbound and fast clear hasn't
- * been used, separate_dcc_dirty is false, but there may have been
- * new rendering. Check if the color buffer is bound and assume
- * it's dirty.
- *
- * Note that DRI2 never unbinds window colorbuffers, which means
- * the DCC pipeline statistics query would never be re-set and would
- * keep adding new results until all free memory is exhausted if we
- * didn't do this.
- */
- if (!separate_dcc_dirty) {
- for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (sctx->framebuffer.state.cbufs[i] &&
- sctx->framebuffer.state.cbufs[i]->texture == res) {
- separate_dcc_dirty = true;
- break;
- }
- }
- }
-
- if (separate_dcc_dirty) {
- tex->separate_dcc_dirty = false;
- vi_separate_dcc_process_and_reset_stats(ctx, tex);
- }
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *tex = (struct si_texture *)res;
+
+ assert(res->target != PIPE_BUFFER);
+ assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
+
+ /* st/dri calls flush twice per frame (not a bug), this prevents double
+ * decompression. */
+ if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
+ return;
+
+ if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
+ si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0),
+ tex->dcc_separate_buffer != NULL, false);
+
+ if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
+ si_retile_dcc(sctx, tex);
+ tex->displayable_dcc_dirty = false;
+ }
+ }
+
+ /* Always do the analysis even if DCC is disabled at the moment. */
+ if (tex->dcc_gather_statistics) {
+ bool separate_dcc_dirty = tex->separate_dcc_dirty;
+
+ /* If the color buffer hasn't been unbound and fast clear hasn't
+ * been used, separate_dcc_dirty is false, but there may have been
+ * new rendering. Check if the color buffer is bound and assume
+ * it's dirty.
+ *
+ * Note that DRI2 never unbinds window colorbuffers, which means
+ * the DCC pipeline statistics query would never be re-set and would
+ * keep adding new results until all free memory is exhausted if we
+ * didn't do this.
+ */
+ if (!separate_dcc_dirty) {
+ for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (sctx->framebuffer.state.cbufs[i] &&
+ sctx->framebuffer.state.cbufs[i]->texture == res) {
+ separate_dcc_dirty = true;
+ break;
+ }
+ }
+ }
+
+ if (separate_dcc_dirty) {
+ tex->separate_dcc_dirty = false;
+ vi_separate_dcc_process_and_reset_stats(ctx, tex);
+ }
+ }
}
void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
{
- /* If graphics is disabled, we can't decompress DCC, but it shouldn't
- * be compressed either. The caller should simply discard it.
- */
- if (!tex->surface.dcc_offset || !sctx->has_graphics)
- return;
-
- si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level,
- 0, util_max_layer(&tex->buffer.b.b, 0),
- true, false);
+ /* If graphics is disabled, we can't decompress DCC, but it shouldn't
+ * be compressed either. The caller should simply discard it.
+ */
+ if (!tex->surface.dcc_offset || !sctx->has_graphics)
+ return;
+
+ si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0,
+ util_max_layer(&tex->buffer.b.b, 0), true, false);
}
void si_init_blit_functions(struct si_context *sctx)
{
- sctx->b.resource_copy_region = si_resource_copy_region;
+ sctx->b.resource_copy_region = si_resource_copy_region;
- if (sctx->has_graphics) {
- sctx->b.blit = si_blit;
- sctx->b.flush_resource = si_flush_resource;
- sctx->b.generate_mipmap = si_generate_mipmap;
- }
+ if (sctx->has_graphics) {
+ sctx->b.blit = si_blit;
+ sctx->b.flush_resource = si_flush_resource;
+ sctx->b.generate_mipmap = si_generate_mipmap;
+ }
}
#include "radeonsi/si_pipe.h"
#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+
#include <inttypes.h>
#include <stdio.h>
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
- struct pb_buffer *buf,
- enum radeon_bo_usage usage)
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+ enum radeon_bo_usage usage)
{
- if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
- return true;
- }
- if (radeon_emitted(sctx->sdma_cs, 0) &&
- sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
- return true;
- }
- return false;
+ if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
+ return true;
+ }
+ if (radeon_emitted(sctx->sdma_cs, 0) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
+ return true;
+ }
+ return false;
}
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
- struct si_resource *resource,
- unsigned usage)
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+ unsigned usage)
{
- enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
- bool busy = false;
-
- assert(!(resource->flags & RADEON_FLAG_SPARSE));
-
- if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
- return sctx->ws->buffer_map(resource->buf, NULL, usage);
- }
-
- if (!(usage & PIPE_TRANSFER_WRITE)) {
- /* have to wait for the last write */
- rusage = RADEON_USAGE_WRITE;
- }
-
- if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
- sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
- resource->buf, rusage)) {
- if (usage & PIPE_TRANSFER_DONTBLOCK) {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- return NULL;
- } else {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- busy = true;
- }
- }
- if (radeon_emitted(sctx->sdma_cs, 0) &&
- sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs,
- resource->buf, rusage)) {
- if (usage & PIPE_TRANSFER_DONTBLOCK) {
- si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
- return NULL;
- } else {
- si_flush_dma_cs(sctx, 0, NULL);
- busy = true;
- }
- }
-
- if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
- if (usage & PIPE_TRANSFER_DONTBLOCK) {
- return NULL;
- } else {
- /* We will be wait for the GPU. Wait for any offloaded
- * CS flush to complete to avoid busy-waiting in the winsys. */
- sctx->ws->cs_sync_flush(sctx->gfx_cs);
- if (sctx->sdma_cs)
- sctx->ws->cs_sync_flush(sctx->sdma_cs);
- }
- }
-
- /* Setting the CS to NULL will prevent doing checks we have done already. */
- return sctx->ws->buffer_map(resource->buf, NULL, usage);
+ enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
+ bool busy = false;
+
+ assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
+ if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+ return sctx->ws->buffer_map(resource->buf, NULL, usage);
+ }
+
+ if (!(usage & PIPE_TRANSFER_WRITE)) {
+ /* have to wait for the last write */
+ rusage = RADEON_USAGE_WRITE;
+ }
+
+ if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ return NULL;
+ } else {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ busy = true;
+ }
+ }
+ if (radeon_emitted(sctx->sdma_cs, 0) &&
+ sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, resource->buf, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+ return NULL;
+ } else {
+ si_flush_dma_cs(sctx, 0, NULL);
+ busy = true;
+ }
+ }
+
+ if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
+ if (usage & PIPE_TRANSFER_DONTBLOCK) {
+ return NULL;
+ } else {
+ /* We will be wait for the GPU. Wait for any offloaded
+ * CS flush to complete to avoid busy-waiting in the winsys. */
+ sctx->ws->cs_sync_flush(sctx->gfx_cs);
+ if (sctx->sdma_cs)
+ sctx->ws->cs_sync_flush(sctx->sdma_cs);
+ }
+ }
+
+ /* Setting the CS to NULL will prevent doing checks we have done already. */
+ return sctx->ws->buffer_map(resource->buf, NULL, usage);
}
-void si_init_resource_fields(struct si_screen *sscreen,
- struct si_resource *res,
- uint64_t size, unsigned alignment)
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+ unsigned alignment)
{
- struct si_texture *tex = (struct si_texture*)res;
-
- res->bo_size = size;
- res->bo_alignment = alignment;
- res->flags = 0;
- res->texture_handle_allocated = false;
- res->image_handle_allocated = false;
-
- switch (res->b.b.usage) {
- case PIPE_USAGE_STREAM:
- res->flags = RADEON_FLAG_GTT_WC;
- /* fall through */
- case PIPE_USAGE_STAGING:
- /* Transfers are likely to occur more often with these
- * resources. */
- res->domains = RADEON_DOMAIN_GTT;
- break;
- case PIPE_USAGE_DYNAMIC:
- /* Older kernels didn't always flush the HDP cache before
- * CS execution
- */
- if (!sscreen->info.kernel_flushes_hdp_before_ib) {
- res->domains = RADEON_DOMAIN_GTT;
- res->flags |= RADEON_FLAG_GTT_WC;
- break;
- }
- /* fall through */
- case PIPE_USAGE_DEFAULT:
- case PIPE_USAGE_IMMUTABLE:
- default:
- /* Not listing GTT here improves performance in some
- * apps. */
- res->domains = RADEON_DOMAIN_VRAM;
- res->flags |= RADEON_FLAG_GTT_WC;
- break;
- }
-
- if (res->b.b.target == PIPE_BUFFER &&
- res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
- /* Use GTT for all persistent mappings with older
- * kernels, because they didn't always flush the HDP
- * cache before CS execution.
- *
- * Write-combined CPU mappings are fine, the kernel
- * ensures all CPU writes finish before the GPU
- * executes a command stream.
- *
- * radeon doesn't have good BO move throttling, so put all
- * persistent buffers into GTT to prevent VRAM CPU page faults.
- */
- if (!sscreen->info.kernel_flushes_hdp_before_ib ||
- !sscreen->info.is_amdgpu)
- res->domains = RADEON_DOMAIN_GTT;
- }
-
- /* Tiled textures are unmappable. Always put them in VRAM. */
- if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
- res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
- res->domains = RADEON_DOMAIN_VRAM;
- res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
- RADEON_FLAG_GTT_WC;
- }
-
- /* Displayable and shareable surfaces are not suballocated. */
- if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
- res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
- else
- res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
-
- if (sscreen->debug_flags & DBG(NO_WC))
- res->flags &= ~RADEON_FLAG_GTT_WC;
-
- if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
- res->flags |= RADEON_FLAG_READ_ONLY;
-
- if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
- res->flags |= RADEON_FLAG_32BIT;
-
- /* Set expected VRAM and GART usage for the buffer. */
- res->vram_usage = 0;
- res->gart_usage = 0;
- res->max_forced_staging_uploads = 0;
- res->b.max_forced_staging_uploads = 0;
-
- if (res->domains & RADEON_DOMAIN_VRAM) {
- res->vram_usage = size;
-
- res->max_forced_staging_uploads =
- res->b.max_forced_staging_uploads =
- sscreen->info.has_dedicated_vram &&
- size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
- } else if (res->domains & RADEON_DOMAIN_GTT) {
- res->gart_usage = size;
- }
+ struct si_texture *tex = (struct si_texture *)res;
+
+ res->bo_size = size;
+ res->bo_alignment = alignment;
+ res->flags = 0;
+ res->texture_handle_allocated = false;
+ res->image_handle_allocated = false;
+
+ switch (res->b.b.usage) {
+ case PIPE_USAGE_STREAM:
+ res->flags = RADEON_FLAG_GTT_WC;
+ /* fall through */
+ case PIPE_USAGE_STAGING:
+ /* Transfers are likely to occur more often with these
+ * resources. */
+ res->domains = RADEON_DOMAIN_GTT;
+ break;
+ case PIPE_USAGE_DYNAMIC:
+ /* Older kernels didn't always flush the HDP cache before
+ * CS execution
+ */
+ if (!sscreen->info.kernel_flushes_hdp_before_ib) {
+ res->domains = RADEON_DOMAIN_GTT;
+ res->flags |= RADEON_FLAG_GTT_WC;
+ break;
+ }
+ /* fall through */
+ case PIPE_USAGE_DEFAULT:
+ case PIPE_USAGE_IMMUTABLE:
+ default:
+ /* Not listing GTT here improves performance in some
+ * apps. */
+ res->domains = RADEON_DOMAIN_VRAM;
+ res->flags |= RADEON_FLAG_GTT_WC;
+ break;
+ }
+
+ if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
+ /* Use GTT for all persistent mappings with older
+ * kernels, because they didn't always flush the HDP
+ * cache before CS execution.
+ *
+ * Write-combined CPU mappings are fine, the kernel
+ * ensures all CPU writes finish before the GPU
+ * executes a command stream.
+ *
+ * radeon doesn't have good BO move throttling, so put all
+ * persistent buffers into GTT to prevent VRAM CPU page faults.
+ */
+ if (!sscreen->info.kernel_flushes_hdp_before_ib || !sscreen->info.is_amdgpu)
+ res->domains = RADEON_DOMAIN_GTT;
+ }
+
+ /* Tiled textures are unmappable. Always put them in VRAM. */
+ if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
+ res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
+ res->domains = RADEON_DOMAIN_VRAM;
+ res->flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC;
+ }
+
+ /* Displayable and shareable surfaces are not suballocated. */
+ if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+ res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
+ else
+ res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
+ if (sscreen->debug_flags & DBG(NO_WC))
+ res->flags &= ~RADEON_FLAG_GTT_WC;
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
+ res->flags |= RADEON_FLAG_READ_ONLY;
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
+ res->flags |= RADEON_FLAG_32BIT;
+
+ /* Set expected VRAM and GART usage for the buffer. */
+ res->vram_usage = 0;
+ res->gart_usage = 0;
+ res->max_forced_staging_uploads = 0;
+ res->b.max_forced_staging_uploads = 0;
+
+ if (res->domains & RADEON_DOMAIN_VRAM) {
+ res->vram_usage = size;
+
+ res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
+ sscreen->info.has_dedicated_vram && size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
+ } else if (res->domains & RADEON_DOMAIN_GTT) {
+ res->gart_usage = size;
+ }
}
-bool si_alloc_resource(struct si_screen *sscreen,
- struct si_resource *res)
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res)
{
- struct pb_buffer *old_buf, *new_buf;
-
- /* Allocate a new resource. */
- new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size,
- res->bo_alignment,
- res->domains, res->flags);
- if (!new_buf) {
- return false;
- }
-
- /* Replace the pointer such that if res->buf wasn't NULL, it won't be
- * NULL. This should prevent crashes with multiple contexts using
- * the same buffer where one of the contexts invalidates it while
- * the others are using it. */
- old_buf = res->buf;
- res->buf = new_buf; /* should be atomic */
- res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
-
- if (res->flags & RADEON_FLAG_32BIT) {
- uint64_t start = res->gpu_address;
- uint64_t last = start + res->bo_size - 1;
- (void)start;
- (void)last;
-
- assert((start >> 32) == sscreen->info.address32_hi);
- assert((last >> 32) == sscreen->info.address32_hi);
- }
-
- pb_reference(&old_buf, NULL);
-
- util_range_set_empty(&res->valid_buffer_range);
- res->TC_L2_dirty = false;
-
- /* Print debug information. */
- if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
- fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
- res->gpu_address, res->gpu_address + res->buf->size,
- res->buf->size);
- }
-
- if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
- si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
-
- return true;
+ struct pb_buffer *old_buf, *new_buf;
+
+ /* Allocate a new resource. */
+ new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size, res->bo_alignment, res->domains,
+ res->flags);
+ if (!new_buf) {
+ return false;
+ }
+
+ /* Replace the pointer such that if res->buf wasn't NULL, it won't be
+ * NULL. This should prevent crashes with multiple contexts using
+ * the same buffer where one of the contexts invalidates it while
+ * the others are using it. */
+ old_buf = res->buf;
+ res->buf = new_buf; /* should be atomic */
+ res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
+
+ if (res->flags & RADEON_FLAG_32BIT) {
+ uint64_t start = res->gpu_address;
+ uint64_t last = start + res->bo_size - 1;
+ (void)start;
+ (void)last;
+
+ assert((start >> 32) == sscreen->info.address32_hi);
+ assert((last >> 32) == sscreen->info.address32_hi);
+ }
+
+ pb_reference(&old_buf, NULL);
+
+ util_range_set_empty(&res->valid_buffer_range);
+ res->TC_L2_dirty = false;
+
+ /* Print debug information. */
+ if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
+ fprintf(stderr, "VM start=0x%" PRIX64 " end=0x%" PRIX64 " | Buffer %" PRIu64 " bytes\n",
+ res->gpu_address, res->gpu_address + res->buf->size, res->buf->size);
+ }
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
+ si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
+
+ return true;
}
-static void si_buffer_destroy(struct pipe_screen *screen,
- struct pipe_resource *buf)
+static void si_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf)
{
- struct si_resource *buffer = si_resource(buf);
+ struct si_resource *buffer = si_resource(buf);
- threaded_resource_deinit(buf);
- util_range_destroy(&buffer->valid_buffer_range);
- pb_reference(&buffer->buf, NULL);
- FREE(buffer);
+ threaded_resource_deinit(buf);
+ util_range_destroy(&buffer->valid_buffer_range);
+ pb_reference(&buffer->buf, NULL);
+ FREE(buffer);
}
/* Reallocate the buffer a update all resource bindings where the buffer is
* This is used to avoid CPU-GPU synchronizations, because it makes the buffer
* idle by discarding its contents.
*/
-static bool
-si_invalidate_buffer(struct si_context *sctx,
- struct si_resource *buf)
+static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *buf)
{
- /* Shared buffers can't be reallocated. */
- if (buf->b.is_shared)
- return false;
-
- /* Sparse buffers can't be reallocated. */
- if (buf->flags & RADEON_FLAG_SPARSE)
- return false;
-
- /* In AMD_pinned_memory, the user pointer association only gets
- * broken when the buffer is explicitly re-allocated.
- */
- if (buf->b.is_user_ptr)
- return false;
-
- /* Check if mapping this buffer would cause waiting for the GPU. */
- if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
- /* Reallocate the buffer in the same pipe_resource. */
- si_alloc_resource(sctx->screen, buf);
- si_rebind_buffer(sctx, &buf->b.b);
- } else {
- util_range_set_empty(&buf->valid_buffer_range);
- }
-
- return true;
+ /* Shared buffers can't be reallocated. */
+ if (buf->b.is_shared)
+ return false;
+
+ /* Sparse buffers can't be reallocated. */
+ if (buf->flags & RADEON_FLAG_SPARSE)
+ return false;
+
+ /* In AMD_pinned_memory, the user pointer association only gets
+ * broken when the buffer is explicitly re-allocated.
+ */
+ if (buf->b.is_user_ptr)
+ return false;
+
+ /* Check if mapping this buffer would cause waiting for the GPU. */
+ if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ /* Reallocate the buffer in the same pipe_resource. */
+ si_alloc_resource(sctx->screen, buf);
+ si_rebind_buffer(sctx, &buf->b.b);
+ } else {
+ util_range_set_empty(&buf->valid_buffer_range);
+ }
+
+ return true;
}
/* Replace the storage of dst with src. */
-void si_replace_buffer_storage(struct pipe_context *ctx,
- struct pipe_resource *dst,
- struct pipe_resource *src)
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+ struct pipe_resource *src)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_resource *sdst = si_resource(dst);
- struct si_resource *ssrc = si_resource(src);
-
- pb_reference(&sdst->buf, ssrc->buf);
- sdst->gpu_address = ssrc->gpu_address;
- sdst->b.b.bind = ssrc->b.b.bind;
- sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
- sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
- sdst->flags = ssrc->flags;
-
- assert(sdst->vram_usage == ssrc->vram_usage);
- assert(sdst->gart_usage == ssrc->gart_usage);
- assert(sdst->bo_size == ssrc->bo_size);
- assert(sdst->bo_alignment == ssrc->bo_alignment);
- assert(sdst->domains == ssrc->domains);
-
- si_rebind_buffer(sctx, dst);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_resource *sdst = si_resource(dst);
+ struct si_resource *ssrc = si_resource(src);
+
+ pb_reference(&sdst->buf, ssrc->buf);
+ sdst->gpu_address = ssrc->gpu_address;
+ sdst->b.b.bind = ssrc->b.b.bind;
+ sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
+ sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
+ sdst->flags = ssrc->flags;
+
+ assert(sdst->vram_usage == ssrc->vram_usage);
+ assert(sdst->gart_usage == ssrc->gart_usage);
+ assert(sdst->bo_size == ssrc->bo_size);
+ assert(sdst->bo_alignment == ssrc->bo_alignment);
+ assert(sdst->domains == ssrc->domains);
+
+ si_rebind_buffer(sctx, dst);
}
-static void si_invalidate_resource(struct pipe_context *ctx,
- struct pipe_resource *resource)
+static void si_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_resource *buf = si_resource(resource);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_resource *buf = si_resource(resource);
- /* We currently only do anyting here for buffers */
- if (resource->target == PIPE_BUFFER)
- (void)si_invalidate_buffer(sctx, buf);
+ /* We currently only do anyting here for buffers */
+ if (resource->target == PIPE_BUFFER)
+ (void)si_invalidate_buffer(sctx, buf);
}
-static void *si_buffer_get_transfer(struct pipe_context *ctx,
- struct pipe_resource *resource,
- unsigned usage,
- const struct pipe_box *box,
- struct pipe_transfer **ptransfer,
- void *data, struct si_resource *staging,
- unsigned offset)
+static void *si_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource,
+ unsigned usage, const struct pipe_box *box,
+ struct pipe_transfer **ptransfer, void *data,
+ struct si_resource *staging, unsigned offset)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_transfer *transfer;
-
- if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
- transfer = slab_alloc(&sctx->pool_transfers_unsync);
- else
- transfer = slab_alloc(&sctx->pool_transfers);
-
- transfer->b.b.resource = NULL;
- pipe_resource_reference(&transfer->b.b.resource, resource);
- transfer->b.b.level = 0;
- transfer->b.b.usage = usage;
- transfer->b.b.box = *box;
- transfer->b.b.stride = 0;
- transfer->b.b.layer_stride = 0;
- transfer->b.staging = NULL;
- transfer->offset = offset;
- transfer->staging = staging;
- *ptransfer = &transfer->b.b;
- return data;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_transfer *transfer;
+
+ if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+ transfer = slab_alloc(&sctx->pool_transfers_unsync);
+ else
+ transfer = slab_alloc(&sctx->pool_transfers);
+
+ transfer->b.b.resource = NULL;
+ pipe_resource_reference(&transfer->b.b.resource, resource);
+ transfer->b.b.level = 0;
+ transfer->b.b.usage = usage;
+ transfer->b.b.box = *box;
+ transfer->b.b.stride = 0;
+ transfer->b.b.layer_stride = 0;
+ transfer->b.staging = NULL;
+ transfer->offset = offset;
+ transfer->staging = staging;
+ *ptransfer = &transfer->b.b;
+ return data;
}
-static void *si_buffer_transfer_map(struct pipe_context *ctx,
- struct pipe_resource *resource,
- unsigned level,
- unsigned usage,
- const struct pipe_box *box,
- struct pipe_transfer **ptransfer)
+static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource,
+ unsigned level, unsigned usage, const struct pipe_box *box,
+ struct pipe_transfer **ptransfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_resource *buf = si_resource(resource);
- uint8_t *data;
-
- assert(box->x + box->width <= resource->width0);
-
- /* From GL_AMD_pinned_memory issues:
- *
- * 4) Is glMapBuffer on a shared buffer guaranteed to return the
- * same system address which was specified at creation time?
- *
- * RESOLVED: NO. The GL implementation might return a different
- * virtual mapping of that memory, although the same physical
- * page will be used.
- *
- * So don't ever use staging buffers.
- */
- if (buf->b.is_user_ptr)
- usage |= PIPE_TRANSFER_PERSISTENT;
-
- /* See if the buffer range being mapped has never been initialized,
- * in which case it can be mapped unsynchronized. */
- if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
- TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
- usage & PIPE_TRANSFER_WRITE &&
- !buf->b.is_shared &&
- !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
- }
-
- /* If discarding the entire range, discard the whole resource instead. */
- if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
- box->x == 0 && box->width == resource->width0) {
- usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
- }
-
- /* If a buffer in VRAM is too large and the range is discarded, don't
- * map it directly. This makes sure that the buffer stays in VRAM.
- */
- bool force_discard_range = false;
- if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
- PIPE_TRANSFER_DISCARD_RANGE) &&
- !(usage & PIPE_TRANSFER_PERSISTENT) &&
- /* Try not to decrement the counter if it's not positive. Still racy,
- * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
- buf->max_forced_staging_uploads > 0 &&
- p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
- usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
- PIPE_TRANSFER_UNSYNCHRONIZED);
- usage |= PIPE_TRANSFER_DISCARD_RANGE;
- force_discard_range = true;
- }
-
- if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
- !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
- TC_TRANSFER_MAP_NO_INVALIDATE))) {
- assert(usage & PIPE_TRANSFER_WRITE);
-
- if (si_invalidate_buffer(sctx, buf)) {
- /* At this point, the buffer is always idle. */
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
- } else {
- /* Fall back to a temporary buffer. */
- usage |= PIPE_TRANSFER_DISCARD_RANGE;
- }
- }
-
- if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
- buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
- usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
- PIPE_TRANSFER_PERSISTENT);
- usage |= PIPE_TRANSFER_DISCARD_RANGE;
- force_discard_range = true;
- }
-
- if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
- ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
- PIPE_TRANSFER_PERSISTENT))) ||
- (buf->flags & RADEON_FLAG_SPARSE))) {
- assert(usage & PIPE_TRANSFER_WRITE);
-
- /* Check if mapping this buffer would cause waiting for the GPU.
- */
- if (buf->flags & RADEON_FLAG_SPARSE ||
- force_discard_range ||
- si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
- /* Do a wait-free write-only transfer using a temporary buffer. */
- struct u_upload_mgr *uploader;
- struct si_resource *staging = NULL;
- unsigned offset;
-
- /* If we are not called from the driver thread, we have
- * to use the uploader from u_threaded_context, which is
- * local to the calling thread.
- */
- if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
- uploader = sctx->tc->base.stream_uploader;
- else
- uploader = sctx->b.stream_uploader;
-
- u_upload_alloc(uploader, 0,
- box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
- sctx->screen->info.tcc_cache_line_size,
- &offset, (struct pipe_resource**)&staging,
- (void**)&data);
-
- if (staging) {
- data += box->x % SI_MAP_BUFFER_ALIGNMENT;
- return si_buffer_get_transfer(ctx, resource, usage, box,
- ptransfer, data, staging, offset);
- } else if (buf->flags & RADEON_FLAG_SPARSE) {
- return NULL;
- }
- } else {
- /* At this point, the buffer is always idle (we checked it above). */
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
- }
- }
- /* Use a staging buffer in cached GTT for reads. */
- else if (((usage & PIPE_TRANSFER_READ) &&
- !(usage & PIPE_TRANSFER_PERSISTENT) &&
- (buf->domains & RADEON_DOMAIN_VRAM ||
- buf->flags & RADEON_FLAG_GTT_WC)) ||
- (buf->flags & RADEON_FLAG_SPARSE)) {
- struct si_resource *staging;
-
- assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
- staging = si_resource(pipe_buffer_create(
- ctx->screen, 0, PIPE_USAGE_STAGING,
- box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
- if (staging) {
- /* Copy the VRAM buffer to the staging buffer. */
- si_sdma_copy_buffer(sctx, &staging->b.b, resource,
- box->x % SI_MAP_BUFFER_ALIGNMENT,
- box->x, box->width);
-
- data = si_buffer_map_sync_with_rings(sctx, staging,
- usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
- if (!data) {
- si_resource_reference(&staging, NULL);
- return NULL;
- }
- data += box->x % SI_MAP_BUFFER_ALIGNMENT;
-
- return si_buffer_get_transfer(ctx, resource, usage, box,
- ptransfer, data, staging, 0);
- } else if (buf->flags & RADEON_FLAG_SPARSE) {
- return NULL;
- }
- }
-
- data = si_buffer_map_sync_with_rings(sctx, buf, usage);
- if (!data) {
- return NULL;
- }
- data += box->x;
-
- return si_buffer_get_transfer(ctx, resource, usage, box,
- ptransfer, data, NULL, 0);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_resource *buf = si_resource(resource);
+ uint8_t *data;
+
+ assert(box->x + box->width <= resource->width0);
+
+ /* From GL_AMD_pinned_memory issues:
+ *
+ * 4) Is glMapBuffer on a shared buffer guaranteed to return the
+ * same system address which was specified at creation time?
+ *
+ * RESOLVED: NO. The GL implementation might return a different
+ * virtual mapping of that memory, although the same physical
+ * page will be used.
+ *
+ * So don't ever use staging buffers.
+ */
+ if (buf->b.is_user_ptr)
+ usage |= PIPE_TRANSFER_PERSISTENT;
+
+ /* See if the buffer range being mapped has never been initialized,
+ * in which case it can be mapped unsynchronized. */
+ if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
+ usage & PIPE_TRANSFER_WRITE && !buf->b.is_shared &&
+ !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ }
+
+ /* If discarding the entire range, discard the whole resource instead. */
+ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) {
+ usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+ }
+
+ /* If a buffer in VRAM is too large and the range is discarded, don't
+ * map it directly. This makes sure that the buffer stays in VRAM.
+ */
+ bool force_discard_range = false;
+ if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_DISCARD_RANGE) &&
+ !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ /* Try not to decrement the counter if it's not positive. Still racy,
+ * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+ buf->max_forced_staging_uploads > 0 &&
+ p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
+ usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_UNSYNCHRONIZED);
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ force_discard_range = true;
+ }
+
+ if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+ !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) {
+ assert(usage & PIPE_TRANSFER_WRITE);
+
+ if (si_invalidate_buffer(sctx, buf)) {
+ /* At this point, the buffer is always idle. */
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ } else {
+ /* Fall back to a temporary buffer. */
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ }
+ }
+
+ if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+ buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT);
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ force_discard_range = true;
+ }
+
+ if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+ ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT))) ||
+ (buf->flags & RADEON_FLAG_SPARSE))) {
+ assert(usage & PIPE_TRANSFER_WRITE);
+
+ /* Check if mapping this buffer would cause waiting for the GPU.
+ */
+ if (buf->flags & RADEON_FLAG_SPARSE || force_discard_range ||
+ si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ /* Do a wait-free write-only transfer using a temporary buffer. */
+ struct u_upload_mgr *uploader;
+ struct si_resource *staging = NULL;
+ unsigned offset;
+
+ /* If we are not called from the driver thread, we have
+ * to use the uploader from u_threaded_context, which is
+ * local to the calling thread.
+ */
+ if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+ uploader = sctx->tc->base.stream_uploader;
+ else
+ uploader = sctx->b.stream_uploader;
+
+ u_upload_alloc(uploader, 0, box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
+ sctx->screen->info.tcc_cache_line_size, &offset,
+ (struct pipe_resource **)&staging, (void **)&data);
+
+ if (staging) {
+ data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+ return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging,
+ offset);
+ } else if (buf->flags & RADEON_FLAG_SPARSE) {
+ return NULL;
+ }
+ } else {
+ /* At this point, the buffer is always idle (we checked it above). */
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ }
+ }
+ /* Use a staging buffer in cached GTT for reads. */
+ else if (((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ (buf->domains & RADEON_DOMAIN_VRAM || buf->flags & RADEON_FLAG_GTT_WC)) ||
+ (buf->flags & RADEON_FLAG_SPARSE)) {
+ struct si_resource *staging;
+
+ assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
+ staging = si_resource(pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_STAGING,
+ box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
+ if (staging) {
+ /* Copy the VRAM buffer to the staging buffer. */
+ si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT,
+ box->x, box->width);
+
+ data = si_buffer_map_sync_with_rings(sctx, staging, usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!data) {
+ si_resource_reference(&staging, NULL);
+ return NULL;
+ }
+ data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+
+ return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, 0);
+ } else if (buf->flags & RADEON_FLAG_SPARSE) {
+ return NULL;
+ }
+ }
+
+ data = si_buffer_map_sync_with_rings(sctx, buf, usage);
+ if (!data) {
+ return NULL;
+ }
+ data += box->x;
+
+ return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0);
}
-static void si_buffer_do_flush_region(struct pipe_context *ctx,
- struct pipe_transfer *transfer,
- const struct pipe_box *box)
+static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+ const struct pipe_box *box)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_transfer *stransfer = (struct si_transfer*)transfer;
- struct si_resource *buf = si_resource(transfer->resource);
-
- if (stransfer->staging) {
- unsigned src_offset = stransfer->offset +
- transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
- (box->x - transfer->box.x);
-
- if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
- /* This should be true for all uploaders. */
- assert(transfer->box.x == 0);
-
- /* Find a previous upload and extend its range. The last
- * upload is likely to be at the end of the list.
- */
- for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
- struct si_sdma_upload *up = &sctx->sdma_uploads[i];
-
- if (up->dst != buf)
- continue;
-
- assert(up->src == stransfer->staging);
- assert(box->x > up->dst_offset);
- up->size = box->x + box->width - up->dst_offset;
- return;
- }
-
- /* Enlarge the array if it's full. */
- if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
- unsigned size;
-
- sctx->max_sdma_uploads += 4;
- size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
- sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
- }
-
- /* Add a new upload. */
- struct si_sdma_upload *up =
- &sctx->sdma_uploads[sctx->num_sdma_uploads++];
- up->dst = up->src = NULL;
- si_resource_reference(&up->dst, buf);
- si_resource_reference(&up->src, stransfer->staging);
- up->dst_offset = box->x;
- up->src_offset = src_offset;
- up->size = box->width;
- return;
- }
-
- /* Copy the staging buffer into the original one. */
- si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
- box->x, src_offset, box->width);
- }
-
- util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x,
- box->x + box->width);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_transfer *stransfer = (struct si_transfer *)transfer;
+ struct si_resource *buf = si_resource(transfer->resource);
+
+ if (stransfer->staging) {
+ unsigned src_offset =
+ stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x);
+
+ if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ /* This should be true for all uploaders. */
+ assert(transfer->box.x == 0);
+
+ /* Find a previous upload and extend its range. The last
+ * upload is likely to be at the end of the list.
+ */
+ for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+ struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+ if (up->dst != buf)
+ continue;
+
+ assert(up->src == stransfer->staging);
+ assert(box->x > up->dst_offset);
+ up->size = box->x + box->width - up->dst_offset;
+ return;
+ }
+
+ /* Enlarge the array if it's full. */
+ if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+ unsigned size;
+
+ sctx->max_sdma_uploads += 4;
+ size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+ sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+ }
+
+ /* Add a new upload. */
+ struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+ up->dst = up->src = NULL;
+ si_resource_reference(&up->dst, buf);
+ si_resource_reference(&up->src, stransfer->staging);
+ up->dst_offset = box->x;
+ up->src_offset = src_offset;
+ up->size = box->width;
+ return;
+ }
+
+ /* Copy the staging buffer into the original one. */
+ si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset,
+ box->width);
+ }
+
+ util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, box->x + box->width);
}
-static void si_buffer_flush_region(struct pipe_context *ctx,
- struct pipe_transfer *transfer,
- const struct pipe_box *rel_box)
+static void si_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+ const struct pipe_box *rel_box)
{
- unsigned required_usage = PIPE_TRANSFER_WRITE |
- PIPE_TRANSFER_FLUSH_EXPLICIT;
+ unsigned required_usage = PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT;
- if ((transfer->usage & required_usage) == required_usage) {
- struct pipe_box box;
+ if ((transfer->usage & required_usage) == required_usage) {
+ struct pipe_box box;
- u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
- si_buffer_do_flush_region(ctx, transfer, &box);
- }
+ u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+ si_buffer_do_flush_region(ctx, transfer, &box);
+ }
}
-static void si_buffer_transfer_unmap(struct pipe_context *ctx,
- struct pipe_transfer *transfer)
+static void si_buffer_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_transfer *stransfer = (struct si_transfer*)transfer;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_transfer *stransfer = (struct si_transfer *)transfer;
- if (transfer->usage & PIPE_TRANSFER_WRITE &&
- !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
- si_buffer_do_flush_region(ctx, transfer, &transfer->box);
+ if (transfer->usage & PIPE_TRANSFER_WRITE && !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+ si_buffer_do_flush_region(ctx, transfer, &transfer->box);
- si_resource_reference(&stransfer->staging, NULL);
- assert(stransfer->b.staging == NULL); /* for threaded context only */
- pipe_resource_reference(&transfer->resource, NULL);
+ si_resource_reference(&stransfer->staging, NULL);
+ assert(stransfer->b.staging == NULL); /* for threaded context only */
+ pipe_resource_reference(&transfer->resource, NULL);
- /* Don't use pool_transfers_unsync. We are always in the driver
- * thread. */
- slab_free(&sctx->pool_transfers, transfer);
+ /* Don't use pool_transfers_unsync. We are always in the driver
+ * thread. */
+ slab_free(&sctx->pool_transfers, transfer);
}
-static void si_buffer_subdata(struct pipe_context *ctx,
- struct pipe_resource *buffer,
- unsigned usage, unsigned offset,
- unsigned size, const void *data)
+static void si_buffer_subdata(struct pipe_context *ctx, struct pipe_resource *buffer,
+ unsigned usage, unsigned offset, unsigned size, const void *data)
{
- struct pipe_transfer *transfer = NULL;
- struct pipe_box box;
- uint8_t *map = NULL;
+ struct pipe_transfer *transfer = NULL;
+ struct pipe_box box;
+ uint8_t *map = NULL;
- usage |= PIPE_TRANSFER_WRITE;
+ usage |= PIPE_TRANSFER_WRITE;
- if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
- usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
- u_box_1d(offset, size, &box);
- map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
- if (!map)
- return;
+ u_box_1d(offset, size, &box);
+ map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
+ if (!map)
+ return;
- memcpy(map, data, size);
- si_buffer_transfer_unmap(ctx, transfer);
+ memcpy(map, data, size);
+ si_buffer_transfer_unmap(ctx, transfer);
}
-static const struct u_resource_vtbl si_buffer_vtbl =
-{
- NULL, /* get_handle */
- si_buffer_destroy, /* resource_destroy */
- si_buffer_transfer_map, /* transfer_map */
- si_buffer_flush_region, /* transfer_flush_region */
- si_buffer_transfer_unmap, /* transfer_unmap */
+static const struct u_resource_vtbl si_buffer_vtbl = {
+ NULL, /* get_handle */
+ si_buffer_destroy, /* resource_destroy */
+ si_buffer_transfer_map, /* transfer_map */
+ si_buffer_flush_region, /* transfer_flush_region */
+ si_buffer_transfer_unmap, /* transfer_unmap */
};
-static struct si_resource *
-si_alloc_buffer_struct(struct pipe_screen *screen,
- const struct pipe_resource *templ)
+static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
+ const struct pipe_resource *templ)
{
- struct si_resource *buf;
+ struct si_resource *buf;
- buf = MALLOC_STRUCT(si_resource);
+ buf = MALLOC_STRUCT(si_resource);
- buf->b.b = *templ;
- buf->b.b.next = NULL;
- pipe_reference_init(&buf->b.b.reference, 1);
- buf->b.b.screen = screen;
+ buf->b.b = *templ;
+ buf->b.b.next = NULL;
+ pipe_reference_init(&buf->b.b.reference, 1);
+ buf->b.b.screen = screen;
- buf->b.vtbl = &si_buffer_vtbl;
- threaded_resource_init(&buf->b.b);
+ buf->b.vtbl = &si_buffer_vtbl;
+ threaded_resource_init(&buf->b.b);
- buf->buf = NULL;
- buf->bind_history = 0;
- buf->TC_L2_dirty = false;
- util_range_init(&buf->valid_buffer_range);
- return buf;
+ buf->buf = NULL;
+ buf->bind_history = 0;
+ buf->TC_L2_dirty = false;
+ util_range_init(&buf->valid_buffer_range);
+ return buf;
}
static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
- const struct pipe_resource *templ,
- unsigned alignment)
+ const struct pipe_resource *templ, unsigned alignment)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
- if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
- buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+ if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+ buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
- si_init_resource_fields(sscreen, buf, templ->width0, alignment);
+ si_init_resource_fields(sscreen, buf, templ->width0, alignment);
- if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
- buf->flags |= RADEON_FLAG_SPARSE;
+ if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+ buf->flags |= RADEON_FLAG_SPARSE;
- if (!si_alloc_resource(sscreen, buf)) {
- FREE(buf);
- return NULL;
- }
- return &buf->b.b;
+ if (!si_alloc_resource(sscreen, buf)) {
+ FREE(buf);
+ return NULL;
+ }
+ return &buf->b.b;
}
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
- unsigned flags, unsigned usage,
- unsigned size, unsigned alignment)
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+ unsigned usage, unsigned size, unsigned alignment)
{
- struct pipe_resource buffer;
-
- memset(&buffer, 0, sizeof buffer);
- buffer.target = PIPE_BUFFER;
- buffer.format = PIPE_FORMAT_R8_UNORM;
- buffer.bind = 0;
- buffer.usage = usage;
- buffer.flags = flags;
- buffer.width0 = size;
- buffer.height0 = 1;
- buffer.depth0 = 1;
- buffer.array_size = 1;
- return si_buffer_create(screen, &buffer, alignment);
+ struct pipe_resource buffer;
+
+ memset(&buffer, 0, sizeof buffer);
+ buffer.target = PIPE_BUFFER;
+ buffer.format = PIPE_FORMAT_R8_UNORM;
+ buffer.bind = 0;
+ buffer.usage = usage;
+ buffer.flags = flags;
+ buffer.width0 = size;
+ buffer.height0 = 1;
+ buffer.depth0 = 1;
+ buffer.array_size = 1;
+ return si_buffer_create(screen, &buffer, alignment);
}
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
- unsigned flags, unsigned usage,
- unsigned size, unsigned alignment)
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+ unsigned usage, unsigned size, unsigned alignment)
{
- return si_resource(pipe_aligned_buffer_create(screen, flags, usage,
- size, alignment));
+ return si_resource(pipe_aligned_buffer_create(screen, flags, usage, size, alignment));
}
-static struct pipe_resource *
-si_buffer_from_user_memory(struct pipe_screen *screen,
- const struct pipe_resource *templ,
- void *user_memory)
+static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ void *user_memory)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct radeon_winsys *ws = sscreen->ws;
- struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
-
- buf->domains = RADEON_DOMAIN_GTT;
- buf->flags = 0;
- buf->b.is_user_ptr = true;
- util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
- util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
-
- /* Convert a user pointer to a buffer. */
- buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
- if (!buf->buf) {
- FREE(buf);
- return NULL;
- }
-
- buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
- buf->vram_usage = 0;
- buf->gart_usage = templ->width0;
-
- return &buf->b.b;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+
+ buf->domains = RADEON_DOMAIN_GTT;
+ buf->flags = 0;
+ buf->b.is_user_ptr = true;
+ util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
+ util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
+
+ /* Convert a user pointer to a buffer. */
+ buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+ if (!buf->buf) {
+ FREE(buf);
+ return NULL;
+ }
+
+ buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
+ buf->vram_usage = 0;
+ buf->gart_usage = templ->width0;
+
+ return &buf->b.b;
}
static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
- const struct pipe_resource *templ)
+ const struct pipe_resource *templ)
{
- if (templ->target == PIPE_BUFFER) {
- return si_buffer_create(screen, templ, 256);
- } else {
- return si_texture_create(screen, templ);
- }
+ if (templ->target == PIPE_BUFFER) {
+ return si_buffer_create(screen, templ, 256);
+ } else {
+ return si_texture_create(screen, templ);
+ }
}
-static bool si_resource_commit(struct pipe_context *pctx,
- struct pipe_resource *resource,
- unsigned level, struct pipe_box *box,
- bool commit)
+static bool si_resource_commit(struct pipe_context *pctx, struct pipe_resource *resource,
+ unsigned level, struct pipe_box *box, bool commit)
{
- struct si_context *ctx = (struct si_context *)pctx;
- struct si_resource *res = si_resource(resource);
-
- /*
- * Since buffer commitment changes cannot be pipelined, we need to
- * (a) flush any pending commands that refer to the buffer we're about
- * to change, and
- * (b) wait for threaded submit to finish, including those that were
- * triggered by some other, earlier operation.
- */
- if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
- ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
- res->buf, RADEON_USAGE_READWRITE)) {
- si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- }
- if (radeon_emitted(ctx->sdma_cs, 0) &&
- ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs,
- res->buf, RADEON_USAGE_READWRITE)) {
- si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
- }
-
- if (ctx->sdma_cs)
- ctx->ws->cs_sync_flush(ctx->sdma_cs);
- ctx->ws->cs_sync_flush(ctx->gfx_cs);
-
- assert(resource->target == PIPE_BUFFER);
-
- return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+ struct si_context *ctx = (struct si_context *)pctx;
+ struct si_resource *res = si_resource(resource);
+
+ /*
+ * Since buffer commitment changes cannot be pipelined, we need to
+ * (a) flush any pending commands that refer to the buffer we're about
+ * to change, and
+ * (b) wait for threaded submit to finish, including those that were
+ * triggered by some other, earlier operation.
+ */
+ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) {
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ }
+ if (radeon_emitted(ctx->sdma_cs, 0) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) {
+ si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+ }
+
+ if (ctx->sdma_cs)
+ ctx->ws->cs_sync_flush(ctx->sdma_cs);
+ ctx->ws->cs_sync_flush(ctx->gfx_cs);
+
+ assert(resource->target == PIPE_BUFFER);
+
+ return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
}
void si_init_screen_buffer_functions(struct si_screen *sscreen)
{
- sscreen->b.resource_create = si_resource_create;
- sscreen->b.resource_destroy = u_resource_destroy_vtbl;
- sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
+ sscreen->b.resource_create = si_resource_create;
+ sscreen->b.resource_destroy = u_resource_destroy_vtbl;
+ sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
}
void si_init_buffer_functions(struct si_context *sctx)
{
- sctx->b.invalidate_resource = si_invalidate_resource;
- sctx->b.transfer_map = u_transfer_map_vtbl;
- sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
- sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
- sctx->b.texture_subdata = u_default_texture_subdata;
- sctx->b.buffer_subdata = si_buffer_subdata;
- sctx->b.resource_commit = si_resource_commit;
+ sctx->b.invalidate_resource = si_invalidate_resource;
+ sctx->b.transfer_map = u_transfer_map_vtbl;
+ sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
+ sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
+ sctx->b.texture_subdata = u_default_texture_subdata;
+ sctx->b.buffer_subdata = si_buffer_subdata;
+ sctx->b.resource_commit = si_resource_commit;
}
static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
{
- assert(reg < SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
- radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+ assert(reg < SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+ radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
}
static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
- radeon_set_config_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
+ radeon_set_config_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
}
static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
{
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+ assert(reg >= SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+ radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
}
static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
- radeon_set_context_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
+ radeon_set_context_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
}
-static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
- unsigned reg, unsigned idx,
- unsigned value)
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
+ unsigned value)
{
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 3 <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
- radeon_emit(cs, value);
+ assert(reg >= SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 3 <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+ radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+ radeon_emit(cs, value);
}
static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
{
- assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
- radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+ assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+ radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
}
static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
- radeon_set_sh_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
+ radeon_set_sh_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
}
static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
{
- assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
- radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+ assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+ assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+ radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
}
static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
{
- radeon_set_uconfig_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
+ radeon_set_uconfig_reg_seq(cs, reg, 1);
+ radeon_emit(cs, value);
}
-static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
- struct si_screen *screen,
- unsigned reg, unsigned idx,
- unsigned value)
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
+ unsigned reg, unsigned idx, unsigned value)
{
- assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
- assert(cs->current.cdw + 3 <= cs->current.max_dw);
- assert(idx != 0);
- unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
- if (screen->info.chip_class < GFX9 ||
- (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
- opcode = PKT3_SET_UCONFIG_REG;
- radeon_emit(cs, PKT3(opcode, 1, 0));
- radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
- radeon_emit(cs, value);
+ assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+ assert(cs->current.cdw + 3 <= cs->current.max_dw);
+ assert(idx != 0);
+ unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
+ if (screen->info.chip_class < GFX9 ||
+ (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
+ opcode = PKT3_SET_UCONFIG_REG;
+ radeon_emit(cs, PKT3(opcode, 1, 0));
+ radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+ radeon_emit(cs, value);
}
static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
- unsigned value, unsigned mask)
+ unsigned value, unsigned mask)
{
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 4 <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
- radeon_emit(cs, mask);
- radeon_emit(cs, value);
+ assert(reg >= SI_CONTEXT_REG_OFFSET);
+ assert(cs->current.cdw + 4 <= cs->current.max_dw);
+ radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
+ radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+ radeon_emit(cs, mask);
+ radeon_emit(cs, value);
}
/* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value,
- unsigned mask)
+ enum si_tracked_reg reg, unsigned value,
+ unsigned mask)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- assert((value & ~mask) == 0);
- value &= mask;
+ assert((value & ~mask) == 0);
+ value &= mask;
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- radeon_set_context_reg_rmw(cs, offset, value, mask);
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+ sctx->tracked_regs.reg_value[reg] != value) {
+ radeon_set_context_reg_rmw(cs, offset, value, mask);
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
+ sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+ sctx->tracked_regs.reg_value[reg] = value;
+ }
}
/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value)
+ enum si_tracked_reg reg, unsigned value)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- radeon_set_context_reg(cs, offset, value);
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+ sctx->tracked_regs.reg_value[reg] != value) {
+ radeon_set_context_reg(cs, offset, value);
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
+ sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+ sctx->tracked_regs.reg_value[reg] = value;
+ }
}
/**
* @param value2 is written to second register
*/
static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2)
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg+1] != value2) {
- radeon_set_context_reg_seq(cs, offset, 2);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg+1] = value2;
- sctx->tracked_regs.reg_saved |= 0x3ull << reg;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg + 1] != value2) {
+ radeon_set_context_reg_seq(cs, offset, 2);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg + 1] = value2;
+ sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+ }
}
/**
* Set 3 consecutive registers if any registers value is different.
*/
static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2, unsigned value3)
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2, unsigned value3)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg+1] != value2 ||
- sctx->tracked_regs.reg_value[reg+2] != value3) {
- radeon_set_context_reg_seq(cs, offset, 3);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
- radeon_emit(cs, value3);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg+1] = value2;
- sctx->tracked_regs.reg_value[reg+2] = value3;
- sctx->tracked_regs.reg_saved |= 0x7ull << reg;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+ sctx->tracked_regs.reg_value[reg + 2] != value3) {
+ radeon_set_context_reg_seq(cs, offset, 3);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+ radeon_emit(cs, value3);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg + 1] = value2;
+ sctx->tracked_regs.reg_value[reg + 2] = value3;
+ sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+ }
}
/**
* Set 4 consecutive registers if any registers value is different.
*/
static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2, unsigned value3,
- unsigned value4)
+ enum si_tracked_reg reg, unsigned value1,
+ unsigned value2, unsigned value3, unsigned value4)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg+1] != value2 ||
- sctx->tracked_regs.reg_value[reg+2] != value3 ||
- sctx->tracked_regs.reg_value[reg+3] != value4) {
- radeon_set_context_reg_seq(cs, offset, 4);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
- radeon_emit(cs, value3);
- radeon_emit(cs, value4);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg+1] = value2;
- sctx->tracked_regs.reg_value[reg+2] = value3;
- sctx->tracked_regs.reg_value[reg+3] = value4;
- sctx->tracked_regs.reg_saved |= 0xfull << reg;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+ sctx->tracked_regs.reg_value[reg] != value1 ||
+ sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+ sctx->tracked_regs.reg_value[reg + 2] != value3 ||
+ sctx->tracked_regs.reg_value[reg + 3] != value4) {
+ radeon_set_context_reg_seq(cs, offset, 4);
+ radeon_emit(cs, value1);
+ radeon_emit(cs, value2);
+ radeon_emit(cs, value3);
+ radeon_emit(cs, value4);
+
+ sctx->tracked_regs.reg_value[reg] = value1;
+ sctx->tracked_regs.reg_value[reg + 1] = value2;
+ sctx->tracked_regs.reg_value[reg + 2] = value3;
+ sctx->tracked_regs.reg_value[reg + 3] = value4;
+ sctx->tracked_regs.reg_saved |= 0xfull << reg;
+ }
}
/**
* Set consecutive registers if any registers value is different.
*/
static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
- unsigned *value, unsigned *saved_val,
- unsigned num)
+ unsigned *value, unsigned *saved_val, unsigned num)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- int i, j;
-
- for (i = 0; i < num; i++) {
- if (saved_val[i] != value[i]) {
- radeon_set_context_reg_seq(cs, offset, num);
- for (j = 0; j < num; j++)
- radeon_emit(cs, value[j]);
-
- memcpy(saved_val, value, sizeof(uint32_t) * num);
- break;
- }
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ int i, j;
+
+ for (i = 0; i < num; i++) {
+ if (saved_val[i] != value[i]) {
+ radeon_set_context_reg_seq(cs, offset, num);
+ for (j = 0; j < num; j++)
+ radeon_emit(cs, value[j]);
+
+ memcpy(saved_val, value, sizeof(uint32_t) * num);
+ break;
+ }
+ }
}
#endif
#include "si_pipe.h"
#include "sid.h"
-
#include "util/format/u_format.h"
#include "util/u_pack_color.h"
#include "util/u_surface.h"
-enum {
- SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
- SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
+enum
+{
+ SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
+ SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
};
-static void si_alloc_separate_cmask(struct si_screen *sscreen,
- struct si_texture *tex)
+static void si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
{
- /* CMASK for MSAA is allocated in advance or always disabled
- * by "nofmask" option.
- */
- if (tex->cmask_buffer || !tex->surface.cmask_size ||
- tex->buffer.b.b.nr_samples >= 2)
- return;
-
- tex->cmask_buffer =
- si_aligned_buffer_create(&sscreen->b,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- tex->surface.cmask_size,
- tex->surface.cmask_alignment);
- if (tex->cmask_buffer == NULL)
- return;
-
- tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
- tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-
- p_atomic_inc(&sscreen->compressed_colortex_counter);
+ /* CMASK for MSAA is allocated in advance or always disabled
+ * by "nofmask" option.
+ */
+ if (tex->cmask_buffer || !tex->surface.cmask_size || tex->buffer.b.b.nr_samples >= 2)
+ return;
+
+ tex->cmask_buffer =
+ si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ tex->surface.cmask_size, tex->surface.cmask_alignment);
+ if (tex->cmask_buffer == NULL)
+ return;
+
+ tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+ tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+
+ p_atomic_inc(&sscreen->compressed_colortex_counter);
}
-static bool si_set_clear_color(struct si_texture *tex,
- enum pipe_format surface_format,
- const union pipe_color_union *color)
+static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format,
+ const union pipe_color_union *color)
{
- union util_color uc;
-
- memset(&uc, 0, sizeof(uc));
-
- if (tex->surface.bpe == 16) {
- /* DCC fast clear only:
- * CLEAR_WORD0 = R = G = B
- * CLEAR_WORD1 = A
- */
- assert(color->ui[0] == color->ui[1] &&
- color->ui[0] == color->ui[2]);
- uc.ui[0] = color->ui[0];
- uc.ui[1] = color->ui[3];
- } else {
- util_pack_color_union(surface_format, &uc, color);
- }
-
- if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
- return false;
-
- memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
- return true;
+ union util_color uc;
+
+ memset(&uc, 0, sizeof(uc));
+
+ if (tex->surface.bpe == 16) {
+ /* DCC fast clear only:
+ * CLEAR_WORD0 = R = G = B
+ * CLEAR_WORD1 = A
+ */
+ assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]);
+ uc.ui[0] = color->ui[0];
+ uc.ui[1] = color->ui[3];
+ } else {
+ util_pack_color_union(surface_format, &uc, color);
+ }
+
+ if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+ return false;
+
+ memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+ return true;
}
/** Linearize and convert luminace/intensity to red. */
enum pipe_format si_simplify_cb_format(enum pipe_format format)
{
- format = util_format_linear(format);
- format = util_format_luminance_to_red(format);
- return util_format_intensity_to_red(format);
+ format = util_format_linear(format);
+ format = util_format_luminance_to_red(format);
+ return util_format_intensity_to_red(format);
}
bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format)
{
- format = si_simplify_cb_format(format);
- const struct util_format_description *desc = util_format_description(format);
+ format = si_simplify_cb_format(format);
+ const struct util_format_description *desc = util_format_description(format);
- /* Formats with 3 channels can't have alpha. */
- if (desc->nr_channels == 3)
- return true; /* same as xxxA; is any value OK here? */
+ /* Formats with 3 channels can't have alpha. */
+ if (desc->nr_channels == 3)
+ return true; /* same as xxxA; is any value OK here? */
- if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
- return desc->swizzle[3] == PIPE_SWIZZLE_X;
+ if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
+ return desc->swizzle[3] == PIPE_SWIZZLE_X;
- return si_translate_colorswap(format, false) <= 1;
+ return si_translate_colorswap(format, false) <= 1;
}
-static bool vi_get_fast_clear_parameters(struct si_screen *sscreen,
- enum pipe_format base_format,
- enum pipe_format surface_format,
- const union pipe_color_union *color,
- uint32_t* clear_value,
- bool *eliminate_needed)
+static bool vi_get_fast_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format,
+ enum pipe_format surface_format,
+ const union pipe_color_union *color, uint32_t *clear_value,
+ bool *eliminate_needed)
{
- /* If we want to clear without needing a fast clear eliminate step, we
- * can set color and alpha independently to 0 or 1 (or 0/max for integer
- * formats).
- */
- bool values[4] = {}; /* whether to clear to 0 or 1 */
- bool color_value = false; /* clear color to 0 or 1 */
- bool alpha_value = false; /* clear alpha to 0 or 1 */
- int alpha_channel; /* index of the alpha component */
- bool has_color = false;
- bool has_alpha = false;
-
- const struct util_format_description *desc =
- util_format_description(si_simplify_cb_format(surface_format));
-
- /* 128-bit fast clear with different R,G,B values is unsupported. */
- if (desc->block.bits == 128 &&
- (color->ui[0] != color->ui[1] ||
- color->ui[0] != color->ui[2]))
- return false;
-
- *eliminate_needed = true;
- *clear_value = DCC_CLEAR_COLOR_REG;
-
- if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
- return true; /* need ELIMINATE_FAST_CLEAR */
-
- bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
- bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
-
- /* Formats with 3 channels can't have alpha. */
- if (desc->nr_channels == 3)
- alpha_channel = -1;
- else if (surf_alpha_is_on_msb)
- alpha_channel = desc->nr_channels - 1;
- else
- alpha_channel = 0;
-
- for (int i = 0; i < 4; ++i) {
- if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
- continue;
-
- if (desc->channel[i].pure_integer &&
- desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
- /* Use the maximum value for clamping the clear color. */
- int max = u_bit_consecutive(0, desc->channel[i].size - 1);
-
- values[i] = color->i[i] != 0;
- if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
- return true; /* need ELIMINATE_FAST_CLEAR */
- } else if (desc->channel[i].pure_integer &&
- desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
- /* Use the maximum value for clamping the clear color. */
- unsigned max = u_bit_consecutive(0, desc->channel[i].size);
-
- values[i] = color->ui[i] != 0U;
- if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
- return true; /* need ELIMINATE_FAST_CLEAR */
- } else {
- values[i] = color->f[i] != 0.0F;
- if (color->f[i] != 0.0F && color->f[i] != 1.0F)
- return true; /* need ELIMINATE_FAST_CLEAR */
- }
-
- if (desc->swizzle[i] == alpha_channel) {
- alpha_value = values[i];
- has_alpha = true;
- } else {
- color_value = values[i];
- has_color = true;
- }
- }
-
- /* If alpha isn't present, make it the same as color, and vice versa. */
- if (!has_alpha)
- alpha_value = color_value;
- else if (!has_color)
- color_value = alpha_value;
-
- if (color_value != alpha_value &&
- base_alpha_is_on_msb != surf_alpha_is_on_msb)
- return true; /* require ELIMINATE_FAST_CLEAR */
-
- /* Check if all color values are equal if they are present. */
- for (int i = 0; i < 4; ++i) {
- if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
- desc->swizzle[i] != alpha_channel &&
- values[i] != color_value)
- return true; /* require ELIMINATE_FAST_CLEAR */
- }
-
- /* This doesn't need ELIMINATE_FAST_CLEAR.
- * On chips predating Raven2, the DCC clear codes and the CB clear
- * color registers must match.
- */
- *eliminate_needed = false;
-
- if (color_value) {
- if (alpha_value)
- *clear_value = DCC_CLEAR_COLOR_1111;
- else
- *clear_value = DCC_CLEAR_COLOR_1110;
- } else {
- if (alpha_value)
- *clear_value = DCC_CLEAR_COLOR_0001;
- else
- *clear_value = DCC_CLEAR_COLOR_0000;
- }
- return true;
+ /* If we want to clear without needing a fast clear eliminate step, we
+ * can set color and alpha independently to 0 or 1 (or 0/max for integer
+ * formats).
+ */
+ bool values[4] = {}; /* whether to clear to 0 or 1 */
+ bool color_value = false; /* clear color to 0 or 1 */
+ bool alpha_value = false; /* clear alpha to 0 or 1 */
+ int alpha_channel; /* index of the alpha component */
+ bool has_color = false;
+ bool has_alpha = false;
+
+ const struct util_format_description *desc =
+ util_format_description(si_simplify_cb_format(surface_format));
+
+ /* 128-bit fast clear with different R,G,B values is unsupported. */
+ if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2]))
+ return false;
+
+ *eliminate_needed = true;
+ *clear_value = DCC_CLEAR_COLOR_REG;
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+
+ bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
+ bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
+
+ /* Formats with 3 channels can't have alpha. */
+ if (desc->nr_channels == 3)
+ alpha_channel = -1;
+ else if (surf_alpha_is_on_msb)
+ alpha_channel = desc->nr_channels - 1;
+ else
+ alpha_channel = 0;
+
+ for (int i = 0; i < 4; ++i) {
+ if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
+ continue;
+
+ if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+ values[i] = color->i[i] != 0;
+ if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ } else if (desc->channel[i].pure_integer &&
+ desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+ values[i] = color->ui[i] != 0U;
+ if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ } else {
+ values[i] = color->f[i] != 0.0F;
+ if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+ return true; /* need ELIMINATE_FAST_CLEAR */
+ }
+
+ if (desc->swizzle[i] == alpha_channel) {
+ alpha_value = values[i];
+ has_alpha = true;
+ } else {
+ color_value = values[i];
+ has_color = true;
+ }
+ }
+
+ /* If alpha isn't present, make it the same as color, and vice versa. */
+ if (!has_alpha)
+ alpha_value = color_value;
+ else if (!has_color)
+ color_value = alpha_value;
+
+ if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb)
+ return true; /* require ELIMINATE_FAST_CLEAR */
+
+ /* Check if all color values are equal if they are present. */
+ for (int i = 0; i < 4; ++i) {
+ if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel &&
+ values[i] != color_value)
+ return true; /* require ELIMINATE_FAST_CLEAR */
+ }
+
+ /* This doesn't need ELIMINATE_FAST_CLEAR.
+ * On chips predating Raven2, the DCC clear codes and the CB clear
+ * color registers must match.
+ */
+ *eliminate_needed = false;
+
+ if (color_value) {
+ if (alpha_value)
+ *clear_value = DCC_CLEAR_COLOR_1111;
+ else
+ *clear_value = DCC_CLEAR_COLOR_1110;
+ } else {
+ if (alpha_value)
+ *clear_value = DCC_CLEAR_COLOR_0001;
+ else
+ *clear_value = DCC_CLEAR_COLOR_0000;
+ }
+ return true;
}
-bool vi_dcc_clear_level(struct si_context *sctx,
- struct si_texture *tex,
- unsigned level, unsigned clear_value)
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+ unsigned clear_value)
{
- struct pipe_resource *dcc_buffer;
- uint64_t dcc_offset, clear_size;
-
- assert(vi_dcc_enabled(tex, level));
-
- if (tex->dcc_separate_buffer) {
- dcc_buffer = &tex->dcc_separate_buffer->b.b;
- dcc_offset = 0;
- } else {
- dcc_buffer = &tex->buffer.b.b;
- dcc_offset = tex->surface.dcc_offset;
- }
-
- if (sctx->chip_class >= GFX9) {
- /* Mipmap level clears aren't implemented. */
- if (tex->buffer.b.b.last_level > 0)
- return false;
-
- /* 4x and 8x MSAA needs a sophisticated compute shader for
- * the clear. See AMDVLK. */
- if (tex->buffer.b.b.nr_storage_samples >= 4)
- return false;
-
- clear_size = tex->surface.dcc_size;
- } else {
- unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
-
- /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
- if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
- return false;
-
- /* Layered 4x and 8x MSAA DCC fast clears need to clear
- * dcc_fast_clear_size bytes for each layer. A compute shader
- * would be more efficient than separate per-layer clear operations.
- */
- if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
- return false;
-
- dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
- clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
- num_layers;
- }
-
- si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
- &clear_value, 4, SI_COHERENCY_CB_META, false);
- return true;
+ struct pipe_resource *dcc_buffer;
+ uint64_t dcc_offset, clear_size;
+
+ assert(vi_dcc_enabled(tex, level));
+
+ if (tex->dcc_separate_buffer) {
+ dcc_buffer = &tex->dcc_separate_buffer->b.b;
+ dcc_offset = 0;
+ } else {
+ dcc_buffer = &tex->buffer.b.b;
+ dcc_offset = tex->surface.dcc_offset;
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ /* Mipmap level clears aren't implemented. */
+ if (tex->buffer.b.b.last_level > 0)
+ return false;
+
+ /* 4x and 8x MSAA needs a sophisticated compute shader for
+ * the clear. See AMDVLK. */
+ if (tex->buffer.b.b.nr_storage_samples >= 4)
+ return false;
+
+ clear_size = tex->surface.dcc_size;
+ } else {
+ unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
+
+ /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
+ if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
+ return false;
+
+ /* Layered 4x and 8x MSAA DCC fast clears need to clear
+ * dcc_fast_clear_size bytes for each layer. A compute shader
+ * would be more efficient than separate per-layer clear operations.
+ */
+ if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
+ return false;
+
+ dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+ clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers;
+ }
+
+ si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, &clear_value, 4, SI_COHERENCY_CB_META,
+ false);
+ return true;
}
/* Set the same micro tile mode as the destination of the last MSAA resolve.
* This allows hitting the MSAA resolve fast path, which requires that both
* src and dst micro tile modes match.
*/
-static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
- struct si_texture *tex)
+static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex)
{
- if (sscreen->info.chip_class >= GFX10 ||
- tex->buffer.b.is_shared ||
- tex->buffer.b.b.nr_samples <= 1 ||
- tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
- return;
-
- assert(sscreen->info.chip_class >= GFX9 ||
- tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
- assert(tex->buffer.b.b.last_level == 0);
-
- if (sscreen->info.chip_class >= GFX9) {
- /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
- assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
-
- /* If you do swizzle_mode % 4, you'll get:
- * 0 = Depth
- * 1 = Standard,
- * 2 = Displayable
- * 3 = Rotated
- *
- * Depth-sample order isn't allowed:
- */
- assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
-
- switch (tex->last_msaa_resolve_target_micro_mode) {
- case RADEON_MICRO_MODE_DISPLAY:
- tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
- tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
- break;
- case RADEON_MICRO_MODE_THIN:
- tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
- tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
- break;
- case RADEON_MICRO_MODE_ROTATED:
- tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
- tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
- break;
- default: /* depth */
- assert(!"unexpected micro mode");
- return;
- }
- } else if (sscreen->info.chip_class >= GFX7) {
- /* These magic numbers were copied from addrlib. It doesn't use
- * any definitions for them either. They are all 2D_TILED_THIN1
- * modes with different bpp and micro tile mode.
- */
- switch (tex->last_msaa_resolve_target_micro_mode) {
- case RADEON_MICRO_MODE_DISPLAY:
- tex->surface.u.legacy.tiling_index[0] = 10;
- break;
- case RADEON_MICRO_MODE_THIN:
- tex->surface.u.legacy.tiling_index[0] = 14;
- break;
- case RADEON_MICRO_MODE_ROTATED:
- tex->surface.u.legacy.tiling_index[0] = 28;
- break;
- default: /* depth, thick */
- assert(!"unexpected micro mode");
- return;
- }
- } else { /* GFX6 */
- switch (tex->last_msaa_resolve_target_micro_mode) {
- case RADEON_MICRO_MODE_DISPLAY:
- switch (tex->surface.bpe) {
- case 1:
- tex->surface.u.legacy.tiling_index[0] = 10;
- break;
- case 2:
- tex->surface.u.legacy.tiling_index[0] = 11;
- break;
- default: /* 4, 8 */
- tex->surface.u.legacy.tiling_index[0] = 12;
- break;
- }
- break;
- case RADEON_MICRO_MODE_THIN:
- switch (tex->surface.bpe) {
- case 1:
- tex->surface.u.legacy.tiling_index[0] = 14;
- break;
- case 2:
- tex->surface.u.legacy.tiling_index[0] = 15;
- break;
- case 4:
- tex->surface.u.legacy.tiling_index[0] = 16;
- break;
- default: /* 8, 16 */
- tex->surface.u.legacy.tiling_index[0] = 17;
- break;
- }
- break;
- default: /* depth, thick */
- assert(!"unexpected micro mode");
- return;
- }
- }
-
- tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
-
- p_atomic_inc(&sscreen->dirty_tex_counter);
+ if (sscreen->info.chip_class >= GFX10 || tex->buffer.b.is_shared ||
+ tex->buffer.b.b.nr_samples <= 1 ||
+ tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
+ return;
+
+ assert(sscreen->info.chip_class >= GFX9 ||
+ tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+ assert(tex->buffer.b.b.last_level == 0);
+
+ if (sscreen->info.chip_class >= GFX9) {
+ /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+ assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+ /* If you do swizzle_mode % 4, you'll get:
+ * 0 = Depth
+ * 1 = Standard,
+ * 2 = Displayable
+ * 3 = Rotated
+ *
+ * Depth-sample order isn't allowed:
+ */
+ assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+ break;
+ case RADEON_MICRO_MODE_ROTATED:
+ tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+ tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+ break;
+ default: /* depth */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ } else if (sscreen->info.chip_class >= GFX7) {
+ /* These magic numbers were copied from addrlib. It doesn't use
+ * any definitions for them either. They are all 2D_TILED_THIN1
+ * modes with different bpp and micro tile mode.
+ */
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ tex->surface.u.legacy.tiling_index[0] = 10;
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ tex->surface.u.legacy.tiling_index[0] = 14;
+ break;
+ case RADEON_MICRO_MODE_ROTATED:
+ tex->surface.u.legacy.tiling_index[0] = 28;
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ } else { /* GFX6 */
+ switch (tex->last_msaa_resolve_target_micro_mode) {
+ case RADEON_MICRO_MODE_DISPLAY:
+ switch (tex->surface.bpe) {
+ case 1:
+ tex->surface.u.legacy.tiling_index[0] = 10;
+ break;
+ case 2:
+ tex->surface.u.legacy.tiling_index[0] = 11;
+ break;
+ default: /* 4, 8 */
+ tex->surface.u.legacy.tiling_index[0] = 12;
+ break;
+ }
+ break;
+ case RADEON_MICRO_MODE_THIN:
+ switch (tex->surface.bpe) {
+ case 1:
+ tex->surface.u.legacy.tiling_index[0] = 14;
+ break;
+ case 2:
+ tex->surface.u.legacy.tiling_index[0] = 15;
+ break;
+ case 4:
+ tex->surface.u.legacy.tiling_index[0] = 16;
+ break;
+ default: /* 8, 16 */
+ tex->surface.u.legacy.tiling_index[0] = 17;
+ break;
+ }
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ }
+
+ tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
+
+ p_atomic_inc(&sscreen->dirty_tex_counter);
}
-static void si_do_fast_color_clear(struct si_context *sctx,
- unsigned *buffers,
- const union pipe_color_union *color)
+static void si_do_fast_color_clear(struct si_context *sctx, unsigned *buffers,
+ const union pipe_color_union *color)
{
- struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
- int i;
+ struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+ int i;
- /* This function is broken in BE, so just disable this path for now */
+ /* This function is broken in BE, so just disable this path for now */
#if UTIL_ARCH_BIG_ENDIAN
- return;
+ return;
#endif
- if (sctx->render_cond)
- return;
-
- for (i = 0; i < fb->nr_cbufs; i++) {
- struct si_texture *tex;
- unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
-
- if (!fb->cbufs[i])
- continue;
-
- /* if this colorbuffer is not being cleared */
- if (!(*buffers & clear_bit))
- continue;
-
- unsigned level = fb->cbufs[i]->u.tex.level;
- if (level > 0)
- continue;
-
- tex = (struct si_texture *)fb->cbufs[i]->texture;
-
- /* TODO: GFX9: Implement DCC fast clear for level 0 of
- * mipmapped textures. Mipmapped DCC has to clear a rectangular
- * area of DCC for level 0 (because the whole miptree is
- * organized in a 2D plane).
- */
- if (sctx->chip_class >= GFX9 &&
- tex->buffer.b.b.last_level > 0)
- continue;
-
- /* the clear is allowed if all layers are bound */
- if (fb->cbufs[i]->u.tex.first_layer != 0 ||
- fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
- continue;
- }
-
- /* only supported on tiled surfaces */
- if (tex->surface.is_linear) {
- continue;
- }
-
- /* shared textures can't use fast clear without an explicit flush,
- * because there is no way to communicate the clear color among
- * all clients
- */
- if (tex->buffer.b.is_shared &&
- !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
- continue;
-
- if (sctx->chip_class <= GFX8 &&
- tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
- !sctx->screen->info.htile_cmask_support_1d_tiling)
- continue;
-
- /* Use a slow clear for small surfaces where the cost of
- * the eliminate pass can be higher than the benefit of fast
- * clear. The closed driver does this, but the numbers may differ.
- *
- * This helps on both dGPUs and APUs, even small APUs like Mullins.
- */
- bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
- tex->buffer.b.b.width0 *
- tex->buffer.b.b.height0 <= 512 * 512;
- bool eliminate_needed = false;
- bool fmask_decompress_needed = false;
-
- /* Fast clear is the most appropriate place to enable DCC for
- * displayable surfaces.
- */
- if (sctx->family == CHIP_STONEY && !too_small) {
- vi_separate_dcc_try_enable(sctx, tex);
-
- /* RB+ isn't supported with a CMASK clear only on Stoney,
- * so all clears are considered to be hypothetically slow
- * clears, which is weighed when determining whether to
- * enable separate DCC.
- */
- if (tex->dcc_gather_statistics) /* only for Stoney */
- tex->num_slow_clears++;
- }
-
- /* Try to clear DCC first, otherwise try CMASK. */
- if (vi_dcc_enabled(tex, 0)) {
- uint32_t reset_value;
-
- if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
- continue;
-
- if (!vi_get_fast_clear_parameters(sctx->screen,
- tex->buffer.b.b.format,
- fb->cbufs[i]->format,
- color, &reset_value,
- &eliminate_needed))
- continue;
-
- if (eliminate_needed && too_small)
- continue;
-
- /* TODO: This DCC+CMASK clear doesn't work with MSAA. */
- if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer &&
- eliminate_needed)
- continue;
-
- if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
- continue;
-
- tex->separate_dcc_dirty = true;
- tex->displayable_dcc_dirty = true;
-
- /* DCC fast clear with MSAA should clear CMASK to 0xC. */
- if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
- uint32_t clear_value = 0xCCCCCCCC;
- si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
- tex->surface.cmask_offset, tex->surface.cmask_size,
- &clear_value, 4, SI_COHERENCY_CB_META, false);
- fmask_decompress_needed = true;
- }
- } else {
- if (too_small)
- continue;
-
- /* 128-bit formats are unusupported */
- if (tex->surface.bpe > 8) {
- continue;
- }
-
- /* RB+ doesn't work with CMASK fast clear on Stoney. */
- if (sctx->family == CHIP_STONEY)
- continue;
-
- /* ensure CMASK is enabled */
- si_alloc_separate_cmask(sctx->screen, tex);
- if (!tex->cmask_buffer)
- continue;
-
- /* Do the fast clear. */
- uint32_t clear_value = 0;
- si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
- tex->surface.cmask_offset, tex->surface.cmask_size,
- &clear_value, 4, SI_COHERENCY_CB_META, false);
- eliminate_needed = true;
- }
-
- if ((eliminate_needed || fmask_decompress_needed) &&
- !(tex->dirty_level_mask & (1 << level))) {
- tex->dirty_level_mask |= 1 << level;
- p_atomic_inc(&sctx->screen->compressed_colortex_counter);
- }
-
- /* We can change the micro tile mode before a full clear. */
- si_set_optimal_micro_tile_mode(sctx->screen, tex);
-
- *buffers &= ~clear_bit;
-
- /* Chips with DCC constant encoding don't need to set the clear
- * color registers for DCC clear values 0 and 1.
- */
- if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
- continue;
-
- if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
- sctx->framebuffer.dirty_cbufs |= 1 << i;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
- }
- }
+ if (sctx->render_cond)
+ return;
+
+ for (i = 0; i < fb->nr_cbufs; i++) {
+ struct si_texture *tex;
+ unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
+
+ if (!fb->cbufs[i])
+ continue;
+
+ /* if this colorbuffer is not being cleared */
+ if (!(*buffers & clear_bit))
+ continue;
+
+ unsigned level = fb->cbufs[i]->u.tex.level;
+ if (level > 0)
+ continue;
+
+ tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+ /* TODO: GFX9: Implement DCC fast clear for level 0 of
+ * mipmapped textures. Mipmapped DCC has to clear a rectangular
+ * area of DCC for level 0 (because the whole miptree is
+ * organized in a 2D plane).
+ */
+ if (sctx->chip_class >= GFX9 && tex->buffer.b.b.last_level > 0)
+ continue;
+
+ /* the clear is allowed if all layers are bound */
+ if (fb->cbufs[i]->u.tex.first_layer != 0 ||
+ fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
+ continue;
+ }
+
+ /* only supported on tiled surfaces */
+ if (tex->surface.is_linear) {
+ continue;
+ }
+
+ /* shared textures can't use fast clear without an explicit flush,
+ * because there is no way to communicate the clear color among
+ * all clients
+ */
+ if (tex->buffer.b.is_shared &&
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ continue;
+
+ if (sctx->chip_class <= GFX8 && tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+ !sctx->screen->info.htile_cmask_support_1d_tiling)
+ continue;
+
+ /* Use a slow clear for small surfaces where the cost of
+ * the eliminate pass can be higher than the benefit of fast
+ * clear. The closed driver does this, but the numbers may differ.
+ *
+ * This helps on both dGPUs and APUs, even small APUs like Mullins.
+ */
+ bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+ tex->buffer.b.b.width0 * tex->buffer.b.b.height0 <= 512 * 512;
+ bool eliminate_needed = false;
+ bool fmask_decompress_needed = false;
+
+ /* Fast clear is the most appropriate place to enable DCC for
+ * displayable surfaces.
+ */
+ if (sctx->family == CHIP_STONEY && !too_small) {
+ vi_separate_dcc_try_enable(sctx, tex);
+
+ /* RB+ isn't supported with a CMASK clear only on Stoney,
+ * so all clears are considered to be hypothetically slow
+ * clears, which is weighed when determining whether to
+ * enable separate DCC.
+ */
+ if (tex->dcc_gather_statistics) /* only for Stoney */
+ tex->num_slow_clears++;
+ }
+
+ /* Try to clear DCC first, otherwise try CMASK. */
+ if (vi_dcc_enabled(tex, 0)) {
+ uint32_t reset_value;
+
+ if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
+ continue;
+
+ if (!vi_get_fast_clear_parameters(sctx->screen, tex->buffer.b.b.format,
+ fb->cbufs[i]->format, color, &reset_value,
+ &eliminate_needed))
+ continue;
+
+ if (eliminate_needed && too_small)
+ continue;
+
+ /* TODO: This DCC+CMASK clear doesn't work with MSAA. */
+ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer && eliminate_needed)
+ continue;
+
+ if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
+ continue;
+
+ tex->separate_dcc_dirty = true;
+ tex->displayable_dcc_dirty = true;
+
+ /* DCC fast clear with MSAA should clear CMASK to 0xC. */
+ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
+ uint32_t clear_value = 0xCCCCCCCC;
+ si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+ tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+ fmask_decompress_needed = true;
+ }
+ } else {
+ if (too_small)
+ continue;
+
+ /* 128-bit formats are unusupported */
+ if (tex->surface.bpe > 8) {
+ continue;
+ }
+
+ /* RB+ doesn't work with CMASK fast clear on Stoney. */
+ if (sctx->family == CHIP_STONEY)
+ continue;
+
+ /* ensure CMASK is enabled */
+ si_alloc_separate_cmask(sctx->screen, tex);
+ if (!tex->cmask_buffer)
+ continue;
+
+ /* Do the fast clear. */
+ uint32_t clear_value = 0;
+ si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+ tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+ eliminate_needed = true;
+ }
+
+ if ((eliminate_needed || fmask_decompress_needed) &&
+ !(tex->dirty_level_mask & (1 << level))) {
+ tex->dirty_level_mask |= 1 << level;
+ p_atomic_inc(&sctx->screen->compressed_colortex_counter);
+ }
+
+ /* We can change the micro tile mode before a full clear. */
+ si_set_optimal_micro_tile_mode(sctx->screen, tex);
+
+ *buffers &= ~clear_bit;
+
+ /* Chips with DCC constant encoding don't need to set the clear
+ * color registers for DCC clear values 0 and 1.
+ */
+ if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
+ continue;
+
+ if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+ sctx->framebuffer.dirty_cbufs |= 1 << i;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ }
}
static void si_clear(struct pipe_context *ctx, unsigned buffers,
- const union pipe_color_union *color,
- double depth, unsigned stencil)
+ const union pipe_color_union *color, double depth, unsigned stencil)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
- struct pipe_surface *zsbuf = fb->zsbuf;
- struct si_texture *zstex =
- zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
- bool needs_db_flush = false;
-
- if (buffers & PIPE_CLEAR_COLOR) {
- si_do_fast_color_clear(sctx, &buffers, color);
- if (!buffers)
- return; /* all buffers have been fast cleared */
-
- /* These buffers cannot use fast clear, make sure to disable expansion. */
- for (unsigned i = 0; i < fb->nr_cbufs; i++) {
- struct si_texture *tex;
-
- /* If not clearing this buffer, skip. */
- if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
- continue;
-
- tex = (struct si_texture *)fb->cbufs[i]->texture;
- if (tex->surface.fmask_size == 0)
- tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
- }
- }
-
- if (zstex &&
- zsbuf->u.tex.first_layer == 0 &&
- zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
- /* TC-compatible HTILE only supports depth clears to 0 or 1. */
- if (buffers & PIPE_CLEAR_DEPTH &&
- si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
- (!zstex->tc_compatible_htile ||
- depth == 0 || depth == 1)) {
- /* Need to disable EXPCLEAR temporarily if clearing
- * to a new value. */
- if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
- sctx->db_depth_disable_expclear = true;
- }
-
- if (zstex->depth_clear_value != (float)depth) {
- if ((zstex->depth_clear_value != 0) != (depth != 0)) {
- /* ZRANGE_PRECISION register of a bound surface will change so we
- * must flush the DB caches. */
- needs_db_flush = true;
- }
- /* Update DB_DEPTH_CLEAR. */
- zstex->depth_clear_value = depth;
- sctx->framebuffer.dirty_zsbuf = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
- }
- sctx->db_depth_clear = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
-
- /* TC-compatible HTILE only supports stencil clears to 0. */
- if (buffers & PIPE_CLEAR_STENCIL &&
- si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
- (!zstex->tc_compatible_htile || stencil == 0)) {
- stencil &= 0xff;
-
- /* Need to disable EXPCLEAR temporarily if clearing
- * to a new value. */
- if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
- sctx->db_stencil_disable_expclear = true;
- }
-
- if (zstex->stencil_clear_value != (uint8_t)stencil) {
- /* Update DB_STENCIL_CLEAR. */
- zstex->stencil_clear_value = stencil;
- sctx->framebuffer.dirty_zsbuf = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
- }
- sctx->db_stencil_clear = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
-
- if (needs_db_flush)
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
- }
-
- si_blitter_begin(sctx, SI_CLEAR);
- util_blitter_clear(sctx->blitter, fb->width, fb->height,
- util_framebuffer_get_num_layers(fb),
- buffers, color, depth, stencil,
- sctx->framebuffer.nr_samples > 1);
- si_blitter_end(sctx);
-
- if (sctx->db_depth_clear) {
- sctx->db_depth_clear = false;
- sctx->db_depth_disable_expclear = false;
- zstex->depth_cleared = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
-
- if (sctx->db_stencil_clear) {
- sctx->db_stencil_clear = false;
- sctx->db_stencil_disable_expclear = false;
- zstex->stencil_cleared = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+ struct pipe_surface *zsbuf = fb->zsbuf;
+ struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
+ bool needs_db_flush = false;
+
+ if (buffers & PIPE_CLEAR_COLOR) {
+ si_do_fast_color_clear(sctx, &buffers, color);
+ if (!buffers)
+ return; /* all buffers have been fast cleared */
+
+ /* These buffers cannot use fast clear, make sure to disable expansion. */
+ for (unsigned i = 0; i < fb->nr_cbufs; i++) {
+ struct si_texture *tex;
+
+ /* If not clearing this buffer, skip. */
+ if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
+ continue;
+
+ tex = (struct si_texture *)fb->cbufs[i]->texture;
+ if (tex->surface.fmask_size == 0)
+ tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
+ }
+ }
+
+ if (zstex && zsbuf->u.tex.first_layer == 0 &&
+ zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
+ /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+ if (buffers & PIPE_CLEAR_DEPTH && si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
+ (!zstex->tc_compatible_htile || depth == 0 || depth == 1)) {
+ /* Need to disable EXPCLEAR temporarily if clearing
+ * to a new value. */
+ if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
+ sctx->db_depth_disable_expclear = true;
+ }
+
+ if (zstex->depth_clear_value != (float)depth) {
+ if ((zstex->depth_clear_value != 0) != (depth != 0)) {
+ /* ZRANGE_PRECISION register of a bound surface will change so we
+ * must flush the DB caches. */
+ needs_db_flush = true;
+ }
+ /* Update DB_DEPTH_CLEAR. */
+ zstex->depth_clear_value = depth;
+ sctx->framebuffer.dirty_zsbuf = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ sctx->db_depth_clear = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ /* TC-compatible HTILE only supports stencil clears to 0. */
+ if (buffers & PIPE_CLEAR_STENCIL &&
+ si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
+ (!zstex->tc_compatible_htile || stencil == 0)) {
+ stencil &= 0xff;
+
+ /* Need to disable EXPCLEAR temporarily if clearing
+ * to a new value. */
+ if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
+ sctx->db_stencil_disable_expclear = true;
+ }
+
+ if (zstex->stencil_clear_value != (uint8_t)stencil) {
+ /* Update DB_STENCIL_CLEAR. */
+ zstex->stencil_clear_value = stencil;
+ sctx->framebuffer.dirty_zsbuf = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ }
+ sctx->db_stencil_clear = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ if (needs_db_flush)
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+ }
+
+ si_blitter_begin(sctx, SI_CLEAR);
+ util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
+ buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
+ si_blitter_end(sctx);
+
+ if (sctx->db_depth_clear) {
+ sctx->db_depth_clear = false;
+ sctx->db_depth_disable_expclear = false;
+ zstex->depth_cleared = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
+
+ if (sctx->db_stencil_clear) {
+ sctx->db_stencil_clear = false;
+ sctx->db_stencil_disable_expclear = false;
+ zstex->stencil_cleared = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
}
-static void si_clear_render_target(struct pipe_context *ctx,
- struct pipe_surface *dst,
- const union pipe_color_union *color,
- unsigned dstx, unsigned dsty,
- unsigned width, unsigned height,
- bool render_condition_enabled)
+static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
+ const union pipe_color_union *color, unsigned dstx,
+ unsigned dsty, unsigned width, unsigned height,
+ bool render_condition_enabled)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture *sdst = (struct si_texture*)dst->texture;
-
- if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
- si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width,
- height, render_condition_enabled);
- return;
- }
-
- si_blitter_begin(sctx, SI_CLEAR_SURFACE |
- (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
- util_blitter_clear_render_target(sctx->blitter, dst, color,
- dstx, dsty, width, height);
- si_blitter_end(sctx);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *sdst = (struct si_texture *)dst->texture;
+
+ if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
+ si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
+ render_condition_enabled);
+ return;
+ }
+
+ si_blitter_begin(sctx,
+ SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height);
+ si_blitter_end(sctx);
}
-static void si_clear_depth_stencil(struct pipe_context *ctx,
- struct pipe_surface *dst,
- unsigned clear_flags,
- double depth,
- unsigned stencil,
- unsigned dstx, unsigned dsty,
- unsigned width, unsigned height,
- bool render_condition_enabled)
+static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst,
+ unsigned clear_flags, double depth, unsigned stencil,
+ unsigned dstx, unsigned dsty, unsigned width, unsigned height,
+ bool render_condition_enabled)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- si_blitter_begin(sctx, SI_CLEAR_SURFACE |
- (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
- util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil,
- dstx, dsty, width, height);
- si_blitter_end(sctx);
+ si_blitter_begin(sctx,
+ SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+ util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty,
+ width, height);
+ si_blitter_end(sctx);
}
-static void si_clear_texture(struct pipe_context *pipe,
- struct pipe_resource *tex,
- unsigned level,
- const struct pipe_box *box,
- const void *data)
+static void si_clear_texture(struct pipe_context *pipe, struct pipe_resource *tex, unsigned level,
+ const struct pipe_box *box, const void *data)
{
- struct pipe_screen *screen = pipe->screen;
- struct si_texture *stex = (struct si_texture*)tex;
- struct pipe_surface tmpl = {{0}};
- struct pipe_surface *sf;
-
- tmpl.format = tex->format;
- tmpl.u.tex.first_layer = box->z;
- tmpl.u.tex.last_layer = box->z + box->depth - 1;
- tmpl.u.tex.level = level;
- sf = pipe->create_surface(pipe, tex, &tmpl);
- if (!sf)
- return;
-
- if (stex->is_depth) {
- unsigned clear;
- float depth;
- uint8_t stencil = 0;
-
- /* Depth is always present. */
- clear = PIPE_CLEAR_DEPTH;
- util_format_unpack_z_float(tex->format, &depth, data, 1);
-
- if (stex->surface.has_stencil) {
- clear |= PIPE_CLEAR_STENCIL;
- util_format_unpack_s_8uint(tex->format,
- &stencil, data, 1);
- }
-
- si_clear_depth_stencil(pipe, sf, clear, depth, stencil,
- box->x, box->y,
- box->width, box->height, false);
- } else {
- union pipe_color_union color;
-
- util_format_unpack_rgba(tex->format, color.ui, data, 1);
-
- if (screen->is_format_supported(screen, tex->format,
- tex->target, 0, 0,
- PIPE_BIND_RENDER_TARGET)) {
- si_clear_render_target(pipe, sf, &color,
- box->x, box->y,
- box->width, box->height, false);
- } else {
- /* Software fallback - just for R9G9B9E5_FLOAT */
- util_clear_render_target(pipe, sf, &color,
- box->x, box->y,
- box->width, box->height);
- }
- }
- pipe_surface_reference(&sf, NULL);
+ struct pipe_screen *screen = pipe->screen;
+ struct si_texture *stex = (struct si_texture *)tex;
+ struct pipe_surface tmpl = {{0}};
+ struct pipe_surface *sf;
+
+ tmpl.format = tex->format;
+ tmpl.u.tex.first_layer = box->z;
+ tmpl.u.tex.last_layer = box->z + box->depth - 1;
+ tmpl.u.tex.level = level;
+ sf = pipe->create_surface(pipe, tex, &tmpl);
+ if (!sf)
+ return;
+
+ if (stex->is_depth) {
+ unsigned clear;
+ float depth;
+ uint8_t stencil = 0;
+
+ /* Depth is always present. */
+ clear = PIPE_CLEAR_DEPTH;
+ util_format_unpack_z_float(tex->format, &depth, data, 1);
+
+ if (stex->surface.has_stencil) {
+ clear |= PIPE_CLEAR_STENCIL;
+ util_format_unpack_s_8uint(tex->format, &stencil, data, 1);
+ }
+
+ si_clear_depth_stencil(pipe, sf, clear, depth, stencil, box->x, box->y, box->width,
+ box->height, false);
+ } else {
+ union pipe_color_union color;
+
+ util_format_unpack_rgba(tex->format, color.ui, data, 1);
+
+ if (screen->is_format_supported(screen, tex->format, tex->target, 0, 0,
+ PIPE_BIND_RENDER_TARGET)) {
+ si_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height, false);
+ } else {
+ /* Software fallback - just for R9G9B9E5_FLOAT */
+ util_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height);
+ }
+ }
+ pipe_surface_reference(&sf, NULL);
}
void si_init_clear_functions(struct si_context *sctx)
{
- sctx->b.clear_render_target = si_clear_render_target;
- sctx->b.clear_texture = si_clear_texture;
+ sctx->b.clear_render_target = si_clear_render_target;
+ sctx->b.clear_texture = si_clear_texture;
- if (sctx->has_graphics) {
- sctx->b.clear = si_clear;
- sctx->b.clear_depth_stencil = si_clear_depth_stencil;
- }
+ if (sctx->has_graphics) {
+ sctx->b.clear = si_clear;
+ sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+ }
}
*
*/
-#include "nir/tgsi_to_nir.h"
-#include "util/u_async_debug.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+#include "si_compute.h"
#include "ac_rtld.h"
#include "amd_kernel_code_t.h"
+#include "nir/tgsi_to_nir.h"
#include "si_build_pm4.h"
-#include "si_compute.h"
+#include "util/u_async_debug.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
-#define COMPUTE_DBG(sscreen, fmt, args...) \
- do { \
- if ((sscreen->debug_flags & DBG(COMPUTE))) fprintf(stderr, fmt, ##args); \
- } while (0);
+#define COMPUTE_DBG(sscreen, fmt, args...) \
+ do { \
+ if ((sscreen->debug_flags & DBG(COMPUTE))) \
+ fprintf(stderr, fmt, ##args); \
+ } while (0);
struct dispatch_packet {
- uint16_t header;
- uint16_t setup;
- uint16_t workgroup_size_x;
- uint16_t workgroup_size_y;
- uint16_t workgroup_size_z;
- uint16_t reserved0;
- uint32_t grid_size_x;
- uint32_t grid_size_y;
- uint32_t grid_size_z;
- uint32_t private_segment_size;
- uint32_t group_segment_size;
- uint64_t kernel_object;
- uint64_t kernarg_address;
- uint64_t reserved2;
+ uint16_t header;
+ uint16_t setup;
+ uint16_t workgroup_size_x;
+ uint16_t workgroup_size_y;
+ uint16_t workgroup_size_z;
+ uint16_t reserved0;
+ uint32_t grid_size_x;
+ uint32_t grid_size_y;
+ uint32_t grid_size_z;
+ uint32_t private_segment_size;
+ uint32_t group_segment_size;
+ uint64_t kernel_object;
+ uint64_t kernarg_address;
+ uint64_t reserved2;
};
-static const amd_kernel_code_t *si_compute_get_code_object(
- const struct si_compute *program,
- uint64_t symbol_offset)
+static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program,
+ uint64_t symbol_offset)
{
- const struct si_shader_selector *sel = &program->sel;
+ const struct si_shader_selector *sel = &program->sel;
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- return NULL;
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+ return NULL;
- struct ac_rtld_binary rtld;
- if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
- .info = &sel->screen->info,
- .shader_type = MESA_SHADER_COMPUTE,
- .wave_size = sel->screen->compute_wave_size,
- .num_parts = 1,
- .elf_ptrs = &program->shader.binary.elf_buffer,
- .elf_sizes = &program->shader.binary.elf_size }))
- return NULL;
+ struct ac_rtld_binary rtld;
+ if (!ac_rtld_open(&rtld,
+ (struct ac_rtld_open_info){.info = &sel->screen->info,
+ .shader_type = MESA_SHADER_COMPUTE,
+ .wave_size = sel->screen->compute_wave_size,
+ .num_parts = 1,
+ .elf_ptrs = &program->shader.binary.elf_buffer,
+ .elf_sizes = &program->shader.binary.elf_size}))
+ return NULL;
- const amd_kernel_code_t *result = NULL;
- const char *text;
- size_t size;
- if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
- goto out;
+ const amd_kernel_code_t *result = NULL;
+ const char *text;
+ size_t size;
+ if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
+ goto out;
- if (symbol_offset + sizeof(amd_kernel_code_t) > size)
- goto out;
+ if (symbol_offset + sizeof(amd_kernel_code_t) > size)
+ goto out;
- result = (const amd_kernel_code_t*)(text + symbol_offset);
+ result = (const amd_kernel_code_t *)(text + symbol_offset);
out:
- ac_rtld_close(&rtld);
- return result;
+ ac_rtld_close(&rtld);
+ return result;
}
static void code_object_to_config(const amd_kernel_code_t *code_object,
- struct ac_shader_config *out_config) {
-
- uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
- uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
- out_config->num_sgprs = code_object->wavefront_sgpr_count;
- out_config->num_vgprs = code_object->workitem_vgpr_count;
- out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
- out_config->rsrc1 = rsrc1;
- out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
- out_config->rsrc2 = rsrc2;
- out_config->scratch_bytes_per_wave =
- align(code_object->workitem_private_segment_byte_size * 64, 1024);
+ struct ac_shader_config *out_config)
+{
+
+ uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
+ uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
+ out_config->num_sgprs = code_object->wavefront_sgpr_count;
+ out_config->num_vgprs = code_object->workitem_vgpr_count;
+ out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
+ out_config->rsrc1 = rsrc1;
+ out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
+ out_config->rsrc2 = rsrc2;
+ out_config->scratch_bytes_per_wave =
+ align(code_object->workitem_private_segment_byte_size * 64, 1024);
}
/* Asynchronous compute shader compilation. */
static void si_create_compute_state_async(void *job, int thread_index)
{
- struct si_compute *program = (struct si_compute *)job;
- struct si_shader_selector *sel = &program->sel;
- struct si_shader *shader = &program->shader;
- struct ac_llvm_compiler *compiler;
- struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
- struct si_screen *sscreen = sel->screen;
-
- assert(!debug->debug_message || debug->async);
- assert(thread_index >= 0);
- assert(thread_index < ARRAY_SIZE(sscreen->compiler));
- compiler = &sscreen->compiler[thread_index];
-
- if (!compiler->passes)
- si_init_compiler(sscreen, compiler);
-
- assert(program->ir_type == PIPE_SHADER_IR_NIR);
- si_nir_scan_shader(sel->nir, &sel->info);
-
- /* Store the declared LDS size into si_shader_info for the shader
- * cache to include it.
- */
- sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
-
- si_get_active_slot_masks(&sel->info,
- &sel->active_const_and_shader_buffers,
- &sel->active_samplers_and_images);
-
- program->shader.is_monolithic = true;
- program->reads_variable_block_size =
- sel->info.uses_block_size &&
- sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
- program->num_cs_user_data_dwords =
- sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
-
- unsigned char ir_sha1_cache_key[20];
- si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
-
- /* Try to load the shader from the shader cache. */
- simple_mtx_lock(&sscreen->shader_cache_mutex);
-
- if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
- si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
- si_shader_dump(sscreen, shader, debug, stderr, true);
-
- if (!si_shader_binary_upload(sscreen, shader, 0))
- program->shader.compilation_failed = true;
- } else {
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
- if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
- program->shader.compilation_failed = true;
- return;
- }
-
- bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
- unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
- (sel->info.uses_grid_size ? 3 : 0) +
- (program->reads_variable_block_size ? 3 : 0) +
- program->num_cs_user_data_dwords;
-
- shader->config.rsrc1 =
- S_00B848_VGPRS((shader->config.num_vgprs - 1) /
- (sscreen->compute_wave_size == 32 ? 8 : 4)) |
- S_00B848_DX10_CLAMP(1) |
- S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
- S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
- S_00B848_FLOAT_MODE(shader->config.float_mode);
-
- if (sscreen->info.chip_class < GFX10) {
- shader->config.rsrc1 |=
- S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
- }
-
- shader->config.rsrc2 =
- S_00B84C_USER_SGPR(user_sgprs) |
- S_00B84C_SCRATCH_EN(scratch_enabled) |
- S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
- S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
- S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
- S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
- S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] ? 2 :
- sel->info.uses_thread_id[1] ? 1 : 0) |
- S_00B84C_LDS_SIZE(shader->config.lds_size);
-
- simple_mtx_lock(&sscreen->shader_cache_mutex);
- si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
- shader, true);
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
- }
-
- ralloc_free(sel->nir);
- sel->nir = NULL;
+ struct si_compute *program = (struct si_compute *)job;
+ struct si_shader_selector *sel = &program->sel;
+ struct si_shader *shader = &program->shader;
+ struct ac_llvm_compiler *compiler;
+ struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+ struct si_screen *sscreen = sel->screen;
+
+ assert(!debug->debug_message || debug->async);
+ assert(thread_index >= 0);
+ assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+ compiler = &sscreen->compiler[thread_index];
+
+ if (!compiler->passes)
+ si_init_compiler(sscreen, compiler);
+
+ assert(program->ir_type == PIPE_SHADER_IR_NIR);
+ si_nir_scan_shader(sel->nir, &sel->info);
+
+ /* Store the declared LDS size into si_shader_info for the shader
+ * cache to include it.
+ */
+ sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
+
+ si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+ &sel->active_samplers_and_images);
+
+ program->shader.is_monolithic = true;
+ program->reads_variable_block_size =
+ sel->info.uses_block_size && sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
+ program->num_cs_user_data_dwords =
+ sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+
+ unsigned char ir_sha1_cache_key[20];
+ si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
+
+ /* Try to load the shader from the shader cache. */
+ simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+ if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+ si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+ si_shader_dump(sscreen, shader, debug, stderr, true);
+
+ if (!si_shader_binary_upload(sscreen, shader, 0))
+ program->shader.compilation_failed = true;
+ } else {
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+ if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
+ program->shader.compilation_failed = true;
+ return;
+ }
+
+ bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
+ unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
+ (program->reads_variable_block_size ? 3 : 0) +
+ program->num_cs_user_data_dwords;
+
+ shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
+ (sscreen->compute_wave_size == 32 ? 8 : 4)) |
+ S_00B848_DX10_CLAMP(1) |
+ S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+ S_00B848_FLOAT_MODE(shader->config.float_mode);
+
+ if (sscreen->info.chip_class < GFX10) {
+ shader->config.rsrc1 |= S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
+ }
+
+ shader->config.rsrc2 = S_00B84C_USER_SGPR(user_sgprs) | S_00B84C_SCRATCH_EN(scratch_enabled) |
+ S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
+ S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
+ S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
+ S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
+ S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2]
+ ? 2
+ : sel->info.uses_thread_id[1] ? 1 : 0) |
+ S_00B84C_LDS_SIZE(shader->config.lds_size);
+
+ simple_mtx_lock(&sscreen->shader_cache_mutex);
+ si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+ }
+
+ ralloc_free(sel->nir);
+ sel->nir = NULL;
}
-static void *si_create_compute_state(
- struct pipe_context *ctx,
- const struct pipe_compute_state *cso)
+static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_screen *sscreen = (struct si_screen *)ctx->screen;
- struct si_compute *program = CALLOC_STRUCT(si_compute);
- struct si_shader_selector *sel = &program->sel;
-
- pipe_reference_init(&sel->base.reference, 1);
- sel->type = PIPE_SHADER_COMPUTE;
- sel->screen = sscreen;
- program->shader.selector = &program->sel;
- program->ir_type = cso->ir_type;
- program->local_size = cso->req_local_mem;
- program->private_size = cso->req_private_mem;
- program->input_size = cso->req_input_mem;
-
- if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
- if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
- program->ir_type = PIPE_SHADER_IR_NIR;
- sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
- } else {
- assert(cso->ir_type == PIPE_SHADER_IR_NIR);
- sel->nir = (struct nir_shader *) cso->prog;
- }
-
- sel->compiler_ctx_state.debug = sctx->debug;
- sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
- p_atomic_inc(&sscreen->num_shaders_created);
-
- si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE,
- &sel->ready,
- &sel->compiler_ctx_state,
- program, si_create_compute_state_async);
- } else {
- const struct pipe_binary_program_header *header;
- header = cso->prog;
-
- program->shader.binary.elf_size = header->num_bytes;
- program->shader.binary.elf_buffer = malloc(header->num_bytes);
- if (!program->shader.binary.elf_buffer) {
- FREE(program);
- return NULL;
- }
- memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
-
- const amd_kernel_code_t *code_object =
- si_compute_get_code_object(program, 0);
- code_object_to_config(code_object, &program->shader.config);
-
- si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
- if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
- fprintf(stderr, "LLVM failed to upload shader\n");
- free((void *)program->shader.binary.elf_buffer);
- FREE(program);
- return NULL;
- }
- }
-
- return program;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ struct si_compute *program = CALLOC_STRUCT(si_compute);
+ struct si_shader_selector *sel = &program->sel;
+
+ pipe_reference_init(&sel->base.reference, 1);
+ sel->type = PIPE_SHADER_COMPUTE;
+ sel->screen = sscreen;
+ program->shader.selector = &program->sel;
+ program->ir_type = cso->ir_type;
+ program->local_size = cso->req_local_mem;
+ program->private_size = cso->req_private_mem;
+ program->input_size = cso->req_input_mem;
+
+ if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
+ if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
+ program->ir_type = PIPE_SHADER_IR_NIR;
+ sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
+ } else {
+ assert(cso->ir_type == PIPE_SHADER_IR_NIR);
+ sel->nir = (struct nir_shader *)cso->prog;
+ }
+
+ sel->compiler_ctx_state.debug = sctx->debug;
+ sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+ p_atomic_inc(&sscreen->num_shaders_created);
+
+ si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state,
+ program, si_create_compute_state_async);
+ } else {
+ const struct pipe_binary_program_header *header;
+ header = cso->prog;
+
+ program->shader.binary.elf_size = header->num_bytes;
+ program->shader.binary.elf_buffer = malloc(header->num_bytes);
+ if (!program->shader.binary.elf_buffer) {
+ FREE(program);
+ return NULL;
+ }
+ memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
+
+ const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0);
+ code_object_to_config(code_object, &program->shader.config);
+
+ si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
+ if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
+ fprintf(stderr, "LLVM failed to upload shader\n");
+ free((void *)program->shader.binary.elf_buffer);
+ FREE(program);
+ return NULL;
+ }
+ }
+
+ return program;
}
static void si_bind_compute_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_compute *program = (struct si_compute*)state;
- struct si_shader_selector *sel = &program->sel;
-
- sctx->cs_shader_state.program = program;
- if (!program)
- return;
-
- /* Wait because we need active slot usage masks. */
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- util_queue_fence_wait(&sel->ready);
-
- si_set_active_descriptors(sctx,
- SI_DESCS_FIRST_COMPUTE +
- SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
- sel->active_const_and_shader_buffers);
- si_set_active_descriptors(sctx,
- SI_DESCS_FIRST_COMPUTE +
- SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
- sel->active_samplers_and_images);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_compute *program = (struct si_compute *)state;
+ struct si_shader_selector *sel = &program->sel;
+
+ sctx->cs_shader_state.program = program;
+ if (!program)
+ return;
+
+ /* Wait because we need active slot usage masks. */
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+ util_queue_fence_wait(&sel->ready);
+
+ si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+ sel->active_const_and_shader_buffers);
+ si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ sel->active_samplers_and_images);
}
-static void si_set_global_binding(
- struct pipe_context *ctx, unsigned first, unsigned n,
- struct pipe_resource **resources,
- uint32_t **handles)
+static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n,
+ struct pipe_resource **resources, uint32_t **handles)
{
- unsigned i;
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_compute *program = sctx->cs_shader_state.program;
-
- if (first + n > program->max_global_buffers) {
- unsigned old_max = program->max_global_buffers;
- program->max_global_buffers = first + n;
- program->global_buffers =
- realloc(program->global_buffers,
- program->max_global_buffers *
- sizeof(program->global_buffers[0]));
- if (!program->global_buffers) {
- fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
- return;
- }
-
- memset(&program->global_buffers[old_max], 0,
- (program->max_global_buffers - old_max) *
- sizeof(program->global_buffers[0]));
- }
-
- if (!resources) {
- for (i = 0; i < n; i++) {
- pipe_resource_reference(&program->global_buffers[first + i], NULL);
- }
- return;
- }
-
- for (i = 0; i < n; i++) {
- uint64_t va;
- uint32_t offset;
- pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
- va = si_resource(resources[i])->gpu_address;
- offset = util_le32_to_cpu(*handles[i]);
- va += offset;
- va = util_cpu_to_le64(va);
- memcpy(handles[i], &va, sizeof(va));
- }
+ unsigned i;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_compute *program = sctx->cs_shader_state.program;
+
+ if (first + n > program->max_global_buffers) {
+ unsigned old_max = program->max_global_buffers;
+ program->max_global_buffers = first + n;
+ program->global_buffers = realloc(
+ program->global_buffers, program->max_global_buffers * sizeof(program->global_buffers[0]));
+ if (!program->global_buffers) {
+ fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
+ return;
+ }
+
+ memset(&program->global_buffers[old_max], 0,
+ (program->max_global_buffers - old_max) * sizeof(program->global_buffers[0]));
+ }
+
+ if (!resources) {
+ for (i = 0; i < n; i++) {
+ pipe_resource_reference(&program->global_buffers[first + i], NULL);
+ }
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ uint64_t va;
+ uint32_t offset;
+ pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
+ va = si_resource(resources[i])->gpu_address;
+ offset = util_le32_to_cpu(*handles[i]);
+ va += offset;
+ va = util_cpu_to_le64(va);
+ memcpy(handles[i], &va, sizeof(va));
+ }
}
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
{
- uint64_t bc_va;
-
- radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
- /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
- * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-
- if (sctx->chip_class >= GFX7) {
- /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
- radeon_set_sh_reg_seq(cs,
- R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
- S_00B858_SH1_CU_EN(0xffff));
- radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
- S_00B858_SH1_CU_EN(0xffff));
- }
-
- if (sctx->chip_class >= GFX10)
- radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-
- /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
- * and is now per pipe, so it should be handled in the
- * kernel if we want to use something other than the default value,
- * which is now 0x22f.
- */
- if (sctx->chip_class <= GFX6) {
- /* XXX: This should be:
- * (number of compute units) * 4 * (waves per simd) - 1 */
-
- radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
- 0x190 /* Default value */);
- }
-
- /* Set the pointer to border colors. */
- bc_va = sctx->border_color_buffer->gpu_address;
-
- if (sctx->chip_class >= GFX7) {
- radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
- radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */
- radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
- } else {
- if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
- radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
- bc_va >> 8);
- }
- }
+ uint64_t bc_va;
+
+ radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+ /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
+ * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
+ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+
+ if (sctx->chip_class >= GFX7) {
+ /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
+ radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+ }
+
+ if (sctx->chip_class >= GFX10)
+ radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+
+ /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
+ * and is now per pipe, so it should be handled in the
+ * kernel if we want to use something other than the default value,
+ * which is now 0x22f.
+ */
+ if (sctx->chip_class <= GFX6) {
+ /* XXX: This should be:
+ * (number of compute units) * 4 * (waves per simd) - 1 */
+
+ radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+ }
+
+ /* Set the pointer to border colors. */
+ bc_va = sctx->border_color_buffer->gpu_address;
+
+ if (sctx->chip_class >= GFX7) {
+ radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
+ radeon_emit(cs, bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */
+ radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+ } else {
+ if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
+ radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+ }
+ }
}
-static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
- struct si_shader *shader,
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
struct ac_shader_config *config)
{
- uint64_t scratch_bo_size, scratch_needed;
- scratch_bo_size = 0;
- scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
- if (sctx->compute_scratch_buffer)
- scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+ uint64_t scratch_bo_size, scratch_needed;
+ scratch_bo_size = 0;
+ scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+ if (sctx->compute_scratch_buffer)
+ scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
- if (scratch_bo_size < scratch_needed) {
- si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+ if (scratch_bo_size < scratch_needed) {
+ si_resource_reference(&sctx->compute_scratch_buffer, NULL);
- sctx->compute_scratch_buffer =
- si_aligned_buffer_create(&sctx->screen->b,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- scratch_needed,
- sctx->screen->info.pte_fragment_size);
+ sctx->compute_scratch_buffer =
+ si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ scratch_needed, sctx->screen->info.pte_fragment_size);
- if (!sctx->compute_scratch_buffer)
- return false;
- }
+ if (!sctx->compute_scratch_buffer)
+ return false;
+ }
- if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
- uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+ if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
- if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
- return false;
+ if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
+ return false;
- si_resource_reference(&shader->scratch_bo,
- sctx->compute_scratch_buffer);
- }
+ si_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer);
+ }
- return true;
+ return true;
}
-static bool si_switch_compute_shader(struct si_context *sctx,
- struct si_compute *program,
- struct si_shader *shader,
- const amd_kernel_code_t *code_object,
- unsigned offset)
+static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program,
+ struct si_shader *shader, const amd_kernel_code_t *code_object,
+ unsigned offset)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct ac_shader_config inline_config = {0};
- struct ac_shader_config *config;
- uint64_t shader_va;
-
- if (sctx->cs_shader_state.emitted_program == program &&
- sctx->cs_shader_state.offset == offset)
- return true;
-
- if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
- config = &shader->config;
- } else {
- unsigned lds_blocks;
-
- config = &inline_config;
- code_object_to_config(code_object, config);
-
- lds_blocks = config->lds_size;
- /* XXX: We are over allocating LDS. For GFX6, the shader reports
- * LDS in blocks of 256 bytes, so if there are 4 bytes lds
- * allocated in the shader and 4 bytes allocated by the state
- * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
- */
- if (sctx->chip_class <= GFX6) {
- lds_blocks += align(program->local_size, 256) >> 8;
- } else {
- lds_blocks += align(program->local_size, 512) >> 9;
- }
-
- /* TODO: use si_multiwave_lds_size_workaround */
- assert(lds_blocks <= 0xFF);
-
- config->rsrc2 &= C_00B84C_LDS_SIZE;
- config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
- }
-
- if (!si_setup_compute_scratch_buffer(sctx, shader, config))
- return false;
-
- if (shader->scratch_bo) {
- COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
- "Total Scratch: %u bytes\n", sctx->scratch_waves,
- config->scratch_bytes_per_wave,
- config->scratch_bytes_per_wave *
- sctx->scratch_waves);
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- shader->scratch_bo, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SCRATCH_BUFFER);
- }
-
- /* Prefetch the compute shader to TC L2.
- *
- * We should also prefetch graphics shaders if a compute dispatch was
- * the last command, and the compute shader if a draw call was the last
- * command. However, that would add more complexity and we're likely
- * to get a shader state change in that case anyway.
- */
- if (sctx->chip_class >= GFX7) {
- cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b,
- 0, program->shader.bo->b.b.width0);
- }
-
- shader_va = shader->bo->gpu_address + offset;
- if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
- /* Shader code is placed after the amd_kernel_code_t
- * struct. */
- shader_va += sizeof(amd_kernel_code_t);
- }
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo,
- RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
- radeon_emit(cs, shader_va >> 8);
- radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
- radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
- radeon_emit(cs, config->rsrc1);
- radeon_emit(cs, config->rsrc2);
-
- COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
- "COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
-
- sctx->max_seen_compute_scratch_bytes_per_wave =
- MAX2(sctx->max_seen_compute_scratch_bytes_per_wave,
- config->scratch_bytes_per_wave);
-
- radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(sctx->scratch_waves)
- | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
-
- sctx->cs_shader_state.emitted_program = program;
- sctx->cs_shader_state.offset = offset;
- sctx->cs_shader_state.uses_scratch =
- config->scratch_bytes_per_wave != 0;
-
- return true;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct ac_shader_config inline_config = {0};
+ struct ac_shader_config *config;
+ uint64_t shader_va;
+
+ if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset)
+ return true;
+
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+ config = &shader->config;
+ } else {
+ unsigned lds_blocks;
+
+ config = &inline_config;
+ code_object_to_config(code_object, config);
+
+ lds_blocks = config->lds_size;
+ /* XXX: We are over allocating LDS. For GFX6, the shader reports
+ * LDS in blocks of 256 bytes, so if there are 4 bytes lds
+ * allocated in the shader and 4 bytes allocated by the state
+ * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
+ */
+ if (sctx->chip_class <= GFX6) {
+ lds_blocks += align(program->local_size, 256) >> 8;
+ } else {
+ lds_blocks += align(program->local_size, 512) >> 9;
+ }
+
+ /* TODO: use si_multiwave_lds_size_workaround */
+ assert(lds_blocks <= 0xFF);
+
+ config->rsrc2 &= C_00B84C_LDS_SIZE;
+ config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+ }
+
+ if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+ return false;
+
+ if (shader->scratch_bo) {
+ COMPUTE_DBG(sctx->screen,
+ "Waves: %u; Scratch per wave: %u bytes; "
+ "Total Scratch: %u bytes\n",
+ sctx->scratch_waves, config->scratch_bytes_per_wave,
+ config->scratch_bytes_per_wave * sctx->scratch_waves);
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SCRATCH_BUFFER);
+ }
+
+ /* Prefetch the compute shader to TC L2.
+ *
+ * We should also prefetch graphics shaders if a compute dispatch was
+ * the last command, and the compute shader if a draw call was the last
+ * command. However, that would add more complexity and we're likely
+ * to get a shader state change in that case anyway.
+ */
+ if (sctx->chip_class >= GFX7) {
+ cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
+ }
+
+ shader_va = shader->bo->gpu_address + offset;
+ if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
+ /* Shader code is placed after the amd_kernel_code_t
+ * struct. */
+ shader_va += sizeof(amd_kernel_code_t);
+ }
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_BINARY);
+
+ radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+ radeon_emit(cs, shader_va >> 8);
+ radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+ radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(cs, config->rsrc1);
+ radeon_emit(cs, config->rsrc2);
+
+ COMPUTE_DBG(sctx->screen,
+ "COMPUTE_PGM_RSRC1: 0x%08x "
+ "COMPUTE_PGM_RSRC2: 0x%08x\n",
+ config->rsrc1, config->rsrc2);
+
+ sctx->max_seen_compute_scratch_bytes_per_wave =
+ MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
+
+ radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves) |
+ S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+
+ sctx->cs_shader_state.emitted_program = program;
+ sctx->cs_shader_state.offset = offset;
+ sctx->cs_shader_state.uses_scratch = config->scratch_bytes_per_wave != 0;
+
+ return true;
}
static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
- const amd_kernel_code_t *code_object,
- unsigned user_sgpr)
+ const amd_kernel_code_t *code_object, unsigned user_sgpr)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
-
- unsigned max_private_element_size = AMD_HSA_BITS_GET(
- code_object->code_properties,
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
-
- uint32_t scratch_dword0 = scratch_va & 0xffffffff;
- uint32_t scratch_dword1 =
- S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
- S_008F04_SWIZZLE_ENABLE(1);
-
- /* Disable address clamping */
- uint32_t scratch_dword2 = 0xffffffff;
- uint32_t scratch_dword3 =
- S_008F0C_INDEX_STRIDE(3) |
- S_008F0C_ADD_TID_ENABLE(1);
-
- if (sctx->chip_class >= GFX9) {
- assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
- } else {
- scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
-
- if (sctx->chip_class < GFX8) {
- /* BUF_DATA_FORMAT is ignored, but it cannot be
- * BUF_DATA_FORMAT_INVALID. */
- scratch_dword3 |=
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
- }
- }
-
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
- (user_sgpr * 4), 4);
- radeon_emit(cs, scratch_dword0);
- radeon_emit(cs, scratch_dword1);
- radeon_emit(cs, scratch_dword2);
- radeon_emit(cs, scratch_dword3);
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+ unsigned max_private_element_size =
+ AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+ uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+ uint32_t scratch_dword1 =
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+
+ /* Disable address clamping */
+ uint32_t scratch_dword2 = 0xffffffff;
+ uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(3) | S_008F0C_ADD_TID_ENABLE(1);
+
+ if (sctx->chip_class >= GFX9) {
+ assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
+ } else {
+ scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
+
+ if (sctx->chip_class < GFX8) {
+ /* BUF_DATA_FORMAT is ignored, but it cannot be
+ * BUF_DATA_FORMAT_INVALID. */
+ scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
+ }
+ }
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+ radeon_emit(cs, scratch_dword0);
+ radeon_emit(cs, scratch_dword1);
+ radeon_emit(cs, scratch_dword2);
+ radeon_emit(cs, scratch_dword3);
}
-static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
- const amd_kernel_code_t *code_object,
- const struct pipe_grid_info *info,
- uint64_t kernel_args_va)
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object,
+ const struct pipe_grid_info *info, uint64_t kernel_args_va)
{
- struct si_compute *program = sctx->cs_shader_state.program;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- static const enum amd_code_property_mask_t workgroup_count_masks [] = {
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
- };
-
- unsigned i, user_sgpr = 0;
- if (AMD_HSA_BITS_GET(code_object->code_properties,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
- if (code_object->workitem_private_segment_byte_size > 0) {
- setup_scratch_rsrc_user_sgprs(sctx, code_object,
- user_sgpr);
- }
- user_sgpr += 4;
- }
-
- if (AMD_HSA_BITS_GET(code_object->code_properties,
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
- struct dispatch_packet dispatch;
- unsigned dispatch_offset;
- struct si_resource *dispatch_buf = NULL;
- uint64_t dispatch_va;
-
- /* Upload dispatch ptr */
- memset(&dispatch, 0, sizeof(dispatch));
-
- dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
- dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
- dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
-
- dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
- dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
- dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
-
- dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
- dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
-
- dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
-
- u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch),
- 256, &dispatch, &dispatch_offset,
- (struct pipe_resource**)&dispatch_buf);
-
- if (!dispatch_buf) {
- fprintf(stderr, "Error: Failed to allocate dispatch "
- "packet.");
- }
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf,
- RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-
- dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
-
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
- (user_sgpr * 4), 2);
- radeon_emit(cs, dispatch_va);
- radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
- S_008F04_STRIDE(0));
-
- si_resource_reference(&dispatch_buf, NULL);
- user_sgpr += 2;
- }
-
- if (AMD_HSA_BITS_GET(code_object->code_properties,
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
- (user_sgpr * 4), 2);
- radeon_emit(cs, kernel_args_va);
- radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
- S_008F04_STRIDE(0));
- user_sgpr += 2;
- }
-
- for (i = 0; i < 3 && user_sgpr < 16; i++) {
- if (code_object->code_properties & workgroup_count_masks[i]) {
- radeon_set_sh_reg_seq(cs,
- R_00B900_COMPUTE_USER_DATA_0 +
- (user_sgpr * 4), 1);
- radeon_emit(cs, info->grid[i]);
- user_sgpr += 1;
- }
- }
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ static const enum amd_code_property_mask_t workgroup_count_masks[] = {
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z};
+
+ unsigned i, user_sgpr = 0;
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+ if (code_object->workitem_private_segment_byte_size > 0) {
+ setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr);
+ }
+ user_sgpr += 4;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+ struct dispatch_packet dispatch;
+ unsigned dispatch_offset;
+ struct si_resource *dispatch_buf = NULL;
+ uint64_t dispatch_va;
+
+ /* Upload dispatch ptr */
+ memset(&dispatch, 0, sizeof(dispatch));
+
+ dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
+ dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
+ dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
+
+ dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
+ dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
+ dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
+
+ dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
+ dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
+
+ dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
+
+ u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset,
+ (struct pipe_resource **)&dispatch_buf);
+
+ if (!dispatch_buf) {
+ fprintf(stderr, "Error: Failed to allocate dispatch "
+ "packet.");
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf, RADEON_USAGE_READ,
+ RADEON_PRIO_CONST_BUFFER);
+
+ dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+ radeon_emit(cs, dispatch_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+
+ si_resource_reference(&dispatch_buf, NULL);
+ user_sgpr += 2;
+ }
+
+ if (AMD_HSA_BITS_GET(code_object->code_properties,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+ radeon_emit(cs, kernel_args_va);
+ radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+ user_sgpr += 2;
+ }
+
+ for (i = 0; i < 3 && user_sgpr < 16; i++) {
+ if (code_object->code_properties & workgroup_count_masks[i]) {
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+ radeon_emit(cs, info->grid[i]);
+ user_sgpr += 1;
+ }
+ }
}
-static bool si_upload_compute_input(struct si_context *sctx,
- const amd_kernel_code_t *code_object,
- const struct pipe_grid_info *info)
+static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object,
+ const struct pipe_grid_info *info)
{
- struct si_compute *program = sctx->cs_shader_state.program;
- struct si_resource *input_buffer = NULL;
- uint32_t kernel_args_offset = 0;
- uint32_t *kernel_args;
- void *kernel_args_ptr;
- uint64_t kernel_args_va;
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct si_resource *input_buffer = NULL;
+ uint32_t kernel_args_offset = 0;
+ uint32_t *kernel_args;
+ void *kernel_args_ptr;
+ uint64_t kernel_args_va;
- u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
- sctx->screen->info.tcc_cache_line_size,
- &kernel_args_offset,
- (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
+ u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
+ sctx->screen->info.tcc_cache_line_size, &kernel_args_offset,
+ (struct pipe_resource **)&input_buffer, &kernel_args_ptr);
- if (unlikely(!kernel_args_ptr))
- return false;
+ if (unlikely(!kernel_args_ptr))
+ return false;
- kernel_args = (uint32_t*)kernel_args_ptr;
- kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
+ kernel_args = (uint32_t *)kernel_args_ptr;
+ kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
- memcpy(kernel_args, info->input, program->input_size);
+ memcpy(kernel_args, info->input, program->input_size);
- for (unsigned i = 0; i < program->input_size / 4; i++) {
- COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
- kernel_args[i]);
- }
+ for (unsigned i = 0; i < program->input_size / 4; i++) {
+ COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]);
+ }
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer,
- RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_CONST_BUFFER);
- si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
- si_resource_reference(&input_buffer, NULL);
- return true;
+ si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
+ si_resource_reference(&input_buffer, NULL);
+ return true;
}
-static void si_setup_nir_user_data(struct si_context *sctx,
- const struct pipe_grid_info *info)
+static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info)
{
- struct si_compute *program = sctx->cs_shader_state.program;
- struct si_shader_selector *sel = &program->sel;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
- 4 * SI_NUM_RESOURCE_SGPRS;
- unsigned block_size_reg = grid_size_reg +
- /* 12 bytes = 3 dwords. */
- 12 * sel->info.uses_grid_size;
- unsigned cs_user_data_reg = block_size_reg +
- 12 * program->reads_variable_block_size;
-
- if (info->indirect) {
- if (sel->info.uses_grid_size) {
- for (unsigned i = 0; i < 3; ++i) {
- si_cp_copy_data(sctx, sctx->gfx_cs,
- COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
- COPY_DATA_SRC_MEM, si_resource(info->indirect),
- info->indirect_offset + 4 * i);
- }
- }
- } else {
- if (sel->info.uses_grid_size) {
- radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
- radeon_emit(cs, info->grid[0]);
- radeon_emit(cs, info->grid[1]);
- radeon_emit(cs, info->grid[2]);
- }
- if (program->reads_variable_block_size) {
- radeon_set_sh_reg_seq(cs, block_size_reg, 3);
- radeon_emit(cs, info->block[0]);
- radeon_emit(cs, info->block[1]);
- radeon_emit(cs, info->block[2]);
- }
- }
-
- if (program->num_cs_user_data_dwords) {
- radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
- radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
- }
+ struct si_compute *program = sctx->cs_shader_state.program;
+ struct si_shader_selector *sel = &program->sel;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 4 * SI_NUM_RESOURCE_SGPRS;
+ unsigned block_size_reg = grid_size_reg +
+ /* 12 bytes = 3 dwords. */
+ 12 * sel->info.uses_grid_size;
+ unsigned cs_user_data_reg = block_size_reg + 12 * program->reads_variable_block_size;
+
+ if (info->indirect) {
+ if (sel->info.uses_grid_size) {
+ for (unsigned i = 0; i < 3; ++i) {
+ si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
+ COPY_DATA_SRC_MEM, si_resource(info->indirect),
+ info->indirect_offset + 4 * i);
+ }
+ }
+ } else {
+ if (sel->info.uses_grid_size) {
+ radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+ radeon_emit(cs, info->grid[0]);
+ radeon_emit(cs, info->grid[1]);
+ radeon_emit(cs, info->grid[2]);
+ }
+ if (program->reads_variable_block_size) {
+ radeon_set_sh_reg_seq(cs, block_size_reg, 3);
+ radeon_emit(cs, info->block[0]);
+ radeon_emit(cs, info->block[1]);
+ radeon_emit(cs, info->block[2]);
+ }
+ }
+
+ if (program->num_cs_user_data_dwords) {
+ radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
+ radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
+ }
}
-static void si_emit_dispatch_packets(struct si_context *sctx,
- const struct pipe_grid_info *info)
+static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
{
- struct si_screen *sscreen = sctx->screen;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
- unsigned threads_per_threadgroup =
- info->block[0] * info->block[1] * info->block[2];
- unsigned waves_per_threadgroup =
- DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
- unsigned threadgroups_per_cu = 1;
-
- if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
- threadgroups_per_cu = 2;
-
- radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
- ac_get_compute_resource_limits(&sscreen->info,
- waves_per_threadgroup,
- sctx->cs_max_waves_per_sh,
- threadgroups_per_cu));
-
- unsigned dispatch_initiator =
- S_00B800_COMPUTE_SHADER_EN(1) |
- S_00B800_FORCE_START_AT_000(1) |
- /* If the KMD allows it (there is a KMD hw register for it),
- * allow launching waves out-of-order. (same as Vulkan) */
- S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
- S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
-
- const uint *last_block = info->last_block;
- bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
-
- radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-
- if (partial_block_en) {
- unsigned partial[3];
-
- /* If no partial_block, these should be an entire block size, not 0. */
- partial[0] = last_block[0] ? last_block[0] : info->block[0];
- partial[1] = last_block[1] ? last_block[1] : info->block[1];
- partial[2] = last_block[2] ? last_block[2] : info->block[2];
-
- radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
- S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
- S_00B820_NUM_THREAD_PARTIAL(partial[1]));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
- S_00B824_NUM_THREAD_PARTIAL(partial[2]));
-
- dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
- } else {
- radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
- }
-
- if (info->indirect) {
- uint64_t base_va = si_resource(info->indirect)->gpu_address;
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(info->indirect),
- RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
- radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
- PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, 1);
- radeon_emit(cs, base_va);
- radeon_emit(cs, base_va >> 32);
-
- radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
- PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, info->indirect_offset);
- radeon_emit(cs, dispatch_initiator);
- } else {
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
- PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, info->grid[0]);
- radeon_emit(cs, info->grid[1]);
- radeon_emit(cs, info->grid[2]);
- radeon_emit(cs, dispatch_initiator);
- }
+ struct si_screen *sscreen = sctx->screen;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+ unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2];
+ unsigned waves_per_threadgroup =
+ DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
+ unsigned threadgroups_per_cu = 1;
+
+ if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
+ threadgroups_per_cu = 2;
+
+ radeon_set_sh_reg(
+ cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+ ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
+ sctx->cs_max_waves_per_sh, threadgroups_per_cu));
+
+ unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) |
+ /* If the KMD allows it (there is a KMD hw register for it),
+ * allow launching waves out-of-order. (same as Vulkan) */
+ S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
+ S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
+
+ const uint *last_block = info->last_block;
+ bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
+
+ radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+ if (partial_block_en) {
+ unsigned partial[3];
+
+ /* If no partial_block, these should be an entire block size, not 0. */
+ partial[0] = last_block[0] ? last_block[0] : info->block[0];
+ partial[1] = last_block[1] ? last_block[1] : info->block[1];
+ partial[2] = last_block[2] ? last_block[2] : info->block[2];
+
+ radeon_emit(
+ cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+ radeon_emit(
+ cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+ radeon_emit(
+ cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+ dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+ } else {
+ radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+ radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+ radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+ }
+
+ if (info->indirect) {
+ uint64_t base_va = si_resource(info->indirect)->gpu_address;
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
+ RADEON_PRIO_DRAW_INDIRECT);
+
+ radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, 1);
+ radeon_emit(cs, base_va);
+ radeon_emit(cs, base_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->indirect_offset);
+ radeon_emit(cs, dispatch_initiator);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, info->grid[0]);
+ radeon_emit(cs, info->grid[1]);
+ radeon_emit(cs, info->grid[2]);
+ radeon_emit(cs, dispatch_initiator);
+ }
}
-
-static void si_launch_grid(
- struct pipe_context *ctx, const struct pipe_grid_info *info)
+static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_compute *program = sctx->cs_shader_state.program;
- const amd_kernel_code_t *code_object =
- si_compute_get_code_object(program, info->pc);
- int i;
- /* HW bug workaround when CS threadgroups > 256 threads and async
- * compute isn't used, i.e. only one compute job can run at a time.
- * If async compute is possible, the threadgroup size must be limited
- * to 256 threads on all queues to avoid the bug.
- * Only GFX6 and certain GFX7 chips are affected.
- */
- bool cs_regalloc_hang =
- (sctx->chip_class == GFX6 ||
- sctx->family == CHIP_BONAIRE ||
- sctx->family == CHIP_KABINI) &&
- info->block[0] * info->block[1] * info->block[2] > 256;
-
- if (cs_regalloc_hang)
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
-
- if (program->ir_type != PIPE_SHADER_IR_NATIVE &&
- program->shader.compilation_failed)
- return;
-
- if (sctx->has_graphics) {
- if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
- si_update_fb_dirtiness_after_rendering(sctx);
- sctx->last_num_draw_calls = sctx->num_draw_calls;
- }
-
- si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
- }
-
- /* Add buffer sizes for memory checking in need_cs_space. */
- si_context_add_resource_size(sctx, &program->shader.bo->b.b);
- /* TODO: add the scratch buffer */
-
- if (info->indirect) {
- si_context_add_resource_size(sctx, info->indirect);
-
- /* Indirect buffers use TC L2 on GFX9, but not older hw. */
- if (sctx->chip_class <= GFX8 &&
- si_resource(info->indirect)->TC_L2_dirty) {
- sctx->flags |= SI_CONTEXT_WB_L2;
- si_resource(info->indirect)->TC_L2_dirty = false;
- }
- }
-
- si_need_gfx_cs_space(sctx);
-
- if (sctx->bo_list_add_all_compute_resources)
- si_compute_resources_add_all_to_bo_list(sctx);
-
- if (!sctx->cs_shader_state.initialized) {
- si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
-
- sctx->cs_shader_state.emitted_program = NULL;
- sctx->cs_shader_state.initialized = true;
- }
-
- if (sctx->flags)
- sctx->emit_cache_flush(sctx);
-
- if (!si_switch_compute_shader(sctx, program, &program->shader,
- code_object, info->pc))
- return;
-
- si_upload_compute_shader_descriptors(sctx);
- si_emit_compute_shader_pointers(sctx);
-
- if (sctx->has_graphics &&
- si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
- sctx->atoms.s.render_cond.emit(sctx);
- si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
- }
-
- if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
- unlikely(!si_upload_compute_input(sctx, code_object, info)))
- return;
-
- /* Global buffers */
- for (i = 0; i < program->max_global_buffers; i++) {
- struct si_resource *buffer =
- si_resource(program->global_buffers[i]);
- if (!buffer) {
- continue;
- }
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_COMPUTE_GLOBAL);
- }
-
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- si_setup_nir_user_data(sctx, info);
-
- si_emit_dispatch_packets(sctx, info);
-
- if (unlikely(sctx->current_saved_cs)) {
- si_trace_emit(sctx);
- si_log_compute_state(sctx, sctx->log);
- }
-
- sctx->compute_is_busy = true;
- sctx->num_compute_calls++;
- if (sctx->cs_shader_state.uses_scratch)
- sctx->num_spill_compute_calls++;
-
- if (cs_regalloc_hang)
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_compute *program = sctx->cs_shader_state.program;
+ const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
+ int i;
+ /* HW bug workaround when CS threadgroups > 256 threads and async
+ * compute isn't used, i.e. only one compute job can run at a time.
+ * If async compute is possible, the threadgroup size must be limited
+ * to 256 threads on all queues to avoid the bug.
+ * Only GFX6 and certain GFX7 chips are affected.
+ */
+ bool cs_regalloc_hang =
+ (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) &&
+ info->block[0] * info->block[1] * info->block[2] > 256;
+
+ if (cs_regalloc_hang)
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
+ return;
+
+ if (sctx->has_graphics) {
+ if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
+ si_update_fb_dirtiness_after_rendering(sctx);
+ sctx->last_num_draw_calls = sctx->num_draw_calls;
+ }
+
+ si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
+ }
+
+ /* Add buffer sizes for memory checking in need_cs_space. */
+ si_context_add_resource_size(sctx, &program->shader.bo->b.b);
+ /* TODO: add the scratch buffer */
+
+ if (info->indirect) {
+ si_context_add_resource_size(sctx, info->indirect);
+
+ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+ if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ si_resource(info->indirect)->TC_L2_dirty = false;
+ }
+ }
+
+ si_need_gfx_cs_space(sctx);
+
+ if (sctx->bo_list_add_all_compute_resources)
+ si_compute_resources_add_all_to_bo_list(sctx);
+
+ if (!sctx->cs_shader_state.initialized) {
+ si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
+
+ sctx->cs_shader_state.emitted_program = NULL;
+ sctx->cs_shader_state.initialized = true;
+ }
+
+ if (sctx->flags)
+ sctx->emit_cache_flush(sctx);
+
+ if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc))
+ return;
+
+ si_upload_compute_shader_descriptors(sctx);
+ si_emit_compute_shader_pointers(sctx);
+
+ if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
+ sctx->atoms.s.render_cond.emit(sctx);
+ si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
+ }
+
+ if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
+ unlikely(!si_upload_compute_input(sctx, code_object, info)))
+ return;
+
+ /* Global buffers */
+ for (i = 0; i < program->max_global_buffers; i++) {
+ struct si_resource *buffer = si_resource(program->global_buffers[i]);
+ if (!buffer) {
+ continue;
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_COMPUTE_GLOBAL);
+ }
+
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+ si_setup_nir_user_data(sctx, info);
+
+ si_emit_dispatch_packets(sctx, info);
+
+ if (unlikely(sctx->current_saved_cs)) {
+ si_trace_emit(sctx);
+ si_log_compute_state(sctx, sctx->log);
+ }
+
+ sctx->compute_is_busy = true;
+ sctx->num_compute_calls++;
+ if (sctx->cs_shader_state.uses_scratch)
+ sctx->num_spill_compute_calls++;
+
+ if (cs_regalloc_hang)
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
}
void si_destroy_compute(struct si_compute *program)
{
- struct si_shader_selector *sel = &program->sel;
+ struct si_shader_selector *sel = &program->sel;
- if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
- util_queue_drop_job(&sel->screen->shader_compiler_queue,
- &sel->ready);
- util_queue_fence_destroy(&sel->ready);
- }
+ if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+ util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready);
+ util_queue_fence_destroy(&sel->ready);
+ }
- for (unsigned i = 0; i < program->max_global_buffers; i++)
- pipe_resource_reference(&program->global_buffers[i], NULL);
- FREE(program->global_buffers);
+ for (unsigned i = 0; i < program->max_global_buffers; i++)
+ pipe_resource_reference(&program->global_buffers[i], NULL);
+ FREE(program->global_buffers);
- si_shader_destroy(&program->shader);
- ralloc_free(program->sel.nir);
- FREE(program);
+ si_shader_destroy(&program->shader);
+ ralloc_free(program->sel.nir);
+ FREE(program);
}
-static void si_delete_compute_state(struct pipe_context *ctx, void* state){
- struct si_compute *program = (struct si_compute *)state;
- struct si_context *sctx = (struct si_context*)ctx;
+static void si_delete_compute_state(struct pipe_context *ctx, void *state)
+{
+ struct si_compute *program = (struct si_compute *)state;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (!state)
- return;
+ if (!state)
+ return;
- if (program == sctx->cs_shader_state.program)
- sctx->cs_shader_state.program = NULL;
+ if (program == sctx->cs_shader_state.program)
+ sctx->cs_shader_state.program = NULL;
- if (program == sctx->cs_shader_state.emitted_program)
- sctx->cs_shader_state.emitted_program = NULL;
+ if (program == sctx->cs_shader_state.emitted_program)
+ sctx->cs_shader_state.emitted_program = NULL;
- si_compute_reference(&program, NULL);
+ si_compute_reference(&program, NULL);
}
-static void si_set_compute_resources(struct pipe_context * ctx_,
- unsigned start, unsigned count,
- struct pipe_surface ** surfaces) { }
+static void si_set_compute_resources(struct pipe_context *ctx_, unsigned start, unsigned count,
+ struct pipe_surface **surfaces)
+{
+}
void si_init_compute_functions(struct si_context *sctx)
{
- sctx->b.create_compute_state = si_create_compute_state;
- sctx->b.delete_compute_state = si_delete_compute_state;
- sctx->b.bind_compute_state = si_bind_compute_state;
- sctx->b.set_compute_resources = si_set_compute_resources;
- sctx->b.set_global_binding = si_set_global_binding;
- sctx->b.launch_grid = si_launch_grid;
+ sctx->b.create_compute_state = si_create_compute_state;
+ sctx->b.delete_compute_state = si_delete_compute_state;
+ sctx->b.bind_compute_state = si_bind_compute_state;
+ sctx->b.set_compute_resources = si_set_compute_resources;
+ sctx->b.set_global_binding = si_set_global_binding;
+ sctx->b.launch_grid = si_launch_grid;
}
#ifndef SI_COMPUTE_H
#define SI_COMPUTE_H
-#include "util/u_inlines.h"
-
#include "si_shader.h"
+#include "util/u_inlines.h"
struct si_compute {
- struct si_shader_selector sel;
- struct si_shader shader;
+ struct si_shader_selector sel;
+ struct si_shader shader;
- unsigned ir_type;
- unsigned local_size;
- unsigned private_size;
- unsigned input_size;
+ unsigned ir_type;
+ unsigned local_size;
+ unsigned private_size;
+ unsigned input_size;
- int max_global_buffers;
- struct pipe_resource **global_buffers;
+ int max_global_buffers;
+ struct pipe_resource **global_buffers;
- bool reads_variable_block_size;
- unsigned num_cs_user_data_dwords;
+ bool reads_variable_block_size;
+ unsigned num_cs_user_data_dwords;
};
void si_destroy_compute(struct si_compute *program);
-static inline void
-si_compute_reference(struct si_compute **dst, struct si_compute *src)
+static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src)
{
- if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
- si_destroy_compute(*dst);
+ if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
+ si_destroy_compute(*dst);
- *dst = src;
+ *dst = src;
}
#endif /* SI_COMPUTE_H */
/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
* and L2_STREAM for src.
*/
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
- enum si_coherency coher,
- uint64_t size)
+static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
+ uint64_t size)
{
- if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
- coher == SI_COHERENCY_CP)) ||
- (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
- return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+ if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
+ (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
+ return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
- return L2_BYPASS;
+ return L2_BYPASS;
}
unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
- enum si_cache_policy cache_policy)
+ enum si_cache_policy cache_policy)
{
- switch (coher) {
- default:
- case SI_COHERENCY_NONE:
- case SI_COHERENCY_CP:
- return 0;
- case SI_COHERENCY_SHADER:
- return SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE |
- (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
- case SI_COHERENCY_CB_META:
- return SI_CONTEXT_FLUSH_AND_INV_CB;
- }
+ switch (coher) {
+ default:
+ case SI_COHERENCY_NONE:
+ case SI_COHERENCY_CP:
+ return 0;
+ case SI_COHERENCY_SHADER:
+ return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
+ case SI_COHERENCY_CB_META:
+ return SI_CONTEXT_FLUSH_AND_INV_CB;
+ }
}
-static void si_launch_grid_internal(struct si_context *sctx,
- struct pipe_grid_info *info)
+static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
{
- /* Set settings for driver-internal compute dispatches. */
- sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
- sctx->render_cond_force_off = true;
- /* Skip decompression to prevent infinite recursion. */
- sctx->blitter->running = true;
-
- /* Dispatch compute. */
- sctx->b.launch_grid(&sctx->b, info);
-
- /* Restore default settings. */
- sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
- sctx->render_cond_force_off = false;
- sctx->blitter->running = false;
+ /* Set settings for driver-internal compute dispatches. */
+ sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+ sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+ sctx->render_cond_force_off = true;
+ /* Skip decompression to prevent infinite recursion. */
+ sctx->blitter->running = true;
+
+ /* Dispatch compute. */
+ sctx->b.launch_grid(&sctx->b, info);
+
+ /* Restore default settings. */
+ sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+ sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+ sctx->render_cond_force_off = false;
+ sctx->blitter->running = false;
}
-static void si_compute_clear_12bytes_buffer(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_offset,
- unsigned size,
- const uint32_t *clear_value,
- enum si_coherency coher)
+static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ unsigned dst_offset, unsigned size,
+ const uint32_t *clear_value, enum si_coherency coher)
{
- struct pipe_context *ctx = &sctx->b;
+ struct pipe_context *ctx = &sctx->b;
- assert(dst_offset % 4 == 0);
- assert(size % 4 == 0);
- unsigned size_12 = DIV_ROUND_UP(size, 12);
+ assert(dst_offset % 4 == 0);
+ assert(size % 4 == 0);
+ unsigned size_12 = DIV_ROUND_UP(size, 12);
- unsigned data[4] = {0};
- memcpy(data, clear_value, 12);
+ unsigned data[4] = {0};
+ memcpy(data, clear_value, 12);
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
- struct pipe_shader_buffer saved_sb = {0};
- si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+ struct pipe_shader_buffer saved_sb = {0};
+ si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
- unsigned saved_writable_mask = 0;
- if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
- (1u << si_get_shaderbuf_slot(0)))
- saved_writable_mask = 1;
+ unsigned saved_writable_mask = 0;
+ if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+ (1u << si_get_shaderbuf_slot(0)))
+ saved_writable_mask = 1;
- struct pipe_constant_buffer saved_cb = {};
- si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+ struct pipe_constant_buffer saved_cb = {};
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- void *saved_cs = sctx->cs_shader_state.program;
+ void *saved_cs = sctx->cs_shader_state.program;
- struct pipe_constant_buffer cb = {};
- cb.buffer_size = sizeof(data);
- cb.user_buffer = data;
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+ struct pipe_constant_buffer cb = {};
+ cb.buffer_size = sizeof(data);
+ cb.user_buffer = data;
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
- struct pipe_shader_buffer sb = {0};
- sb.buffer = dst;
- sb.buffer_offset = dst_offset;
- sb.buffer_size = size;
+ struct pipe_shader_buffer sb = {0};
+ sb.buffer = dst;
+ sb.buffer_offset = dst_offset;
+ sb.buffer_size = size;
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
- struct pipe_grid_info info = {0};
+ struct pipe_grid_info info = {0};
- if (!sctx->cs_clear_12bytes_buffer)
- sctx->cs_clear_12bytes_buffer =
- si_clear_12bytes_buffer_shader(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
- info.block[0] = 64;
- info.last_block[0] = size_12 % 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(size_12, 64);
- info.grid[1] = 1;
- info.grid[2] = 1;
+ if (!sctx->cs_clear_12bytes_buffer)
+ sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
+ info.block[0] = 64;
+ info.last_block[0] = size_12 % 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(size_12, 64);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
- si_launch_grid_internal(sctx, &info);
+ si_launch_grid_internal(sctx, &info);
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- pipe_resource_reference(&saved_sb.buffer, NULL);
- pipe_resource_reference(&saved_cb.buffer, NULL);
+ pipe_resource_reference(&saved_sb.buffer, NULL);
+ pipe_resource_reference(&saved_cb.buffer, NULL);
}
-static void si_compute_do_clear_or_copy(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_offset,
- struct pipe_resource *src,
- unsigned src_offset,
- unsigned size,
- const uint32_t *clear_value,
- unsigned clear_value_size,
- enum si_coherency coher)
+static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
+ unsigned dst_offset, struct pipe_resource *src,
+ unsigned src_offset, unsigned size,
+ const uint32_t *clear_value, unsigned clear_value_size,
+ enum si_coherency coher)
{
- struct pipe_context *ctx = &sctx->b;
-
- assert(src_offset % 4 == 0);
- assert(dst_offset % 4 == 0);
- assert(size % 4 == 0);
-
- assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
- assert(!src || src_offset + size <= src->width0);
-
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
-
- /* Save states. */
- void *saved_cs = sctx->cs_shader_state.program;
- struct pipe_shader_buffer saved_sb[2] = {};
- si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
-
- unsigned saved_writable_mask = 0;
- for (unsigned i = 0; i < (src ? 2 : 1); i++) {
- if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
- (1u << si_get_shaderbuf_slot(i)))
- saved_writable_mask |= 1 << i;
- }
-
- /* The memory accesses are coalesced, meaning that the 1st instruction writes
- * the 1st contiguous block of data for the whole wave, the 2nd instruction
- * writes the 2nd contiguous block of data, etc.
- */
- unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
- SI_COMPUTE_CLEAR_DW_PER_THREAD;
- unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
- unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
- unsigned wave_size = sctx->screen->compute_wave_size;
- unsigned dwords_per_wave = dwords_per_thread * wave_size;
-
- unsigned num_dwords = size / 4;
- unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
- struct pipe_grid_info info = {};
- info.block[0] = MIN2(wave_size, num_instructions);
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
- info.grid[1] = 1;
- info.grid[2] = 1;
-
- struct pipe_shader_buffer sb[2] = {};
- sb[0].buffer = dst;
- sb[0].buffer_offset = dst_offset;
- sb[0].buffer_size = size;
-
- bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
-
- if (src) {
- sb[1].buffer = src;
- sb[1].buffer_offset = src_offset;
- sb[1].buffer_size = size;
-
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
-
- if (!sctx->cs_copy_buffer) {
- sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
- SI_COMPUTE_COPY_DW_PER_THREAD,
- shader_dst_stream_policy, true);
- }
- ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
- } else {
- assert(clear_value_size >= 4 &&
- clear_value_size <= 16 &&
- util_is_power_of_two_or_zero(clear_value_size));
-
- for (unsigned i = 0; i < 4; i++)
- sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
-
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
-
- if (!sctx->cs_clear_buffer) {
- sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
- SI_COMPUTE_CLEAR_DW_PER_THREAD,
- shader_dst_stream_policy, false);
- }
- ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
- }
-
- si_launch_grid_internal(sctx, &info);
-
- enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
-
- if (cache_policy != L2_BYPASS)
- si_resource(dst)->TC_L2_dirty = true;
-
- /* Restore states. */
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
- saved_writable_mask);
- for (int i = 0; i < 2; i++)
- pipe_resource_reference(&saved_sb[i].buffer, NULL);
+ struct pipe_context *ctx = &sctx->b;
+
+ assert(src_offset % 4 == 0);
+ assert(dst_offset % 4 == 0);
+ assert(size % 4 == 0);
+
+ assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+ assert(!src || src_offset + size <= src->width0);
+
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+ /* Save states. */
+ void *saved_cs = sctx->cs_shader_state.program;
+ struct pipe_shader_buffer saved_sb[2] = {};
+ si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+ unsigned saved_writable_mask = 0;
+ for (unsigned i = 0; i < (src ? 2 : 1); i++) {
+ if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+ (1u << si_get_shaderbuf_slot(i)))
+ saved_writable_mask |= 1 << i;
+ }
+
+ /* The memory accesses are coalesced, meaning that the 1st instruction writes
+ * the 1st contiguous block of data for the whole wave, the 2nd instruction
+ * writes the 2nd contiguous block of data, etc.
+ */
+ unsigned dwords_per_thread =
+ src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
+ unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+ unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+ unsigned wave_size = sctx->screen->compute_wave_size;
+ unsigned dwords_per_wave = dwords_per_thread * wave_size;
+
+ unsigned num_dwords = size / 4;
+ unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = MIN2(wave_size, num_instructions);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_offset = dst_offset;
+ sb[0].buffer_size = size;
+
+ bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
+
+ if (src) {
+ sb[1].buffer = src;
+ sb[1].buffer_offset = src_offset;
+ sb[1].buffer_size = size;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
+
+ if (!sctx->cs_copy_buffer) {
+ sctx->cs_copy_buffer = si_create_dma_compute_shader(
+ &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+ }
+ ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+ } else {
+ assert(clear_value_size >= 4 && clear_value_size <= 16 &&
+ util_is_power_of_two_or_zero(clear_value_size));
+
+ for (unsigned i = 0; i < 4; i++)
+ sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
+
+ if (!sctx->cs_clear_buffer) {
+ sctx->cs_clear_buffer = si_create_dma_compute_shader(
+ &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+ }
+ ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+ }
+
+ si_launch_grid_internal(sctx, &info);
+
+ enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
+
+ if (cache_policy != L2_BYPASS)
+ si_resource(dst)->TC_L2_dirty = true;
+
+ /* Restore states. */
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
+ for (int i = 0; i < 2; i++)
+ pipe_resource_reference(&saved_sb[i].buffer, NULL);
}
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, uint32_t *clear_value,
- uint32_t clear_value_size, enum si_coherency coher,
- bool force_cpdma)
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+ enum si_coherency coher, bool force_cpdma)
{
- if (!size)
- return;
-
- ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
-
- assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
- assert(offset % clear_alignment == 0);
- assert(size % clear_alignment == 0);
- assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
-
- /* Reduce a large clear value size if possible. */
- if (clear_value_size > 4) {
- bool clear_dword_duplicated = true;
-
- /* See if we can lower large fills to dword fills. */
- for (unsigned i = 1; i < clear_value_size / 4; i++) {
- if (clear_value[0] != clear_value[i]) {
- clear_dword_duplicated = false;
- break;
- }
- }
- if (clear_dword_duplicated)
- clear_value_size = 4;
- }
-
- /* Expand a small clear value size. */
- uint32_t tmp_clear_value;
- if (clear_value_size <= 2) {
- if (clear_value_size == 1) {
- tmp_clear_value = *(uint8_t*)clear_value;
- tmp_clear_value |= (tmp_clear_value << 8) |
- (tmp_clear_value << 16) |
- (tmp_clear_value << 24);
- } else {
- tmp_clear_value = *(uint16_t*)clear_value;
- tmp_clear_value |= tmp_clear_value << 16;
- }
- clear_value = &tmp_clear_value;
- clear_value_size = 4;
- }
-
- if (clear_value_size == 12) {
- si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
- return;
- }
-
- uint64_t aligned_size = size & ~3ull;
- if (aligned_size >= 4) {
- /* Before GFX9, CP DMA was very slow when clearing GTT, so never
- * use CP DMA clears on those chips, because we can't be certain
- * about buffer placements.
- */
- if (clear_value_size > 4 ||
- (!force_cpdma &&
- clear_value_size == 4 &&
- offset % 4 == 0 &&
- (size > 32*1024 || sctx->chip_class <= GFX9))) {
- si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
- aligned_size, clear_value,
- clear_value_size, coher);
- } else {
- assert(clear_value_size == 4);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
- aligned_size, *clear_value, 0, coher,
- get_cache_policy(sctx, coher, size));
- }
-
- offset += aligned_size;
- size -= aligned_size;
- }
-
- /* Handle non-dword alignment. */
- if (size) {
- assert(dst);
- assert(dst->target == PIPE_BUFFER);
- assert(size < 4);
-
- pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
- }
+ if (!size)
+ return;
+
+ ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+ assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+ assert(offset % clear_alignment == 0);
+ assert(size % clear_alignment == 0);
+ assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+ /* Reduce a large clear value size if possible. */
+ if (clear_value_size > 4) {
+ bool clear_dword_duplicated = true;
+
+ /* See if we can lower large fills to dword fills. */
+ for (unsigned i = 1; i < clear_value_size / 4; i++) {
+ if (clear_value[0] != clear_value[i]) {
+ clear_dword_duplicated = false;
+ break;
+ }
+ }
+ if (clear_dword_duplicated)
+ clear_value_size = 4;
+ }
+
+ /* Expand a small clear value size. */
+ uint32_t tmp_clear_value;
+ if (clear_value_size <= 2) {
+ if (clear_value_size == 1) {
+ tmp_clear_value = *(uint8_t *)clear_value;
+ tmp_clear_value |=
+ (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
+ } else {
+ tmp_clear_value = *(uint16_t *)clear_value;
+ tmp_clear_value |= tmp_clear_value << 16;
+ }
+ clear_value = &tmp_clear_value;
+ clear_value_size = 4;
+ }
+
+ if (clear_value_size == 12) {
+ si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
+ return;
+ }
+
+ uint64_t aligned_size = size & ~3ull;
+ if (aligned_size >= 4) {
+ /* Before GFX9, CP DMA was very slow when clearing GTT, so never
+ * use CP DMA clears on those chips, because we can't be certain
+ * about buffer placements.
+ */
+ if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
+ (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
+ si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
+ clear_value_size, coher);
+ } else {
+ assert(clear_value_size == 4);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
+ coher, get_cache_policy(sctx, coher, size));
+ }
+
+ offset += aligned_size;
+ size -= aligned_size;
+ }
+
+ /* Handle non-dword alignment. */
+ if (size) {
+ assert(dst);
+ assert(dst->target == PIPE_BUFFER);
+ assert(size < 4);
+
+ pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+ }
}
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned offset, unsigned size,
- const void *clear_value,
- int clear_value_size)
+static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+ unsigned offset, unsigned size, const void *clear_value,
+ int clear_value_size)
{
- si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
- clear_value_size, SI_COHERENCY_SHADER, false);
+ si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
+ clear_value_size, SI_COHERENCY_SHADER, false);
}
-void si_copy_buffer(struct si_context *sctx,
- struct pipe_resource *dst, struct pipe_resource *src,
- uint64_t dst_offset, uint64_t src_offset, unsigned size)
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+ uint64_t dst_offset, uint64_t src_offset, unsigned size)
{
- if (!size)
- return;
-
- enum si_coherency coher = SI_COHERENCY_SHADER;
- enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-
- /* Only use compute for VRAM copies on dGPUs. */
- if (sctx->screen->info.has_dedicated_vram &&
- si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
- si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
- size > 32 * 1024 &&
- dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
- si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
- size, NULL, 0, coher);
- } else {
- si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
- 0, coher, cache_policy);
- }
+ if (!size)
+ return;
+
+ enum si_coherency coher = SI_COHERENCY_SHADER;
+ enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+ /* Only use compute for VRAM copies on dGPUs. */
+ if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+ si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
+ src_offset % 4 == 0 && size % 4 == 0) {
+ si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
+ } else {
+ si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
+ }
}
-void si_compute_copy_image(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- struct pipe_resource *src,
- unsigned src_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- const struct pipe_box *src_box)
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+ struct pipe_resource *src, unsigned src_level, unsigned dstx,
+ unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
{
- struct pipe_context *ctx = &sctx->b;
- unsigned width = src_box->width;
- unsigned height = src_box->height;
- unsigned depth = src_box->depth;
- enum pipe_format src_format = util_format_linear(src->format);
- enum pipe_format dst_format = util_format_linear(dst->format);
-
- assert(util_format_is_subsampled_422(src_format) ==
- util_format_is_subsampled_422(dst_format));
-
- if (util_format_is_subsampled_422(src_format)) {
- src_format = dst_format = PIPE_FORMAT_R32_UINT;
- /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
- * should force us to divide src_box->x, dstx and width by 2.
- * But given that ac_surface allocates this format as 32 bpp
- * and that surf_size is then modified to pack the values
- * we must keep the original values to get the correct results.
- */
- }
- unsigned data[] = {src_box->x, src_box->y, src_box->z, 0,
- dstx, dsty, dstz, 0};
-
- if (width == 0 || height == 0)
- return;
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
- /* The driver doesn't decompress resources automatically here. */
- si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level,
- dstz, dstz + src_box->depth - 1);
- si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
- src_box->z, src_box->z + src_box->depth - 1);
-
- /* src and dst have the same number of samples. */
- si_make_CB_shader_coherent(sctx, src->nr_samples, true,
- /* Only src can have DCC.*/
- ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned);
-
- struct pipe_constant_buffer saved_cb = {};
- si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
- struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
- struct pipe_image_view saved_image[2] = {0};
- util_copy_image_view(&saved_image[0], &images->views[0]);
- util_copy_image_view(&saved_image[1], &images->views[1]);
-
- void *saved_cs = sctx->cs_shader_state.program;
-
- struct pipe_constant_buffer cb = {};
- cb.buffer_size = sizeof(data);
- cb.user_buffer = data;
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
- struct pipe_image_view image[2] = {0};
- image[0].resource = src;
- image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
- image[0].format = src_format;
- image[0].u.tex.level = src_level;
- image[0].u.tex.first_layer = 0;
- image[0].u.tex.last_layer =
- src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
- : (unsigned)(src->array_size - 1);
- image[1].resource = dst;
- image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
- image[1].format = dst_format;
- image[1].u.tex.level = dst_level;
- image[1].u.tex.first_layer = 0;
- image[1].u.tex.last_layer =
- dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
- : (unsigned)(dst->array_size - 1);
-
- if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
- image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
-
- /* SNORM8 blitting has precision issues on some chips. Use the SINT
- * equivalent instead, which doesn't force DCC decompression.
- * Note that some chips avoid this issue by using SDMA.
- */
- if (util_format_is_snorm8(dst->format)) {
- image[0].format = image[1].format =
- util_format_snorm8_to_sint8(dst->format);
- }
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
-
- struct pipe_grid_info info = {0};
-
- if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
- if (!sctx->cs_copy_image_1d_array)
- sctx->cs_copy_image_1d_array =
- si_create_copy_image_compute_shader_1d_array(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
- info.block[0] = 64;
- info.last_block[0] = width % 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 64);
- info.grid[1] = depth;
- info.grid[2] = 1;
- } else {
- if (!sctx->cs_copy_image)
- sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_copy_image);
- info.block[0] = 8;
- info.last_block[0] = width % 8;
- info.block[1] = 8;
- info.last_block[1] = height % 8;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 8);
- info.grid[1] = DIV_ROUND_UP(height, 8);
- info.grid[2] = depth;
- }
-
- si_launch_grid_internal(sctx, &info);
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- for (int i = 0; i < 2; i++)
- pipe_resource_reference(&saved_image[i].resource, NULL);
- pipe_resource_reference(&saved_cb.buffer, NULL);
+ struct pipe_context *ctx = &sctx->b;
+ unsigned width = src_box->width;
+ unsigned height = src_box->height;
+ unsigned depth = src_box->depth;
+ enum pipe_format src_format = util_format_linear(src->format);
+ enum pipe_format dst_format = util_format_linear(dst->format);
+
+ assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
+
+ if (util_format_is_subsampled_422(src_format)) {
+ src_format = dst_format = PIPE_FORMAT_R32_UINT;
+ /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
+ * should force us to divide src_box->x, dstx and width by 2.
+ * But given that ac_surface allocates this format as 32 bpp
+ * and that surf_size is then modified to pack the values
+ * we must keep the original values to get the correct results.
+ */
+ }
+ unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
+
+ if (width == 0 || height == 0)
+ return;
+
+ sctx->flags |=
+ SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+ /* The driver doesn't decompress resources automatically here. */
+ si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
+ dstz + src_box->depth - 1);
+ si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+ src_box->z + src_box->depth - 1);
+
+ /* src and dst have the same number of samples. */
+ si_make_CB_shader_coherent(sctx, src->nr_samples, true,
+ /* Only src can have DCC.*/
+ ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
+
+ struct pipe_constant_buffer saved_cb = {};
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+ struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+ struct pipe_image_view saved_image[2] = {0};
+ util_copy_image_view(&saved_image[0], &images->views[0]);
+ util_copy_image_view(&saved_image[1], &images->views[1]);
+
+ void *saved_cs = sctx->cs_shader_state.program;
+
+ struct pipe_constant_buffer cb = {};
+ cb.buffer_size = sizeof(data);
+ cb.user_buffer = data;
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+ struct pipe_image_view image[2] = {0};
+ image[0].resource = src;
+ image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+ image[0].format = src_format;
+ image[0].u.tex.level = src_level;
+ image[0].u.tex.first_layer = 0;
+ image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
+ : (unsigned)(src->array_size - 1);
+ image[1].resource = dst;
+ image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+ image[1].format = dst_format;
+ image[1].u.tex.level = dst_level;
+ image[1].u.tex.first_layer = 0;
+ image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
+ : (unsigned)(dst->array_size - 1);
+
+ if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
+ image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
+
+ /* SNORM8 blitting has precision issues on some chips. Use the SINT
+ * equivalent instead, which doesn't force DCC decompression.
+ * Note that some chips avoid this issue by using SDMA.
+ */
+ if (util_format_is_snorm8(dst->format)) {
+ image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
+ }
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
+
+ struct pipe_grid_info info = {0};
+
+ if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
+ if (!sctx->cs_copy_image_1d_array)
+ sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
+ info.block[0] = 64;
+ info.last_block[0] = width % 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, 64);
+ info.grid[1] = depth;
+ info.grid[2] = 1;
+ } else {
+ if (!sctx->cs_copy_image)
+ sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_copy_image);
+ info.block[0] = 8;
+ info.last_block[0] = width % 8;
+ info.block[1] = 8;
+ info.last_block[1] = height % 8;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, 8);
+ info.grid[1] = DIV_ROUND_UP(height, 8);
+ info.grid[2] = depth;
+ }
+
+ si_launch_grid_internal(sctx, &info);
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+ si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+ for (int i = 0; i < 2; i++)
+ pipe_resource_reference(&saved_image[i].resource, NULL);
+ pipe_resource_reference(&saved_cb.buffer, NULL);
}
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
{
- struct pipe_context *ctx = &sctx->b;
-
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
- sctx->emit_cache_flush(sctx);
-
- /* Save states. */
- void *saved_cs = sctx->cs_shader_state.program;
- struct pipe_image_view saved_img[3] = {};
-
- for (unsigned i = 0; i < 3; i++) {
- util_copy_image_view(&saved_img[i],
- &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
- }
-
- /* Set images. */
- bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
- unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
- struct pipe_image_view img[3];
-
- assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
- assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
- assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
-
- for (unsigned i = 0; i < 3; i++) {
- img[i].resource = &tex->buffer.b.b;
- img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
- img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
- }
-
- img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
- PIPE_FORMAT_R32G32B32A32_UINT;
- img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
- img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
-
- img[1].format = PIPE_FORMAT_R8_UINT;
- img[1].u.buf.offset = tex->surface.dcc_offset;
- img[1].u.buf.size = tex->surface.dcc_size;
-
- img[2].format = PIPE_FORMAT_R8_UINT;
- img[2].u.buf.offset = tex->surface.display_dcc_offset;
- img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
-
- /* Bind the compute shader. */
- if (!sctx->cs_dcc_retile)
- sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
-
- /* Dispatch compute. */
- /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
- unsigned num_threads = num_elements / 4;
-
- struct pipe_grid_info info = {};
- info.block[0] = 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
- info.grid[1] = 1;
- info.grid[2] = 1;
- info.last_block[0] = num_threads % 64;
-
- si_launch_grid_internal(sctx, &info);
-
- /* Don't flush caches or wait. The driver will wait at the end of this IB,
- * and L2 will be flushed by the kernel fence.
- */
-
- /* Restore states. */
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
-
- for (unsigned i = 0; i < 3; i++) {
- pipe_resource_reference(&saved_img[i].resource, NULL);
- }
+ struct pipe_context *ctx = &sctx->b;
+
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
+ si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
+ sctx->emit_cache_flush(sctx);
+
+ /* Save states. */
+ void *saved_cs = sctx->cs_shader_state.program;
+ struct pipe_image_view saved_img[3] = {};
+
+ for (unsigned i = 0; i < 3; i++) {
+ util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+ }
+
+ /* Set images. */
+ bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+ unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+ struct pipe_image_view img[3];
+
+ assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
+ assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
+ assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
+
+ for (unsigned i = 0; i < 3; i++) {
+ img[i].resource = &tex->buffer.b.b;
+ img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
+ img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
+ }
+
+ img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
+ img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
+ img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
+
+ img[1].format = PIPE_FORMAT_R8_UINT;
+ img[1].u.buf.offset = tex->surface.dcc_offset;
+ img[1].u.buf.size = tex->surface.dcc_size;
+
+ img[2].format = PIPE_FORMAT_R8_UINT;
+ img[2].u.buf.offset = tex->surface.display_dcc_offset;
+ img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
+
+ /* Bind the compute shader. */
+ if (!sctx->cs_dcc_retile)
+ sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
+
+ /* Dispatch compute. */
+ /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
+ unsigned num_threads = num_elements / 4;
+
+ struct pipe_grid_info info = {};
+ info.block[0] = 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+ info.last_block[0] = num_threads % 64;
+
+ si_launch_grid_internal(sctx, &info);
+
+ /* Don't flush caches or wait. The driver will wait at the end of this IB,
+ * and L2 will be flushed by the kernel fence.
+ */
+
+ /* Restore states. */
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
+
+ for (unsigned i = 0; i < 3; i++) {
+ pipe_resource_reference(&saved_img[i].resource, NULL);
+ }
}
/* Expand FMASK to make it identity, so that image stores can ignore it. */
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
{
- struct si_context *sctx = (struct si_context *)ctx;
- bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
- unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
- unsigned log_samples = util_logbase2(tex->nr_samples);
- assert(tex->nr_samples >= 2);
-
- /* EQAA FMASK expansion is unimplemented. */
- if (tex->nr_samples != tex->nr_storage_samples)
- return;
-
- /* Flush caches and sync engines. */
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
- si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
- true /* DCC is not possible with image stores */);
-
- /* Save states. */
- void *saved_cs = sctx->cs_shader_state.program;
- struct pipe_image_view saved_image = {0};
- util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
-
- /* Bind the image. */
- struct pipe_image_view image = {0};
- image.resource = tex;
- /* Don't set WRITE so as not to trigger FMASK expansion, causing
- * an infinite loop. */
- image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
- image.format = util_format_linear(tex->format);
- if (is_array)
- image.u.tex.last_layer = tex->array_size - 1;
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
- /* Bind the shader. */
- void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
- if (!*shader)
- *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
- ctx->bind_compute_state(ctx, *shader);
-
- /* Dispatch compute. */
- struct pipe_grid_info info = {0};
- info.block[0] = 8;
- info.last_block[0] = tex->width0 % 8;
- info.block[1] = 8;
- info.last_block[1] = tex->height0 % 8;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
- info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
- info.grid[2] = is_array ? tex->array_size : 1;
-
- si_launch_grid_internal(sctx, &info);
-
- /* Flush caches and sync engines. */
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
- /* Restore previous states. */
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
- pipe_resource_reference(&saved_image.resource, NULL);
-
- /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
+ struct si_context *sctx = (struct si_context *)ctx;
+ bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
+ unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
+ unsigned log_samples = util_logbase2(tex->nr_samples);
+ assert(tex->nr_samples >= 2);
+
+ /* EQAA FMASK expansion is unimplemented. */
+ if (tex->nr_samples != tex->nr_storage_samples)
+ return;
+
+ /* Flush caches and sync engines. */
+ sctx->flags |=
+ SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+ si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
+ true /* DCC is not possible with image stores */);
+
+ /* Save states. */
+ void *saved_cs = sctx->cs_shader_state.program;
+ struct pipe_image_view saved_image = {0};
+ util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
+
+ /* Bind the image. */
+ struct pipe_image_view image = {0};
+ image.resource = tex;
+ /* Don't set WRITE so as not to trigger FMASK expansion, causing
+ * an infinite loop. */
+ image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
+ image.format = util_format_linear(tex->format);
+ if (is_array)
+ image.u.tex.last_layer = tex->array_size - 1;
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+ /* Bind the shader. */
+ void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
+ if (!*shader)
+ *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
+ ctx->bind_compute_state(ctx, *shader);
+
+ /* Dispatch compute. */
+ struct pipe_grid_info info = {0};
+ info.block[0] = 8;
+ info.last_block[0] = tex->width0 % 8;
+ info.block[1] = 8;
+ info.last_block[1] = tex->height0 % 8;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
+ info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
+ info.grid[2] = is_array ? tex->array_size : 1;
+
+ si_launch_grid_internal(sctx, &info);
+
+ /* Flush caches and sync engines. */
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+ si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+ /* Restore previous states. */
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+ pipe_resource_reference(&saved_image.resource, NULL);
+
+ /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
#define INVALID 0 /* never used */
- static const uint64_t fmask_expand_values[][4] = {
- /* samples */
- /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */
- {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */
- {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */
- {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
- {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */
- };
-
- /* Clear FMASK to identity. */
- struct si_texture *stex = (struct si_texture*)tex;
- si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
- (uint32_t*)&fmask_expand_values[log_fragments][log_samples - 1],
- 4, SI_COHERENCY_SHADER, false);
+ static const uint64_t fmask_expand_values[][4] = {
+ /* samples */
+ /* 2 (8 bpp) 4 (8 bpp) 8 (8-32bpp) 16 (16-64bpp) fragments */
+ {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE}, /* 1 */
+ {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4}, /* 2 */
+ {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
+ {INVALID, INVALID, 0x76543210, 0x8888888876543210}, /* 8 */
+ };
+
+ /* Clear FMASK to identity. */
+ struct si_texture *stex = (struct si_texture *)tex;
+ si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
+ (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
+ SI_COHERENCY_SHADER, false);
}
void si_init_compute_blit_functions(struct si_context *sctx)
{
- sctx->b.clear_buffer = si_pipe_clear_buffer;
+ sctx->b.clear_buffer = si_pipe_clear_buffer;
}
/* Clear a region of a color surface to a constant value. */
-void si_compute_clear_render_target(struct pipe_context *ctx,
- struct pipe_surface *dstsurf,
- const union pipe_color_union *color,
- unsigned dstx, unsigned dsty,
- unsigned width, unsigned height,
- bool render_condition_enabled)
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+ const union pipe_color_union *color, unsigned dstx,
+ unsigned dsty, unsigned width, unsigned height,
+ bool render_condition_enabled)
{
- struct si_context *sctx = (struct si_context *)ctx;
- unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
- unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
-
- if (width == 0 || height == 0)
- return;
-
- /* The driver doesn't decompress resources automatically here. */
- si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA,
- dstsurf->u.tex.level, dstsurf->u.tex.first_layer,
- dstsurf->u.tex.last_layer);
-
- if (util_format_is_srgb(dstsurf->format)) {
- union pipe_color_union color_srgb;
- for (int i = 0; i < 3; i++)
- color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
- color_srgb.f[3] = color->f[3];
- memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
- } else {
- memcpy(data + 4, color->ui, sizeof(color->ui));
- }
-
- sctx->render_cond_force_off = !render_condition_enabled;
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
- si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
- true /* DCC is not possible with image stores */);
-
- struct pipe_constant_buffer saved_cb = {};
- si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
- struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
- struct pipe_image_view saved_image = {0};
- util_copy_image_view(&saved_image, &images->views[0]);
-
- void *saved_cs = sctx->cs_shader_state.program;
-
- struct pipe_constant_buffer cb = {};
- cb.buffer_size = sizeof(data);
- cb.user_buffer = data;
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
- struct pipe_image_view image = {0};
- image.resource = dstsurf->texture;
- image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
- image.format = util_format_linear(dstsurf->format);
- image.u.tex.level = dstsurf->u.tex.level;
- image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
- image.u.tex.last_layer = dstsurf->u.tex.last_layer;
-
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
- struct pipe_grid_info info = {0};
-
- if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
- if (!sctx->cs_clear_render_target)
- sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
- info.block[0] = 8;
- info.last_block[0] = width % 8;
- info.block[1] = 8;
- info.last_block[1] = height % 8;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 8);
- info.grid[1] = DIV_ROUND_UP(height, 8);
- info.grid[2] = num_layers;
- } else {
- if (!sctx->cs_clear_render_target_1d_array)
- sctx->cs_clear_render_target_1d_array =
- si_clear_render_target_shader_1d_array(ctx);
- ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
- info.block[0] = 64;
- info.last_block[0] = width % 64;
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(width, 64);
- info.grid[1] = num_layers;
- info.grid[2] = 1;
- }
-
- si_launch_grid_internal(sctx, &info);
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
- si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
- ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
- ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
- pipe_resource_reference(&saved_image.resource, NULL);
- pipe_resource_reference(&saved_cb.buffer, NULL);
+ struct si_context *sctx = (struct si_context *)ctx;
+ unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
+ unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
+
+ if (width == 0 || height == 0)
+ return;
+
+ /* The driver doesn't decompress resources automatically here. */
+ si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
+ dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
+
+ if (util_format_is_srgb(dstsurf->format)) {
+ union pipe_color_union color_srgb;
+ for (int i = 0; i < 3; i++)
+ color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+ color_srgb.f[3] = color->f[3];
+ memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
+ } else {
+ memcpy(data + 4, color->ui, sizeof(color->ui));
+ }
+
+ sctx->render_cond_force_off = !render_condition_enabled;
+
+ sctx->flags |=
+ SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+ si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
+ true /* DCC is not possible with image stores */);
+
+ struct pipe_constant_buffer saved_cb = {};
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+ struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+ struct pipe_image_view saved_image = {0};
+ util_copy_image_view(&saved_image, &images->views[0]);
+
+ void *saved_cs = sctx->cs_shader_state.program;
+
+ struct pipe_constant_buffer cb = {};
+ cb.buffer_size = sizeof(data);
+ cb.user_buffer = data;
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+ struct pipe_image_view image = {0};
+ image.resource = dstsurf->texture;
+ image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
+ image.format = util_format_linear(dstsurf->format);
+ image.u.tex.level = dstsurf->u.tex.level;
+ image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
+ image.u.tex.last_layer = dstsurf->u.tex.last_layer;
+
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+ struct pipe_grid_info info = {0};
+
+ if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
+ if (!sctx->cs_clear_render_target)
+ sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
+ info.block[0] = 8;
+ info.last_block[0] = width % 8;
+ info.block[1] = 8;
+ info.last_block[1] = height % 8;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, 8);
+ info.grid[1] = DIV_ROUND_UP(height, 8);
+ info.grid[2] = num_layers;
+ } else {
+ if (!sctx->cs_clear_render_target_1d_array)
+ sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
+ info.block[0] = 64;
+ info.last_block[0] = width % 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(width, 64);
+ info.grid[1] = num_layers;
+ info.grid[2] = 1;
+ }
+
+ si_launch_grid_internal(sctx, &info);
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+ si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+ pipe_resource_reference(&saved_image.resource, NULL);
+ pipe_resource_reference(&saved_cb.buffer, NULL);
}
*
*/
+#include "ac_llvm_cull.h"
+#include "si_build_pm4.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
-#include "si_build_pm4.h"
-#include "ac_llvm_cull.h"
-
+#include "util/fast_idiv_by_const.h"
#include "util/u_prim.h"
#include "util/u_suballoc.h"
#include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
/* Based on:
* https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
/* At least 256 is needed for the fastest wave launch rate from compute queues
* due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH 0 /* no limit */
-#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
+#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH 0 /* no limit */
+#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */
/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z 0
+#define CULL_Z 0
/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE 2
-#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE 2
+#define GDS_SIZE_UNORDERED (4 * 1024) /* only for the unordered GDS counter */
/* Grouping compute dispatches for small draw calls: How many primitives from multiple
* draw calls to process by compute before signaling the gfx IB. This reduces the number
* of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH (512 * 1024)
+#define PRIMS_PER_BATCH (512 * 1024)
/* Draw call splitting at the packet level. This allows signaling the gfx IB
* for big draw calls sooner, but doesn't allow context flushes between packets.
* Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
/* If there is not enough ring buffer space for the current IB, split draw calls into
* this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
+#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
/* Derived values. */
-#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
- SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
- UINT_MAX & ~(THREADGROUP_SIZE - 1))
+#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL \
+ (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE \
+ : UINT_MAX & ~(THREADGROUP_SIZE - 1))
-#define REWIND_SIGNAL_BIT 0x80000000
+#define REWIND_SIGNAL_BIT 0x80000000
/* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION 0
+#define FORCE_REWIND_EMULATION 0
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
- bool is_aux_context,
- unsigned *prim_discard_vertex_count_threshold,
- unsigned *index_ring_size_per_ib)
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+ unsigned *prim_discard_vertex_count_threshold,
+ unsigned *index_ring_size_per_ib)
{
- *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
- if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
- !sscreen->info.has_gds_ordered_append ||
- sscreen->debug_flags & DBG(NO_PD) ||
- is_aux_context)
- return;
-
- /* TODO: enable this after the GDS kernel memory management is fixed */
- bool enable_on_pro_graphics_by_default = false;
-
- if (sscreen->debug_flags & DBG(ALWAYS_PD) ||
- sscreen->debug_flags & DBG(PD) ||
- (enable_on_pro_graphics_by_default &&
- sscreen->info.is_pro_graphics &&
- (sscreen->info.family == CHIP_BONAIRE ||
- sscreen->info.family == CHIP_HAWAII ||
- sscreen->info.family == CHIP_TONGA ||
- sscreen->info.family == CHIP_FIJI ||
- sscreen->info.family == CHIP_POLARIS10 ||
- sscreen->info.family == CHIP_POLARIS11 ||
- sscreen->info.family == CHIP_VEGA10 ||
- sscreen->info.family == CHIP_VEGA20))) {
- *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
- if (sscreen->debug_flags & DBG(ALWAYS_PD))
- *prim_discard_vertex_count_threshold = 0; /* always enable */
-
- const uint32_t MB = 1024 * 1024;
- const uint64_t GB = 1024 * 1024 * 1024;
-
- /* The total size is double this per context.
- * Greater numbers allow bigger gfx IBs.
- */
- if (sscreen->info.vram_size <= 2 * GB)
- *index_ring_size_per_ib = 64 * MB;
- else if (sscreen->info.vram_size <= 4 * GB)
- *index_ring_size_per_ib = 128 * MB;
- else
- *index_ring_size_per_ib = 256 * MB;
- }
+ *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+ if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
+ !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
+ return;
+
+ /* TODO: enable this after the GDS kernel memory management is fixed */
+ bool enable_on_pro_graphics_by_default = false;
+
+ if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
+ (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
+ (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
+ sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+ sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+ sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
+ *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+ if (sscreen->debug_flags & DBG(ALWAYS_PD))
+ *prim_discard_vertex_count_threshold = 0; /* always enable */
+
+ const uint32_t MB = 1024 * 1024;
+ const uint64_t GB = 1024 * 1024 * 1024;
+
+ /* The total size is double this per context.
+ * Greater numbers allow bigger gfx IBs.
+ */
+ if (sscreen->info.vram_size <= 2 * GB)
+ *index_ring_size_per_ib = 64 * MB;
+ else if (sscreen->info.vram_size <= 4 * GB)
+ *index_ring_size_per_ib = 128 * MB;
+ else
+ *index_ring_size_per_ib = 256 * MB;
+ }
}
/* Opcode can be "add" or "swap". */
-static LLVMValueRef
-si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
- LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
- bool release, bool done)
+static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+ LLVMValueRef m0, LLVMValueRef value,
+ unsigned ordered_count_index, bool release, bool done)
{
- if (ctx->screen->info.chip_class >= GFX10)
- ordered_count_index |= 1 << 24; /* number of dwords == 1 */
-
- LLVMValueRef args[] = {
- LLVMBuildIntToPtr(ctx->ac.builder, m0,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
- value,
- LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
- ctx->ac.i32_0, /* scope */
- ctx->ac.i1false, /* volatile */
- LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
- LLVMConstInt(ctx->ac.i1, release, 0),
- LLVMConstInt(ctx->ac.i1, done, 0),
- };
-
- char intrinsic[64];
- snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
- return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
+ if (ctx->screen->info.chip_class >= GFX10)
+ ordered_count_index |= 1 << 24; /* number of dwords == 1 */
+
+ LLVMValueRef args[] = {
+ LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
+ value,
+ LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+ ctx->ac.i32_0, /* scope */
+ ctx->ac.i1false, /* volatile */
+ LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
+ LLVMConstInt(ctx->ac.i1, release, 0),
+ LLVMConstInt(ctx->ac.i1, done, 0),
+ };
+
+ char intrinsic[64];
+ snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+ return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
}
static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
{
- uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
- ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
- ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
- return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
+ uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+ ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
+ ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
+ return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+ LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
}
struct si_thread0_section {
- struct si_shader_context *ctx;
- LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
- LLVMValueRef saved_exec;
+ struct si_shader_context *ctx;
+ LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+ LLVMValueRef saved_exec;
};
/* Enter a section that only executes on thread 0. */
static void si_enter_thread0_section(struct si_shader_context *ctx,
- struct si_thread0_section *section,
- LLVMValueRef thread_id)
+ struct si_thread0_section *section, LLVMValueRef thread_id)
{
- section->ctx = ctx;
- section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
- /* This IF has 4 instructions:
- * v_and_b32_e32 v, 63, v ; get the thread ID
- * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
- * s_and_saveexec_b64 s, vcc
- * s_cbranch_execz BB0_4
- *
- * It could just be s_and_saveexec_b64 s, 1.
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
- ctx->ac.i32_0, ""), 12601);
+ section->ctx = ctx;
+ section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
+
+ /* This IF has 4 instructions:
+ * v_and_b32_e32 v, 63, v ; get the thread ID
+ * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0
+ * s_and_saveexec_b64 s, vcc
+ * s_cbranch_execz BB0_4
+ *
+ * It could just be s_and_saveexec_b64 s, 1.
+ */
+ ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
+ 12601);
}
/* Exit a section that only executes on thread 0 and broadcast the result
* to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section,
- LLVMValueRef *result)
+static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
{
- struct si_shader_context *ctx = section->ctx;
+ struct si_shader_context *ctx = section->ctx;
- LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+ LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
- ac_build_endif(&ctx->ac, 12601);
+ ac_build_endif(&ctx->ac, 12601);
- /* Broadcast the result from thread 0 to all threads. */
- *result = ac_build_readlane(&ctx->ac,
- LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+ /* Broadcast the result from thread 0 to all threads. */
+ *result =
+ ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
}
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
{
- struct si_shader_key *key = &ctx->shader->key;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef vs = ctx->main_fn;
-
- /* Always inline the VS function. */
- ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
- LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
- enum ac_arg_type const_desc_type;
- if (ctx->shader->selector->info.const_buffers_declared == 1 &&
- ctx->shader->selector->info.shader_buffers_declared == 0)
- const_desc_type = AC_ARG_CONST_FLOAT_PTR;
- else
- const_desc_type = AC_ARG_CONST_DESC_PTR;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
- struct ac_arg param_vb_desc, param_const_desc;
- struct ac_arg param_base_vertex, param_start_instance;
- struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
- struct ac_arg param_restart_index, param_smallprim_precision;
- struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
- struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- ¶m_index_buffers_and_constants);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- ¶m_vb_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
- ¶m_const_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
- ¶m_sampler_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
-
- /* Block ID and thread ID inputs. */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
- if (VERTEX_COUNTER_GDS_MODE == 2)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);
-
- /* Create the compute shader function. */
- unsigned old_type = ctx->type;
- ctx->type = PIPE_SHADER_COMPUTE;
- si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
- ctx->type = old_type;
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "amdgpu-gds-size", 256);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
- GDS_SIZE_UNORDERED);
- }
-
- /* Assemble parameters for VS. */
- LLVMValueRef vs_params[16];
- unsigned num_vs_params = 0;
- unsigned param_vertex_id, param_instance_id;
-
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
- vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
- vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32,
- S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
- vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
- vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
- vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
- vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
-
- assert(num_vs_params <= ARRAY_SIZE(vs_params));
- assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
- /* Load descriptors. (load 8 dwords at once) */
- LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
- LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
- tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
- ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
- tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
- for (unsigned i = 0; i < 8; i++)
- desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
- input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
- output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
- /* Compute PrimID and InstanceID. */
- LLVMValueRef global_thread_id =
- ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
- LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
- ac_get_arg(&ctx->ac, param_local_id));
- LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
- LLVMValueRef instance_id = ctx->ac.i32_0;
-
- if (key->opt.cs_instancing) {
- LLVMValueRef num_prims_udiv_terms =
- ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
- LLVMValueRef num_prims_udiv_multiplier =
- ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
- /* Unpack num_prims_udiv_terms. */
- LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
- LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
- LLVMConstInt(ctx->ac.i32, 5, 0), "");
- /* Divide the total prim_id by the number of prims per instance. */
- instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
- num_prims_udiv_multiplier,
- post_shift);
- /* Compute the remainder. */
- prim_id = LLVMBuildSub(builder, prim_id,
- LLVMBuildMul(builder, instance_id,
- prims_per_instance, ""), "");
- }
-
- /* Generate indices (like a non-indexed draw call). */
- LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
- unsigned vertices_per_prim = 3;
-
- switch (key->opt.cs_prim_type) {
- case PIPE_PRIM_TRIANGLES:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_imad(&ctx->ac, prim_id,
- LLVMConstInt(ctx->ac.i32, 3, 0),
- LLVMConstInt(ctx->ac.i32, i, 0));
- }
- break;
- case PIPE_PRIM_TRIANGLE_STRIP:
- for (unsigned i = 0; i < 3; i++) {
- index[i] = LLVMBuildAdd(builder, prim_id,
- LLVMConstInt(ctx->ac.i32, i, 0), "");
- }
- break;
- case PIPE_PRIM_TRIANGLE_FAN:
- /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
- * and rasterizer as a normal triangle, so we need to put the provoking
- * vertex into the correct index variable and preserve orientation at the same time.
- * gl_VertexID is preserved, because it's equal to the index.
- */
- if (key->opt.cs_provoking_vertex_first) {
- index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- index[2] = ctx->ac.i32_0;
- } else {
- index[0] = ctx->ac.i32_0;
- index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
- index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- }
- break;
- default:
- unreachable("unexpected primitive type");
- }
-
- /* Fetch indices. */
- if (key->opt.cs_indexed) {
- for (unsigned i = 0; i < 3; i++) {
- index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
- index[i], ctx->ac.i32_0, 1,
- 0, true);
- index[i] = ac_to_integer(&ctx->ac, index[i]);
- }
- }
-
- LLVMValueRef ordered_wave_id = NULL;
-
- /* Extract the ordered wave ID. */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
- ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
- LLVMConstInt(ctx->ac.i32, 6, 0), "");
- ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
- LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
- }
- LLVMValueRef thread_id =
- LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
- LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
- /* Every other triangle in a strip has a reversed vertex order, so we
- * need to swap vertices of odd primitives to get the correct primitive
- * orientation when converting triangle strips to triangles. Primitive
- * restart complicates it, because a strip can start anywhere.
- */
- LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
- LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
- if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
- /* Without primitive restart, odd primitives have reversed orientation.
- * Only primitive restart can flip it with respect to the first vertex
- * of the draw call.
- */
- LLVMValueRef first_is_odd = ctx->ac.i1false;
-
- /* Handle primitive restart. */
- if (key->opt.cs_primitive_restart) {
- /* Get the GDS primitive restart continue flag and clear
- * the flag in vertex_counter. This flag is used when the draw
- * call was split and we need to load the primitive orientation
- * flag from GDS for the first wave too.
- */
- LLVMValueRef gds_prim_restart_continue =
- LLVMBuildLShr(builder, vertex_counter,
- LLVMConstInt(ctx->ac.i32, 31, 0), "");
- gds_prim_restart_continue =
- LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
- vertex_counter = LLVMBuildAnd(builder, vertex_counter,
- LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
- LLVMValueRef index0_is_reset;
-
- for (unsigned i = 0; i < 3; i++) {
- LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
- ac_get_arg(&ctx->ac, param_restart_index),
- "");
- if (i == 0)
- index0_is_reset = LLVMBuildNot(builder, not_reset, "");
- prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
- not_reset, "");
- }
-
- /* If the previous waves flip the primitive orientation
- * of the current triangle strip, it will be stored in GDS.
- *
- * Sometimes the correct orientation is not needed, in which case
- * we don't need to execute this.
- */
- if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
- /* If there are reset indices in this wave, get the thread index
- * where the most recent strip starts relative to each thread.
- */
- LLVMValueRef preceding_threads_mask =
- LLVMBuildSub(builder,
- LLVMBuildShl(builder, ctx->ac.i64_1,
- LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
- ctx->ac.i64_1, "");
-
- LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
- LLVMValueRef preceding_reset_threadmask =
- LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
- LLVMValueRef strip_start =
- ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
- strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
- /* This flips the orientatino based on reset indices within this wave only. */
- first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
- LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
- LLVMValueRef is_first_wave, current_wave_resets_index;
-
- /* Get the thread index where the last strip starts in this wave.
- *
- * If the last strip doesn't start in this wave, the thread index
- * will be 0.
- *
- * If the last strip starts in the next wave, the thread index will
- * be 64.
- */
- last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
- last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, §ion, thread_id);
-
- /* This must be done in the thread 0 section, because
- * we expect PrimID to be 0 for the whole first wave
- * in this expression.
- *
- * NOTE: This will need to be different if we wanna support
- * instancing with primitive restart.
- */
- is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
- is_first_wave = LLVMBuildAnd(builder, is_first_wave,
- LLVMBuildNot(builder,
- gds_prim_restart_continue, ""), "");
- current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
- last_strip_start, ctx->ac.i32_0, "");
-
- ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
- /* Save the last strip start primitive index in GDS and read
- * the value that previous waves stored.
- *
- * if (is_first_wave || current_wave_resets_strip)
- * // Read the value that previous waves stored and store a new one.
- * first_is_odd = ds.ordered.swap(last_strip_start);
- * else
- * // Just read the value that previous waves stored.
- * first_is_odd = ds.ordered.add(0);
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildOr(builder, is_first_wave,
- current_wave_resets_index, ""), 12602);
- {
- /* The GDS address is always 0 with ordered append. */
- tmp = si_build_ds_ordered_op(ctx, "swap",
- ordered_wave_id, last_strip_start,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_else(&ctx->ac, 12603);
- {
- /* Just read the value from GDS. */
- tmp = si_build_ds_ordered_op(ctx, "add",
- ordered_wave_id, ctx->ac.i32_0,
- 1, true, false);
- LLVMBuildStore(builder, tmp, ret);
- }
- ac_build_endif(&ctx->ac, 12602);
-
- prev_wave_state = LLVMBuildLoad(builder, ret, "");
- /* Ignore the return value if this is the first wave. */
- prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
- ctx->ac.i32_0, prev_wave_state, "");
- si_exit_thread0_section(§ion, &prev_wave_state);
- prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
- /* If the strip start appears to be on thread 0 for the current primitive
- * (meaning the reset index is not present in this wave and might have
- * appeared in previous waves), use the value from GDS to determine
- * primitive orientation.
- *
- * If the strip start is in this wave for the current primitive, use
- * the value from the current wave to determine primitive orientation.
- */
- LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
- strip_start, ctx->ac.i32_0, "");
- first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
- first_is_odd, "");
- }
- }
- /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
- LLVMValueRef prim_is_odd =
- LLVMBuildXor(builder, first_is_odd,
- LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
- /* Convert triangle strip indices to triangle indices. */
- ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd,
- LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
- index);
- }
-
- /* Execute the vertex shader for each vertex to get vertex positions. */
- LLVMValueRef pos[3][4];
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- vs_params[param_vertex_id] = index[i];
- vs_params[param_instance_id] = instance_id;
-
- LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
- for (unsigned chan = 0; chan < 4; chan++)
- pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
- }
-
- /* Divide XYZ by W. */
- for (unsigned i = 0; i < vertices_per_prim; i++) {
- for (unsigned chan = 0; chan < 3; chan++)
- pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
- }
-
- /* Load the viewport state. */
- LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
- LLVMConstInt(ctx->ac.i32, 2, 0));
- vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
- LLVMValueRef vp_scale[2], vp_translate[2];
- vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
- vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
- vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
- vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
- /* Do culling. */
- struct ac_cull_options options = {};
- options.cull_front = key->opt.cs_cull_front;
- options.cull_back = key->opt.cs_cull_back;
- options.cull_view_xy = true;
- options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
- options.cull_small_prims = true;
- options.cull_zero_area = true;
- options.cull_w = true;
- options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
- LLVMValueRef accepted =
- ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
- vp_scale, vp_translate,
- ac_get_arg(&ctx->ac, param_smallprim_precision),
- &options);
-
- ac_build_optimization_barrier(&ctx->ac, &accepted);
- LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
- /* Count the number of active threads by doing bitcount(accepted). */
- LLVMValueRef num_prims_accepted =
- ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64,
- &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
- num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
- LLVMValueRef start;
-
- /* Execute atomic_add on the vertex count. */
- struct si_thread0_section section;
- si_enter_thread0_section(ctx, §ion, thread_id);
- {
- if (VERTEX_COUNTER_GDS_MODE == 0) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 1) {
- LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
- LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
- start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
- vertex_counter, num_indices,
- LLVMAtomicOrderingMonotonic, false);
- } else if (VERTEX_COUNTER_GDS_MODE == 2) {
- LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
- /* If the draw call was split into multiple subdraws, each using
- * a separate draw packet, we need to start counting from 0 for
- * the first compute wave of the subdraw.
- *
- * vertex_counter contains the primitive ID of the first thread
- * in the first wave.
- *
- * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
- */
- LLVMValueRef is_first_wave =
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- vertex_counter, "");
-
- /* Store the primitive count for ordered append, not vertex count.
- * The idea is to avoid GDS initialization via CP DMA. The shader
- * effectively stores the first count using "swap".
- *
- * if (first_wave) {
- * ds.ordered.swap(num_prims_accepted); // store the first primitive count
- * previous = 0;
- * } else {
- * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
- * }
- */
- ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
- {
- /* The GDS address is always 0 with ordered append. */
- si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
- num_prims_accepted, 0, true, true);
- LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
- }
- ac_build_else(&ctx->ac, 12605);
- {
- LLVMBuildStore(builder,
- si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
- num_prims_accepted, 0,
- true, true),
- tmp_store);
- }
- ac_build_endif(&ctx->ac, 12604);
-
- start = LLVMBuildLoad(builder, tmp_store, "");
- }
- }
- si_exit_thread0_section(§ion, &start);
-
- /* Write the final vertex count to memory. An EOS/EOP event could do this,
- * but those events are super slow and should be avoided if performance
- * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
- * event like this.
- */
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
- ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
- 12606);
- LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
- count = LLVMBuildMul(builder, count,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
- /* GFX8 needs to disable caching, so that the CP can see the stored value.
- * MTYPE=3 bypasses TC L2.
- */
- if (ctx->screen->info.chip_class <= GFX8) {
- LLVMValueRef desc[] = {
- ac_get_arg(&ctx->ac, param_vertex_count_addr),
- LLVMConstInt(ctx->ac.i32,
- S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
- LLVMConstInt(ctx->ac.i32, 4, 0),
- LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
- S_008F0C_MTYPE(3 /* uncached */), 0),
- };
- LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
- ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0,
- ctx->ac.i32_0, 0, ac_glc | ac_slc);
- } else {
- LLVMBuildStore(builder, count,
- si_expand_32bit_pointer(ctx,
- ac_get_arg(&ctx->ac,
- param_vertex_count_addr)));
- }
- ac_build_endif(&ctx->ac, 12606);
- } else {
- /* For unordered modes that increment a vertex count instead of
- * primitive count, convert it into the primitive index.
- */
- start = LLVMBuildUDiv(builder, start,
- LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
- }
-
- /* Now we need to store the indices of accepted primitives into
- * the output index buffer.
- */
- ac_build_ifcc(&ctx->ac, accepted, 16607);
- {
- /* Get the number of bits set before the index of this thread. */
- LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
- /* We have lowered instancing. Pack the instance ID into vertex ID. */
- if (key->opt.cs_instancing) {
- instance_id = LLVMBuildShl(builder, instance_id,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
- for (unsigned i = 0; i < vertices_per_prim; i++)
- index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
- }
-
- if (VERTEX_COUNTER_GDS_MODE == 2) {
- /* vertex_counter contains the first primitive ID
- * for this dispatch. If the draw call was split into
- * multiple subdraws, the first primitive ID is > 0
- * for subsequent subdraws. Each subdraw uses a different
- * portion of the output index buffer. Offset the store
- * vindex by the first primitive ID to get the correct
- * store address for the subdraw.
- */
- start = LLVMBuildAdd(builder, start, vertex_counter, "");
- }
-
- /* Write indices for accepted primitives. */
- LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
- LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
- if (!ac_has_vec3_support(ctx->ac.chip_class, true))
- vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
- ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
- vindex, ctx->ac.i32_0, 3,
- ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
- }
- ac_build_endif(&ctx->ac, 16607);
-
- LLVMBuildRetVoid(builder);
+ struct si_shader_key *key = &ctx->shader->key;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef vs = ctx->main_fn;
+
+ /* Always inline the VS function. */
+ ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+ LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+ enum ac_arg_type const_desc_type;
+ if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+ ctx->shader->selector->info.shader_buffers_declared == 0)
+ const_desc_type = AC_ARG_CONST_FLOAT_PTR;
+ else
+ const_desc_type = AC_ARG_CONST_DESC_PTR;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
+ struct ac_arg param_vb_desc, param_const_desc;
+ struct ac_arg param_base_vertex, param_start_instance;
+ struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
+ struct ac_arg param_restart_index, param_smallprim_precision;
+ struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
+ struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
+
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+ ¶m_index_buffers_and_constants);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_last_wave_prim_id);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_count_addr);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_restart_index);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
+
+ /* Block ID and thread ID inputs. */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
+ if (VERTEX_COUNTER_GDS_MODE == 2)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_ordered_wave_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id);
+
+ /* Create the compute shader function. */
+ unsigned old_type = ctx->type;
+ ctx->type = PIPE_SHADER_COMPUTE;
+ si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
+ ctx->type = old_type;
+
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+ } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
+ }
+
+ /* Assemble parameters for VS. */
+ LLVMValueRef vs_params[16];
+ unsigned num_vs_params = 0;
+ unsigned param_vertex_id, param_instance_id;
+
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+ vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
+ vs_params[num_vs_params++] =
+ LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
+ vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
+
+ vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
+ vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
+ vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
+
+ assert(num_vs_params <= ARRAY_SIZE(vs_params));
+ assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+ /* Load descriptors. (load 8 dwords at once) */
+ LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+ LLVMValueRef index_buffers_and_constants =
+ ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
+ tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+ ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+ tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
+
+ for (unsigned i = 0; i < 8; i++)
+ desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+ input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+ output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+ /* Compute PrimID and InstanceID. */
+ LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
+ LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
+ ac_get_arg(&ctx->ac, param_local_id));
+ LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+ LLVMValueRef instance_id = ctx->ac.i32_0;
+
+ if (key->opt.cs_instancing) {
+ LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
+ LLVMValueRef num_prims_udiv_multiplier =
+ ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
+ /* Unpack num_prims_udiv_terms. */
+ LLVMValueRef post_shift =
+ LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
+ LLVMValueRef prims_per_instance =
+ LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
+ /* Divide the total prim_id by the number of prims per instance. */
+ instance_id =
+ ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
+ /* Compute the remainder. */
+ prim_id = LLVMBuildSub(builder, prim_id,
+ LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
+ }
+
+ /* Generate indices (like a non-indexed draw call). */
+ LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
+ unsigned vertices_per_prim = 3;
+
+ switch (key->opt.cs_prim_type) {
+ case PIPE_PRIM_TRIANGLES:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
+ LLVMConstInt(ctx->ac.i32, i, 0));
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
+ }
+ break;
+ case PIPE_PRIM_TRIANGLE_FAN:
+ /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+ * and rasterizer as a normal triangle, so we need to put the provoking
+ * vertex into the correct index variable and preserve orientation at the same time.
+ * gl_VertexID is preserved, because it's equal to the index.
+ */
+ if (key->opt.cs_provoking_vertex_first) {
+ index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+ index[2] = ctx->ac.i32_0;
+ } else {
+ index[0] = ctx->ac.i32_0;
+ index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+ index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+ }
+ break;
+ default:
+ unreachable("unexpected primitive type");
+ }
+
+ /* Fetch indices. */
+ if (key->opt.cs_indexed) {
+ for (unsigned i = 0; i < 3; i++) {
+ index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
+ 1, 0, true);
+ index[i] = ac_to_integer(&ctx->ac, index[i]);
+ }
+ }
+
+ LLVMValueRef ordered_wave_id = NULL;
+
+ /* Extract the ordered wave ID. */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
+ ordered_wave_id =
+ LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
+ ordered_wave_id =
+ LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
+ }
+ LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
+ LLVMConstInt(ctx->ac.i32, 63, 0), "");
+
+ /* Every other triangle in a strip has a reversed vertex order, so we
+ * need to swap vertices of odd primitives to get the correct primitive
+ * orientation when converting triangle strips to triangles. Primitive
+ * restart complicates it, because a strip can start anywhere.
+ */
+ LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
+ LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
+
+ if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* Without primitive restart, odd primitives have reversed orientation.
+ * Only primitive restart can flip it with respect to the first vertex
+ * of the draw call.
+ */
+ LLVMValueRef first_is_odd = ctx->ac.i1false;
+
+ /* Handle primitive restart. */
+ if (key->opt.cs_primitive_restart) {
+ /* Get the GDS primitive restart continue flag and clear
+ * the flag in vertex_counter. This flag is used when the draw
+ * call was split and we need to load the primitive orientation
+ * flag from GDS for the first wave too.
+ */
+ LLVMValueRef gds_prim_restart_continue =
+ LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+ gds_prim_restart_continue =
+ LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
+ vertex_counter =
+ LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
+
+ LLVMValueRef index0_is_reset;
+
+ for (unsigned i = 0; i < 3; i++) {
+ LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+ ac_get_arg(&ctx->ac, param_restart_index), "");
+ if (i == 0)
+ index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+ prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
+ }
+
+ /* If the previous waves flip the primitive orientation
+ * of the current triangle strip, it will be stored in GDS.
+ *
+ * Sometimes the correct orientation is not needed, in which case
+ * we don't need to execute this.
+ */
+ if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+ /* If there are reset indices in this wave, get the thread index
+ * where the most recent strip starts relative to each thread.
+ */
+ LLVMValueRef preceding_threads_mask =
+ LLVMBuildSub(builder,
+ LLVMBuildShl(builder, ctx->ac.i64_1,
+ LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
+ ctx->ac.i64_1, "");
+
+ LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+ LLVMValueRef preceding_reset_threadmask =
+ LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+ LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+ strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
+
+ /* This flips the orientatino based on reset indices within this wave only. */
+ first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
+
+ LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+ LLVMValueRef is_first_wave, current_wave_resets_index;
+
+ /* Get the thread index where the last strip starts in this wave.
+ *
+ * If the last strip doesn't start in this wave, the thread index
+ * will be 0.
+ *
+ * If the last strip starts in the next wave, the thread index will
+ * be 64.
+ */
+ last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+ last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
+
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, §ion, thread_id);
+
+ /* This must be done in the thread 0 section, because
+ * we expect PrimID to be 0 for the whole first wave
+ * in this expression.
+ *
+ * NOTE: This will need to be different if we wanna support
+ * instancing with primitive restart.
+ */
+ is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
+ is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+ LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
+ current_wave_resets_index =
+ LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
+
+ ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
+
+ /* Save the last strip start primitive index in GDS and read
+ * the value that previous waves stored.
+ *
+ * if (is_first_wave || current_wave_resets_strip)
+ * // Read the value that previous waves stored and store a new one.
+ * first_is_odd = ds.ordered.swap(last_strip_start);
+ * else
+ * // Just read the value that previous waves stored.
+ * first_is_odd = ds.ordered.add(0);
+ */
+ ac_build_ifcc(
+ &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
+ false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_else(&ctx->ac, 12603);
+ {
+ /* Just read the value from GDS. */
+ tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
+ false);
+ LLVMBuildStore(builder, tmp, ret);
+ }
+ ac_build_endif(&ctx->ac, 12602);
+
+ prev_wave_state = LLVMBuildLoad(builder, ret, "");
+ /* Ignore the return value if this is the first wave. */
+ prev_wave_state =
+ LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
+ si_exit_thread0_section(§ion, &prev_wave_state);
+ prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
+
+ /* If the strip start appears to be on thread 0 for the current primitive
+ * (meaning the reset index is not present in this wave and might have
+ * appeared in previous waves), use the value from GDS to determine
+ * primitive orientation.
+ *
+ * If the strip start is in this wave for the current primitive, use
+ * the value from the current wave to determine primitive orientation.
+ */
+ LLVMValueRef strip_start_is0 =
+ LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
+ first_is_odd =
+ LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
+ }
+ }
+ /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+ LLVMValueRef prim_is_odd = LLVMBuildXor(
+ builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
+
+ /* Convert triangle strip indices to triangle indices. */
+ ac_build_triangle_strip_indices_to_triangle(
+ &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
+ index);
+ }
+
+ /* Execute the vertex shader for each vertex to get vertex positions. */
+ LLVMValueRef pos[3][4];
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ vs_params[param_vertex_id] = index[i];
+ vs_params[param_instance_id] = instance_id;
+
+ LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+ for (unsigned chan = 0; chan < 4; chan++)
+ pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+ }
+
+ /* Divide XYZ by W. */
+ for (unsigned i = 0; i < vertices_per_prim; i++) {
+ for (unsigned chan = 0; chan < 3; chan++)
+ pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+ }
+
+ /* Load the viewport state. */
+ LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+ LLVMConstInt(ctx->ac.i32, 2, 0));
+ vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+ LLVMValueRef vp_scale[2], vp_translate[2];
+ vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+ vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+ vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+ vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+ /* Do culling. */
+ struct ac_cull_options options = {};
+ options.cull_front = key->opt.cs_cull_front;
+ options.cull_back = key->opt.cs_cull_back;
+ options.cull_view_xy = true;
+ options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+ options.cull_small_prims = true;
+ options.cull_zero_area = true;
+ options.cull_w = true;
+ options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+ LLVMValueRef accepted =
+ ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
+ ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
+
+ ac_build_optimization_barrier(&ctx->ac, &accepted);
+ LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+ /* Count the number of active threads by doing bitcount(accepted). */
+ LLVMValueRef num_prims_accepted = ac_build_intrinsic(
+ &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+ num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
+
+ LLVMValueRef start;
+
+ /* Execute atomic_add on the vertex count. */
+ struct si_thread0_section section;
+ si_enter_thread0_section(ctx, §ion, thread_id);
+ {
+ if (VERTEX_COUNTER_GDS_MODE == 0) {
+ LLVMValueRef num_indices = LLVMBuildMul(
+ builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+ LLVMValueRef num_indices = LLVMBuildMul(
+ builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+ LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
+ start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+ LLVMAtomicOrderingMonotonic, false);
+ } else if (VERTEX_COUNTER_GDS_MODE == 2) {
+ LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+ /* If the draw call was split into multiple subdraws, each using
+ * a separate draw packet, we need to start counting from 0 for
+ * the first compute wave of the subdraw.
+ *
+ * vertex_counter contains the primitive ID of the first thread
+ * in the first wave.
+ *
+ * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+ */
+ LLVMValueRef is_first_wave =
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
+
+ /* Store the primitive count for ordered append, not vertex count.
+ * The idea is to avoid GDS initialization via CP DMA. The shader
+ * effectively stores the first count using "swap".
+ *
+ * if (first_wave) {
+ * ds.ordered.swap(num_prims_accepted); // store the first primitive count
+ * previous = 0;
+ * } else {
+ * previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+ * }
+ */
+ ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+ {
+ /* The GDS address is always 0 with ordered append. */
+ si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
+ LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
+ }
+ ac_build_else(&ctx->ac, 12605);
+ {
+ LLVMBuildStore(builder,
+ si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
+ 0, true, true),
+ tmp_store);
+ }
+ ac_build_endif(&ctx->ac, 12604);
+
+ start = LLVMBuildLoad(builder, tmp_store, "");
+ }
+ }
+ si_exit_thread0_section(§ion, &start);
+
+ /* Write the final vertex count to memory. An EOS/EOP event could do this,
+ * but those events are super slow and should be avoided if performance
+ * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+ * event like this.
+ */
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+ ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
+ 12606);
+ LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+ count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+
+ /* GFX8 needs to disable caching, so that the CP can see the stored value.
+ * MTYPE=3 bypasses TC L2.
+ */
+ if (ctx->screen->info.chip_class <= GFX8) {
+ LLVMValueRef desc[] = {
+ ac_get_arg(&ctx->ac, param_vertex_count_addr),
+ LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+ LLVMConstInt(ctx->ac.i32, 4, 0),
+ LLVMConstInt(
+ ctx->ac.i32,
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
+ 0),
+ };
+ LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+ ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
+ ac_glc | ac_slc);
+ } else {
+ LLVMBuildStore(
+ builder, count,
+ si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
+ }
+ ac_build_endif(&ctx->ac, 12606);
+ } else {
+ /* For unordered modes that increment a vertex count instead of
+ * primitive count, convert it into the primitive index.
+ */
+ start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+ }
+
+ /* Now we need to store the indices of accepted primitives into
+ * the output index buffer.
+ */
+ ac_build_ifcc(&ctx->ac, accepted, 16607);
+ {
+ /* Get the number of bits set before the index of this thread. */
+ LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+ /* We have lowered instancing. Pack the instance ID into vertex ID. */
+ if (key->opt.cs_instancing) {
+ instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+ for (unsigned i = 0; i < vertices_per_prim; i++)
+ index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+ }
+
+ if (VERTEX_COUNTER_GDS_MODE == 2) {
+ /* vertex_counter contains the first primitive ID
+ * for this dispatch. If the draw call was split into
+ * multiple subdraws, the first primitive ID is > 0
+ * for subsequent subdraws. Each subdraw uses a different
+ * portion of the output index buffer. Offset the store
+ * vindex by the first primitive ID to get the correct
+ * store address for the subdraw.
+ */
+ start = LLVMBuildAdd(builder, start, vertex_counter, "");
+ }
+
+ /* Write indices for accepted primitives. */
+ LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+ LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+ if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+ vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+ ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, 3,
+ ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+ }
+ ac_build_endif(&ctx->ac, 16607);
+
+ LLVMBuildRetVoid(builder);
}
/* Return false if the shader isn't ready. */
static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart)
+ const struct pipe_draw_info *info,
+ bool primitive_restart)
{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_shader_key key;
-
- /* Primitive restart needs ordered counters. */
- assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
- assert(!primitive_restart || info->instance_count == 1);
-
- memset(&key, 0, sizeof(key));
- si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
- assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
- key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
- key.opt.vs_as_prim_discard_cs = 1;
- key.opt.cs_prim_type = info->mode;
- key.opt.cs_indexed = info->index_size != 0;
- key.opt.cs_instancing = info->instance_count > 1;
- key.opt.cs_primitive_restart = primitive_restart;
- key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
- /* Primitive restart with triangle strips needs to preserve primitive
- * orientation for cases where front and back primitive orientation matters.
- */
- if (primitive_restart) {
- struct si_shader_selector *ps = sctx->ps_shader.cso;
-
- key.opt.cs_need_correct_orientation =
- rs->cull_front != rs->cull_back ||
- ps->info.uses_frontface ||
- (rs->two_side && ps->info.colors_read);
- }
-
- if (rs->rasterizer_discard) {
- /* Just for performance testing and analysis of trivial bottlenecks.
- * This should result in a very short compute shader. */
- key.opt.cs_cull_front = 1;
- key.opt.cs_cull_back = 1;
- } else {
- key.opt.cs_cull_front =
- sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
- key.opt.cs_cull_back =
- sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
- }
-
- if (!rs->depth_clamp_any && CULL_Z) {
- key.opt.cs_cull_z = 1;
- key.opt.cs_halfz_clip_space = rs->clip_halfz;
- }
-
- sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
- sctx->cs_prim_discard_state.current = NULL;
-
- if (!sctx->compiler.passes)
- si_init_compiler(sctx->screen, &sctx->compiler);
-
- struct si_compiler_ctx_state compiler_state;
- compiler_state.compiler = &sctx->compiler;
- compiler_state.debug = sctx->debug;
- compiler_state.is_debug_context = sctx->is_debug;
-
- return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
- &compiler_state, &key, -1, true) == 0 &&
- /* Disallow compute shaders using the scratch buffer. */
- sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_shader_key key;
+
+ /* Primitive restart needs ordered counters. */
+ assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+ assert(!primitive_restart || info->instance_count == 1);
+
+ memset(&key, 0, sizeof(key));
+ si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+ assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+ key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+ key.opt.vs_as_prim_discard_cs = 1;
+ key.opt.cs_prim_type = info->mode;
+ key.opt.cs_indexed = info->index_size != 0;
+ key.opt.cs_instancing = info->instance_count > 1;
+ key.opt.cs_primitive_restart = primitive_restart;
+ key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+ /* Primitive restart with triangle strips needs to preserve primitive
+ * orientation for cases where front and back primitive orientation matters.
+ */
+ if (primitive_restart) {
+ struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+ key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
+ ps->info.uses_frontface ||
+ (rs->two_side && ps->info.colors_read);
+ }
+
+ if (rs->rasterizer_discard) {
+ /* Just for performance testing and analysis of trivial bottlenecks.
+ * This should result in a very short compute shader. */
+ key.opt.cs_cull_front = 1;
+ key.opt.cs_cull_back = 1;
+ } else {
+ key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+ key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+ }
+
+ if (!rs->depth_clamp_any && CULL_Z) {
+ key.opt.cs_cull_z = 1;
+ key.opt.cs_halfz_clip_space = rs->clip_halfz;
+ }
+
+ sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+ sctx->cs_prim_discard_state.current = NULL;
+
+ if (!sctx->compiler.passes)
+ si_init_compiler(sctx->screen, &sctx->compiler);
+
+ struct si_compiler_ctx_state compiler_state;
+ compiler_state.compiler = &sctx->compiler;
+ compiler_state.debug = sctx->debug;
+ compiler_state.is_debug_context = sctx->is_debug;
+
+ return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
+ &key, -1, true) == 0 &&
+ /* Disallow compute shaders using the scratch buffer. */
+ sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
}
static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
{
- if (sctx->index_ring)
- return true;
-
- if (!sctx->prim_discard_compute_cs) {
- struct radeon_winsys *ws = sctx->ws;
- unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
- VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
- unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
- if (gds_size) {
- sctx->gds = ws->buffer_create(ws, gds_size, 4,
- RADEON_DOMAIN_GDS, 0);
- if (!sctx->gds)
- return false;
-
- ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
- RADEON_USAGE_READWRITE, 0, 0);
- }
- if (num_oa_counters) {
- assert(gds_size);
- sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
- 1, RADEON_DOMAIN_OA, 0);
- if (!sctx->gds_oa)
- return false;
-
- ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
- RADEON_USAGE_READWRITE, 0, 0);
- }
-
- sctx->prim_discard_compute_cs =
- ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
- num_oa_counters > 0);
- if (!sctx->prim_discard_compute_cs)
- return false;
- }
-
- if (!sctx->index_ring) {
- sctx->index_ring =
- si_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- sctx->index_ring_size_per_ib * 2,
- sctx->screen->info.pte_fragment_size);
- if (!sctx->index_ring)
- return false;
- }
- return true;
+ if (sctx->index_ring)
+ return true;
+
+ if (!sctx->prim_discard_compute_cs) {
+ struct radeon_winsys *ws = sctx->ws;
+ unsigned gds_size =
+ VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+ unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+ if (gds_size) {
+ sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, 0);
+ if (!sctx->gds)
+ return false;
+
+ ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+ }
+ if (num_oa_counters) {
+ assert(gds_size);
+ sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, 0);
+ if (!sctx->gds_oa)
+ return false;
+
+ ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+ }
+
+ sctx->prim_discard_compute_cs =
+ ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0);
+ if (!sctx->prim_discard_compute_cs)
+ return false;
+ }
+
+ if (!sctx->index_ring) {
+ sctx->index_ring = si_aligned_buffer_create(
+ sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
+ if (!sctx->index_ring)
+ return false;
+ }
+ return true;
}
static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
{
- return sctx->index_ring_offset +
- align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
- sctx->index_ring_size_per_ib;
+ return sctx->index_ring_offset +
+ align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+ sctx->index_ring_size_per_ib;
}
enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart)
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+ bool primitive_restart)
{
- /* If the compute shader compilation isn't finished, this returns false. */
- if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
- return SI_PRIM_DISCARD_DISABLED;
-
- if (!si_initialize_prim_discard_cmdbuf(sctx))
- return SI_PRIM_DISCARD_DISABLED;
-
- struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
- unsigned prim = info->mode;
- unsigned count = info->count;
- unsigned instance_count = info->instance_count;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
- unsigned num_prims = num_prims_per_instance * instance_count;
- unsigned out_indexbuf_size = num_prims * 12;
- bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
- const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
- /* Split draws at the draw call level if the ring is full. This makes
- * better use of the ring space.
- */
- if (ring_full &&
- num_prims > split_prims_draw_level &&
- instance_count == 1 && /* TODO: support splitting instanced draws */
- (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
- /* Split draws. */
- struct pipe_draw_info split_draw = *info;
- split_draw.primitive_restart = primitive_restart;
-
- unsigned base_start = split_draw.start;
-
- if (prim == PIPE_PRIM_TRIANGLES) {
- unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
- assert(vert_count_per_subdraw < count);
-
- for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
- split_draw.start = base_start + start;
- split_draw.count = MIN2(count - start, vert_count_per_subdraw);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw);
- }
- } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
- /* No primitive pair can be split, because strips reverse orientation
- * for odd primitives. */
- STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
- unsigned vert_count_per_subdraw = split_prims_draw_level;
-
- for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
- split_draw.start = base_start + start;
- split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
- sctx->b.draw_vbo(&sctx->b, &split_draw);
-
- if (start == 0 &&
- primitive_restart &&
- sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
- sctx->preserve_prim_restart_gds_at_flush = true;
- }
- sctx->preserve_prim_restart_gds_at_flush = false;
- } else {
- assert(0);
- }
-
- return SI_PRIM_DISCARD_DRAW_SPLIT;
- }
-
- /* Just quit if the draw call doesn't fit into the ring and can't be split. */
- if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
- if (SI_PRIM_DISCARD_DEBUG)
- puts("PD failed: draw call too big, can't be split");
- return SI_PRIM_DISCARD_DISABLED;
- }
-
- unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
- unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
- 24 * (num_subdraws - 1) + /* subdraws */
- 20; /* leave some space at the end */
- unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
- need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
- else
- need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
- if (ring_full ||
- (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
- !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
- /* If the current IB is empty but the size is too small, add a NOP
- * packet to force a flush and get a bigger IB.
- */
- if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
- gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
- }
-
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- }
-
- /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
- assert(compute_has_space);
- assert(si_check_ring_space(sctx, out_indexbuf_size));
- return SI_PRIM_DISCARD_ENABLED;
+ /* If the compute shader compilation isn't finished, this returns false. */
+ if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
+ return SI_PRIM_DISCARD_DISABLED;
+
+ if (!si_initialize_prim_discard_cmdbuf(sctx))
+ return SI_PRIM_DISCARD_DISABLED;
+
+ struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+ unsigned prim = info->mode;
+ unsigned count = info->count;
+ unsigned instance_count = info->instance_count;
+ unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+ unsigned num_prims = num_prims_per_instance * instance_count;
+ unsigned out_indexbuf_size = num_prims * 12;
+ bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+ const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+ /* Split draws at the draw call level if the ring is full. This makes
+ * better use of the ring space.
+ */
+ if (ring_full && num_prims > split_prims_draw_level &&
+ instance_count == 1 && /* TODO: support splitting instanced draws */
+ (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+ /* Split draws. */
+ struct pipe_draw_info split_draw = *info;
+ split_draw.primitive_restart = primitive_restart;
+
+ unsigned base_start = split_draw.start;
+
+ if (prim == PIPE_PRIM_TRIANGLES) {
+ unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+ assert(vert_count_per_subdraw < count);
+
+ for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+ split_draw.start = base_start + start;
+ split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+ sctx->b.draw_vbo(&sctx->b, &split_draw);
+ }
+ } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+ /* No primitive pair can be split, because strips reverse orientation
+ * for odd primitives. */
+ STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+ unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+ for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+ split_draw.start = base_start + start;
+ split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+ sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+ if (start == 0 && primitive_restart &&
+ sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+ sctx->preserve_prim_restart_gds_at_flush = true;
+ }
+ sctx->preserve_prim_restart_gds_at_flush = false;
+ } else {
+ assert(0);
+ }
+
+ return SI_PRIM_DISCARD_DRAW_SPLIT;
+ }
+
+ /* Just quit if the draw call doesn't fit into the ring and can't be split. */
+ if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+ if (SI_PRIM_DISCARD_DEBUG)
+ puts("PD failed: draw call too big, can't be split");
+ return SI_PRIM_DISCARD_DISABLED;
+ }
+
+ unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+ unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+ 24 * (num_subdraws - 1) + /* subdraws */
+ 20; /* leave some space at the end */
+ unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+ if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+ need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+ else
+ need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+ if (ring_full ||
+ (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+ !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+ /* If the current IB is empty but the size is too small, add a NOP
+ * packet to force a flush and get a bigger IB.
+ */
+ if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+ gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+ radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(gfx_cs, 0);
+ }
+
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ }
+
+ /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+ assert(compute_has_space);
+ assert(si_check_ring_space(sctx, out_indexbuf_size));
+ return SI_PRIM_DISCARD_ENABLED;
}
void si_compute_signal_gfx(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- unsigned writeback_L2_flags = 0;
-
- /* The writeback L2 flags vary with each chip generation. */
- /* CI needs to flush vertex indices to memory. */
- if (sctx->chip_class <= GFX7)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
- else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
- writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
- if (!sctx->compute_num_prims_in_batch)
- return;
-
- assert(sctx->compute_rewind_va);
-
- /* After the queued dispatches are done and vertex counts are written to
- * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
- * the dispatches to finish, it only adds the CS_DONE event into the event
- * queue.
- */
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
- EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- NULL,
- sctx->compute_rewind_va |
- ((uint64_t)sctx->screen->info.address32_hi << 32),
- REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
- SI_NOT_QUERY);
-
- sctx->compute_rewind_va = 0;
- sctx->compute_num_prims_in_batch = 0;
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ unsigned writeback_L2_flags = 0;
+
+ /* The writeback L2 flags vary with each chip generation. */
+ /* CI needs to flush vertex indices to memory. */
+ if (sctx->chip_class <= GFX7)
+ writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+ else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+ writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+ if (!sctx->compute_num_prims_in_batch)
+ return;
+
+ assert(sctx->compute_rewind_va);
+
+ /* After the queued dispatches are done and vertex counts are written to
+ * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+ * the dispatches to finish, it only adds the CS_DONE event into the event
+ * queue.
+ */
+ si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+ sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+ writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_VALUE_32BIT, NULL,
+ sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+ REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+ SI_NOT_QUERY);
+
+ sctx->compute_rewind_va = 0;
+ sctx->compute_num_prims_in_batch = 0;
}
/* Dispatch a primitive discard compute shader. */
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned index_size,
- unsigned base_vertex,
- uint64_t input_indexbuf_va,
- unsigned input_indexbuf_num_elements)
+ const struct pipe_draw_info *info, unsigned index_size,
+ unsigned base_vertex, uint64_t input_indexbuf_va,
+ unsigned input_indexbuf_num_elements)
{
- struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
- struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
- if (!num_prims_per_instance)
- return;
-
- unsigned num_prims = num_prims_per_instance * info->instance_count;
- unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
- switch (info->mode) {
- case PIPE_PRIM_TRIANGLES:
- case PIPE_PRIM_TRIANGLE_STRIP:
- case PIPE_PRIM_TRIANGLE_FAN:
- vertices_per_prim = 3;
- output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
- gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
- break;
- default:
- unreachable("unsupported primitive type");
- return;
- }
-
- unsigned out_indexbuf_offset;
- uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
- bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
- /* Initialize the compute IB if it's empty. */
- if (!sctx->prim_discard_compute_ib_initialized) {
- /* 1) State initialization. */
- sctx->compute_gds_offset = 0;
- sctx->compute_ib_last_shader = NULL;
-
- if (sctx->last_ib_barrier_fence) {
- assert(!sctx->last_ib_barrier_buf);
- sctx->ws->cs_add_fence_dependency(gfx_cs,
- sctx->last_ib_barrier_fence,
- RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
- }
-
- /* 2) IB initialization. */
-
- /* This needs to be done at the beginning of IBs due to possible
- * TTM buffer moves in the kernel.
- */
- if (sctx->chip_class >= GFX10) {
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
- radeon_emit(cs, 0); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- radeon_emit(cs, /* GCR_CNTL */
- S_586_GLI_INV(V_586_GLI_ALL) |
- S_586_GLK_INV(1) | S_586_GLV_INV(1) |
- S_586_GL1_INV(1) |
- S_586_GL2_INV(1) | S_586_GL2_WB(1) |
- S_586_GLM_INV(1) | S_586_GLM_WB(1) |
- S_586_SEQ(V_586_SEQ_FORWARD));
- } else {
- si_emit_surface_sync(sctx, cs,
- S_0085F0_TC_ACTION_ENA(1) |
- S_0085F0_TCL1_ACTION_ENA(1) |
- S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
- S_0085F0_SH_ICACHE_ACTION_ENA(1) |
- S_0085F0_SH_KCACHE_ACTION_ENA(1));
- }
-
- /* Restore the GDS prim restart counter if needed. */
- if (sctx->preserve_prim_restart_gds_at_flush) {
- si_cp_copy_data(sctx, cs,
- COPY_DATA_GDS, NULL, 4,
- COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
- }
-
- si_emit_initial_compute_regs(sctx, cs);
-
- radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
- S_00B860_WAVES(sctx->scratch_waves) |
- S_00B860_WAVESIZE(0)); /* no scratch */
-
- /* Only 1D grids are launched. */
- radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
- radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
- S_00B820_NUM_THREAD_PARTIAL(1));
- radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
- S_00B824_NUM_THREAD_PARTIAL(1));
-
- radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0);
-
- /* Disable ordered alloc for OA resources. */
- for (unsigned i = 0; i < 2; i++) {
- radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
- radeon_emit(cs, S_031074_INDEX(i));
- radeon_emit(cs, 0);
- radeon_emit(cs, S_03107C_ENABLE(0));
- }
-
- if (sctx->last_ib_barrier_buf) {
- assert(!sctx->last_ib_barrier_fence);
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
- RADEON_USAGE_READ, RADEON_PRIO_FENCE);
- si_cp_wait_mem(sctx, cs,
- sctx->last_ib_barrier_buf->gpu_address +
- sctx->last_ib_barrier_buf_offset, 1, 1,
- WAIT_REG_MEM_EQUAL);
- }
-
- sctx->prim_discard_compute_ib_initialized = true;
- }
-
- /* Allocate the output index buffer. */
- output_indexbuf_size = align(output_indexbuf_size,
- sctx->screen->info.tcc_cache_line_size);
- assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
- out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
- sctx->index_ring_offset += output_indexbuf_size;
-
- radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SHADER_RW_BUFFER);
- uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
- /* Prepare index buffer descriptors. */
- struct si_resource *indexbuf_desc = NULL;
- unsigned indexbuf_desc_offset;
- unsigned desc_size = 12 * 4;
- uint32_t *desc;
-
- u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
- si_optimal_tcc_alignment(sctx, desc_size),
- &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
- (void**)&desc);
- radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
-
- /* Input index buffer. */
- desc[0] = input_indexbuf_va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
- S_008F04_STRIDE(index_size);
- desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT :
- index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT :
- V_008F0C_IMG_FORMAT_32_UINT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
- index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
- V_008F0C_BUF_DATA_FORMAT_32);
- }
-
- /* Output index buffer. */
- desc[4] = out_indexbuf_va;
- desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
- S_008F04_STRIDE(vertices_per_prim * 4);
- desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
- if (sctx->chip_class >= GFX10) {
- desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
- S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
- S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(output_indexbuf_format);
- }
-
- /* Viewport state. */
- struct si_small_prim_cull_info cull_info;
- si_get_small_prim_cull_info(sctx, &cull_info);
-
- desc[8] = fui(cull_info.scale[0]);
- desc[9] = fui(cull_info.scale[1]);
- desc[10] = fui(cull_info.translate[0]);
- desc[11] = fui(cull_info.translate[1]);
-
- /* Better subpixel precision increases the efficiency of small
- * primitive culling. */
- unsigned num_samples = sctx->framebuffer.nr_samples;
- unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
- float small_prim_cull_precision;
-
- if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
- small_prim_cull_precision = num_samples / 4096.0;
- else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
- small_prim_cull_precision = num_samples / 1024.0;
- else
- small_prim_cull_precision = num_samples / 256.0;
-
- /* Set user data SGPRs. */
- /* This can't be greater than 14 if we want the fastest launch rate. */
- unsigned user_sgprs = 13;
-
- uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
- unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
- unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
- uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
- uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
- uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
- sctx->vb_descriptors_buffer->gpu_address +
- sctx->vb_descriptors_offset : 0;
- unsigned gds_offset, gds_size;
- struct si_fast_udiv_info32 num_prims_udiv = {};
-
- if (info->instance_count > 1)
- num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
- /* Limitations on how these two are packed in the user SGPR. */
- assert(num_prims_udiv.post_shift < 32);
- assert(num_prims_per_instance < 1 << 27);
-
- si_resource_reference(&indexbuf_desc, NULL);
-
- bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- gds_offset = sctx->compute_gds_offset;
- gds_size = primitive_restart ? 8 : 4;
- sctx->compute_gds_offset += gds_size;
-
- /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
- * The remainder of the GDS will be cleared after the dispatch packet
- * in parallel with compute shaders.
- */
- if (first_dispatch) {
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
- radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
- radeon_emit(cs, gds_offset);
- radeon_emit(cs, 0);
- radeon_emit(cs, 0); /* value to write */
- if (gds_size == 8)
- radeon_emit(cs, 0);
- }
- }
-
- /* Set shader registers. */
- struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
- if (shader != sctx->compute_ib_last_shader) {
- radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
- RADEON_PRIO_SHADER_BINARY);
- uint64_t shader_va = shader->bo->gpu_address;
-
- assert(shader->config.scratch_bytes_per_wave == 0);
- assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
- radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
- radeon_emit(cs, shader_va >> 8);
- radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
- radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
- radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B848_SGPRS(sctx->chip_class <= GFX9 ?
- (shader->config.num_sgprs - 1) / 8 : 0) |
- S_00B848_FLOAT_MODE(shader->config.float_mode) |
- S_00B848_DX10_CLAMP(1) |
- S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
- S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
- radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
- S_00B84C_USER_SGPR(user_sgprs) |
- S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
- S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
- S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
- S_00B84C_LDS_SIZE(shader->config.lds_size));
-
- radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
- ac_get_compute_resource_limits(&sctx->screen->info,
- WAVES_PER_TG,
- MAX_WAVES_PER_SH,
- THREADGROUPS_PER_CU));
- sctx->compute_ib_last_shader = shader;
- }
-
- STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
- /* Big draw calls are split into smaller dispatches and draw packets. */
- for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
- unsigned num_subdraw_prims;
-
- if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
- num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
- else
- num_subdraw_prims = num_prims - start_prim;
-
- /* Small dispatches are executed back to back until a specific primitive
- * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
- * to start drawing the batch. This batching adds latency to the gfx IB,
- * but CS_DONE and REWIND are too slow.
- */
- if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
- si_compute_signal_gfx(sctx);
-
- if (sctx->compute_num_prims_in_batch == 0) {
- assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
- sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
- if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(gfx_cs, 0);
-
- si_cp_wait_mem(sctx, gfx_cs,
- sctx->compute_rewind_va |
- (uint64_t)sctx->screen->info.address32_hi << 32,
- REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
- WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
- /* Use INDIRECT_BUFFER to chain to a different buffer
- * to discard the CP prefetch cache.
- */
- sctx->ws->cs_check_space(gfx_cs, 0, true);
- } else {
- radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
- radeon_emit(gfx_cs, 0);
- }
- }
-
- sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
- uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
- uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
- /* Emit the draw packet into the gfx IB. */
- radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
- radeon_emit(gfx_cs, num_prims * vertices_per_prim);
- radeon_emit(gfx_cs, index_va);
- radeon_emit(gfx_cs, index_va >> 32);
- radeon_emit(gfx_cs, 0);
- radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-
- /* Continue with the compute IB. */
- if (start_prim == 0) {
- uint32_t gds_prim_restart_continue_bit = 0;
-
- if (sctx->preserve_prim_restart_gds_at_flush) {
- assert(primitive_restart &&
- info->mode == PIPE_PRIM_TRIANGLE_STRIP);
- assert(start_prim < 1 << 31);
- gds_prim_restart_continue_bit = 1 << 31;
- }
-
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
- radeon_emit(cs, index_buffers_va);
- radeon_emit(cs,
- VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
- VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
- start_prim |
- gds_prim_restart_continue_bit);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- radeon_emit(cs, vb_desc_va);
- radeon_emit(cs, vs_const_desc_va);
- radeon_emit(cs, vs_sampler_desc_va);
- radeon_emit(cs, base_vertex);
- radeon_emit(cs, info->start_instance);
- radeon_emit(cs, num_prims_udiv.multiplier);
- radeon_emit(cs, num_prims_udiv.post_shift |
- (num_prims_per_instance << 5));
- radeon_emit(cs, info->restart_index);
- /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
- radeon_emit(cs, fui(small_prim_cull_precision));
- } else {
- assert(VERTEX_COUNTER_GDS_MODE == 2);
- /* Only update the SGPRs that changed. */
- radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
- radeon_emit(cs, start_prim);
- radeon_emit(cs, start_prim + num_subdraw_prims - 1);
- radeon_emit(cs, count_va);
- }
-
- /* Set grid dimensions. */
- unsigned start_block = start_prim / THREADGROUP_SIZE;
- unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
- unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
- radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
- radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
- S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
- S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
- PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
- radeon_emit(cs, 1);
- radeon_emit(cs, 1);
- radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
- S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
- S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
- S_00B800_ORDER_MODE(0 /* launch in order */));
-
- /* This is only for unordered append. Ordered append writes this from
- * the shader.
- *
- * Note that EOP and EOS events are super slow, so emulating the event
- * in a shader is an important optimization.
- */
- if (VERTEX_COUNTER_GDS_MODE == 1) {
- si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
- sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
- EOP_INT_SEL_NONE,
- EOP_DATA_SEL_GDS,
- NULL,
- count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
- EOP_DATA_GDS(gds_offset / 4, 1),
- SI_NOT_QUERY);
-
- /* Now that compute shaders are running, clear the remainder of GDS. */
- if (first_dispatch) {
- unsigned offset = gds_offset + gds_size;
- si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
- GDS_SIZE_UNORDERED - offset,
- 0,
- SI_CPDMA_SKIP_CHECK_CS_SPACE |
- SI_CPDMA_SKIP_GFX_SYNC |
- SI_CPDMA_SKIP_SYNC_BEFORE,
- SI_COHERENCY_NONE, L2_BYPASS);
- }
- }
- first_dispatch = false;
-
- assert(cs->current.cdw <= cs->current.max_dw);
- assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
- }
+ struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+ unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+ if (!num_prims_per_instance)
+ return;
+
+ unsigned num_prims = num_prims_per_instance * info->instance_count;
+ unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
+
+ switch (info->mode) {
+ case PIPE_PRIM_TRIANGLES:
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ case PIPE_PRIM_TRIANGLE_FAN:
+ vertices_per_prim = 3;
+ output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+ gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
+ break;
+ default:
+ unreachable("unsupported primitive type");
+ return;
+ }
+
+ unsigned out_indexbuf_offset;
+ uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+ bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+ /* Initialize the compute IB if it's empty. */
+ if (!sctx->prim_discard_compute_ib_initialized) {
+ /* 1) State initialization. */
+ sctx->compute_gds_offset = 0;
+ sctx->compute_ib_last_shader = NULL;
+
+ if (sctx->last_ib_barrier_fence) {
+ assert(!sctx->last_ib_barrier_buf);
+ sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
+ RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+ }
+
+ /* 2) IB initialization. */
+
+ /* This needs to be done at the beginning of IBs due to possible
+ * TTM buffer moves in the kernel.
+ */
+ if (sctx->chip_class >= GFX10) {
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+ radeon_emit(cs, 0); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ radeon_emit(cs, /* GCR_CNTL */
+ S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
+ S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
+ S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
+ } else {
+ si_emit_surface_sync(sctx, cs,
+ S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
+ S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+ S_0085F0_SH_KCACHE_ACTION_ENA(1));
+ }
+
+ /* Restore the GDS prim restart counter if needed. */
+ if (sctx->preserve_prim_restart_gds_at_flush) {
+ si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
+ sctx->wait_mem_scratch, 4);
+ }
+
+ si_emit_initial_compute_regs(sctx, cs);
+
+ radeon_set_sh_reg(
+ cs, R_00B860_COMPUTE_TMPRING_SIZE,
+ S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
+
+ /* Only 1D grids are launched. */
+ radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+ radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
+ radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
+
+ radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, 0);
+
+ /* Disable ordered alloc for OA resources. */
+ for (unsigned i = 0; i < 2; i++) {
+ radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+ radeon_emit(cs, S_031074_INDEX(i));
+ radeon_emit(cs, 0);
+ radeon_emit(cs, S_03107C_ENABLE(0));
+ }
+
+ if (sctx->last_ib_barrier_buf) {
+ assert(!sctx->last_ib_barrier_fence);
+ radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
+ RADEON_PRIO_FENCE);
+ si_cp_wait_mem(sctx, cs,
+ sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
+ 1, 1, WAIT_REG_MEM_EQUAL);
+ }
+
+ sctx->prim_discard_compute_ib_initialized = true;
+ }
+
+ /* Allocate the output index buffer. */
+ output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
+ assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+ out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+ sctx->index_ring_offset += output_indexbuf_size;
+
+ radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+ uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+ /* Prepare index buffer descriptors. */
+ struct si_resource *indexbuf_desc = NULL;
+ unsigned indexbuf_desc_offset;
+ unsigned desc_size = 12 * 4;
+ uint32_t *desc;
+
+ u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
+ &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
+ radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
+
+ /* Input index buffer. */
+ desc[0] = input_indexbuf_va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
+ desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
+ : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
+ : V_008F0C_IMG_FORMAT_32_UINT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+ S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[3] =
+ S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
+ : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
+ : V_008F0C_BUF_DATA_FORMAT_32);
+ }
+
+ /* Output index buffer. */
+ desc[4] = out_indexbuf_va;
+ desc[5] =
+ S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
+ desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+ S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+ S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(output_indexbuf_format);
+ }
+
+ /* Viewport state. */
+ struct si_small_prim_cull_info cull_info;
+ si_get_small_prim_cull_info(sctx, &cull_info);
+
+ desc[8] = fui(cull_info.scale[0]);
+ desc[9] = fui(cull_info.scale[1]);
+ desc[10] = fui(cull_info.translate[0]);
+ desc[11] = fui(cull_info.translate[1]);
+
+ /* Better subpixel precision increases the efficiency of small
+ * primitive culling. */
+ unsigned num_samples = sctx->framebuffer.nr_samples;
+ unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+ float small_prim_cull_precision;
+
+ if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+ small_prim_cull_precision = num_samples / 4096.0;
+ else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+ small_prim_cull_precision = num_samples / 1024.0;
+ else
+ small_prim_cull_precision = num_samples / 256.0;
+
+ /* Set user data SGPRs. */
+ /* This can't be greater than 14 if we want the fastest launch rate. */
+ unsigned user_sgprs = 13;
+
+ uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+ unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+ unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+ uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+ uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+ uint64_t vb_desc_va = sctx->vb_descriptors_buffer
+ ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
+ : 0;
+ unsigned gds_offset, gds_size;
+ struct si_fast_udiv_info32 num_prims_udiv = {};
+
+ if (info->instance_count > 1)
+ num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+ /* Limitations on how these two are packed in the user SGPR. */
+ assert(num_prims_udiv.post_shift < 32);
+ assert(num_prims_per_instance < 1 << 27);
+
+ si_resource_reference(&indexbuf_desc, NULL);
+
+ bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
+
+ if (VERTEX_COUNTER_GDS_MODE == 1) {
+ gds_offset = sctx->compute_gds_offset;
+ gds_size = primitive_restart ? 8 : 4;
+ sctx->compute_gds_offset += gds_size;
+
+ /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+ * The remainder of the GDS will be cleared after the dispatch packet
+ * in parallel with compute shaders.
+ */
+ if (first_dispatch) {
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
+ radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+ radeon_emit(cs, gds_offset);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, 0); /* value to write */
+ if (gds_size == 8)
+ radeon_emit(cs, 0);
+ }
+ }
+
+ /* Set shader registers. */
+ struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+ if (shader != sctx->compute_ib_last_shader) {
+ radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_BINARY);
+ uint64_t shader_va = shader->bo->gpu_address;
+
+ assert(shader->config.scratch_bytes_per_wave == 0);
+ assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+ radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+ radeon_emit(cs, shader_va >> 8);
+ radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+ radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+ radeon_emit(
+ cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
+ S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
+ S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
+ S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
+ radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
+ S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+ S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+ S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+ S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+ radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+ ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
+ MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
+ sctx->compute_ib_last_shader = shader;
+ }
+
+ STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+ /* Big draw calls are split into smaller dispatches and draw packets. */
+ for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+ unsigned num_subdraw_prims;
+
+ if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+ num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+ else
+ num_subdraw_prims = num_prims - start_prim;
+
+ /* Small dispatches are executed back to back until a specific primitive
+ * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+ * to start drawing the batch. This batching adds latency to the gfx IB,
+ * but CS_DONE and REWIND are too slow.
+ */
+ if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+ si_compute_signal_gfx(sctx);
+
+ if (sctx->compute_num_prims_in_batch == 0) {
+ assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+ sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+ if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+ radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(gfx_cs, 0);
+
+ si_cp_wait_mem(
+ sctx, gfx_cs,
+ sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
+ REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+ /* Use INDIRECT_BUFFER to chain to a different buffer
+ * to discard the CP prefetch cache.
+ */
+ sctx->ws->cs_check_space(gfx_cs, 0, true);
+ } else {
+ radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+ radeon_emit(gfx_cs, 0);
+ }
+ }
+
+ sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+ uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+ uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+ /* Emit the draw packet into the gfx IB. */
+ radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+ radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+ radeon_emit(gfx_cs, index_va);
+ radeon_emit(gfx_cs, index_va >> 32);
+ radeon_emit(gfx_cs, 0);
+ radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+ /* Continue with the compute IB. */
+ if (start_prim == 0) {
+ uint32_t gds_prim_restart_continue_bit = 0;
+
+ if (sctx->preserve_prim_restart_gds_at_flush) {
+ assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+ assert(start_prim < 1 << 31);
+ gds_prim_restart_continue_bit = 1 << 31;
+ }
+
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+ radeon_emit(cs, index_buffers_va);
+ radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
+ ? count_va
+ : VERTEX_COUNTER_GDS_MODE == 1
+ ? gds_offset
+ : start_prim | gds_prim_restart_continue_bit);
+ radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, vb_desc_va);
+ radeon_emit(cs, vs_const_desc_va);
+ radeon_emit(cs, vs_sampler_desc_va);
+ radeon_emit(cs, base_vertex);
+ radeon_emit(cs, info->start_instance);
+ radeon_emit(cs, num_prims_udiv.multiplier);
+ radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
+ radeon_emit(cs, info->restart_index);
+ /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+ radeon_emit(cs, fui(small_prim_cull_precision));
+ } else {
+ assert(VERTEX_COUNTER_GDS_MODE == 2);
+ /* Only update the SGPRs that changed. */
+ radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+ radeon_emit(cs, start_prim);
+ radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+ radeon_emit(cs, count_va);
+ }
+
+ /* Set grid dimensions. */
+ unsigned start_block = start_prim / THREADGROUP_SIZE;
+ unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+ unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+ radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+ radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+ S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+ S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+ S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+ S_00B800_ORDER_MODE(0 /* launch in order */));
+
+ /* This is only for unordered append. Ordered append writes this from
+ * the shader.
+ *
+ * Note that EOP and EOS events are super slow, so emulating the event
+ * in a shader is an important optimization.
+ */
+ if (VERTEX_COUNTER_GDS_MODE == 1) {
+ si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+ sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+ EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
+ count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+ EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
+
+ /* Now that compute shaders are running, clear the remainder of GDS. */
+ if (first_dispatch) {
+ unsigned offset = gds_offset + gds_size;
+ si_cp_dma_clear_buffer(
+ sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
+ SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE,
+ SI_COHERENCY_NONE, L2_BYPASS);
+ }
+ }
+ first_dispatch = false;
+
+ assert(cs->current.cdw <= cs->current.max_dw);
+ assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+ }
}
/* Set this if you want the ME to wait until CP DMA is done.
* It should be set on the last CP DMA packet. */
-#define CP_DMA_SYNC (1 << 0)
+#define CP_DMA_SYNC (1 << 0)
/* Set this if the source data was used as a destination in a previous CP DMA
* packet. It's for preventing a read-after-write (RAW) hazard between two
* CP DMA packets. */
-#define CP_DMA_RAW_WAIT (1 << 1)
-#define CP_DMA_DST_IS_GDS (1 << 2)
-#define CP_DMA_CLEAR (1 << 3)
-#define CP_DMA_PFP_SYNC_ME (1 << 4)
-#define CP_DMA_SRC_IS_GDS (1 << 5)
+#define CP_DMA_RAW_WAIT (1 << 1)
+#define CP_DMA_DST_IS_GDS (1 << 2)
+#define CP_DMA_CLEAR (1 << 3)
+#define CP_DMA_PFP_SYNC_ME (1 << 4)
+#define CP_DMA_SRC_IS_GDS (1 << 5)
/* The max number of bytes that can be copied per packet. */
static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
{
- unsigned max = sctx->chip_class >= GFX9 ?
- S_414_BYTE_COUNT_GFX9(~0u) :
- S_414_BYTE_COUNT_GFX6(~0u);
+ unsigned max =
+ sctx->chip_class >= GFX9 ? S_414_BYTE_COUNT_GFX9(~0u) : S_414_BYTE_COUNT_GFX6(~0u);
- /* make it aligned for optimal performance */
- return max & ~(SI_CPDMA_ALIGNMENT - 1);
+ /* make it aligned for optimal performance */
+ return max & ~(SI_CPDMA_ALIGNMENT - 1);
}
-
/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
* a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
* clear value.
*/
-static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
- uint64_t dst_va, uint64_t src_va, unsigned size,
- unsigned flags, enum si_cache_policy cache_policy)
+static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va,
+ uint64_t src_va, unsigned size, unsigned flags,
+ enum si_cache_policy cache_policy)
{
- uint32_t header = 0, command = 0;
-
- assert(size <= cp_dma_max_byte_count(sctx));
- assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
-
- if (sctx->chip_class >= GFX9)
- command |= S_414_BYTE_COUNT_GFX9(size);
- else
- command |= S_414_BYTE_COUNT_GFX6(size);
-
- /* Sync flags. */
- if (flags & CP_DMA_SYNC)
- header |= S_411_CP_SYNC(1);
- else {
- if (sctx->chip_class >= GFX9)
- command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
- else
- command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
- }
-
- if (flags & CP_DMA_RAW_WAIT)
- command |= S_414_RAW_WAIT(1);
-
- /* Src and dst flags. */
- if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) &&
- src_va == dst_va) {
- header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
- } else if (flags & CP_DMA_DST_IS_GDS) {
- header |= S_411_DST_SEL(V_411_GDS);
- /* GDS increments the address, not CP. */
- command |= S_414_DAS(V_414_REGISTER) |
- S_414_DAIC(V_414_NO_INCREMENT);
- } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
- header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2) |
- S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
- }
-
- if (flags & CP_DMA_CLEAR) {
- header |= S_411_SRC_SEL(V_411_DATA);
- } else if (flags & CP_DMA_SRC_IS_GDS) {
- header |= S_411_SRC_SEL(V_411_GDS);
- /* Both of these are required for GDS. It does increment the address. */
- command |= S_414_SAS(V_414_REGISTER) |
- S_414_SAIC(V_414_NO_INCREMENT);
- } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
- header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
- S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
- }
-
- if (sctx->chip_class >= GFX7) {
- radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
- radeon_emit(cs, header);
- radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
- radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
- radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
- radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
- radeon_emit(cs, command);
- } else {
- header |= S_411_SRC_ADDR_HI(src_va >> 32);
-
- radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
- radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
- radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */
- radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
- radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, command);
- }
-
- /* CP DMA is executed in ME, but index buffers are read by PFP.
- * This ensures that ME (CP DMA) is idle before PFP starts fetching
- * indices. If we wanted to execute CP DMA in PFP, this packet
- * should precede it.
- */
- if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
- }
+ uint32_t header = 0, command = 0;
+
+ assert(size <= cp_dma_max_byte_count(sctx));
+ assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
+
+ if (sctx->chip_class >= GFX9)
+ command |= S_414_BYTE_COUNT_GFX9(size);
+ else
+ command |= S_414_BYTE_COUNT_GFX6(size);
+
+ /* Sync flags. */
+ if (flags & CP_DMA_SYNC)
+ header |= S_411_CP_SYNC(1);
+ else {
+ if (sctx->chip_class >= GFX9)
+ command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+ else
+ command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+ }
+
+ if (flags & CP_DMA_RAW_WAIT)
+ command |= S_414_RAW_WAIT(1);
+
+ /* Src and dst flags. */
+ if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) {
+ header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
+ } else if (flags & CP_DMA_DST_IS_GDS) {
+ header |= S_411_DST_SEL(V_411_GDS);
+ /* GDS increments the address, not CP. */
+ command |= S_414_DAS(V_414_REGISTER) | S_414_DAIC(V_414_NO_INCREMENT);
+ } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+ header |=
+ S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
+ }
+
+ if (flags & CP_DMA_CLEAR) {
+ header |= S_411_SRC_SEL(V_411_DATA);
+ } else if (flags & CP_DMA_SRC_IS_GDS) {
+ header |= S_411_SRC_SEL(V_411_GDS);
+ /* Both of these are required for GDS. It does increment the address. */
+ command |= S_414_SAS(V_414_REGISTER) | S_414_SAIC(V_414_NO_INCREMENT);
+ } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+ header |=
+ S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
+ }
+
+ if (sctx->chip_class >= GFX7) {
+ radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+ radeon_emit(cs, header);
+ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
+ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
+ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
+ radeon_emit(cs, command);
+ } else {
+ header |= S_411_SRC_ADDR_HI(src_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+ radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(cs, header); /* SRC_ADDR_HI [15:0] + flags. */
+ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
+ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+ radeon_emit(cs, command);
+ }
+
+ /* CP DMA is executed in ME, but index buffers are read by PFP.
+ * This ensures that ME (CP DMA) is idle before PFP starts fetching
+ * indices. If we wanted to execute CP DMA in PFP, this packet
+ * should precede it.
+ */
+ if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ }
}
void si_cp_dma_wait_for_idle(struct si_context *sctx)
{
- /* Issue a dummy DMA that copies zero bytes.
- *
- * The DMA engine will see that there's no work to do and skip this
- * DMA request, however, the CP will see the sync flag and still wait
- * for all DMAs to complete.
- */
- si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
+ /* Issue a dummy DMA that copies zero bytes.
+ *
+ * The DMA engine will see that there's no work to do and skip this
+ * DMA request, however, the CP will see the sync flag and still wait
+ * for all DMAs to complete.
+ */
+ si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
}
static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
- struct pipe_resource *src, unsigned byte_count,
- uint64_t remaining_size, unsigned user_flags,
- enum si_coherency coher, bool *is_first,
- unsigned *packet_flags)
+ struct pipe_resource *src, unsigned byte_count,
+ uint64_t remaining_size, unsigned user_flags, enum si_coherency coher,
+ bool *is_first, unsigned *packet_flags)
{
- /* Fast exit for a CPDMA prefetch. */
- if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
- *is_first = false;
- return;
- }
-
- if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
- /* Count memory usage in so that need_cs_space can take it into account. */
- if (dst)
- si_context_add_resource_size(sctx, dst);
- if (src)
- si_context_add_resource_size(sctx, src);
- }
-
- if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
- si_need_gfx_cs_space(sctx);
-
- /* This must be done after need_cs_space. */
- if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
- if (dst)
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(dst),
- RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
- if (src)
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(src),
- RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
- }
-
- /* Flush the caches for the first copy only.
- * Also wait for the previous CP DMA operations.
- */
- if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
- sctx->emit_cache_flush(sctx);
-
- if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first &&
- !(*packet_flags & CP_DMA_CLEAR))
- *packet_flags |= CP_DMA_RAW_WAIT;
-
- *is_first = false;
-
- /* Do the synchronization after the last dma, so that all data
- * is written to memory.
- */
- if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
- byte_count == remaining_size) {
- *packet_flags |= CP_DMA_SYNC;
-
- if (coher == SI_COHERENCY_SHADER)
- *packet_flags |= CP_DMA_PFP_SYNC_ME;
- }
+ /* Fast exit for a CPDMA prefetch. */
+ if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
+ *is_first = false;
+ return;
+ }
+
+ if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+ /* Count memory usage in so that need_cs_space can take it into account. */
+ if (dst)
+ si_context_add_resource_size(sctx, dst);
+ if (src)
+ si_context_add_resource_size(sctx, src);
+ }
+
+ if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
+ si_need_gfx_cs_space(sctx);
+
+ /* This must be done after need_cs_space. */
+ if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+ if (dst)
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(dst), RADEON_USAGE_WRITE,
+ RADEON_PRIO_CP_DMA);
+ if (src)
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ,
+ RADEON_PRIO_CP_DMA);
+ }
+
+ /* Flush the caches for the first copy only.
+ * Also wait for the previous CP DMA operations.
+ */
+ if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
+ sctx->emit_cache_flush(sctx);
+
+ if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first && !(*packet_flags & CP_DMA_CLEAR))
+ *packet_flags |= CP_DMA_RAW_WAIT;
+
+ *is_first = false;
+
+ /* Do the synchronization after the last dma, so that all data
+ * is written to memory.
+ */
+ if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) {
+ *packet_flags |= CP_DMA_SYNC;
+
+ if (coher == SI_COHERENCY_SHADER)
+ *packet_flags |= CP_DMA_PFP_SYNC_ME;
+ }
}
void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
- struct pipe_resource *dst, uint64_t offset,
- uint64_t size, unsigned value, unsigned user_flags,
- enum si_coherency coher, enum si_cache_policy cache_policy)
+ struct pipe_resource *dst, uint64_t offset, uint64_t size,
+ unsigned value, unsigned user_flags, enum si_coherency coher,
+ enum si_cache_policy cache_policy)
{
- struct si_resource *sdst = si_resource(dst);
- uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
- bool is_first = true;
-
- assert(size && size % 4 == 0);
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- if (sdst)
- util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
- /* Flush the caches. */
- if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, coher, cache_policy);
- }
-
- while (size) {
- unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
- unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
-
- si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags,
- coher, &is_first, &dma_flags);
-
- /* Emit the clear packet. */
- si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
-
- size -= byte_count;
- va += byte_count;
- }
-
- if (sdst && cache_policy != L2_BYPASS)
- sdst->TC_L2_dirty = true;
-
- /* If it's not a framebuffer fast clear... */
- if (coher == SI_COHERENCY_SHADER) {
- sctx->num_cp_dma_calls++;
- si_prim_discard_signal_next_compute_ib_start(sctx);
- }
+ struct si_resource *sdst = si_resource(dst);
+ uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
+ bool is_first = true;
+
+ assert(size && size % 4 == 0);
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ if (sdst)
+ util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+ /* Flush the caches. */
+ if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, cache_policy);
+ }
+
+ while (size) {
+ unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+ unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
+
+ si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first,
+ &dma_flags);
+
+ /* Emit the clear packet. */
+ si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
+
+ size -= byte_count;
+ va += byte_count;
+ }
+
+ if (sdst && cache_policy != L2_BYPASS)
+ sdst->TC_L2_dirty = true;
+
+ /* If it's not a framebuffer fast clear... */
+ if (coher == SI_COHERENCY_SHADER) {
+ sctx->num_cp_dma_calls++;
+ si_prim_discard_signal_next_compute_ib_start(sctx);
+ }
}
/**
*
* \param size Remaining size to the CP DMA alignment.
*/
-static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
- unsigned user_flags, enum si_coherency coher,
- enum si_cache_policy cache_policy,
- bool *is_first)
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags,
+ enum si_coherency coher, enum si_cache_policy cache_policy,
+ bool *is_first)
{
- uint64_t va;
- unsigned dma_flags = 0;
- unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
-
- assert(size < SI_CPDMA_ALIGNMENT);
-
- /* Use the scratch buffer as the dummy buffer. The 3D engine should be
- * idle at this point.
- */
- if (!sctx->scratch_buffer ||
- sctx->scratch_buffer->b.b.width0 < scratch_size) {
- si_resource_reference(&sctx->scratch_buffer, NULL);
- sctx->scratch_buffer =
- si_aligned_buffer_create(&sctx->screen->b,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- scratch_size, 256);
- if (!sctx->scratch_buffer)
- return;
-
- si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
- }
-
- si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
- &sctx->scratch_buffer->b.b, size, size, user_flags,
- coher, is_first, &dma_flags);
-
- va = sctx->scratch_buffer->gpu_address;
- si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags,
- cache_policy);
+ uint64_t va;
+ unsigned dma_flags = 0;
+ unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
+
+ assert(size < SI_CPDMA_ALIGNMENT);
+
+ /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+ * idle at this point.
+ */
+ if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) {
+ si_resource_reference(&sctx->scratch_buffer, NULL);
+ sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT, scratch_size, 256);
+ if (!sctx->scratch_buffer)
+ return;
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+ }
+
+ si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size,
+ user_flags, coher, is_first, &dma_flags);
+
+ va = sctx->scratch_buffer->gpu_address;
+ si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy);
}
/**
*
* \param user_flags bitmask of SI_CPDMA_*
*/
-void si_cp_dma_copy_buffer(struct si_context *sctx,
- struct pipe_resource *dst, struct pipe_resource *src,
- uint64_t dst_offset, uint64_t src_offset, unsigned size,
- unsigned user_flags, enum si_coherency coher,
- enum si_cache_policy cache_policy)
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+ unsigned size, unsigned user_flags, enum si_coherency coher,
+ enum si_cache_policy cache_policy)
{
- uint64_t main_dst_offset, main_src_offset;
- unsigned skipped_size = 0;
- unsigned realign_size = 0;
- unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) |
- (src ? 0 : CP_DMA_SRC_IS_GDS);
- bool is_first = true;
-
- assert(size);
-
- if (dst) {
- /* Skip this for the L2 prefetch. */
- if (dst != src || dst_offset != src_offset) {
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset,
- dst_offset + size);
- }
-
- dst_offset += si_resource(dst)->gpu_address;
- }
- if (src)
- src_offset += si_resource(src)->gpu_address;
-
- /* The workarounds aren't needed on Fiji and beyond. */
- if (sctx->family <= CHIP_CARRIZO ||
- sctx->family == CHIP_STONEY) {
- /* If the size is not aligned, we must add a dummy copy at the end
- * just to align the internal counter. Otherwise, the DMA engine
- * would slow down by an order of magnitude for following copies.
- */
- if (size % SI_CPDMA_ALIGNMENT)
- realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
-
- /* If the copy begins unaligned, we must start copying from the next
- * aligned block and the skipped part should be copied after everything
- * else has been copied. Only the src alignment matters, not dst.
- *
- * GDS doesn't need the source address to be aligned.
- */
- if (src && src_offset % SI_CPDMA_ALIGNMENT) {
- skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
- /* The main part will be skipped if the size is too small. */
- skipped_size = MIN2(skipped_size, size);
- size -= skipped_size;
- }
- }
-
- /* Flush the caches. */
- if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH |
- si_get_flush_flags(sctx, coher, cache_policy);
- }
-
- /* This is the main part doing the copying. Src is always aligned. */
- main_dst_offset = dst_offset + skipped_size;
- main_src_offset = src_offset + skipped_size;
-
- while (size) {
- unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
- unsigned dma_flags = gds_flags;
-
- si_cp_dma_prepare(sctx, dst, src, byte_count,
- size + skipped_size + realign_size,
- user_flags, coher, &is_first, &dma_flags);
-
- si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset,
- byte_count, dma_flags, cache_policy);
-
- size -= byte_count;
- main_src_offset += byte_count;
- main_dst_offset += byte_count;
- }
-
- /* Copy the part we skipped because src wasn't aligned. */
- if (skipped_size) {
- unsigned dma_flags = gds_flags;
-
- si_cp_dma_prepare(sctx, dst, src, skipped_size,
- skipped_size + realign_size, user_flags,
- coher, &is_first, &dma_flags);
-
- si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size,
- dma_flags, cache_policy);
- }
-
- /* Finally, realign the engine if the size wasn't aligned. */
- if (realign_size) {
- si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
- cache_policy, &is_first);
- }
-
- if (dst && cache_policy != L2_BYPASS)
- si_resource(dst)->TC_L2_dirty = true;
-
- /* If it's not a prefetch or GDS copy... */
- if (dst && src && (dst != src || dst_offset != src_offset)) {
- sctx->num_cp_dma_calls++;
- si_prim_discard_signal_next_compute_ib_start(sctx);
- }
+ uint64_t main_dst_offset, main_src_offset;
+ unsigned skipped_size = 0;
+ unsigned realign_size = 0;
+ unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
+ bool is_first = true;
+
+ assert(size);
+
+ if (dst) {
+ /* Skip this for the L2 prefetch. */
+ if (dst != src || dst_offset != src_offset) {
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
+ }
+
+ dst_offset += si_resource(dst)->gpu_address;
+ }
+ if (src)
+ src_offset += si_resource(src)->gpu_address;
+
+ /* The workarounds aren't needed on Fiji and beyond. */
+ if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
+ /* If the size is not aligned, we must add a dummy copy at the end
+ * just to align the internal counter. Otherwise, the DMA engine
+ * would slow down by an order of magnitude for following copies.
+ */
+ if (size % SI_CPDMA_ALIGNMENT)
+ realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
+
+ /* If the copy begins unaligned, we must start copying from the next
+ * aligned block and the skipped part should be copied after everything
+ * else has been copied. Only the src alignment matters, not dst.
+ *
+ * GDS doesn't need the source address to be aligned.
+ */
+ if (src && src_offset % SI_CPDMA_ALIGNMENT) {
+ skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
+ /* The main part will be skipped if the size is too small. */
+ skipped_size = MIN2(skipped_size, size);
+ size -= skipped_size;
+ }
+ }
+
+ /* Flush the caches. */
+ if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, cache_policy);
+ }
+
+ /* This is the main part doing the copying. Src is always aligned. */
+ main_dst_offset = dst_offset + skipped_size;
+ main_src_offset = src_offset + skipped_size;
+
+ while (size) {
+ unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+ unsigned dma_flags = gds_flags;
+
+ si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags,
+ coher, &is_first, &dma_flags);
+
+ si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags,
+ cache_policy);
+
+ size -= byte_count;
+ main_src_offset += byte_count;
+ main_dst_offset += byte_count;
+ }
+
+ /* Copy the part we skipped because src wasn't aligned. */
+ if (skipped_size) {
+ unsigned dma_flags = gds_flags;
+
+ si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
+ coher, &is_first, &dma_flags);
+
+ si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags,
+ cache_policy);
+ }
+
+ /* Finally, realign the engine if the size wasn't aligned. */
+ if (realign_size) {
+ si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first);
+ }
+
+ if (dst && cache_policy != L2_BYPASS)
+ si_resource(dst)->TC_L2_dirty = true;
+
+ /* If it's not a prefetch or GDS copy... */
+ if (dst && src && (dst != src || dst_offset != src_offset)) {
+ sctx->num_cp_dma_calls++;
+ si_prim_discard_signal_next_compute_ib_start(sctx);
+ }
}
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
- uint64_t offset, unsigned size)
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+ unsigned size)
{
- assert(sctx->chip_class >= GFX7);
+ assert(sctx->chip_class >= GFX7);
- si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size,
- SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU);
+ si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL,
+ SI_COHERENCY_SHADER, L2_LRU);
}
-static void cik_prefetch_shader_async(struct si_context *sctx,
- struct si_pm4_state *state)
+static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
{
- struct pipe_resource *bo = &state->bo[0]->b.b;
- assert(state->nbo == 1);
+ struct pipe_resource *bo = &state->bo[0]->b.b;
+ assert(state->nbo == 1);
- cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+ cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
}
static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
{
- if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
- return;
+ if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
+ return;
- cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
- sctx->vb_descriptors_offset,
- sctx->vertex_elements->vb_desc_list_alloc_size);
+ cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+ sctx->vertex_elements->vb_desc_list_alloc_size);
}
/**
*/
void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only)
{
- unsigned mask = sctx->prefetch_L2_mask;
- assert(mask);
-
- /* Prefetch shaders and VBO descriptors to TC L2. */
- if (sctx->chip_class >= GFX9) {
- /* Choose the right spot for the VBO prefetch. */
- if (sctx->queued.named.hs) {
- if (mask & SI_PREFETCH_HS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
-
- if (mask & SI_PREFETCH_GS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- } else if (sctx->queued.named.gs) {
- if (mask & SI_PREFETCH_GS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
-
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- } else {
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
- }
- } else {
- /* GFX6-GFX8 */
- /* Choose the right spot for the VBO prefetch. */
- if (sctx->tes_shader.cso) {
- if (mask & SI_PREFETCH_LS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
-
- if (mask & SI_PREFETCH_HS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
- if (mask & SI_PREFETCH_ES)
- cik_prefetch_shader_async(sctx, sctx->queued.named.es);
- if (mask & SI_PREFETCH_GS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- } else if (sctx->gs_shader.cso) {
- if (mask & SI_PREFETCH_ES)
- cik_prefetch_shader_async(sctx, sctx->queued.named.es);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
-
- if (mask & SI_PREFETCH_GS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- } else {
- if (mask & SI_PREFETCH_VS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
- if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
- cik_prefetch_VBO_descriptors(sctx);
- if (vertex_stage_only) {
- sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
- SI_PREFETCH_VBO_DESCRIPTORS);
- return;
- }
- }
- }
-
- if (mask & SI_PREFETCH_PS)
- cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
-
- sctx->prefetch_L2_mask = 0;
+ unsigned mask = sctx->prefetch_L2_mask;
+ assert(mask);
+
+ /* Prefetch shaders and VBO descriptors to TC L2. */
+ if (sctx->chip_class >= GFX9) {
+ /* Choose the right spot for the VBO prefetch. */
+ if (sctx->queued.named.hs) {
+ if (mask & SI_PREFETCH_HS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+
+ if (mask & SI_PREFETCH_GS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ } else if (sctx->queued.named.gs) {
+ if (mask & SI_PREFETCH_GS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ } else {
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+ }
+ } else {
+ /* GFX6-GFX8 */
+ /* Choose the right spot for the VBO prefetch. */
+ if (sctx->tes_shader.cso) {
+ if (mask & SI_PREFETCH_LS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+
+ if (mask & SI_PREFETCH_HS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+ if (mask & SI_PREFETCH_ES)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+ if (mask & SI_PREFETCH_GS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ } else if (sctx->gs_shader.cso) {
+ if (mask & SI_PREFETCH_ES)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+
+ if (mask & SI_PREFETCH_GS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ } else {
+ if (mask & SI_PREFETCH_VS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+ if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+ cik_prefetch_VBO_descriptors(sctx);
+ if (vertex_stage_only) {
+ sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+ return;
+ }
+ }
+ }
+
+ if (mask & SI_PREFETCH_PS)
+ cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+
+ sctx->prefetch_L2_mask = 0;
}
void si_test_gds(struct si_context *sctx)
{
- struct pipe_context *ctx = &sctx->b;
- struct pipe_resource *src, *dst;
- unsigned r[4] = {};
- unsigned offset = debug_get_num_option("OFFSET", 16);
-
- src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
- dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER, L2_BYPASS);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER, L2_BYPASS);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER, L2_BYPASS);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER, L2_BYPASS);
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-
- si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
- si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
- pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
- printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
- r[0] == 0xabcdef01 && r[1] == 0x23456789 &&
- r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" : "fail");
-
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS);
- si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
- pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
- printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
- r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 &&
- r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : "fail");
-
- pipe_resource_reference(&src, NULL);
- pipe_resource_reference(&dst, NULL);
- exit(0);
+ struct pipe_context *ctx = &sctx->b;
+ struct pipe_resource *src, *dst;
+ unsigned r[4] = {};
+ unsigned offset = debug_get_num_option("OFFSET", 16);
+
+ src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+ dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER,
+ L2_BYPASS);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER,
+ L2_BYPASS);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER,
+ L2_BYPASS);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER,
+ L2_BYPASS);
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER,
+ L2_BYPASS);
+
+ si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+ si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+ pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+ printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+ r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98
+ ? "pass"
+ : "fail");
+
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE,
+ L2_BYPASS);
+ si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+ pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+ printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+ r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146
+ ? "pass"
+ : "fail");
+
+ pipe_resource_reference(&src, NULL);
+ pipe_resource_reference(&dst, NULL);
+ exit(0);
}
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
- unsigned offset, unsigned size, unsigned dst_sel,
- unsigned engine, const void *data)
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+ unsigned size, unsigned dst_sel, unsigned engine, const void *data)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- assert(offset % 4 == 0);
- assert(size % 4 == 0);
+ assert(offset % 4 == 0);
+ assert(size % 4 == 0);
- if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
- dst_sel = V_370_MEM_GRBM;
+ if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
+ dst_sel = V_370_MEM_GRBM;
- radeon_add_to_buffer_list(sctx, cs, buf,
- RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
- uint64_t va = buf->gpu_address + offset;
+ radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+ uint64_t va = buf->gpu_address + offset;
- radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size/4, 0));
- radeon_emit(cs, S_370_DST_SEL(dst_sel) |
- S_370_WR_CONFIRM(1) |
- S_370_ENGINE_SEL(engine));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- radeon_emit_array(cs, (const uint32_t*)data, size/4);
+ radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+ radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit_array(cs, (const uint32_t *)data, size / 4);
}
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
- unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
- unsigned src_sel, struct si_resource *src, unsigned src_offset)
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+ struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+ struct si_resource *src, unsigned src_offset)
{
- /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
- if (dst) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst,
- RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
- }
- if (src) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src,
- RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
- }
-
- uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
- uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
-
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) |
- COPY_DATA_DST_SEL(dst_sel) |
- COPY_DATA_WR_CONFIRM);
- radeon_emit(cs, src_va);
- radeon_emit(cs, src_va >> 32);
- radeon_emit(cs, dst_va);
- radeon_emit(cs, dst_va >> 32);
+ /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
+ if (dst) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+ }
+ if (src) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+ }
+
+ uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
+ uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
+
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+ radeon_emit(cs, src_va);
+ radeon_emit(cs, src_va >> 32);
+ radeon_emit(cs, dst_va);
+ radeon_emit(cs, dst_va >> 32);
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
+#include "ac_debug.h"
+#include "ac_rtld.h"
+#include "driver_ddebug/dd_util.h"
#include "si_compute.h"
+#include "si_pipe.h"
#include "sid.h"
#include "sid_tables.h"
#include "tgsi/tgsi_from_mesa.h"
-#include "driver_ddebug/dd_util.h"
#include "util/u_dump.h"
#include "util/u_log.h"
#include "util/u_memory.h"
#include "util/u_string.h"
-#include "ac_debug.h"
-#include "ac_rtld.h"
-static void si_dump_bo_list(struct si_context *sctx,
- const struct radeon_saved_cs *saved, FILE *f);
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f);
DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
* Store a linearized copy of all chunks of \p cs together with the buffer
* list in \p saved.
*/
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
- struct radeon_saved_cs *saved, bool get_buffer_list)
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+ bool get_buffer_list)
{
- uint32_t *buf;
- unsigned i;
-
- /* Save the IB chunks. */
- saved->num_dw = cs->prev_dw + cs->current.cdw;
- saved->ib = MALLOC(4 * saved->num_dw);
- if (!saved->ib)
- goto oom;
-
- buf = saved->ib;
- for (i = 0; i < cs->num_prev; ++i) {
- memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
- buf += cs->prev[i].cdw;
- }
- memcpy(buf, cs->current.buf, cs->current.cdw * 4);
-
- if (!get_buffer_list)
- return;
-
- /* Save the buffer list. */
- saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
- saved->bo_list = CALLOC(saved->bo_count,
- sizeof(saved->bo_list[0]));
- if (!saved->bo_list) {
- FREE(saved->ib);
- goto oom;
- }
- ws->cs_get_buffer_list(cs, saved->bo_list);
-
- return;
+ uint32_t *buf;
+ unsigned i;
+
+ /* Save the IB chunks. */
+ saved->num_dw = cs->prev_dw + cs->current.cdw;
+ saved->ib = MALLOC(4 * saved->num_dw);
+ if (!saved->ib)
+ goto oom;
+
+ buf = saved->ib;
+ for (i = 0; i < cs->num_prev; ++i) {
+ memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
+ buf += cs->prev[i].cdw;
+ }
+ memcpy(buf, cs->current.buf, cs->current.cdw * 4);
+
+ if (!get_buffer_list)
+ return;
+
+ /* Save the buffer list. */
+ saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
+ saved->bo_list = CALLOC(saved->bo_count, sizeof(saved->bo_list[0]));
+ if (!saved->bo_list) {
+ FREE(saved->ib);
+ goto oom;
+ }
+ ws->cs_get_buffer_list(cs, saved->bo_list);
+
+ return;
oom:
- fprintf(stderr, "%s: out of memory\n", __func__);
- memset(saved, 0, sizeof(*saved));
+ fprintf(stderr, "%s: out of memory\n", __func__);
+ memset(saved, 0, sizeof(*saved));
}
void si_clear_saved_cs(struct radeon_saved_cs *saved)
{
- FREE(saved->ib);
- FREE(saved->bo_list);
+ FREE(saved->ib);
+ FREE(saved->bo_list);
- memset(saved, 0, sizeof(*saved));
+ memset(saved, 0, sizeof(*saved));
}
void si_destroy_saved_cs(struct si_saved_cs *scs)
{
- si_clear_saved_cs(&scs->gfx);
- si_resource_reference(&scs->trace_buf, NULL);
- free(scs);
+ si_clear_saved_cs(&scs->gfx);
+ si_resource_reference(&scs->trace_buf, NULL);
+ free(scs);
}
-static void si_dump_shader(struct si_screen *sscreen,
- struct si_shader *shader, FILE *f)
+static void si_dump_shader(struct si_screen *sscreen, struct si_shader *shader, FILE *f)
{
- if (shader->shader_log)
- fwrite(shader->shader_log, shader->shader_log_size, 1, f);
- else
- si_shader_dump(sscreen, shader, NULL, f, false);
+ if (shader->shader_log)
+ fwrite(shader->shader_log, shader->shader_log_size, 1, f);
+ else
+ si_shader_dump(sscreen, shader, NULL, f, false);
- if (shader->bo && sscreen->options.dump_shader_binary) {
- unsigned size = shader->bo->b.b.width0;
- fprintf(f, "BO: VA=%"PRIx64" Size=%u\n", shader->bo->gpu_address, size);
+ if (shader->bo && sscreen->options.dump_shader_binary) {
+ unsigned size = shader->bo->b.b.width0;
+ fprintf(f, "BO: VA=%" PRIx64 " Size=%u\n", shader->bo->gpu_address, size);
- const char *mapped = sscreen->ws->buffer_map(shader->bo->buf, NULL,
- PIPE_TRANSFER_UNSYNCHRONIZED |
- PIPE_TRANSFER_READ |
- RADEON_TRANSFER_TEMPORARY);
+ const char *mapped = sscreen->ws->buffer_map(
+ shader->bo->buf, NULL,
+ PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY);
- for (unsigned i = 0; i < size; i += 4) {
- fprintf(f, " %4x: %08x\n", i, *(uint32_t*)(mapped + i));
- }
+ for (unsigned i = 0; i < size; i += 4) {
+ fprintf(f, " %4x: %08x\n", i, *(uint32_t *)(mapped + i));
+ }
- sscreen->ws->buffer_unmap(shader->bo->buf);
+ sscreen->ws->buffer_unmap(shader->bo->buf);
- fprintf(f, "\n");
- }
+ fprintf(f, "\n");
+ }
}
struct si_log_chunk_shader {
- /* The shader destroy code assumes a current context for unlinking of
- * PM4 packets etc.
- *
- * While we should be able to destroy shaders without a context, doing
- * so would happen only very rarely and be therefore likely to fail
- * just when you're trying to debug something. Let's just remember the
- * current context in the chunk.
- */
- struct si_context *ctx;
- struct si_shader *shader;
-
- /* For keep-alive reference counts */
- struct si_shader_selector *sel;
- struct si_compute *program;
+ /* The shader destroy code assumes a current context for unlinking of
+ * PM4 packets etc.
+ *
+ * While we should be able to destroy shaders without a context, doing
+ * so would happen only very rarely and be therefore likely to fail
+ * just when you're trying to debug something. Let's just remember the
+ * current context in the chunk.
+ */
+ struct si_context *ctx;
+ struct si_shader *shader;
+
+ /* For keep-alive reference counts */
+ struct si_shader_selector *sel;
+ struct si_compute *program;
};
-static void
-si_log_chunk_shader_destroy(void *data)
+static void si_log_chunk_shader_destroy(void *data)
{
- struct si_log_chunk_shader *chunk = data;
- si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
- si_compute_reference(&chunk->program, NULL);
- FREE(chunk);
+ struct si_log_chunk_shader *chunk = data;
+ si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
+ si_compute_reference(&chunk->program, NULL);
+ FREE(chunk);
}
-static void
-si_log_chunk_shader_print(void *data, FILE *f)
+static void si_log_chunk_shader_print(void *data, FILE *f)
{
- struct si_log_chunk_shader *chunk = data;
- struct si_screen *sscreen = chunk->ctx->screen;
- si_dump_shader(sscreen, chunk->shader, f);
+ struct si_log_chunk_shader *chunk = data;
+ struct si_screen *sscreen = chunk->ctx->screen;
+ si_dump_shader(sscreen, chunk->shader, f);
}
static struct u_log_chunk_type si_log_chunk_type_shader = {
- .destroy = si_log_chunk_shader_destroy,
- .print = si_log_chunk_shader_print,
+ .destroy = si_log_chunk_shader_destroy,
+ .print = si_log_chunk_shader_print,
};
-static void si_dump_gfx_shader(struct si_context *ctx,
- const struct si_shader_ctx_state *state,
- struct u_log_context *log)
+static void si_dump_gfx_shader(struct si_context *ctx, const struct si_shader_ctx_state *state,
+ struct u_log_context *log)
{
- struct si_shader *current = state->current;
+ struct si_shader *current = state->current;
- if (!state->cso || !current)
- return;
+ if (!state->cso || !current)
+ return;
- struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
- chunk->ctx = ctx;
- chunk->shader = current;
- si_shader_selector_reference(ctx, &chunk->sel, current->selector);
- u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+ struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+ chunk->ctx = ctx;
+ chunk->shader = current;
+ si_shader_selector_reference(ctx, &chunk->sel, current->selector);
+ u_log_chunk(log, &si_log_chunk_type_shader, chunk);
}
-static void si_dump_compute_shader(struct si_context *ctx,
- struct u_log_context *log)
+static void si_dump_compute_shader(struct si_context *ctx, struct u_log_context *log)
{
- const struct si_cs_shader_state *state = &ctx->cs_shader_state;
+ const struct si_cs_shader_state *state = &ctx->cs_shader_state;
- if (!state->program)
- return;
+ if (!state->program)
+ return;
- struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
- chunk->ctx = ctx;
- chunk->shader = &state->program->shader;
- si_compute_reference(&chunk->program, state->program);
- u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+ struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+ chunk->ctx = ctx;
+ chunk->shader = &state->program->shader;
+ si_compute_reference(&chunk->program, state->program);
+ u_log_chunk(log, &si_log_chunk_type_shader, chunk);
}
/**
*/
bool si_replace_shader(unsigned num, struct si_shader_binary *binary)
{
- const char *p = debug_get_option_replace_shaders();
- const char *semicolon;
- char *copy = NULL;
- FILE *f;
- long filesize, nread;
- bool replaced = false;
-
- if (!p)
- return false;
-
- while (*p) {
- unsigned long i;
- char *endp;
- i = strtoul(p, &endp, 0);
-
- p = endp;
- if (*p != ':') {
- fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
- exit(1);
- }
- ++p;
-
- if (i == num)
- break;
-
- p = strchr(p, ';');
- if (!p)
- return false;
- ++p;
- }
- if (!*p)
- return false;
-
- semicolon = strchr(p, ';');
- if (semicolon) {
- p = copy = strndup(p, semicolon - p);
- if (!copy) {
- fprintf(stderr, "out of memory\n");
- return false;
- }
- }
-
- fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
-
- f = fopen(p, "r");
- if (!f) {
- perror("radeonsi: failed to open file");
- goto out_free;
- }
-
- if (fseek(f, 0, SEEK_END) != 0)
- goto file_error;
-
- filesize = ftell(f);
- if (filesize < 0)
- goto file_error;
-
- if (fseek(f, 0, SEEK_SET) != 0)
- goto file_error;
-
- binary->elf_buffer = MALLOC(filesize);
- if (!binary->elf_buffer) {
- fprintf(stderr, "out of memory\n");
- goto out_close;
- }
-
- nread = fread((void*)binary->elf_buffer, 1, filesize, f);
- if (nread != filesize) {
- FREE((void*)binary->elf_buffer);
- binary->elf_buffer = NULL;
- goto file_error;
- }
-
- binary->elf_size = nread;
- replaced = true;
+ const char *p = debug_get_option_replace_shaders();
+ const char *semicolon;
+ char *copy = NULL;
+ FILE *f;
+ long filesize, nread;
+ bool replaced = false;
+
+ if (!p)
+ return false;
+
+ while (*p) {
+ unsigned long i;
+ char *endp;
+ i = strtoul(p, &endp, 0);
+
+ p = endp;
+ if (*p != ':') {
+ fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
+ exit(1);
+ }
+ ++p;
+
+ if (i == num)
+ break;
+
+ p = strchr(p, ';');
+ if (!p)
+ return false;
+ ++p;
+ }
+ if (!*p)
+ return false;
+
+ semicolon = strchr(p, ';');
+ if (semicolon) {
+ p = copy = strndup(p, semicolon - p);
+ if (!copy) {
+ fprintf(stderr, "out of memory\n");
+ return false;
+ }
+ }
+
+ fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
+
+ f = fopen(p, "r");
+ if (!f) {
+ perror("radeonsi: failed to open file");
+ goto out_free;
+ }
+
+ if (fseek(f, 0, SEEK_END) != 0)
+ goto file_error;
+
+ filesize = ftell(f);
+ if (filesize < 0)
+ goto file_error;
+
+ if (fseek(f, 0, SEEK_SET) != 0)
+ goto file_error;
+
+ binary->elf_buffer = MALLOC(filesize);
+ if (!binary->elf_buffer) {
+ fprintf(stderr, "out of memory\n");
+ goto out_close;
+ }
+
+ nread = fread((void *)binary->elf_buffer, 1, filesize, f);
+ if (nread != filesize) {
+ FREE((void *)binary->elf_buffer);
+ binary->elf_buffer = NULL;
+ goto file_error;
+ }
+
+ binary->elf_size = nread;
+ replaced = true;
out_close:
- fclose(f);
+ fclose(f);
out_free:
- free(copy);
- return replaced;
+ free(copy);
+ return replaced;
file_error:
- perror("radeonsi: reading shader");
- goto out_close;
+ perror("radeonsi: reading shader");
+ goto out_close;
}
/* Parsed IBs are difficult to read without colors. Use "less -R file" to
* read them, or use "aha -b -f file" to convert them to html.
*/
-#define COLOR_RESET "\033[0m"
-#define COLOR_RED "\033[31m"
-#define COLOR_GREEN "\033[1;32m"
-#define COLOR_YELLOW "\033[1;33m"
-#define COLOR_CYAN "\033[1;36m"
-
-static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f,
- unsigned offset)
+#define COLOR_RESET "\033[0m"
+#define COLOR_RED "\033[31m"
+#define COLOR_GREEN "\033[1;32m"
+#define COLOR_YELLOW "\033[1;33m"
+#define COLOR_CYAN "\033[1;36m"
+
+static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, unsigned offset)
{
- struct radeon_winsys *ws = sctx->ws;
- uint32_t value;
+ struct radeon_winsys *ws = sctx->ws;
+ uint32_t value;
- if (ws->read_registers(ws, offset, 1, &value))
- ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
+ if (ws->read_registers(ws, offset, 1, &value))
+ ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
}
static void si_dump_debug_registers(struct si_context *sctx, FILE *f)
{
- if (!sctx->screen->info.has_read_registers_query)
- return;
-
- fprintf(f, "Memory-mapped registers:\n");
- si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
-
- /* No other registers can be read on DRM < 3.1.0. */
- if (!sctx->screen->info.is_amdgpu ||
- sctx->screen->info.drm_minor < 1) {
- fprintf(f, "\n");
- return;
- }
-
- si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
- si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
- si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
- si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
- si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
- si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
- si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
- if (sctx->chip_class <= GFX8) {
- si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
- si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
- si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
- }
- si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
- si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
- si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
- si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
- si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
- si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
- si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
- si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
- si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
- si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
- fprintf(f, "\n");
+ if (!sctx->screen->info.has_read_registers_query)
+ return;
+
+ fprintf(f, "Memory-mapped registers:\n");
+ si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
+
+ /* No other registers can be read on DRM < 3.1.0. */
+ if (!sctx->screen->info.is_amdgpu || sctx->screen->info.drm_minor < 1) {
+ fprintf(f, "\n");
+ return;
+ }
+
+ si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
+ si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
+ si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
+ si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
+ si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
+ si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
+ si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
+ if (sctx->chip_class <= GFX8) {
+ si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
+ si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
+ si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
+ }
+ si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
+ si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
+ si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
+ si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
+ si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
+ si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
+ si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
+ si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
+ si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
+ si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
+ fprintf(f, "\n");
}
struct si_log_chunk_cs {
- struct si_context *ctx;
- struct si_saved_cs *cs;
- bool dump_bo_list;
- unsigned gfx_begin, gfx_end;
- unsigned compute_begin, compute_end;
+ struct si_context *ctx;
+ struct si_saved_cs *cs;
+ bool dump_bo_list;
+ unsigned gfx_begin, gfx_end;
+ unsigned compute_begin, compute_end;
};
static void si_log_chunk_type_cs_destroy(void *data)
{
- struct si_log_chunk_cs *chunk = data;
- si_saved_cs_reference(&chunk->cs, NULL);
- free(chunk);
+ struct si_log_chunk_cs *chunk = data;
+ si_saved_cs_reference(&chunk->cs, NULL);
+ free(chunk);
}
-static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs,
- unsigned begin, unsigned end,
- int *last_trace_id, unsigned trace_id_count,
- const char *name, enum chip_class chip_class)
+static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begin, unsigned end,
+ int *last_trace_id, unsigned trace_id_count, const char *name,
+ enum chip_class chip_class)
{
- unsigned orig_end = end;
+ unsigned orig_end = end;
- assert(begin <= end);
+ assert(begin <= end);
- fprintf(f, "------------------ %s begin (dw = %u) ------------------\n",
- name, begin);
+ fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", name, begin);
- for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
- struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
+ for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
+ struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
- if (begin < chunk->cdw) {
- ac_parse_ib_chunk(f, chunk->buf + begin,
- MIN2(end, chunk->cdw) - begin,
- last_trace_id, trace_id_count,
- chip_class, NULL, NULL);
- }
+ if (begin < chunk->cdw) {
+ ac_parse_ib_chunk(f, chunk->buf + begin, MIN2(end, chunk->cdw) - begin, last_trace_id,
+ trace_id_count, chip_class, NULL, NULL);
+ }
- if (end <= chunk->cdw)
- return;
+ if (end <= chunk->cdw)
+ return;
- if (begin < chunk->cdw)
- fprintf(f, "\n---------- Next %s Chunk ----------\n\n",
- name);
+ if (begin < chunk->cdw)
+ fprintf(f, "\n---------- Next %s Chunk ----------\n\n", name);
- begin -= MIN2(begin, chunk->cdw);
- end -= chunk->cdw;
- }
+ begin -= MIN2(begin, chunk->cdw);
+ end -= chunk->cdw;
+ }
- assert(end <= cs->current.cdw);
+ assert(end <= cs->current.cdw);
- ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id,
- trace_id_count, chip_class, NULL, NULL);
+ ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, trace_id_count,
+ chip_class, NULL, NULL);
- fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n",
- name, orig_end);
+ fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
}
static void si_log_chunk_type_cs_print(void *data, FILE *f)
{
- struct si_log_chunk_cs *chunk = data;
- struct si_context *ctx = chunk->ctx;
- struct si_saved_cs *scs = chunk->cs;
- int last_trace_id = -1;
- int last_compute_trace_id = -1;
-
- /* We are expecting that the ddebug pipe has already
- * waited for the context, so this buffer should be idle.
- * If the GPU is hung, there is no point in waiting for it.
- */
- uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf,
- NULL,
- PIPE_TRANSFER_UNSYNCHRONIZED |
- PIPE_TRANSFER_READ);
- if (map) {
- last_trace_id = map[0];
- last_compute_trace_id = map[1];
- }
-
- if (chunk->gfx_end != chunk->gfx_begin) {
- if (chunk->gfx_begin == 0) {
- if (ctx->init_config)
- ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw,
- NULL, 0, "IB2: Init config", ctx->chip_class,
- NULL, NULL);
-
- if (ctx->init_config_gs_rings)
- ac_parse_ib(f, ctx->init_config_gs_rings->pm4,
- ctx->init_config_gs_rings->ndw,
- NULL, 0, "IB2: Init GS rings", ctx->chip_class,
- NULL, NULL);
- }
-
- if (scs->flushed) {
- ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin,
- chunk->gfx_end - chunk->gfx_begin,
- &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class,
- NULL, NULL);
- } else {
- si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin,
- chunk->gfx_end, &last_trace_id, map ? 1 : 0,
- "IB", ctx->chip_class);
- }
- }
-
- if (chunk->compute_end != chunk->compute_begin) {
- assert(ctx->prim_discard_compute_cs);
-
- if (scs->flushed) {
- ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
- chunk->compute_end - chunk->compute_begin,
- &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
- NULL, NULL);
- } else {
- si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
- chunk->compute_end, &last_compute_trace_id,
- map ? 1 : 0, "Compute IB", ctx->chip_class);
- }
- }
-
- if (chunk->dump_bo_list) {
- fprintf(f, "Flushing. Time: ");
- util_dump_ns(f, scs->time_flush);
- fprintf(f, "\n\n");
- si_dump_bo_list(ctx, &scs->gfx, f);
- }
+ struct si_log_chunk_cs *chunk = data;
+ struct si_context *ctx = chunk->ctx;
+ struct si_saved_cs *scs = chunk->cs;
+ int last_trace_id = -1;
+ int last_compute_trace_id = -1;
+
+ /* We are expecting that the ddebug pipe has already
+ * waited for the context, so this buffer should be idle.
+ * If the GPU is hung, there is no point in waiting for it.
+ */
+ uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf, NULL,
+ PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ);
+ if (map) {
+ last_trace_id = map[0];
+ last_compute_trace_id = map[1];
+ }
+
+ if (chunk->gfx_end != chunk->gfx_begin) {
+ if (chunk->gfx_begin == 0) {
+ if (ctx->init_config)
+ ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw, NULL, 0,
+ "IB2: Init config", ctx->chip_class, NULL, NULL);
+
+ if (ctx->init_config_gs_rings)
+ ac_parse_ib(f, ctx->init_config_gs_rings->pm4, ctx->init_config_gs_rings->ndw, NULL, 0,
+ "IB2: Init GS rings", ctx->chip_class, NULL, NULL);
+ }
+
+ if (scs->flushed) {
+ ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin,
+ &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, NULL, NULL);
+ } else {
+ si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin, chunk->gfx_end, &last_trace_id,
+ map ? 1 : 0, "IB", ctx->chip_class);
+ }
+ }
+
+ if (chunk->compute_end != chunk->compute_begin) {
+ assert(ctx->prim_discard_compute_cs);
+
+ if (scs->flushed) {
+ ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
+ chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
+ "Compute IB", ctx->chip_class, NULL, NULL);
+ } else {
+ si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
+ chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
+ ctx->chip_class);
+ }
+ }
+
+ if (chunk->dump_bo_list) {
+ fprintf(f, "Flushing. Time: ");
+ util_dump_ns(f, scs->time_flush);
+ fprintf(f, "\n\n");
+ si_dump_bo_list(ctx, &scs->gfx, f);
+ }
}
static const struct u_log_chunk_type si_log_chunk_type_cs = {
- .destroy = si_log_chunk_type_cs_destroy,
- .print = si_log_chunk_type_cs_print,
+ .destroy = si_log_chunk_type_cs_destroy,
+ .print = si_log_chunk_type_cs_print,
};
-static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
- bool dump_bo_list)
+static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool dump_bo_list)
{
- assert(ctx->current_saved_cs);
+ assert(ctx->current_saved_cs);
- struct si_saved_cs *scs = ctx->current_saved_cs;
- unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
- unsigned compute_cur = 0;
+ struct si_saved_cs *scs = ctx->current_saved_cs;
+ unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
+ unsigned compute_cur = 0;
- if (ctx->prim_discard_compute_cs)
- compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
+ if (ctx->prim_discard_compute_cs)
+ compute_cur =
+ ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
- if (!dump_bo_list &&
- gfx_cur == scs->gfx_last_dw &&
- compute_cur == scs->compute_last_dw)
- return;
+ if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+ return;
- struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
+ struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
- chunk->ctx = ctx;
- si_saved_cs_reference(&chunk->cs, scs);
- chunk->dump_bo_list = dump_bo_list;
+ chunk->ctx = ctx;
+ si_saved_cs_reference(&chunk->cs, scs);
+ chunk->dump_bo_list = dump_bo_list;
- chunk->gfx_begin = scs->gfx_last_dw;
- chunk->gfx_end = gfx_cur;
- scs->gfx_last_dw = gfx_cur;
+ chunk->gfx_begin = scs->gfx_last_dw;
+ chunk->gfx_end = gfx_cur;
+ scs->gfx_last_dw = gfx_cur;
- chunk->compute_begin = scs->compute_last_dw;
- chunk->compute_end = compute_cur;
- scs->compute_last_dw = compute_cur;
+ chunk->compute_begin = scs->compute_last_dw;
+ chunk->compute_end = compute_cur;
+ scs->compute_last_dw = compute_cur;
- u_log_chunk(log, &si_log_chunk_type_cs, chunk);
+ u_log_chunk(log, &si_log_chunk_type_cs, chunk);
}
void si_auto_log_cs(void *data, struct u_log_context *log)
{
- struct si_context *ctx = (struct si_context *)data;
- si_log_cs(ctx, log, false);
+ struct si_context *ctx = (struct si_context *)data;
+ si_log_cs(ctx, log, false);
}
void si_log_hw_flush(struct si_context *sctx)
{
- if (!sctx->log)
- return;
-
- si_log_cs(sctx, sctx->log, true);
-
- if (&sctx->b == sctx->screen->aux_context) {
- /* The aux context isn't captured by the ddebug wrapper,
- * so we dump it on a flush-by-flush basis here.
- */
- FILE *f = dd_get_debug_file(false);
- if (!f) {
- fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
- } else {
- dd_write_header(f, &sctx->screen->b, 0);
-
- fprintf(f, "Aux context dump:\n\n");
- u_log_new_page_print(sctx->log, f);
-
- fclose(f);
- }
- }
+ if (!sctx->log)
+ return;
+
+ si_log_cs(sctx, sctx->log, true);
+
+ if (&sctx->b == sctx->screen->aux_context) {
+ /* The aux context isn't captured by the ddebug wrapper,
+ * so we dump it on a flush-by-flush basis here.
+ */
+ FILE *f = dd_get_debug_file(false);
+ if (!f) {
+ fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
+ } else {
+ dd_write_header(f, &sctx->screen->b, 0);
+
+ fprintf(f, "Aux context dump:\n\n");
+ u_log_new_page_print(sctx->log, f);
+
+ fclose(f);
+ }
+ }
}
static const char *priority_to_string(enum radeon_bo_priority priority)
{
#define ITEM(x) [RADEON_PRIO_##x] = #x
- static const char *table[64] = {
- ITEM(FENCE),
- ITEM(TRACE),
- ITEM(SO_FILLED_SIZE),
- ITEM(QUERY),
- ITEM(IB1),
- ITEM(IB2),
- ITEM(DRAW_INDIRECT),
- ITEM(INDEX_BUFFER),
- ITEM(CP_DMA),
- ITEM(CONST_BUFFER),
- ITEM(DESCRIPTORS),
- ITEM(BORDER_COLORS),
- ITEM(SAMPLER_BUFFER),
- ITEM(VERTEX_BUFFER),
- ITEM(SHADER_RW_BUFFER),
- ITEM(COMPUTE_GLOBAL),
- ITEM(SAMPLER_TEXTURE),
- ITEM(SHADER_RW_IMAGE),
- ITEM(SAMPLER_TEXTURE_MSAA),
- ITEM(COLOR_BUFFER),
- ITEM(DEPTH_BUFFER),
- ITEM(COLOR_BUFFER_MSAA),
- ITEM(DEPTH_BUFFER_MSAA),
- ITEM(SEPARATE_META),
- ITEM(SHADER_BINARY),
- ITEM(SHADER_RINGS),
- ITEM(SCRATCH_BUFFER),
- };
+ static const char *table[64] = {
+ ITEM(FENCE),
+ ITEM(TRACE),
+ ITEM(SO_FILLED_SIZE),
+ ITEM(QUERY),
+ ITEM(IB1),
+ ITEM(IB2),
+ ITEM(DRAW_INDIRECT),
+ ITEM(INDEX_BUFFER),
+ ITEM(CP_DMA),
+ ITEM(CONST_BUFFER),
+ ITEM(DESCRIPTORS),
+ ITEM(BORDER_COLORS),
+ ITEM(SAMPLER_BUFFER),
+ ITEM(VERTEX_BUFFER),
+ ITEM(SHADER_RW_BUFFER),
+ ITEM(COMPUTE_GLOBAL),
+ ITEM(SAMPLER_TEXTURE),
+ ITEM(SHADER_RW_IMAGE),
+ ITEM(SAMPLER_TEXTURE_MSAA),
+ ITEM(COLOR_BUFFER),
+ ITEM(DEPTH_BUFFER),
+ ITEM(COLOR_BUFFER_MSAA),
+ ITEM(DEPTH_BUFFER_MSAA),
+ ITEM(SEPARATE_META),
+ ITEM(SHADER_BINARY),
+ ITEM(SHADER_RINGS),
+ ITEM(SCRATCH_BUFFER),
+ };
#undef ITEM
- assert(priority < ARRAY_SIZE(table));
- return table[priority];
+ assert(priority < ARRAY_SIZE(table));
+ return table[priority];
}
static int bo_list_compare_va(const struct radeon_bo_list_item *a,
- const struct radeon_bo_list_item *b)
+ const struct radeon_bo_list_item *b)
{
- return a->vm_address < b->vm_address ? -1 :
- a->vm_address > b->vm_address ? 1 : 0;
+ return a->vm_address < b->vm_address ? -1 : a->vm_address > b->vm_address ? 1 : 0;
}
-static void si_dump_bo_list(struct si_context *sctx,
- const struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f)
{
- unsigned i,j;
-
- if (!saved->bo_list)
- return;
-
- /* Sort the list according to VM adddresses first. */
- qsort(saved->bo_list, saved->bo_count,
- sizeof(saved->bo_list[0]), (void*)bo_list_compare_va);
-
- fprintf(f, "Buffer list (in units of pages = 4kB):\n"
- COLOR_YELLOW " Size VM start page "
- "VM end page Usage" COLOR_RESET "\n");
-
- for (i = 0; i < saved->bo_count; i++) {
- /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
- const unsigned page_size = sctx->screen->info.gart_page_size;
- uint64_t va = saved->bo_list[i].vm_address;
- uint64_t size = saved->bo_list[i].bo_size;
- bool hit = false;
-
- /* If there's unused virtual memory between 2 buffers, print it. */
- if (i) {
- uint64_t previous_va_end = saved->bo_list[i-1].vm_address +
- saved->bo_list[i-1].bo_size;
-
- if (va > previous_va_end) {
- fprintf(f, " %10"PRIu64" -- hole --\n",
- (va - previous_va_end) / page_size);
- }
- }
-
- /* Print the buffer. */
- fprintf(f, " %10"PRIu64" 0x%013"PRIX64" 0x%013"PRIX64" ",
- size / page_size, va / page_size, (va + size) / page_size);
-
- /* Print the usage. */
- for (j = 0; j < 32; j++) {
- if (!(saved->bo_list[i].priority_usage & (1u << j)))
- continue;
-
- fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
- hit = true;
- }
- fprintf(f, "\n");
- }
- fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
- " Other buffers can still be allocated there.\n\n");
+ unsigned i, j;
+
+ if (!saved->bo_list)
+ return;
+
+ /* Sort the list according to VM adddresses first. */
+ qsort(saved->bo_list, saved->bo_count, sizeof(saved->bo_list[0]), (void *)bo_list_compare_va);
+
+ fprintf(f, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW
+ " Size VM start page "
+ "VM end page Usage" COLOR_RESET "\n");
+
+ for (i = 0; i < saved->bo_count; i++) {
+ /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
+ const unsigned page_size = sctx->screen->info.gart_page_size;
+ uint64_t va = saved->bo_list[i].vm_address;
+ uint64_t size = saved->bo_list[i].bo_size;
+ bool hit = false;
+
+ /* If there's unused virtual memory between 2 buffers, print it. */
+ if (i) {
+ uint64_t previous_va_end =
+ saved->bo_list[i - 1].vm_address + saved->bo_list[i - 1].bo_size;
+
+ if (va > previous_va_end) {
+ fprintf(f, " %10" PRIu64 " -- hole --\n", (va - previous_va_end) / page_size);
+ }
+ }
+
+ /* Print the buffer. */
+ fprintf(f, " %10" PRIu64 " 0x%013" PRIX64 " 0x%013" PRIX64 " ",
+ size / page_size, va / page_size, (va + size) / page_size);
+
+ /* Print the usage. */
+ for (j = 0; j < 32; j++) {
+ if (!(saved->bo_list[i].priority_usage & (1u << j)))
+ continue;
+
+ fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
+ hit = true;
+ }
+ fprintf(f, "\n");
+ }
+ fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
+ " Other buffers can still be allocated there.\n\n");
}
static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log)
{
- struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
- struct si_texture *tex;
- int i;
-
- for (i = 0; i < state->nr_cbufs; i++) {
- if (!state->cbufs[i])
- continue;
-
- tex = (struct si_texture*)state->cbufs[i]->texture;
- u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
- si_print_texture_info(sctx->screen, tex, log);
- u_log_printf(log, "\n");
- }
-
- if (state->zsbuf) {
- tex = (struct si_texture*)state->zsbuf->texture;
- u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
- si_print_texture_info(sctx->screen, tex, log);
- u_log_printf(log, "\n");
- }
+ struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+ struct si_texture *tex;
+ int i;
+
+ for (i = 0; i < state->nr_cbufs; i++) {
+ if (!state->cbufs[i])
+ continue;
+
+ tex = (struct si_texture *)state->cbufs[i]->texture;
+ u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
+ si_print_texture_info(sctx->screen, tex, log);
+ u_log_printf(log, "\n");
+ }
+
+ if (state->zsbuf) {
+ tex = (struct si_texture *)state->zsbuf->texture;
+ u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
+ si_print_texture_info(sctx->screen, tex, log);
+ u_log_printf(log, "\n");
+ }
}
typedef unsigned (*slot_remap_func)(unsigned);
struct si_log_chunk_desc_list {
- /** Pointer to memory map of buffer where the list is uploader */
- uint32_t *gpu_list;
- /** Reference of buffer where the list is uploaded, so that gpu_list
- * is kept live. */
- struct si_resource *buf;
-
- const char *shader_name;
- const char *elem_name;
- slot_remap_func slot_remap;
- enum chip_class chip_class;
- unsigned element_dw_size;
- unsigned num_elements;
-
- uint32_t list[0];
+ /** Pointer to memory map of buffer where the list is uploader */
+ uint32_t *gpu_list;
+ /** Reference of buffer where the list is uploaded, so that gpu_list
+ * is kept live. */
+ struct si_resource *buf;
+
+ const char *shader_name;
+ const char *elem_name;
+ slot_remap_func slot_remap;
+ enum chip_class chip_class;
+ unsigned element_dw_size;
+ unsigned num_elements;
+
+ uint32_t list[0];
};
-static void
-si_log_chunk_desc_list_destroy(void *data)
+static void si_log_chunk_desc_list_destroy(void *data)
{
- struct si_log_chunk_desc_list *chunk = data;
- si_resource_reference(&chunk->buf, NULL);
- FREE(chunk);
+ struct si_log_chunk_desc_list *chunk = data;
+ si_resource_reference(&chunk->buf, NULL);
+ FREE(chunk);
}
-static void
-si_log_chunk_desc_list_print(void *data, FILE *f)
+static void si_log_chunk_desc_list_print(void *data, FILE *f)
{
- struct si_log_chunk_desc_list *chunk = data;
- unsigned sq_img_rsrc_word0 = chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0
- : R_008F10_SQ_IMG_RSRC_WORD0;
-
- for (unsigned i = 0; i < chunk->num_elements; i++) {
- unsigned cpu_dw_offset = i * chunk->element_dw_size;
- unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
- const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
- uint32_t *cpu_list = chunk->list + cpu_dw_offset;
- uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
-
- fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n",
- chunk->shader_name, chunk->elem_name, i, list_note);
-
- switch (chunk->element_dw_size) {
- case 4:
- for (unsigned j = 0; j < 4; j++)
- ac_dump_reg(f, chunk->chip_class,
- R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
- gpu_list[j], 0xffffffff);
- break;
- case 8:
- for (unsigned j = 0; j < 8; j++)
- ac_dump_reg(f, chunk->chip_class,
- sq_img_rsrc_word0 + j*4,
- gpu_list[j], 0xffffffff);
-
- fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n");
- for (unsigned j = 0; j < 4; j++)
- ac_dump_reg(f, chunk->chip_class,
- R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
- gpu_list[4+j], 0xffffffff);
- break;
- case 16:
- for (unsigned j = 0; j < 8; j++)
- ac_dump_reg(f, chunk->chip_class,
- sq_img_rsrc_word0 + j*4,
- gpu_list[j], 0xffffffff);
-
- fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n");
- for (unsigned j = 0; j < 4; j++)
- ac_dump_reg(f, chunk->chip_class,
- R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
- gpu_list[4+j], 0xffffffff);
-
- fprintf(f, COLOR_CYAN " FMASK:" COLOR_RESET "\n");
- for (unsigned j = 0; j < 8; j++)
- ac_dump_reg(f, chunk->chip_class,
- sq_img_rsrc_word0 + j*4,
- gpu_list[8+j], 0xffffffff);
-
- fprintf(f, COLOR_CYAN " Sampler state:" COLOR_RESET "\n");
- for (unsigned j = 0; j < 4; j++)
- ac_dump_reg(f, chunk->chip_class,
- R_008F30_SQ_IMG_SAMP_WORD0 + j*4,
- gpu_list[12+j], 0xffffffff);
- break;
- }
-
- if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
- fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!"
- COLOR_RESET "\n");
- }
-
- fprintf(f, "\n");
- }
-
+ struct si_log_chunk_desc_list *chunk = data;
+ unsigned sq_img_rsrc_word0 =
+ chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 : R_008F10_SQ_IMG_RSRC_WORD0;
+
+ for (unsigned i = 0; i < chunk->num_elements; i++) {
+ unsigned cpu_dw_offset = i * chunk->element_dw_size;
+ unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
+ const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
+ uint32_t *cpu_list = chunk->list + cpu_dw_offset;
+ uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
+
+ fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", chunk->shader_name,
+ chunk->elem_name, i, list_note);
+
+ switch (chunk->element_dw_size) {
+ case 4:
+ for (unsigned j = 0; j < 4; j++)
+ ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[j],
+ 0xffffffff);
+ break;
+ case 8:
+ for (unsigned j = 0; j < 8; j++)
+ ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+ fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n");
+ for (unsigned j = 0; j < 4; j++)
+ ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+ 0xffffffff);
+ break;
+ case 16:
+ for (unsigned j = 0; j < 8; j++)
+ ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+ fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n");
+ for (unsigned j = 0; j < 4; j++)
+ ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+ 0xffffffff);
+
+ fprintf(f, COLOR_CYAN " FMASK:" COLOR_RESET "\n");
+ for (unsigned j = 0; j < 8; j++)
+ ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[8 + j],
+ 0xffffffff);
+
+ fprintf(f, COLOR_CYAN " Sampler state:" COLOR_RESET "\n");
+ for (unsigned j = 0; j < 4; j++)
+ ac_dump_reg(f, chunk->chip_class, R_008F30_SQ_IMG_SAMP_WORD0 + j * 4, gpu_list[12 + j],
+ 0xffffffff);
+ break;
+ }
+
+ if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
+ fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET "\n");
+ }
+
+ fprintf(f, "\n");
+ }
}
static const struct u_log_chunk_type si_log_chunk_type_descriptor_list = {
- .destroy = si_log_chunk_desc_list_destroy,
- .print = si_log_chunk_desc_list_print,
+ .destroy = si_log_chunk_desc_list_destroy,
+ .print = si_log_chunk_desc_list_print,
};
-static void si_dump_descriptor_list(struct si_screen *screen,
- struct si_descriptors *desc,
- const char *shader_name,
- const char *elem_name,
- unsigned element_dw_size,
- unsigned num_elements,
- slot_remap_func slot_remap,
- struct u_log_context *log)
+static void si_dump_descriptor_list(struct si_screen *screen, struct si_descriptors *desc,
+ const char *shader_name, const char *elem_name,
+ unsigned element_dw_size, unsigned num_elements,
+ slot_remap_func slot_remap, struct u_log_context *log)
{
- if (!desc->list)
- return;
-
- /* In some cases, the caller doesn't know how many elements are really
- * uploaded. Reduce num_elements to fit in the range of active slots. */
- unsigned active_range_dw_begin =
- desc->first_active_slot * desc->element_dw_size;
- unsigned active_range_dw_end =
- active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
-
- while (num_elements > 0) {
- int i = slot_remap(num_elements - 1);
- unsigned dw_begin = i * element_dw_size;
- unsigned dw_end = dw_begin + element_dw_size;
-
- if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
- break;
-
- num_elements--;
- }
-
- struct si_log_chunk_desc_list *chunk =
- CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list,
- 4 * element_dw_size * num_elements);
- chunk->shader_name = shader_name;
- chunk->elem_name = elem_name;
- chunk->element_dw_size = element_dw_size;
- chunk->num_elements = num_elements;
- chunk->slot_remap = slot_remap;
- chunk->chip_class = screen->info.chip_class;
-
- si_resource_reference(&chunk->buf, desc->buffer);
- chunk->gpu_list = desc->gpu_list;
-
- for (unsigned i = 0; i < num_elements; ++i) {
- memcpy(&chunk->list[i * element_dw_size],
- &desc->list[slot_remap(i) * element_dw_size],
- 4 * element_dw_size);
- }
-
- u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
+ if (!desc->list)
+ return;
+
+ /* In some cases, the caller doesn't know how many elements are really
+ * uploaded. Reduce num_elements to fit in the range of active slots. */
+ unsigned active_range_dw_begin = desc->first_active_slot * desc->element_dw_size;
+ unsigned active_range_dw_end =
+ active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
+
+ while (num_elements > 0) {
+ int i = slot_remap(num_elements - 1);
+ unsigned dw_begin = i * element_dw_size;
+ unsigned dw_end = dw_begin + element_dw_size;
+
+ if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
+ break;
+
+ num_elements--;
+ }
+
+ struct si_log_chunk_desc_list *chunk =
+ CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, 4 * element_dw_size * num_elements);
+ chunk->shader_name = shader_name;
+ chunk->elem_name = elem_name;
+ chunk->element_dw_size = element_dw_size;
+ chunk->num_elements = num_elements;
+ chunk->slot_remap = slot_remap;
+ chunk->chip_class = screen->info.chip_class;
+
+ si_resource_reference(&chunk->buf, desc->buffer);
+ chunk->gpu_list = desc->gpu_list;
+
+ for (unsigned i = 0; i < num_elements; ++i) {
+ memcpy(&chunk->list[i * element_dw_size], &desc->list[slot_remap(i) * element_dw_size],
+ 4 * element_dw_size);
+ }
+
+ u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
}
static unsigned si_identity(unsigned slot)
{
- return slot;
+ return slot;
}
-static void si_dump_descriptors(struct si_context *sctx,
- enum pipe_shader_type processor,
- const struct si_shader_info *info,
- struct u_log_context *log)
+static void si_dump_descriptors(struct si_context *sctx, enum pipe_shader_type processor,
+ const struct si_shader_info *info, struct u_log_context *log)
{
- struct si_descriptors *descs =
- &sctx->descriptors[SI_DESCS_FIRST_SHADER +
- processor * SI_NUM_SHADER_DESCS];
- static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
- const char *name = shader_name[processor];
- unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
- unsigned enabled_images;
-
- if (info) {
- enabled_constbuf = info->const_buffers_declared;
- enabled_shaderbuf = info->shader_buffers_declared;
- enabled_samplers = info->samplers_declared;
- enabled_images = info->images_declared;
- } else {
- enabled_constbuf = sctx->const_and_shader_buffers[processor].enabled_mask >>
- SI_NUM_SHADER_BUFFERS;
- enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
- u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
- enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >>
- (32 - SI_NUM_SHADER_BUFFERS);
- enabled_samplers = sctx->samplers[processor].enabled_mask;
- enabled_images = sctx->images[processor].enabled_mask;
- }
-
- if (processor == PIPE_SHADER_VERTEX &&
- sctx->vb_descriptors_buffer &&
- sctx->vb_descriptors_gpu_list &&
- sctx->vertex_elements) {
- assert(info); /* only CS may not have an info struct */
- struct si_descriptors desc = {};
-
- desc.buffer = sctx->vb_descriptors_buffer;
- desc.list = sctx->vb_descriptors_gpu_list;
- desc.gpu_list = sctx->vb_descriptors_gpu_list;
- desc.element_dw_size = 4;
- desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
-
- si_dump_descriptor_list(sctx->screen, &desc, name,
- " - Vertex buffer", 4, info->num_inputs,
- si_identity, log);
- }
-
- si_dump_descriptor_list(sctx->screen,
- &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
- name, " - Constant buffer", 4,
- util_last_bit(enabled_constbuf),
- si_get_constbuf_slot, log);
- si_dump_descriptor_list(sctx->screen,
- &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
- name, " - Shader buffer", 4,
- util_last_bit(enabled_shaderbuf),
- si_get_shaderbuf_slot, log);
- si_dump_descriptor_list(sctx->screen,
- &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
- name, " - Sampler", 16,
- util_last_bit(enabled_samplers),
- si_get_sampler_slot, log);
- si_dump_descriptor_list(sctx->screen,
- &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
- name, " - Image", 8,
- util_last_bit(enabled_images),
- si_get_image_slot, log);
+ struct si_descriptors *descs =
+ &sctx->descriptors[SI_DESCS_FIRST_SHADER + processor * SI_NUM_SHADER_DESCS];
+ static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
+ const char *name = shader_name[processor];
+ unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
+ unsigned enabled_images;
+
+ if (info) {
+ enabled_constbuf = info->const_buffers_declared;
+ enabled_shaderbuf = info->shader_buffers_declared;
+ enabled_samplers = info->samplers_declared;
+ enabled_images = info->images_declared;
+ } else {
+ enabled_constbuf =
+ sctx->const_and_shader_buffers[processor].enabled_mask >> SI_NUM_SHADER_BUFFERS;
+ enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
+ u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+ enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >> (32 - SI_NUM_SHADER_BUFFERS);
+ enabled_samplers = sctx->samplers[processor].enabled_mask;
+ enabled_images = sctx->images[processor].enabled_mask;
+ }
+
+ if (processor == PIPE_SHADER_VERTEX && sctx->vb_descriptors_buffer &&
+ sctx->vb_descriptors_gpu_list && sctx->vertex_elements) {
+ assert(info); /* only CS may not have an info struct */
+ struct si_descriptors desc = {};
+
+ desc.buffer = sctx->vb_descriptors_buffer;
+ desc.list = sctx->vb_descriptors_gpu_list;
+ desc.gpu_list = sctx->vb_descriptors_gpu_list;
+ desc.element_dw_size = 4;
+ desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
+
+ si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs,
+ si_identity, log);
+ }
+
+ si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+ " - Constant buffer", 4, util_last_bit(enabled_constbuf),
+ si_get_constbuf_slot, log);
+ si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+ " - Shader buffer", 4, util_last_bit(enabled_shaderbuf),
+ si_get_shaderbuf_slot, log);
+ si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+ " - Sampler", 16, util_last_bit(enabled_samplers), si_get_sampler_slot,
+ log);
+ si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+ " - Image", 8, util_last_bit(enabled_images), si_get_image_slot, log);
}
static void si_dump_gfx_descriptors(struct si_context *sctx,
- const struct si_shader_ctx_state *state,
- struct u_log_context *log)
+ const struct si_shader_ctx_state *state,
+ struct u_log_context *log)
{
- if (!state->cso || !state->current)
- return;
+ if (!state->cso || !state->current)
+ return;
- si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
+ si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
}
-static void si_dump_compute_descriptors(struct si_context *sctx,
- struct u_log_context *log)
+static void si_dump_compute_descriptors(struct si_context *sctx, struct u_log_context *log)
{
- if (!sctx->cs_shader_state.program)
- return;
+ if (!sctx->cs_shader_state.program)
+ return;
- si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
+ si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
}
struct si_shader_inst {
- const char *text; /* start of disassembly for this instruction */
- unsigned textlen;
- unsigned size; /* instruction size = 4 or 8 */
- uint64_t addr; /* instruction address */
+ const char *text; /* start of disassembly for this instruction */
+ unsigned textlen;
+ unsigned size; /* instruction size = 4 or 8 */
+ uint64_t addr; /* instruction address */
};
/**
* The caller must keep \p rtld_binary alive as long as \p instructions are
* used and then close it afterwards.
*/
-static void si_add_split_disasm(struct si_screen *screen,
- struct ac_rtld_binary *rtld_binary,
- struct si_shader_binary *binary,
- uint64_t *addr,
- unsigned *num,
- struct si_shader_inst *instructions,
- enum pipe_shader_type shader_type,
- unsigned wave_size)
+static void si_add_split_disasm(struct si_screen *screen, struct ac_rtld_binary *rtld_binary,
+ struct si_shader_binary *binary, uint64_t *addr, unsigned *num,
+ struct si_shader_inst *instructions,
+ enum pipe_shader_type shader_type, unsigned wave_size)
{
- if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
- .info = &screen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return;
-
- const char *disasm;
- size_t nbytes;
- if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm",
- &disasm, &nbytes))
- return;
-
- const char *end = disasm + nbytes;
- while (disasm < end) {
- const char *semicolon = memchr(disasm, ';', end - disasm);
- if (!semicolon)
- break;
-
- struct si_shader_inst *inst = &instructions[(*num)++];
- const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
- if (!inst_end)
- inst_end = end;
-
- inst->text = disasm;
- inst->textlen = inst_end - disasm;
-
- inst->addr = *addr;
- /* More than 16 chars after ";" means the instruction is 8 bytes long. */
- inst->size = inst_end - semicolon > 16 ? 8 : 4;
- *addr += inst->size;
-
- if (inst_end == end)
- break;
- disasm = inst_end + 1;
- }
+ if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
+ .info = &screen->info,
+ .shader_type = tgsi_processor_to_shader_stage(shader_type),
+ .wave_size = wave_size,
+ .num_parts = 1,
+ .elf_ptrs = &binary->elf_buffer,
+ .elf_sizes = &binary->elf_size}))
+ return;
+
+ const char *disasm;
+ size_t nbytes;
+ if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+ return;
+
+ const char *end = disasm + nbytes;
+ while (disasm < end) {
+ const char *semicolon = memchr(disasm, ';', end - disasm);
+ if (!semicolon)
+ break;
+
+ struct si_shader_inst *inst = &instructions[(*num)++];
+ const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
+ if (!inst_end)
+ inst_end = end;
+
+ inst->text = disasm;
+ inst->textlen = inst_end - disasm;
+
+ inst->addr = *addr;
+ /* More than 16 chars after ";" means the instruction is 8 bytes long. */
+ inst->size = inst_end - semicolon > 16 ? 8 : 4;
+ *addr += inst->size;
+
+ if (inst_end == end)
+ break;
+ disasm = inst_end + 1;
+ }
}
/* If the shader is being executed, print its asm instructions, and annotate
* those that are being executed right now with information about waves that
* execute them. This is most useful during a GPU hang.
*/
-static void si_print_annotated_shader(struct si_shader *shader,
- struct ac_wave_info *waves,
- unsigned num_waves,
- FILE *f)
+static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_info *waves,
+ unsigned num_waves, FILE *f)
{
- if (!shader)
- return;
-
- struct si_screen *screen = shader->selector->screen;
- enum pipe_shader_type shader_type = shader->selector->type;
- uint64_t start_addr = shader->bo->gpu_address;
- uint64_t end_addr = start_addr + shader->bo->b.b.width0;
- unsigned i;
-
- /* See if any wave executes the shader. */
- for (i = 0; i < num_waves; i++) {
- if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
- break;
- }
- if (i == num_waves)
- return; /* the shader is not being executed */
-
- /* Remember the first found wave. The waves are sorted according to PC. */
- waves = &waves[i];
- num_waves -= i;
-
- /* Get the list of instructions.
- * Buffer size / 4 is the upper bound of the instruction count.
- */
- unsigned num_inst = 0;
- uint64_t inst_addr = start_addr;
- unsigned wave_size = si_get_shader_wave_size(shader);
- struct ac_rtld_binary rtld_binaries[5] = {};
- struct si_shader_inst *instructions =
- calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
-
- if (shader->prolog) {
- si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary,
- &inst_addr, &num_inst, instructions, shader_type, wave_size);
- }
- if (shader->previous_stage) {
- si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary,
- &inst_addr, &num_inst, instructions, shader_type, wave_size);
- }
- if (shader->prolog2) {
- si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary,
- &inst_addr, &num_inst, instructions, shader_type, wave_size);
- }
- si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary,
- &inst_addr, &num_inst, instructions, shader_type, wave_size);
- if (shader->epilog) {
- si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary,
- &inst_addr, &num_inst, instructions, shader_type, wave_size);
- }
-
- fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
- si_get_shader_name(shader));
-
- /* Print instructions with annotations. */
- for (i = 0; i < num_inst; i++) {
- struct si_shader_inst *inst = &instructions[i];
-
- fprintf(f, "%.*s [PC=0x%"PRIx64", size=%u]\n",
- inst->textlen, inst->text, inst->addr, inst->size);
-
- /* Print which waves execute the instruction right now. */
- while (num_waves && inst->addr == waves->pc) {
- fprintf(f,
- " " COLOR_GREEN "^ SE%u SH%u CU%u "
- "SIMD%u WAVE%u EXEC=%016"PRIx64 " ",
- waves->se, waves->sh, waves->cu, waves->simd,
- waves->wave, waves->exec);
-
- if (inst->size == 4) {
- fprintf(f, "INST32=%08X" COLOR_RESET "\n",
- waves->inst_dw0);
- } else {
- fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
- waves->inst_dw0, waves->inst_dw1);
- }
-
- waves->matched = true;
- waves = &waves[1];
- num_waves--;
- }
- }
-
- fprintf(f, "\n\n");
- free(instructions);
- for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
- ac_rtld_close(&rtld_binaries[i]);
+ if (!shader)
+ return;
+
+ struct si_screen *screen = shader->selector->screen;
+ enum pipe_shader_type shader_type = shader->selector->type;
+ uint64_t start_addr = shader->bo->gpu_address;
+ uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+ unsigned i;
+
+ /* See if any wave executes the shader. */
+ for (i = 0; i < num_waves; i++) {
+ if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+ break;
+ }
+ if (i == num_waves)
+ return; /* the shader is not being executed */
+
+ /* Remember the first found wave. The waves are sorted according to PC. */
+ waves = &waves[i];
+ num_waves -= i;
+
+ /* Get the list of instructions.
+ * Buffer size / 4 is the upper bound of the instruction count.
+ */
+ unsigned num_inst = 0;
+ uint64_t inst_addr = start_addr;
+ unsigned wave_size = si_get_shader_wave_size(shader);
+ struct ac_rtld_binary rtld_binaries[5] = {};
+ struct si_shader_inst *instructions =
+ calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+ if (shader->prolog) {
+ si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst,
+ instructions, shader_type, wave_size);
+ }
+ if (shader->previous_stage) {
+ si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr,
+ &num_inst, instructions, shader_type, wave_size);
+ }
+ if (shader->prolog2) {
+ si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr,
+ &num_inst, instructions, shader_type, wave_size);
+ }
+ si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst,
+ instructions, shader_type, wave_size);
+ if (shader->epilog) {
+ si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst,
+ instructions, shader_type, wave_size);
+ }
+
+ fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+ si_get_shader_name(shader));
+
+ /* Print instructions with annotations. */
+ for (i = 0; i < num_inst; i++) {
+ struct si_shader_inst *inst = &instructions[i];
+
+ fprintf(f, "%.*s [PC=0x%" PRIx64 ", size=%u]\n", inst->textlen, inst->text, inst->addr,
+ inst->size);
+
+ /* Print which waves execute the instruction right now. */
+ while (num_waves && inst->addr == waves->pc) {
+ fprintf(f,
+ " " COLOR_GREEN "^ SE%u SH%u CU%u "
+ "SIMD%u WAVE%u EXEC=%016" PRIx64 " ",
+ waves->se, waves->sh, waves->cu, waves->simd, waves->wave, waves->exec);
+
+ if (inst->size == 4) {
+ fprintf(f, "INST32=%08X" COLOR_RESET "\n", waves->inst_dw0);
+ } else {
+ fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", waves->inst_dw0, waves->inst_dw1);
+ }
+
+ waves->matched = true;
+ waves = &waves[1];
+ num_waves--;
+ }
+ }
+
+ fprintf(f, "\n\n");
+ free(instructions);
+ for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
+ ac_rtld_close(&rtld_binaries[i]);
}
static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
{
- struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
- unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
-
- fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
- "\n\n", num_waves);
-
- si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
- si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
- si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
- si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
- si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
-
- /* Print waves executing shaders that are not currently bound. */
- unsigned i;
- bool found = false;
- for (i = 0; i < num_waves; i++) {
- if (waves[i].matched)
- continue;
-
- if (!found) {
- fprintf(f, COLOR_CYAN
- "Waves not executing currently-bound shaders:"
- COLOR_RESET "\n");
- found = true;
- }
- fprintf(f, " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016"PRIx64
- " INST=%08X %08X PC=%"PRIx64"\n",
- waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
- waves[i].wave, waves[i].exec, waves[i].inst_dw0,
- waves[i].inst_dw1, waves[i].pc);
- }
- if (found)
- fprintf(f, "\n\n");
+ struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
+ unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
+
+ fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves);
+
+ si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+ /* Print waves executing shaders that are not currently bound. */
+ unsigned i;
+ bool found = false;
+ for (i = 0; i < num_waves; i++) {
+ if (waves[i].matched)
+ continue;
+
+ if (!found) {
+ fprintf(f, COLOR_CYAN "Waves not executing currently-bound shaders:" COLOR_RESET "\n");
+ found = true;
+ }
+ fprintf(f,
+ " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016" PRIx64 " INST=%08X %08X PC=%" PRIx64
+ "\n",
+ waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, waves[i].wave, waves[i].exec,
+ waves[i].inst_dw0, waves[i].inst_dw1, waves[i].pc);
+ }
+ if (found)
+ fprintf(f, "\n\n");
}
static void si_dump_command(const char *title, const char *command, FILE *f)
{
- char line[2000];
+ char line[2000];
- FILE *p = popen(command, "r");
- if (!p)
- return;
+ FILE *p = popen(command, "r");
+ if (!p)
+ return;
- fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
- while (fgets(line, sizeof(line), p))
- fputs(line, f);
- fprintf(f, "\n\n");
- pclose(p);
+ fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
+ while (fgets(line, sizeof(line), p))
+ fputs(line, f);
+ fprintf(f, "\n\n");
+ pclose(p);
}
-static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
- unsigned flags)
+static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags)
{
- struct si_context *sctx = (struct si_context*)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (sctx->log)
- u_log_flush(sctx->log);
+ if (sctx->log)
+ u_log_flush(sctx->log);
- if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
- si_dump_debug_registers(sctx, f);
+ if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
+ si_dump_debug_registers(sctx, f);
- si_dump_annotated_shaders(sctx, f);
- si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
- si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
- }
+ si_dump_annotated_shaders(sctx, f);
+ si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
+ si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
+ }
}
void si_log_draw_state(struct si_context *sctx, struct u_log_context *log)
{
- struct si_shader_ctx_state *tcs_shader;
-
- if (!log)
- return;
-
- tcs_shader = &sctx->tcs_shader;
- if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
- tcs_shader = &sctx->fixed_func_tcs_shader;
-
- si_dump_framebuffer(sctx, log);
-
- si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
- si_dump_gfx_shader(sctx, tcs_shader, log);
- si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
- si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
- si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
-
- si_dump_descriptor_list(sctx->screen,
- &sctx->descriptors[SI_DESCS_RW_BUFFERS],
- "", "RW buffers", 4,
- sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots,
- si_identity, log);
- si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
- si_dump_gfx_descriptors(sctx, tcs_shader, log);
- si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
- si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
- si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
+ struct si_shader_ctx_state *tcs_shader;
+
+ if (!log)
+ return;
+
+ tcs_shader = &sctx->tcs_shader;
+ if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
+ tcs_shader = &sctx->fixed_func_tcs_shader;
+
+ si_dump_framebuffer(sctx, log);
+
+ si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
+ si_dump_gfx_shader(sctx, tcs_shader, log);
+ si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
+ si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
+ si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
+
+ si_dump_descriptor_list(sctx->screen, &sctx->descriptors[SI_DESCS_RW_BUFFERS], "", "RW buffers",
+ 4, sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots, si_identity,
+ log);
+ si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
+ si_dump_gfx_descriptors(sctx, tcs_shader, log);
+ si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
+ si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
+ si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
}
void si_log_compute_state(struct si_context *sctx, struct u_log_context *log)
{
- if (!log)
- return;
+ if (!log)
+ return;
- si_dump_compute_shader(sctx, log);
- si_dump_compute_descriptors(sctx, log);
+ si_dump_compute_shader(sctx, log);
+ si_dump_compute_descriptors(sctx, log);
}
-static void si_dump_dma(struct si_context *sctx,
- struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f)
{
- static const char ib_name[] = "sDMA IB";
- unsigned i;
+ static const char ib_name[] = "sDMA IB";
+ unsigned i;
- si_dump_bo_list(sctx, saved, f);
+ si_dump_bo_list(sctx, saved, f);
- fprintf(f, "------------------ %s begin ------------------\n", ib_name);
+ fprintf(f, "------------------ %s begin ------------------\n", ib_name);
- for (i = 0; i < saved->num_dw; ++i) {
- fprintf(f, " %08x\n", saved->ib[i]);
- }
+ for (i = 0; i < saved->num_dw; ++i) {
+ fprintf(f, " %08x\n", saved->ib[i]);
+ }
- fprintf(f, "------------------- %s end -------------------\n", ib_name);
- fprintf(f, "\n");
+ fprintf(f, "------------------- %s end -------------------\n", ib_name);
+ fprintf(f, "\n");
- fprintf(f, "SDMA Dump Done.\n");
+ fprintf(f, "SDMA Dump Done.\n");
}
-void si_check_vm_faults(struct si_context *sctx,
- struct radeon_saved_cs *saved, enum ring_type ring)
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring)
{
- struct pipe_screen *screen = sctx->b.screen;
- FILE *f;
- uint64_t addr;
- char cmd_line[4096];
-
- if (!ac_vm_fault_occured(sctx->chip_class,
- &sctx->dmesg_timestamp, &addr))
- return;
-
- f = dd_get_debug_file(false);
- if (!f)
- return;
-
- fprintf(f, "VM fault report.\n\n");
- if (os_get_command_line(cmd_line, sizeof(cmd_line)))
- fprintf(f, "Command: %s\n", cmd_line);
- fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
- fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
- fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
- fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
-
- if (sctx->apitrace_call_number)
- fprintf(f, "Last apitrace call: %u\n\n",
- sctx->apitrace_call_number);
-
- switch (ring) {
- case RING_GFX: {
- struct u_log_context log;
- u_log_context_init(&log);
-
- si_log_draw_state(sctx, &log);
- si_log_compute_state(sctx, &log);
- si_log_cs(sctx, &log, true);
-
- u_log_new_page_print(&log, f);
- u_log_context_destroy(&log);
- break;
- }
- case RING_DMA:
- si_dump_dma(sctx, saved, f);
- break;
-
- default:
- break;
- }
-
- fclose(f);
-
- fprintf(stderr, "Detected a VM fault, exiting...\n");
- exit(0);
+ struct pipe_screen *screen = sctx->b.screen;
+ FILE *f;
+ uint64_t addr;
+ char cmd_line[4096];
+
+ if (!ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, &addr))
+ return;
+
+ f = dd_get_debug_file(false);
+ if (!f)
+ return;
+
+ fprintf(f, "VM fault report.\n\n");
+ if (os_get_command_line(cmd_line, sizeof(cmd_line)))
+ fprintf(f, "Command: %s\n", cmd_line);
+ fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
+ fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
+ fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
+ fprintf(f, "Failing VM page: 0x%08" PRIx64 "\n\n", addr);
+
+ if (sctx->apitrace_call_number)
+ fprintf(f, "Last apitrace call: %u\n\n", sctx->apitrace_call_number);
+
+ switch (ring) {
+ case RING_GFX: {
+ struct u_log_context log;
+ u_log_context_init(&log);
+
+ si_log_draw_state(sctx, &log);
+ si_log_compute_state(sctx, &log);
+ si_log_cs(sctx, &log, true);
+
+ u_log_new_page_print(&log, f);
+ u_log_context_destroy(&log);
+ break;
+ }
+ case RING_DMA:
+ si_dump_dma(sctx, saved, f);
+ break;
+
+ default:
+ break;
+ }
+
+ fclose(f);
+
+ fprintf(stderr, "Detected a VM fault, exiting...\n");
+ exit(0);
}
void si_init_debug_functions(struct si_context *sctx)
{
- sctx->b.dump_debug_state = si_dump_debug_state;
-
- /* Set the initial dmesg timestamp for this context, so that
- * only new messages will be checked for VM faults.
- */
- if (sctx->screen->debug_flags & DBG(CHECK_VM))
- ac_vm_fault_occured(sctx->chip_class,
- &sctx->dmesg_timestamp, NULL);
+ sctx->b.dump_debug_state = si_dump_debug_state;
+
+ /* Set the initial dmesg timestamp for this context, so that
+ * only new messages will be checked for VM faults.
+ */
+ if (sctx->screen->debug_flags & DBG(CHECK_VM))
+ ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, NULL);
}
OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
-OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)")
+OPT_BOOL(debug_disassembly, false,
+ "Report shader disassembly as part of driver debug messages (for shader db)")
OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)")
-OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
+OPT_BOOL(vs_fetch_always_opencode, false,
+ "Always open code vertex fetches (less efficient, purely for testing)")
OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")
#undef OPT_BOOL
#include "si_pipe.h"
#include "sid.h"
-
+#include "util/format/u_format.h"
#include "util/hash_table.h"
#include "util/u_idalloc.h"
-#include "util/format/u_format.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
-
/* NULL image and buffer descriptor for textures (alpha = 1) and images
* (alpha = 0).
*
* This is the only reason why the buffer descriptor must be in words [4:7].
*/
static uint32_t null_texture_descriptor[8] = {
- 0,
- 0,
- 0,
- S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
- S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
- /* the rest must contain zeros, which is also used by the buffer
- * descriptor */
+ 0, 0, 0, S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+ /* the rest must contain zeros, which is also used by the buffer
+ * descriptor */
};
static uint32_t null_image_descriptor[8] = {
- 0,
- 0,
- 0,
- S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
- /* the rest must contain zeros, which is also used by the buffer
- * descriptor */
+ 0, 0, 0, S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+ /* the rest must contain zeros, which is also used by the buffer
+ * descriptor */
};
static uint64_t si_desc_extract_buffer_address(const uint32_t *desc)
{
- uint64_t va = desc[0] |
- ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
+ uint64_t va = desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
- /* Sign-extend the 48-bit address. */
- va <<= 16;
- va = (int64_t)va >> 16;
- return va;
+ /* Sign-extend the 48-bit address. */
+ va <<= 16;
+ va = (int64_t)va >> 16;
+ return va;
}
-static void si_init_descriptor_list(uint32_t *desc_list,
- unsigned element_dw_size,
- unsigned num_elements,
- const uint32_t *null_descriptor)
+static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size,
+ unsigned num_elements, const uint32_t *null_descriptor)
{
- int i;
+ int i;
- /* Initialize the array to NULL descriptors if the element size is 8. */
- if (null_descriptor) {
- assert(element_dw_size % 8 == 0);
- for (i = 0; i < num_elements * element_dw_size / 8; i++)
- memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
- }
+ /* Initialize the array to NULL descriptors if the element size is 8. */
+ if (null_descriptor) {
+ assert(element_dw_size % 8 == 0);
+ for (i = 0; i < num_elements * element_dw_size / 8; i++)
+ memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
+ }
}
-static void si_init_descriptors(struct si_descriptors *desc,
- short shader_userdata_rel_index,
- unsigned element_dw_size,
- unsigned num_elements)
+static void si_init_descriptors(struct si_descriptors *desc, short shader_userdata_rel_index,
+ unsigned element_dw_size, unsigned num_elements)
{
- desc->list = CALLOC(num_elements, element_dw_size * 4);
- desc->element_dw_size = element_dw_size;
- desc->num_elements = num_elements;
- desc->shader_userdata_offset = shader_userdata_rel_index * 4;
- desc->slot_index_to_bind_directly = -1;
+ desc->list = CALLOC(num_elements, element_dw_size * 4);
+ desc->element_dw_size = element_dw_size;
+ desc->num_elements = num_elements;
+ desc->shader_userdata_offset = shader_userdata_rel_index * 4;
+ desc->slot_index_to_bind_directly = -1;
}
static void si_release_descriptors(struct si_descriptors *desc)
{
- si_resource_reference(&desc->buffer, NULL);
- FREE(desc->list);
+ si_resource_reference(&desc->buffer, NULL);
+ FREE(desc->list);
}
-static bool si_upload_descriptors(struct si_context *sctx,
- struct si_descriptors *desc)
+static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc)
{
- unsigned slot_size = desc->element_dw_size * 4;
- unsigned first_slot_offset = desc->first_active_slot * slot_size;
- unsigned upload_size = desc->num_active_slots * slot_size;
+ unsigned slot_size = desc->element_dw_size * 4;
+ unsigned first_slot_offset = desc->first_active_slot * slot_size;
+ unsigned upload_size = desc->num_active_slots * slot_size;
- /* Skip the upload if no shader is using the descriptors. dirty_mask
- * will stay dirty and the descriptors will be uploaded when there is
- * a shader using them.
- */
- if (!upload_size)
- return true;
+ /* Skip the upload if no shader is using the descriptors. dirty_mask
+ * will stay dirty and the descriptors will be uploaded when there is
+ * a shader using them.
+ */
+ if (!upload_size)
+ return true;
- /* If there is just one active descriptor, bind it directly. */
- if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
- desc->num_active_slots == 1) {
- uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
- desc->element_dw_size];
+ /* If there is just one active descriptor, bind it directly. */
+ if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+ desc->num_active_slots == 1) {
+ uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * desc->element_dw_size];
- /* The buffer is already in the buffer list. */
- si_resource_reference(&desc->buffer, NULL);
- desc->gpu_list = NULL;
- desc->gpu_address = si_desc_extract_buffer_address(descriptor);
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
- return true;
- }
+ /* The buffer is already in the buffer list. */
+ si_resource_reference(&desc->buffer, NULL);
+ desc->gpu_list = NULL;
+ desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ return true;
+ }
- uint32_t *ptr;
- unsigned buffer_offset;
- u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
- si_optimal_tcc_alignment(sctx, upload_size),
- &buffer_offset, (struct pipe_resource**)&desc->buffer,
- (void**)&ptr);
- if (!desc->buffer) {
- desc->gpu_address = 0;
- return false; /* skip the draw call */
- }
+ uint32_t *ptr;
+ unsigned buffer_offset;
+ u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
+ si_optimal_tcc_alignment(sctx, upload_size), &buffer_offset,
+ (struct pipe_resource **)&desc->buffer, (void **)&ptr);
+ if (!desc->buffer) {
+ desc->gpu_address = 0;
+ return false; /* skip the draw call */
+ }
- util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
- upload_size);
- desc->gpu_list = ptr - first_slot_offset / 4;
+ util_memcpy_cpu_to_le32(ptr, (char *)desc->list + first_slot_offset, upload_size);
+ desc->gpu_list = ptr - first_slot_offset / 4;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
- /* The shader pointer should point to slot 0. */
- buffer_offset -= first_slot_offset;
- desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
+ /* The shader pointer should point to slot 0. */
+ buffer_offset -= first_slot_offset;
+ desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
- assert(desc->buffer->flags & RADEON_FLAG_32BIT);
- assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
- assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
+ assert(desc->buffer->flags & RADEON_FLAG_32BIT);
+ assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
+ assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
- return true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ return true;
}
-static void
-si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
+static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
{
- if (!desc->buffer)
- return;
+ if (!desc->buffer)
+ return;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
- RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
}
/* SAMPLER VIEWS */
-static inline enum radeon_bo_priority
-si_get_sampler_view_priority(struct si_resource *res)
+static inline enum radeon_bo_priority si_get_sampler_view_priority(struct si_resource *res)
{
- if (res->b.b.target == PIPE_BUFFER)
- return RADEON_PRIO_SAMPLER_BUFFER;
+ if (res->b.b.target == PIPE_BUFFER)
+ return RADEON_PRIO_SAMPLER_BUFFER;
- if (res->b.b.nr_samples > 1)
- return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
+ if (res->b.b.nr_samples > 1)
+ return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
- return RADEON_PRIO_SAMPLER_TEXTURE;
+ return RADEON_PRIO_SAMPLER_TEXTURE;
}
-static struct si_descriptors *
-si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader)
+static struct si_descriptors *si_sampler_and_image_descriptors(struct si_context *sctx,
+ unsigned shader)
{
- return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
+ return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
}
static void si_release_sampler_views(struct si_samplers *samplers)
{
- int i;
+ int i;
- for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
- pipe_sampler_view_reference(&samplers->views[i], NULL);
- }
+ for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
+ pipe_sampler_view_reference(&samplers->views[i], NULL);
+ }
}
-static void si_sampler_view_add_buffer(struct si_context *sctx,
- struct pipe_resource *resource,
- enum radeon_bo_usage usage,
- bool is_stencil_sampler,
- bool check_mem)
+static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_resource *resource,
+ enum radeon_bo_usage usage, bool is_stencil_sampler,
+ bool check_mem)
{
- struct si_texture *tex = (struct si_texture*)resource;
- enum radeon_bo_priority priority;
+ struct si_texture *tex = (struct si_texture *)resource;
+ enum radeon_bo_priority priority;
- if (!resource)
- return;
+ if (!resource)
+ return;
- /* Use the flushed depth texture if direct sampling is unsupported. */
- if (resource->target != PIPE_BUFFER &&
- tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler))
- tex = tex->flushed_depth_texture;
+ /* Use the flushed depth texture if direct sampling is unsupported. */
+ if (resource->target != PIPE_BUFFER && tex->is_depth &&
+ !si_can_sample_zs(tex, is_stencil_sampler))
+ tex = tex->flushed_depth_texture;
- priority = si_get_sampler_view_priority(&tex->buffer);
- radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority,
- check_mem);
+ priority = si_get_sampler_view_priority(&tex->buffer);
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem);
- if (resource->target == PIPE_BUFFER)
- return;
+ if (resource->target == PIPE_BUFFER)
+ return;
- /* Add separate DCC. */
- if (tex->dcc_separate_buffer) {
- radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer,
- usage, RADEON_PRIO_SEPARATE_META, check_mem);
- }
+ /* Add separate DCC. */
+ if (tex->dcc_separate_buffer) {
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage,
+ RADEON_PRIO_SEPARATE_META, check_mem);
+ }
}
-static void si_sampler_views_begin_new_cs(struct si_context *sctx,
- struct si_samplers *samplers)
+static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
{
- unsigned mask = samplers->enabled_mask;
+ unsigned mask = samplers->enabled_mask;
- /* Add buffers to the CS. */
- while (mask) {
- int i = u_bit_scan(&mask);
- struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
+ /* Add buffers to the CS. */
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
- si_sampler_view_add_buffer(sctx, sview->base.texture,
- RADEON_USAGE_READ,
- sview->is_stencil_sampler, false);
- }
+ si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+ sview->is_stencil_sampler, false);
+ }
}
/* Set buffer descriptor fields that can be changed by reallocations. */
-static void si_set_buf_desc_address(struct si_resource *buf,
- uint64_t offset, uint32_t *state)
+static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, uint32_t *state)
{
- uint64_t va = buf->gpu_address + offset;
+ uint64_t va = buf->gpu_address + offset;
- state[0] = va;
- state[1] &= C_008F04_BASE_ADDRESS_HI;
- state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
+ state[0] = va;
+ state[1] &= C_008F04_BASE_ADDRESS_HI;
+ state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
}
/* Set texture descriptor fields that can be changed by reallocations.
* \param is_stencil select between separate Z & Stencil
* \param state descriptor to update
*/
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
- struct si_texture *tex,
- const struct legacy_surf_level *base_level_info,
- unsigned base_level, unsigned first_level,
- unsigned block_width, bool is_stencil,
- uint32_t *state)
-{
- uint64_t va, meta_va = 0;
-
- if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
- tex = tex->flushed_depth_texture;
- is_stencil = false;
- }
-
- va = tex->buffer.gpu_address;
-
- if (sscreen->info.chip_class >= GFX9) {
- /* Only stencil_offset needs to be added here. */
- if (is_stencil)
- va += tex->surface.u.gfx9.stencil_offset;
- else
- va += tex->surface.u.gfx9.surf_offset;
- } else {
- va += base_level_info->offset;
- }
-
- state[0] = va >> 8;
- state[1] &= C_008F14_BASE_ADDRESS_HI;
- state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
-
- /* Only macrotiled modes can set tile swizzle.
- * GFX9 doesn't use (legacy) base_level_info.
- */
- if (sscreen->info.chip_class >= GFX9 ||
- base_level_info->mode == RADEON_SURF_MODE_2D)
- state[0] |= tex->surface.tile_swizzle;
-
- if (sscreen->info.chip_class >= GFX8) {
- state[6] &= C_008F28_COMPRESSION_EN;
-
- if (vi_dcc_enabled(tex, first_level)) {
- meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
- tex->surface.dcc_offset;
-
- if (sscreen->info.chip_class == GFX8) {
- meta_va += base_level_info->dcc_offset;
- assert(base_level_info->mode == RADEON_SURF_MODE_2D);
- }
-
- unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
- dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
- meta_va |= dcc_tile_swizzle;
- } else if (vi_tc_compat_htile_enabled(tex, first_level,
- is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
- meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
- }
-
- if (meta_va)
- state[6] |= S_008F28_COMPRESSION_EN(1);
- }
-
- if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
- state[7] = meta_va >> 8;
-
- if (sscreen->info.chip_class >= GFX10) {
- state[3] &= C_00A00C_SW_MODE;
-
- if (is_stencil) {
- state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
- } else {
- state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
- }
-
- state[6] &= C_00A018_META_DATA_ADDRESS_LO &
- C_00A018_META_PIPE_ALIGNED;
-
- if (meta_va) {
- struct gfx9_surf_meta_flags meta;
-
- if (tex->surface.dcc_offset)
- meta = tex->surface.u.gfx9.dcc;
- else
- meta = tex->surface.u.gfx9.htile;
-
- state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
- S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
- }
-
- state[7] = meta_va >> 16;
- } else if (sscreen->info.chip_class == GFX9) {
- state[3] &= C_008F1C_SW_MODE;
- state[4] &= C_008F20_PITCH;
-
- if (is_stencil) {
- state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
- state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
- } else {
- state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
- state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
- }
-
- state[5] &= C_008F24_META_DATA_ADDRESS &
- C_008F24_META_PIPE_ALIGNED &
- C_008F24_META_RB_ALIGNED;
- if (meta_va) {
- struct gfx9_surf_meta_flags meta;
-
- if (tex->surface.dcc_offset)
- meta = tex->surface.u.gfx9.dcc;
- else
- meta = tex->surface.u.gfx9.htile;
-
- state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
- S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
- S_008F24_META_RB_ALIGNED(meta.rb_aligned);
- }
- } else {
- /* GFX6-GFX8 */
- unsigned pitch = base_level_info->nblk_x * block_width;
- unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
-
- state[3] &= C_008F1C_TILING_INDEX;
- state[3] |= S_008F1C_TILING_INDEX(index);
- state[4] &= C_008F20_PITCH;
- state[4] |= S_008F20_PITCH(pitch - 1);
- }
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+ const struct legacy_surf_level *base_level_info,
+ unsigned base_level, unsigned first_level, unsigned block_width,
+ bool is_stencil, uint32_t *state)
+{
+ uint64_t va, meta_va = 0;
+
+ if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
+ tex = tex->flushed_depth_texture;
+ is_stencil = false;
+ }
+
+ va = tex->buffer.gpu_address;
+
+ if (sscreen->info.chip_class >= GFX9) {
+ /* Only stencil_offset needs to be added here. */
+ if (is_stencil)
+ va += tex->surface.u.gfx9.stencil_offset;
+ else
+ va += tex->surface.u.gfx9.surf_offset;
+ } else {
+ va += base_level_info->offset;
+ }
+
+ state[0] = va >> 8;
+ state[1] &= C_008F14_BASE_ADDRESS_HI;
+ state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
+
+ /* Only macrotiled modes can set tile swizzle.
+ * GFX9 doesn't use (legacy) base_level_info.
+ */
+ if (sscreen->info.chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D)
+ state[0] |= tex->surface.tile_swizzle;
+
+ if (sscreen->info.chip_class >= GFX8) {
+ state[6] &= C_008F28_COMPRESSION_EN;
+
+ if (vi_dcc_enabled(tex, first_level)) {
+ meta_va =
+ (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset;
+
+ if (sscreen->info.chip_class == GFX8) {
+ meta_va += base_level_info->dcc_offset;
+ assert(base_level_info->mode == RADEON_SURF_MODE_2D);
+ }
+
+ unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
+ dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
+ meta_va |= dcc_tile_swizzle;
+ } else if (vi_tc_compat_htile_enabled(tex, first_level,
+ is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
+ meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
+ }
+
+ if (meta_va)
+ state[6] |= S_008F28_COMPRESSION_EN(1);
+ }
+
+ if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
+ state[7] = meta_va >> 8;
+
+ if (sscreen->info.chip_class >= GFX10) {
+ state[3] &= C_00A00C_SW_MODE;
+
+ if (is_stencil) {
+ state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+ } else {
+ state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+ }
+
+ state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED;
+
+ if (meta_va) {
+ struct gfx9_surf_meta_flags meta;
+
+ if (tex->surface.dcc_offset)
+ meta = tex->surface.u.gfx9.dcc;
+ else
+ meta = tex->surface.u.gfx9.htile;
+
+ state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
+ S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
+ }
+
+ state[7] = meta_va >> 16;
+ } else if (sscreen->info.chip_class == GFX9) {
+ state[3] &= C_008F1C_SW_MODE;
+ state[4] &= C_008F20_PITCH;
+
+ if (is_stencil) {
+ state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+ state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
+ } else {
+ state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+ state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
+ }
+
+ state[5] &=
+ C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED;
+ if (meta_va) {
+ struct gfx9_surf_meta_flags meta;
+
+ if (tex->surface.dcc_offset)
+ meta = tex->surface.u.gfx9.dcc;
+ else
+ meta = tex->surface.u.gfx9.htile;
+
+ state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
+ S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
+ S_008F24_META_RB_ALIGNED(meta.rb_aligned);
+ }
+ } else {
+ /* GFX6-GFX8 */
+ unsigned pitch = base_level_info->nblk_x * block_width;
+ unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
+
+ state[3] &= C_008F1C_TILING_INDEX;
+ state[3] |= S_008F1C_TILING_INDEX(index);
+ state[4] &= C_008F20_PITCH;
+ state[4] |= S_008F20_PITCH(pitch - 1);
+ }
}
static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
- struct si_sampler_view *sview,
- struct si_texture *tex,
- uint32_t *desc)
-{
- if (sview && sview->is_integer)
- memcpy(desc, sstate->integer_val, 4*4);
- else if (tex && tex->upgraded_depth &&
- (!sview || !sview->is_stencil_sampler))
- memcpy(desc, sstate->upgraded_depth_val, 4*4);
- else
- memcpy(desc, sstate->val, 4*4);
-}
-
-static void si_set_sampler_view_desc(struct si_context *sctx,
- struct si_sampler_view *sview,
- struct si_sampler_state *sstate,
- uint32_t *desc)
-{
- struct pipe_sampler_view *view = &sview->base;
- struct si_texture *tex = (struct si_texture *)view->texture;
- bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
-
- if (unlikely(!is_buffer && sview->dcc_incompatible)) {
- if (vi_dcc_enabled(tex, view->u.tex.first_level))
- if (!si_texture_disable_dcc(sctx, tex))
- si_decompress_dcc(sctx, tex);
-
- sview->dcc_incompatible = false;
- }
-
- assert(tex); /* views with texture == NULL aren't supported */
- memcpy(desc, sview->state, 8*4);
-
- if (is_buffer) {
- si_set_buf_desc_address(&tex->buffer,
- sview->base.u.buf.offset,
- desc + 4);
- } else {
- bool is_separate_stencil = tex->db_compatible &&
- sview->is_stencil_sampler;
-
- si_set_mutable_tex_desc_fields(sctx->screen, tex,
- sview->base_level_info,
- sview->base_level,
- sview->base.u.tex.first_level,
- sview->block_width,
- is_separate_stencil,
- desc);
- }
-
- if (!is_buffer && tex->surface.fmask_size) {
- memcpy(desc + 8, sview->fmask_state, 8*4);
- } else {
- /* Disable FMASK and bind sampler state in [12:15]. */
- memcpy(desc + 8, null_texture_descriptor, 4*4);
-
- if (sstate)
- si_set_sampler_state_desc(sstate, sview,
- is_buffer ? NULL : tex,
- desc + 12);
- }
+ struct si_sampler_view *sview, struct si_texture *tex,
+ uint32_t *desc)
+{
+ if (sview && sview->is_integer)
+ memcpy(desc, sstate->integer_val, 4 * 4);
+ else if (tex && tex->upgraded_depth && (!sview || !sview->is_stencil_sampler))
+ memcpy(desc, sstate->upgraded_depth_val, 4 * 4);
+ else
+ memcpy(desc, sstate->val, 4 * 4);
+}
+
+static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview,
+ struct si_sampler_state *sstate, uint32_t *desc)
+{
+ struct pipe_sampler_view *view = &sview->base;
+ struct si_texture *tex = (struct si_texture *)view->texture;
+ bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
+
+ if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+ if (vi_dcc_enabled(tex, view->u.tex.first_level))
+ if (!si_texture_disable_dcc(sctx, tex))
+ si_decompress_dcc(sctx, tex);
+
+ sview->dcc_incompatible = false;
+ }
+
+ assert(tex); /* views with texture == NULL aren't supported */
+ memcpy(desc, sview->state, 8 * 4);
+
+ if (is_buffer) {
+ si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
+ } else {
+ bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
+
+ si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
+ sview->base.u.tex.first_level, sview->block_width,
+ is_separate_stencil, desc);
+ }
+
+ if (!is_buffer && tex->surface.fmask_size) {
+ memcpy(desc + 8, sview->fmask_state, 8 * 4);
+ } else {
+ /* Disable FMASK and bind sampler state in [12:15]. */
+ memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+
+ if (sstate)
+ si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12);
+ }
}
static bool color_needs_decompression(struct si_texture *tex)
{
- return tex->surface.fmask_size ||
- (tex->dirty_level_mask &&
- (tex->cmask_buffer || tex->surface.dcc_offset));
+ return tex->surface.fmask_size ||
+ (tex->dirty_level_mask && (tex->cmask_buffer || tex->surface.dcc_offset));
}
static bool depth_needs_decompression(struct si_texture *tex)
{
- /* If the depth/stencil texture is TC-compatible, no decompression
- * will be done. The decompression function will only flush DB caches
- * to make it coherent with shaders. That's necessary because the driver
- * doesn't flush DB caches in any other case.
- */
- return tex->db_compatible;
-}
-
-static void si_set_sampler_view(struct si_context *sctx,
- unsigned shader,
- unsigned slot, struct pipe_sampler_view *view,
- bool disallow_early_out)
-{
- struct si_samplers *samplers = &sctx->samplers[shader];
- struct si_sampler_view *sview = (struct si_sampler_view*)view;
- struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
- unsigned desc_slot = si_get_sampler_slot(slot);
- uint32_t *desc = descs->list + desc_slot * 16;
-
- if (samplers->views[slot] == view && !disallow_early_out)
- return;
-
- if (view) {
- struct si_texture *tex = (struct si_texture *)view->texture;
-
- si_set_sampler_view_desc(sctx, sview,
- samplers->sampler_states[slot], desc);
-
- if (tex->buffer.b.b.target == PIPE_BUFFER) {
- tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
- samplers->needs_color_decompress_mask &= ~(1u << slot);
- } else {
- if (depth_needs_decompression(tex)) {
- samplers->needs_depth_decompress_mask |= 1u << slot;
- } else {
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
- }
- if (color_needs_decompression(tex)) {
- samplers->needs_color_decompress_mask |= 1u << slot;
- } else {
- samplers->needs_color_decompress_mask &= ~(1u << slot);
- }
-
- if (tex->surface.dcc_offset &&
- p_atomic_read(&tex->framebuffers_bound))
- sctx->need_check_render_feedback = true;
- }
-
- pipe_sampler_view_reference(&samplers->views[slot], view);
- samplers->enabled_mask |= 1u << slot;
-
- /* Since this can flush, it must be done after enabled_mask is
- * updated. */
- si_sampler_view_add_buffer(sctx, view->texture,
- RADEON_USAGE_READ,
- sview->is_stencil_sampler, true);
- } else {
- pipe_sampler_view_reference(&samplers->views[slot], NULL);
- memcpy(desc, null_texture_descriptor, 8*4);
- /* Only clear the lower dwords of FMASK. */
- memcpy(desc + 8, null_texture_descriptor, 4*4);
- /* Re-set the sampler state if we are transitioning from FMASK. */
- if (samplers->sampler_states[slot])
- si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL,
- desc + 12);
-
- samplers->enabled_mask &= ~(1u << slot);
- samplers->needs_depth_decompress_mask &= ~(1u << slot);
- samplers->needs_color_decompress_mask &= ~(1u << slot);
- }
-
- sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-}
-
-static void si_update_shader_needs_decompress_mask(struct si_context *sctx,
- unsigned shader)
-{
- struct si_samplers *samplers = &sctx->samplers[shader];
- unsigned shader_bit = 1 << shader;
-
- if (samplers->needs_depth_decompress_mask ||
- samplers->needs_color_decompress_mask ||
- sctx->images[shader].needs_color_decompress_mask)
- sctx->shader_needs_decompress_mask |= shader_bit;
- else
- sctx->shader_needs_decompress_mask &= ~shader_bit;
-}
-
-static void si_set_sampler_views(struct pipe_context *ctx,
- enum pipe_shader_type shader, unsigned start,
- unsigned count,
- struct pipe_sampler_view **views)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- int i;
-
- if (!count || shader >= SI_NUM_SHADERS)
- return;
-
- if (views) {
- for (i = 0; i < count; i++)
- si_set_sampler_view(sctx, shader, start + i, views[i], false);
- } else {
- for (i = 0; i < count; i++)
- si_set_sampler_view(sctx, shader, start + i, NULL, false);
- }
-
- si_update_shader_needs_decompress_mask(sctx, shader);
-}
-
-static void
-si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
-{
- unsigned mask = samplers->enabled_mask;
-
- while (mask) {
- int i = u_bit_scan(&mask);
- struct pipe_resource *res = samplers->views[i]->texture;
-
- if (res && res->target != PIPE_BUFFER) {
- struct si_texture *tex = (struct si_texture *)res;
-
- if (color_needs_decompression(tex)) {
- samplers->needs_color_decompress_mask |= 1u << i;
- } else {
- samplers->needs_color_decompress_mask &= ~(1u << i);
- }
- }
- }
+ /* If the depth/stencil texture is TC-compatible, no decompression
+ * will be done. The decompression function will only flush DB caches
+ * to make it coherent with shaders. That's necessary because the driver
+ * doesn't flush DB caches in any other case.
+ */
+ return tex->db_compatible;
+}
+
+static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot,
+ struct pipe_sampler_view *view, bool disallow_early_out)
+{
+ struct si_samplers *samplers = &sctx->samplers[shader];
+ struct si_sampler_view *sview = (struct si_sampler_view *)view;
+ struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+ unsigned desc_slot = si_get_sampler_slot(slot);
+ uint32_t *desc = descs->list + desc_slot * 16;
+
+ if (samplers->views[slot] == view && !disallow_early_out)
+ return;
+
+ if (view) {
+ struct si_texture *tex = (struct si_texture *)view->texture;
+
+ si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+
+ if (tex->buffer.b.b.target == PIPE_BUFFER) {
+ tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+ samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ samplers->needs_color_decompress_mask &= ~(1u << slot);
+ } else {
+ if (depth_needs_decompression(tex)) {
+ samplers->needs_depth_decompress_mask |= 1u << slot;
+ } else {
+ samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ }
+ if (color_needs_decompression(tex)) {
+ samplers->needs_color_decompress_mask |= 1u << slot;
+ } else {
+ samplers->needs_color_decompress_mask &= ~(1u << slot);
+ }
+
+ if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+ sctx->need_check_render_feedback = true;
+ }
+
+ pipe_sampler_view_reference(&samplers->views[slot], view);
+ samplers->enabled_mask |= 1u << slot;
+
+ /* Since this can flush, it must be done after enabled_mask is
+ * updated. */
+ si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler,
+ true);
+ } else {
+ pipe_sampler_view_reference(&samplers->views[slot], NULL);
+ memcpy(desc, null_texture_descriptor, 8 * 4);
+ /* Only clear the lower dwords of FMASK. */
+ memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+ /* Re-set the sampler state if we are transitioning from FMASK. */
+ if (samplers->sampler_states[slot])
+ si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+
+ samplers->enabled_mask &= ~(1u << slot);
+ samplers->needs_depth_decompress_mask &= ~(1u << slot);
+ samplers->needs_color_decompress_mask &= ~(1u << slot);
+ }
+
+ sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+}
+
+static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader)
+{
+ struct si_samplers *samplers = &sctx->samplers[shader];
+ unsigned shader_bit = 1 << shader;
+
+ if (samplers->needs_depth_decompress_mask || samplers->needs_color_decompress_mask ||
+ sctx->images[shader].needs_color_decompress_mask)
+ sctx->shader_needs_decompress_mask |= shader_bit;
+ else
+ sctx->shader_needs_decompress_mask &= ~shader_bit;
+}
+
+static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
+ unsigned start, unsigned count, struct pipe_sampler_view **views)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ int i;
+
+ if (!count || shader >= SI_NUM_SHADERS)
+ return;
+
+ if (views) {
+ for (i = 0; i < count; i++)
+ si_set_sampler_view(sctx, shader, start + i, views[i], false);
+ } else {
+ for (i = 0; i < count; i++)
+ si_set_sampler_view(sctx, shader, start + i, NULL, false);
+ }
+
+ si_update_shader_needs_decompress_mask(sctx, shader);
+}
+
+static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
+{
+ unsigned mask = samplers->enabled_mask;
+
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ struct pipe_resource *res = samplers->views[i]->texture;
+
+ if (res && res->target != PIPE_BUFFER) {
+ struct si_texture *tex = (struct si_texture *)res;
+
+ if (color_needs_decompression(tex)) {
+ samplers->needs_color_decompress_mask |= 1u << i;
+ } else {
+ samplers->needs_color_decompress_mask &= ~(1u << i);
+ }
+ }
+ }
}
/* IMAGE VIEWS */
-static void
-si_release_image_views(struct si_images *images)
+static void si_release_image_views(struct si_images *images)
{
- unsigned i;
+ unsigned i;
- for (i = 0; i < SI_NUM_IMAGES; ++i) {
- struct pipe_image_view *view = &images->views[i];
+ for (i = 0; i < SI_NUM_IMAGES; ++i) {
+ struct pipe_image_view *view = &images->views[i];
- pipe_resource_reference(&view->resource, NULL);
- }
+ pipe_resource_reference(&view->resource, NULL);
+ }
}
-static void
-si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
+static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
{
- uint mask = images->enabled_mask;
+ uint mask = images->enabled_mask;
+
+ /* Add buffers to the CS. */
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ struct pipe_image_view *view = &images->views[i];
- /* Add buffers to the CS. */
- while (mask) {
- int i = u_bit_scan(&mask);
- struct pipe_image_view *view = &images->views[i];
+ assert(view->resource);
- assert(view->resource);
+ si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+ }
+}
- si_sampler_view_add_buffer(sctx, view->resource,
- RADEON_USAGE_READWRITE, false, false);
- }
-}
-
-static void
-si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
-{
- struct si_images *images = &ctx->images[shader];
-
- if (images->enabled_mask & (1u << slot)) {
- struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
- unsigned desc_slot = si_get_image_slot(slot);
-
- pipe_resource_reference(&images->views[slot].resource, NULL);
- images->needs_color_decompress_mask &= ~(1 << slot);
-
- memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4);
- images->enabled_mask &= ~(1u << slot);
- ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
- }
-}
-
-static void
-si_mark_image_range_valid(const struct pipe_image_view *view)
-{
- struct si_resource *res = si_resource(view->resource);
-
- if (res->b.b.target != PIPE_BUFFER)
- return;
-
- util_range_add(&res->b.b, &res->valid_buffer_range,
- view->u.buf.offset,
- view->u.buf.offset + view->u.buf.size);
-}
-
-static void si_set_shader_image_desc(struct si_context *ctx,
- const struct pipe_image_view *view,
- bool skip_decompress,
- uint32_t *desc, uint32_t *fmask_desc)
-{
- struct si_screen *screen = ctx->screen;
- struct si_resource *res;
-
- res = si_resource(view->resource);
-
- if (res->b.b.target == PIPE_BUFFER ||
- view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
- if (view->access & PIPE_IMAGE_ACCESS_WRITE)
- si_mark_image_range_valid(view);
-
- si_make_buffer_descriptor(screen, res,
- view->format,
- view->u.buf.offset,
- view->u.buf.size, desc);
- si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
- } else {
- static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
- struct si_texture *tex = (struct si_texture *)res;
- unsigned level = view->u.tex.level;
- unsigned width, height, depth, hw_level;
- bool uses_dcc = vi_dcc_enabled(tex, level);
- unsigned access = view->access;
-
- assert(!tex->is_depth);
- assert(fmask_desc || tex->surface.fmask_offset == 0);
-
- if (uses_dcc && !skip_decompress &&
- (access & PIPE_IMAGE_ACCESS_WRITE ||
- !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
- /* If DCC can't be disabled, at least decompress it.
- * The decompression is relatively cheap if the surface
- * has been decompressed already.
- */
- if (!si_texture_disable_dcc(ctx, tex))
- si_decompress_dcc(ctx, tex);
- }
-
- if (ctx->chip_class >= GFX9) {
- /* Always set the base address. The swizzle modes don't
- * allow setting mipmap level offsets as the base.
- */
- width = res->b.b.width0;
- height = res->b.b.height0;
- depth = res->b.b.depth0;
- hw_level = level;
- } else {
- /* Always force the base level to the selected level.
- *
- * This is required for 3D textures, where otherwise
- * selecting a single slice for non-layered bindings
- * fails. It doesn't hurt the other targets.
- */
- width = u_minify(res->b.b.width0, level);
- height = u_minify(res->b.b.height0, level);
- depth = u_minify(res->b.b.depth0, level);
- hw_level = 0;
- }
-
- screen->make_texture_descriptor(screen, tex,
- false, res->b.b.target,
- view->format, swizzle,
- hw_level, hw_level,
- view->u.tex.first_layer,
- view->u.tex.last_layer,
- width, height, depth,
- desc, fmask_desc);
- si_set_mutable_tex_desc_fields(screen, tex,
- &tex->surface.u.legacy.level[level],
- level, level,
- util_format_get_blockwidth(view->format),
- false, desc);
- }
-}
-
-static void si_set_shader_image(struct si_context *ctx,
- unsigned shader,
- unsigned slot, const struct pipe_image_view *view,
- bool skip_decompress)
-{
- struct si_images *images = &ctx->images[shader];
- struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
- struct si_resource *res;
-
- if (!view || !view->resource) {
- si_disable_shader_image(ctx, shader, slot);
- return;
- }
-
- res = si_resource(view->resource);
-
- if (&images->views[slot] != view)
- util_copy_image_view(&images->views[slot], view);
-
- si_set_shader_image_desc(ctx, view, skip_decompress,
- descs->list + si_get_image_slot(slot) * 8,
- descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
-
- if (res->b.b.target == PIPE_BUFFER ||
- view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
- images->needs_color_decompress_mask &= ~(1 << slot);
- res->bind_history |= PIPE_BIND_SHADER_IMAGE;
- } else {
- struct si_texture *tex = (struct si_texture *)res;
- unsigned level = view->u.tex.level;
-
- if (color_needs_decompression(tex)) {
- images->needs_color_decompress_mask |= 1 << slot;
- } else {
- images->needs_color_decompress_mask &= ~(1 << slot);
- }
-
- if (vi_dcc_enabled(tex, level) &&
- p_atomic_read(&tex->framebuffers_bound))
- ctx->need_check_render_feedback = true;
- }
-
- images->enabled_mask |= 1u << slot;
- ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-
- /* Since this can flush, it must be done after enabled_mask is updated. */
- si_sampler_view_add_buffer(ctx, &res->b.b,
- (view->access & PIPE_IMAGE_ACCESS_WRITE) ?
- RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
- false, true);
-}
-
-static void
-si_set_shader_images(struct pipe_context *pipe,
- enum pipe_shader_type shader,
- unsigned start_slot, unsigned count,
- const struct pipe_image_view *views)
-{
- struct si_context *ctx = (struct si_context *)pipe;
- unsigned i, slot;
-
- assert(shader < SI_NUM_SHADERS);
-
- if (!count)
- return;
-
- assert(start_slot + count <= SI_NUM_IMAGES);
-
- if (views) {
- for (i = 0, slot = start_slot; i < count; ++i, ++slot)
- si_set_shader_image(ctx, shader, slot, &views[i], false);
- } else {
- for (i = 0, slot = start_slot; i < count; ++i, ++slot)
- si_set_shader_image(ctx, shader, slot, NULL, false);
- }
-
- si_update_shader_needs_decompress_mask(ctx, shader);
-}
-
-static void
-si_images_update_needs_color_decompress_mask(struct si_images *images)
+static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
{
- unsigned mask = images->enabled_mask;
+ struct si_images *images = &ctx->images[shader];
+
+ if (images->enabled_mask & (1u << slot)) {
+ struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+ unsigned desc_slot = si_get_image_slot(slot);
+
+ pipe_resource_reference(&images->views[slot].resource, NULL);
+ images->needs_color_decompress_mask &= ~(1 << slot);
+
+ memcpy(descs->list + desc_slot * 8, null_image_descriptor, 8 * 4);
+ images->enabled_mask &= ~(1u << slot);
+ ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+ }
+}
- while (mask) {
- int i = u_bit_scan(&mask);
- struct pipe_resource *res = images->views[i].resource;
+static void si_mark_image_range_valid(const struct pipe_image_view *view)
+{
+ struct si_resource *res = si_resource(view->resource);
- if (res && res->target != PIPE_BUFFER) {
- struct si_texture *tex = (struct si_texture *)res;
+ if (res->b.b.target != PIPE_BUFFER)
+ return;
- if (color_needs_decompression(tex)) {
- images->needs_color_decompress_mask |= 1 << i;
- } else {
- images->needs_color_decompress_mask &= ~(1 << i);
- }
- }
- }
+ util_range_add(&res->b.b, &res->valid_buffer_range, view->u.buf.offset,
+ view->u.buf.offset + view->u.buf.size);
+}
+
+static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_image_view *view,
+ bool skip_decompress, uint32_t *desc, uint32_t *fmask_desc)
+{
+ struct si_screen *screen = ctx->screen;
+ struct si_resource *res;
+
+ res = si_resource(view->resource);
+
+ if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+ if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+ si_mark_image_range_valid(view);
+
+ si_make_buffer_descriptor(screen, res, view->format, view->u.buf.offset, view->u.buf.size,
+ desc);
+ si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
+ } else {
+ static const unsigned char swizzle[4] = {0, 1, 2, 3};
+ struct si_texture *tex = (struct si_texture *)res;
+ unsigned level = view->u.tex.level;
+ unsigned width, height, depth, hw_level;
+ bool uses_dcc = vi_dcc_enabled(tex, level);
+ unsigned access = view->access;
+
+ assert(!tex->is_depth);
+ assert(fmask_desc || tex->surface.fmask_offset == 0);
+
+ if (uses_dcc && !skip_decompress &&
+ (access & PIPE_IMAGE_ACCESS_WRITE ||
+ !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
+ /* If DCC can't be disabled, at least decompress it.
+ * The decompression is relatively cheap if the surface
+ * has been decompressed already.
+ */
+ if (!si_texture_disable_dcc(ctx, tex))
+ si_decompress_dcc(ctx, tex);
+ }
+
+ if (ctx->chip_class >= GFX9) {
+ /* Always set the base address. The swizzle modes don't
+ * allow setting mipmap level offsets as the base.
+ */
+ width = res->b.b.width0;
+ height = res->b.b.height0;
+ depth = res->b.b.depth0;
+ hw_level = level;
+ } else {
+ /* Always force the base level to the selected level.
+ *
+ * This is required for 3D textures, where otherwise
+ * selecting a single slice for non-layered bindings
+ * fails. It doesn't hurt the other targets.
+ */
+ width = u_minify(res->b.b.width0, level);
+ height = u_minify(res->b.b.height0, level);
+ depth = u_minify(res->b.b.depth0, level);
+ hw_level = 0;
+ }
+
+ screen->make_texture_descriptor(
+ screen, tex, false, res->b.b.target, view->format, swizzle, hw_level, hw_level,
+ view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
+ si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
+ util_format_get_blockwidth(view->format), false, desc);
+ }
+}
+
+static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigned slot,
+ const struct pipe_image_view *view, bool skip_decompress)
+{
+ struct si_images *images = &ctx->images[shader];
+ struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+ struct si_resource *res;
+
+ if (!view || !view->resource) {
+ si_disable_shader_image(ctx, shader, slot);
+ return;
+ }
+
+ res = si_resource(view->resource);
+
+ if (&images->views[slot] != view)
+ util_copy_image_view(&images->views[slot], view);
+
+ si_set_shader_image_desc(ctx, view, skip_decompress, descs->list + si_get_image_slot(slot) * 8,
+ descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
+
+ if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+ images->needs_color_decompress_mask &= ~(1 << slot);
+ res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+ } else {
+ struct si_texture *tex = (struct si_texture *)res;
+ unsigned level = view->u.tex.level;
+
+ if (color_needs_decompression(tex)) {
+ images->needs_color_decompress_mask |= 1 << slot;
+ } else {
+ images->needs_color_decompress_mask &= ~(1 << slot);
+ }
+
+ if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+ ctx->need_check_render_feedback = true;
+ }
+
+ images->enabled_mask |= 1u << slot;
+ ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+ /* Since this can flush, it must be done after enabled_mask is updated. */
+ si_sampler_view_add_buffer(
+ ctx, &res->b.b,
+ (view->access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+ true);
+}
+
+static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_type shader,
+ unsigned start_slot, unsigned count,
+ const struct pipe_image_view *views)
+{
+ struct si_context *ctx = (struct si_context *)pipe;
+ unsigned i, slot;
+
+ assert(shader < SI_NUM_SHADERS);
+
+ if (!count)
+ return;
+
+ assert(start_slot + count <= SI_NUM_IMAGES);
+
+ if (views) {
+ for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+ si_set_shader_image(ctx, shader, slot, &views[i], false);
+ } else {
+ for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+ si_set_shader_image(ctx, shader, slot, NULL, false);
+ }
+
+ si_update_shader_needs_decompress_mask(ctx, shader);
+}
+
+static void si_images_update_needs_color_decompress_mask(struct si_images *images)
+{
+ unsigned mask = images->enabled_mask;
+
+ while (mask) {
+ int i = u_bit_scan(&mask);
+ struct pipe_resource *res = images->views[i].resource;
+
+ if (res && res->target != PIPE_BUFFER) {
+ struct si_texture *tex = (struct si_texture *)res;
+
+ if (color_needs_decompression(tex)) {
+ images->needs_color_decompress_mask |= 1 << i;
+ } else {
+ images->needs_color_decompress_mask &= ~(1 << i);
+ }
+ }
+ }
}
void si_update_ps_colorbuf0_slot(struct si_context *sctx)
{
- struct si_buffer_resources *buffers = &sctx->rw_buffers;
- struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
- unsigned slot = SI_PS_IMAGE_COLORBUF0;
- struct pipe_surface *surf = NULL;
-
- /* si_texture_disable_dcc can get us here again. */
- if (sctx->blitter->running)
- return;
-
- /* See whether FBFETCH is used and color buffer 0 is set. */
- if (sctx->ps_shader.cso &&
- sctx->ps_shader.cso->info.uses_fbfetch &&
- sctx->framebuffer.state.nr_cbufs &&
- sctx->framebuffer.state.cbufs[0])
- surf = sctx->framebuffer.state.cbufs[0];
-
- /* Return if FBFETCH transitions from disabled to disabled. */
- if (!buffers->buffers[slot] && !surf)
- return;
-
- sctx->ps_uses_fbfetch = surf != NULL;
- si_update_ps_iter_samples(sctx);
-
- if (surf) {
- struct si_texture *tex = (struct si_texture*)surf->texture;
- struct pipe_image_view view = {0};
-
- assert(tex);
- assert(!tex->is_depth);
-
- /* Disable DCC, because the texture is used as both a sampler
- * and color buffer.
- */
- si_texture_disable_dcc(sctx, tex);
-
- if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
- /* Disable CMASK. */
- assert(tex->cmask_buffer != &tex->buffer);
- si_eliminate_fast_color_clear(sctx, tex);
- si_texture_discard_cmask(sctx->screen, tex);
- }
-
- view.resource = surf->texture;
- view.format = surf->format;
- view.access = PIPE_IMAGE_ACCESS_READ;
- view.u.tex.first_layer = surf->u.tex.first_layer;
- view.u.tex.last_layer = surf->u.tex.last_layer;
- view.u.tex.level = surf->u.tex.level;
-
- /* Set the descriptor. */
- uint32_t *desc = descs->list + slot*4;
- memset(desc, 0, 16 * 4);
- si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
-
- pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- &tex->buffer, RADEON_USAGE_READ,
- RADEON_PRIO_SHADER_RW_IMAGE);
- buffers->enabled_mask |= 1u << slot;
- } else {
- /* Clear the descriptor. */
- memset(descs->list + slot*4, 0, 8*4);
- pipe_resource_reference(&buffers->buffers[slot], NULL);
- buffers->enabled_mask &= ~(1u << slot);
- }
-
- sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+ struct si_buffer_resources *buffers = &sctx->rw_buffers;
+ struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+ unsigned slot = SI_PS_IMAGE_COLORBUF0;
+ struct pipe_surface *surf = NULL;
+
+ /* si_texture_disable_dcc can get us here again. */
+ if (sctx->blitter->running)
+ return;
+
+ /* See whether FBFETCH is used and color buffer 0 is set. */
+ if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_fbfetch &&
+ sctx->framebuffer.state.nr_cbufs && sctx->framebuffer.state.cbufs[0])
+ surf = sctx->framebuffer.state.cbufs[0];
+
+ /* Return if FBFETCH transitions from disabled to disabled. */
+ if (!buffers->buffers[slot] && !surf)
+ return;
+
+ sctx->ps_uses_fbfetch = surf != NULL;
+ si_update_ps_iter_samples(sctx);
+
+ if (surf) {
+ struct si_texture *tex = (struct si_texture *)surf->texture;
+ struct pipe_image_view view = {0};
+
+ assert(tex);
+ assert(!tex->is_depth);
+
+ /* Disable DCC, because the texture is used as both a sampler
+ * and color buffer.
+ */
+ si_texture_disable_dcc(sctx, tex);
+
+ if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
+ /* Disable CMASK. */
+ assert(tex->cmask_buffer != &tex->buffer);
+ si_eliminate_fast_color_clear(sctx, tex);
+ si_texture_discard_cmask(sctx->screen, tex);
+ }
+
+ view.resource = surf->texture;
+ view.format = surf->format;
+ view.access = PIPE_IMAGE_ACCESS_READ;
+ view.u.tex.first_layer = surf->u.tex.first_layer;
+ view.u.tex.last_layer = surf->u.tex.last_layer;
+ view.u.tex.level = surf->u.tex.level;
+
+ /* Set the descriptor. */
+ uint32_t *desc = descs->list + slot * 4;
+ memset(desc, 0, 16 * 4);
+ si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
+
+ pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_SHADER_RW_IMAGE);
+ buffers->enabled_mask |= 1u << slot;
+ } else {
+ /* Clear the descriptor. */
+ memset(descs->list + slot * 4, 0, 8 * 4);
+ pipe_resource_reference(&buffers->buffers[slot], NULL);
+ buffers->enabled_mask &= ~(1u << slot);
+ }
+
+ sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
}
/* SAMPLER STATES */
-static void si_bind_sampler_states(struct pipe_context *ctx,
- enum pipe_shader_type shader,
+static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_type shader,
unsigned start, unsigned count, void **states)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_samplers *samplers = &sctx->samplers[shader];
- struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
- struct si_sampler_state **sstates = (struct si_sampler_state**)states;
- int i;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_samplers *samplers = &sctx->samplers[shader];
+ struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
+ struct si_sampler_state **sstates = (struct si_sampler_state **)states;
+ int i;
- if (!count || shader >= SI_NUM_SHADERS || !sstates)
- return;
+ if (!count || shader >= SI_NUM_SHADERS || !sstates)
+ return;
- for (i = 0; i < count; i++) {
- unsigned slot = start + i;
- unsigned desc_slot = si_get_sampler_slot(slot);
+ for (i = 0; i < count; i++) {
+ unsigned slot = start + i;
+ unsigned desc_slot = si_get_sampler_slot(slot);
- if (!sstates[i] ||
- sstates[i] == samplers->sampler_states[slot])
- continue;
+ if (!sstates[i] || sstates[i] == samplers->sampler_states[slot])
+ continue;
#ifndef NDEBUG
- assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
+ assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
#endif
- samplers->sampler_states[slot] = sstates[i];
+ samplers->sampler_states[slot] = sstates[i];
- /* If FMASK is bound, don't overwrite it.
- * The sampler state will be set after FMASK is unbound.
- */
- struct si_sampler_view *sview =
- (struct si_sampler_view *)samplers->views[slot];
+ /* If FMASK is bound, don't overwrite it.
+ * The sampler state will be set after FMASK is unbound.
+ */
+ struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[slot];
- struct si_texture *tex = NULL;
+ struct si_texture *tex = NULL;
- if (sview && sview->base.texture &&
- sview->base.texture->target != PIPE_BUFFER)
- tex = (struct si_texture *)sview->base.texture;
+ if (sview && sview->base.texture && sview->base.texture->target != PIPE_BUFFER)
+ tex = (struct si_texture *)sview->base.texture;
- if (tex && tex->surface.fmask_size)
- continue;
+ if (tex && tex->surface.fmask_size)
+ continue;
- si_set_sampler_state_desc(sstates[i], sview, tex,
- desc->list + desc_slot * 16 + 12);
+ si_set_sampler_state_desc(sstates[i], sview, tex, desc->list + desc_slot * 16 + 12);
- sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
- }
+ sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+ }
}
/* BUFFER RESOURCES */
static void si_init_buffer_resources(struct si_buffer_resources *buffers,
- struct si_descriptors *descs,
- unsigned num_buffers,
- short shader_userdata_rel_index,
- enum radeon_bo_priority priority,
- enum radeon_bo_priority priority_constbuf)
+ struct si_descriptors *descs, unsigned num_buffers,
+ short shader_userdata_rel_index,
+ enum radeon_bo_priority priority,
+ enum radeon_bo_priority priority_constbuf)
{
- buffers->priority = priority;
- buffers->priority_constbuf = priority_constbuf;
- buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
- buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
+ buffers->priority = priority;
+ buffers->priority_constbuf = priority_constbuf;
+ buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource *));
+ buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
- si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
+ si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
}
static void si_release_buffer_resources(struct si_buffer_resources *buffers,
- struct si_descriptors *descs)
+ struct si_descriptors *descs)
{
- int i;
+ int i;
- for (i = 0; i < descs->num_elements; i++) {
- pipe_resource_reference(&buffers->buffers[i], NULL);
- }
+ for (i = 0; i < descs->num_elements; i++) {
+ pipe_resource_reference(&buffers->buffers[i], NULL);
+ }
- FREE(buffers->buffers);
- FREE(buffers->offsets);
+ FREE(buffers->buffers);
+ FREE(buffers->offsets);
}
static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
- struct si_buffer_resources *buffers)
+ struct si_buffer_resources *buffers)
{
- unsigned mask = buffers->enabled_mask;
+ unsigned mask = buffers->enabled_mask;
- /* Add buffers to the CS. */
- while (mask) {
- int i = u_bit_scan(&mask);
+ /* Add buffers to the CS. */
+ while (mask) {
+ int i = u_bit_scan(&mask);
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(buffers->buffers[i]),
- buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE :
- RADEON_USAGE_READ,
- i < SI_NUM_SHADER_BUFFERS ? buffers->priority :
- buffers->priority_constbuf);
- }
+ radeon_add_to_buffer_list(
+ sctx, sctx->gfx_cs, si_resource(buffers->buffers[i]),
+ buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+ i < SI_NUM_SHADER_BUFFERS ? buffers->priority : buffers->priority_constbuf);
+ }
}
static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
- struct si_descriptors *descs,
- unsigned idx, struct pipe_resource **buf,
- unsigned *offset, unsigned *size)
+ struct si_descriptors *descs, unsigned idx,
+ struct pipe_resource **buf, unsigned *offset,
+ unsigned *size)
{
- pipe_resource_reference(buf, buffers->buffers[idx]);
- if (*buf) {
- struct si_resource *res = si_resource(*buf);
- const uint32_t *desc = descs->list + idx * 4;
- uint64_t va;
+ pipe_resource_reference(buf, buffers->buffers[idx]);
+ if (*buf) {
+ struct si_resource *res = si_resource(*buf);
+ const uint32_t *desc = descs->list + idx * 4;
+ uint64_t va;
- *size = desc[2];
+ *size = desc[2];
- assert(G_008F04_STRIDE(desc[1]) == 0);
- va = si_desc_extract_buffer_address(desc);
+ assert(G_008F04_STRIDE(desc[1]) == 0);
+ va = si_desc_extract_buffer_address(desc);
- assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
- *offset = va - res->gpu_address;
- }
+ assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
+ *offset = va - res->gpu_address;
+ }
}
/* VERTEX BUFFERS */
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
{
- int count = sctx->num_vertex_elements;
- int i;
+ int count = sctx->num_vertex_elements;
+ int i;
- for (i = 0; i < count; i++) {
- int vb = sctx->vertex_elements->vertex_buffer_index[i];
+ for (i = 0; i < count; i++) {
+ int vb = sctx->vertex_elements->vertex_buffer_index[i];
- if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
- continue;
- if (!sctx->vertex_buffer[vb].buffer.resource)
- continue;
+ if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+ continue;
+ if (!sctx->vertex_buffer[vb].buffer.resource)
+ continue;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(sctx->vertex_buffer[vb].buffer.resource),
- RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
- }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+ si_resource(sctx->vertex_buffer[vb].buffer.resource),
+ RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+ }
- if (!sctx->vb_descriptors_buffer)
- return;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
+ if (!sctx->vb_descriptors_buffer)
+ return;
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
}
bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
{
- unsigned i, count = sctx->num_vertex_elements;
- uint32_t *ptr;
-
- if (!sctx->vertex_buffers_dirty || !count)
- return true;
-
- struct si_vertex_elements *velems = sctx->vertex_elements;
- unsigned alloc_size = velems->vb_desc_list_alloc_size;
-
- if (alloc_size) {
- /* Vertex buffer descriptors are the only ones which are uploaded
- * directly through a staging buffer and don't go through
- * the fine-grained upload path.
- */
- u_upload_alloc(sctx->b.const_uploader, 0,
- alloc_size,
- si_optimal_tcc_alignment(sctx, alloc_size),
- &sctx->vb_descriptors_offset,
- (struct pipe_resource**)&sctx->vb_descriptors_buffer,
- (void**)&ptr);
- if (!sctx->vb_descriptors_buffer) {
- sctx->vb_descriptors_offset = 0;
- sctx->vb_descriptors_gpu_list = NULL;
- return false;
- }
-
- sctx->vb_descriptors_gpu_list = ptr;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
- RADEON_PRIO_DESCRIPTORS);
- sctx->vertex_buffer_pointer_dirty = true;
- sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
- } else {
- si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
- sctx->vertex_buffer_pointer_dirty = false;
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
- }
-
- assert(count <= SI_MAX_ATTRIBS);
-
- unsigned first_vb_use_mask = velems->first_vb_use_mask;
- unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
-
- for (i = 0; i < count; i++) {
- struct pipe_vertex_buffer *vb;
- struct si_resource *buf;
- unsigned vbo_index = velems->vertex_buffer_index[i];
- uint32_t *desc = i < num_vbos_in_user_sgprs ?
- &sctx->vb_descriptor_user_sgprs[i * 4] :
- &ptr[(i - num_vbos_in_user_sgprs) * 4];
-
- vb = &sctx->vertex_buffer[vbo_index];
- buf = si_resource(vb->buffer.resource);
- if (!buf) {
- memset(desc, 0, 16);
- continue;
- }
-
- int64_t offset = (int64_t)((int)vb->buffer_offset) +
- velems->src_offset[i];
-
- if (offset >= buf->b.b.width0) {
- assert(offset < buf->b.b.width0);
- memset(desc, 0, 16);
- continue;
- }
-
- uint64_t va = buf->gpu_address + offset;
-
- int64_t num_records = (int64_t)buf->b.b.width0 - offset;
- if (sctx->chip_class != GFX8 && vb->stride) {
- /* Round up by rounding down and adding 1 */
- num_records = (num_records - velems->format_size[i]) /
- vb->stride + 1;
- }
- assert(num_records >= 0 && num_records <= UINT_MAX);
-
- uint32_t rsrc_word3 = velems->rsrc_word3[i];
-
- /* OOB_SELECT chooses the out-of-bounds check:
- * - 1: index >= NUM_RECORDS (Structured)
- * - 3: offset >= NUM_RECORDS (Raw)
- */
- if (sctx->chip_class >= GFX10)
- rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW);
-
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(vb->stride);
- desc[2] = num_records;
- desc[3] = rsrc_word3;
-
- if (first_vb_use_mask & (1 << i)) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(vb->buffer.resource),
- RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
- }
- }
-
- /* Don't flush the const cache. It would have a very negative effect
- * on performance (confirmed by testing). New descriptors are always
- * uploaded to a fresh new buffer, so I don't think flushing the const
- * cache is needed. */
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
- sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
- sctx->vertex_buffers_dirty = false;
- return true;
+ unsigned i, count = sctx->num_vertex_elements;
+ uint32_t *ptr;
+
+ if (!sctx->vertex_buffers_dirty || !count)
+ return true;
+
+ struct si_vertex_elements *velems = sctx->vertex_elements;
+ unsigned alloc_size = velems->vb_desc_list_alloc_size;
+
+ if (alloc_size) {
+ /* Vertex buffer descriptors are the only ones which are uploaded
+ * directly through a staging buffer and don't go through
+ * the fine-grained upload path.
+ */
+ u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
+ si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
+ (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
+ if (!sctx->vb_descriptors_buffer) {
+ sctx->vb_descriptors_offset = 0;
+ sctx->vb_descriptors_gpu_list = NULL;
+ return false;
+ }
+
+ sctx->vb_descriptors_gpu_list = ptr;
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+ RADEON_PRIO_DESCRIPTORS);
+ sctx->vertex_buffer_pointer_dirty = true;
+ sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+ } else {
+ si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+ sctx->vertex_buffer_pointer_dirty = false;
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
+ }
+
+ assert(count <= SI_MAX_ATTRIBS);
+
+ unsigned first_vb_use_mask = velems->first_vb_use_mask;
+ unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+
+ for (i = 0; i < count; i++) {
+ struct pipe_vertex_buffer *vb;
+ struct si_resource *buf;
+ unsigned vbo_index = velems->vertex_buffer_index[i];
+ uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
+ : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+
+ vb = &sctx->vertex_buffer[vbo_index];
+ buf = si_resource(vb->buffer.resource);
+ if (!buf) {
+ memset(desc, 0, 16);
+ continue;
+ }
+
+ int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
+
+ if (offset >= buf->b.b.width0) {
+ assert(offset < buf->b.b.width0);
+ memset(desc, 0, 16);
+ continue;
+ }
+
+ uint64_t va = buf->gpu_address + offset;
+
+ int64_t num_records = (int64_t)buf->b.b.width0 - offset;
+ if (sctx->chip_class != GFX8 && vb->stride) {
+ /* Round up by rounding down and adding 1 */
+ num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
+ }
+ assert(num_records >= 0 && num_records <= UINT_MAX);
+
+ uint32_t rsrc_word3 = velems->rsrc_word3[i];
+
+ /* OOB_SELECT chooses the out-of-bounds check:
+ * - 1: index >= NUM_RECORDS (Structured)
+ * - 3: offset >= NUM_RECORDS (Raw)
+ */
+ if (sctx->chip_class >= GFX10)
+ rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
+ : V_008F0C_OOB_SELECT_RAW);
+
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
+ desc[2] = num_records;
+ desc[3] = rsrc_word3;
+
+ if (first_vb_use_mask & (1 << i)) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource),
+ RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+ }
+ }
+
+ /* Don't flush the const cache. It would have a very negative effect
+ * on performance (confirmed by testing). New descriptors are always
+ * uploaded to a fresh new buffer, so I don't think flushing the const
+ * cache is needed. */
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+ sctx->vertex_buffers_dirty = false;
+ return true;
}
-
/* CONSTANT BUFFERS */
-static struct si_descriptors *
-si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader)
-{
- return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
-}
-
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
- const uint8_t *ptr, unsigned size, uint32_t *const_offset)
-{
- void *tmp;
-
- u_upload_alloc(sctx->b.const_uploader, 0, size,
- si_optimal_tcc_alignment(sctx, size),
- const_offset,
- (struct pipe_resource**)buf, &tmp);
- if (*buf)
- util_memcpy_cpu_to_le32(tmp, ptr, size);
-}
-
-static void si_set_constant_buffer(struct si_context *sctx,
- struct si_buffer_resources *buffers,
- unsigned descriptors_idx,
- uint slot, const struct pipe_constant_buffer *input)
-{
- struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
- assert(slot < descs->num_elements);
- pipe_resource_reference(&buffers->buffers[slot], NULL);
-
- /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
- * with a NULL buffer). We need to use a dummy buffer instead. */
- if (sctx->chip_class == GFX7 &&
- (!input || (!input->buffer && !input->user_buffer)))
- input = &sctx->null_const_buf;
-
- if (input && (input->buffer || input->user_buffer)) {
- struct pipe_resource *buffer = NULL;
- uint64_t va;
- unsigned buffer_offset;
-
- /* Upload the user buffer if needed. */
- if (input->user_buffer) {
- si_upload_const_buffer(sctx,
- (struct si_resource**)&buffer, input->user_buffer,
- input->buffer_size, &buffer_offset);
- if (!buffer) {
- /* Just unbind on failure. */
- si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
- return;
- }
- } else {
- pipe_resource_reference(&buffer, input->buffer);
- buffer_offset = input->buffer_offset;
- }
-
- va = si_resource(buffer)->gpu_address + buffer_offset;
-
- /* Set the descriptor. */
- uint32_t *desc = descs->list + slot*4;
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(0);
- desc[2] = input->buffer_size;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
-
- buffers->buffers[slot] = buffer;
- buffers->offsets[slot] = buffer_offset;
- radeon_add_to_gfx_buffer_list_check_mem(sctx,
- si_resource(buffer),
- RADEON_USAGE_READ,
- buffers->priority_constbuf, true);
- buffers->enabled_mask |= 1u << slot;
- } else {
- /* Clear the descriptor. */
- memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
- buffers->enabled_mask &= ~(1u << slot);
- }
-
- sctx->descriptors_dirty |= 1u << descriptors_idx;
-}
-
-static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
- enum pipe_shader_type shader, uint slot,
- const struct pipe_constant_buffer *input)
-{
- struct si_context *sctx = (struct si_context *)ctx;
-
- if (shader >= SI_NUM_SHADERS)
- return;
-
- if (slot == 0 && input && input->buffer &&
- !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
- assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
- return;
- }
-
- if (input && input->buffer)
- si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
-
- slot = si_get_constbuf_slot(slot);
- si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
- si_const_and_shader_buffer_descriptors_idx(shader),
- slot, input);
-}
-
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
- uint slot, struct pipe_constant_buffer *cbuf)
-{
- cbuf->user_buffer = NULL;
- si_get_buffer_from_descriptors(
- &sctx->const_and_shader_buffers[shader],
- si_const_and_shader_buffer_descriptors(sctx, shader),
- si_get_constbuf_slot(slot),
- &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
+static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
+ unsigned shader)
+{
+ return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
+}
+
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+ unsigned size, uint32_t *const_offset)
+{
+ void *tmp;
+
+ u_upload_alloc(sctx->b.const_uploader, 0, size, si_optimal_tcc_alignment(sctx, size),
+ const_offset, (struct pipe_resource **)buf, &tmp);
+ if (*buf)
+ util_memcpy_cpu_to_le32(tmp, ptr, size);
+}
+
+static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+ unsigned descriptors_idx, uint slot,
+ const struct pipe_constant_buffer *input)
+{
+ struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+ assert(slot < descs->num_elements);
+ pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+ /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
+ * with a NULL buffer). We need to use a dummy buffer instead. */
+ if (sctx->chip_class == GFX7 && (!input || (!input->buffer && !input->user_buffer)))
+ input = &sctx->null_const_buf;
+
+ if (input && (input->buffer || input->user_buffer)) {
+ struct pipe_resource *buffer = NULL;
+ uint64_t va;
+ unsigned buffer_offset;
+
+ /* Upload the user buffer if needed. */
+ if (input->user_buffer) {
+ si_upload_const_buffer(sctx, (struct si_resource **)&buffer, input->user_buffer,
+ input->buffer_size, &buffer_offset);
+ if (!buffer) {
+ /* Just unbind on failure. */
+ si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
+ return;
+ }
+ } else {
+ pipe_resource_reference(&buffer, input->buffer);
+ buffer_offset = input->buffer_offset;
+ }
+
+ va = si_resource(buffer)->gpu_address + buffer_offset;
+
+ /* Set the descriptor. */
+ uint32_t *desc = descs->list + slot * 4;
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+ desc[2] = input->buffer_size;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ }
+
+ buffers->buffers[slot] = buffer;
+ buffers->offsets[slot] = buffer_offset;
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+ buffers->priority_constbuf, true);
+ buffers->enabled_mask |= 1u << slot;
+ } else {
+ /* Clear the descriptor. */
+ memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+ buffers->enabled_mask &= ~(1u << slot);
+ }
+
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+}
+
+static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
+ uint slot, const struct pipe_constant_buffer *input)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (shader >= SI_NUM_SHADERS)
+ return;
+
+ if (slot == 0 && input && input->buffer &&
+ !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
+ assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
+ return;
+ }
+
+ if (input && input->buffer)
+ si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+
+ slot = si_get_constbuf_slot(slot);
+ si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
+ si_const_and_shader_buffer_descriptors_idx(shader), slot, input);
+}
+
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+ struct pipe_constant_buffer *cbuf)
+{
+ cbuf->user_buffer = NULL;
+ si_get_buffer_from_descriptors(
+ &sctx->const_and_shader_buffers[shader], si_const_and_shader_buffer_descriptors(sctx, shader),
+ si_get_constbuf_slot(slot), &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
}
/* SHADER BUFFERS */
-static void si_set_shader_buffer(struct si_context *sctx,
- struct si_buffer_resources *buffers,
- unsigned descriptors_idx,
- uint slot, const struct pipe_shader_buffer *sbuffer,
- bool writable, enum radeon_bo_priority priority)
-{
- struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
- uint32_t *desc = descs->list + slot * 4;
-
- if (!sbuffer || !sbuffer->buffer) {
- pipe_resource_reference(&buffers->buffers[slot], NULL);
- memset(desc, 0, sizeof(uint32_t) * 4);
- buffers->enabled_mask &= ~(1u << slot);
- buffers->writable_mask &= ~(1u << slot);
- sctx->descriptors_dirty |= 1u << descriptors_idx;
- return;
- }
-
- struct si_resource *buf = si_resource(sbuffer->buffer);
- uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
-
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(0);
- desc[2] = sbuffer->buffer_size;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
-
- pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
- buffers->offsets[slot] = sbuffer->buffer_offset;
- radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
- writable ? RADEON_USAGE_READWRITE :
- RADEON_USAGE_READ,
- priority, true);
- if (writable)
- buffers->writable_mask |= 1u << slot;
- else
- buffers->writable_mask &= ~(1u << slot);
-
- buffers->enabled_mask |= 1u << slot;
- sctx->descriptors_dirty |= 1u << descriptors_idx;
-
- util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
- sbuffer->buffer_offset + sbuffer->buffer_size);
-}
-
-static void si_set_shader_buffers(struct pipe_context *ctx,
- enum pipe_shader_type shader,
- unsigned start_slot, unsigned count,
- const struct pipe_shader_buffer *sbuffers,
- unsigned writable_bitmask)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
- unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
- unsigned i;
-
- assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
-
- for (i = 0; i < count; ++i) {
- const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
- unsigned slot = si_get_shaderbuf_slot(start_slot + i);
-
- if (sbuffer && sbuffer->buffer)
- si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
-
- si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
- !!(writable_bitmask & (1u << i)),
- buffers->priority);
- }
-}
-
-void si_get_shader_buffers(struct si_context *sctx,
- enum pipe_shader_type shader,
- uint start_slot, uint count,
- struct pipe_shader_buffer *sbuf)
-{
- struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
- struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
-
- for (unsigned i = 0; i < count; ++i) {
- si_get_buffer_from_descriptors(
- buffers, descs,
- si_get_shaderbuf_slot(start_slot + i),
- &sbuf[i].buffer, &sbuf[i].buffer_offset,
- &sbuf[i].buffer_size);
- }
+static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+ unsigned descriptors_idx, uint slot,
+ const struct pipe_shader_buffer *sbuffer, bool writable,
+ enum radeon_bo_priority priority)
+{
+ struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+ uint32_t *desc = descs->list + slot * 4;
+
+ if (!sbuffer || !sbuffer->buffer) {
+ pipe_resource_reference(&buffers->buffers[slot], NULL);
+ memset(desc, 0, sizeof(uint32_t) * 4);
+ buffers->enabled_mask &= ~(1u << slot);
+ buffers->writable_mask &= ~(1u << slot);
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+ return;
+ }
+
+ struct si_resource *buf = si_resource(sbuffer->buffer);
+ uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
+
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+ desc[2] = sbuffer->buffer_size;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ }
+
+ pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+ buffers->offsets[slot] = sbuffer->buffer_offset;
+ radeon_add_to_gfx_buffer_list_check_mem(
+ sctx, buf, writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, priority, true);
+ if (writable)
+ buffers->writable_mask |= 1u << slot;
+ else
+ buffers->writable_mask &= ~(1u << slot);
+
+ buffers->enabled_mask |= 1u << slot;
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+ util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
+ sbuffer->buffer_offset + sbuffer->buffer_size);
+}
+
+static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+ unsigned start_slot, unsigned count,
+ const struct pipe_shader_buffer *sbuffers,
+ unsigned writable_bitmask)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+ unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
+ unsigned i;
+
+ assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
+
+ for (i = 0; i < count; ++i) {
+ const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
+ unsigned slot = si_get_shaderbuf_slot(start_slot + i);
+
+ if (sbuffer && sbuffer->buffer)
+ si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
+
+ si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
+ !!(writable_bitmask & (1u << i)), buffers->priority);
+ }
+}
+
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+ uint count, struct pipe_shader_buffer *sbuf)
+{
+ struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+ struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
+
+ for (unsigned i = 0; i < count; ++i) {
+ si_get_buffer_from_descriptors(buffers, descs, si_get_shaderbuf_slot(start_slot + i),
+ &sbuf[i].buffer, &sbuf[i].buffer_offset, &sbuf[i].buffer_size);
+ }
}
/* RING BUFFERS */
-void si_set_rw_buffer(struct si_context *sctx,
- uint slot, const struct pipe_constant_buffer *input)
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input)
{
- si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
- slot, input);
+ si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, input);
}
void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
- const struct pipe_shader_buffer *sbuffer)
-{
- si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
- slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER);
-}
-
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
- struct pipe_resource *buffer,
- unsigned stride, unsigned num_records,
- bool add_tid, bool swizzle,
- unsigned element_size, unsigned index_stride, uint64_t offset)
-{
- struct si_buffer_resources *buffers = &sctx->rw_buffers;
- struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-
- /* The stride field in the resource descriptor has 14 bits */
- assert(stride < (1 << 14));
-
- assert(slot < descs->num_elements);
- pipe_resource_reference(&buffers->buffers[slot], NULL);
-
- if (buffer) {
- uint64_t va;
-
- va = si_resource(buffer)->gpu_address + offset;
-
- switch (element_size) {
- default:
- assert(!"Unsupported ring buffer element size");
- case 0:
- case 2:
- element_size = 0;
- break;
- case 4:
- element_size = 1;
- break;
- case 8:
- element_size = 2;
- break;
- case 16:
- element_size = 3;
- break;
- }
-
- switch (index_stride) {
- default:
- assert(!"Unsupported ring buffer index stride");
- case 0:
- case 8:
- index_stride = 0;
- break;
- case 16:
- index_stride = 1;
- break;
- case 32:
- index_stride = 2;
- break;
- case 64:
- index_stride = 3;
- break;
- }
-
- if (sctx->chip_class >= GFX8 && stride)
- num_records *= stride;
-
- /* Set the descriptor. */
- uint32_t *desc = descs->list + slot*4;
- desc[0] = va;
- desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
- S_008F04_STRIDE(stride) |
- S_008F04_SWIZZLE_ENABLE(swizzle);
- desc[2] = num_records;
- desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
- S_008F0C_INDEX_STRIDE(index_stride) |
- S_008F0C_ADD_TID_ENABLE(add_tid);
-
- if (sctx->chip_class >= GFX9)
- assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
- else
- desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
-
- if (sctx->chip_class >= GFX10) {
- desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
- }
-
- pipe_resource_reference(&buffers->buffers[slot], buffer);
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(buffer),
- RADEON_USAGE_READWRITE, buffers->priority);
- buffers->enabled_mask |= 1u << slot;
- } else {
- /* Clear the descriptor. */
- memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
- buffers->enabled_mask &= ~(1u << slot);
- }
-
- sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+ const struct pipe_shader_buffer *sbuffer)
+{
+ si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, sbuffer, true,
+ RADEON_PRIO_SHADER_RW_BUFFER);
+}
+
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+ unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+ unsigned element_size, unsigned index_stride, uint64_t offset)
+{
+ struct si_buffer_resources *buffers = &sctx->rw_buffers;
+ struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+
+ /* The stride field in the resource descriptor has 14 bits */
+ assert(stride < (1 << 14));
+
+ assert(slot < descs->num_elements);
+ pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+ if (buffer) {
+ uint64_t va;
+
+ va = si_resource(buffer)->gpu_address + offset;
+
+ switch (element_size) {
+ default:
+ assert(!"Unsupported ring buffer element size");
+ case 0:
+ case 2:
+ element_size = 0;
+ break;
+ case 4:
+ element_size = 1;
+ break;
+ case 8:
+ element_size = 2;
+ break;
+ case 16:
+ element_size = 3;
+ break;
+ }
+
+ switch (index_stride) {
+ default:
+ assert(!"Unsupported ring buffer index stride");
+ case 0:
+ case 8:
+ index_stride = 0;
+ break;
+ case 16:
+ index_stride = 1;
+ break;
+ case 32:
+ index_stride = 2;
+ break;
+ case 64:
+ index_stride = 3;
+ break;
+ }
+
+ if (sctx->chip_class >= GFX8 && stride)
+ num_records *= stride;
+
+ /* Set the descriptor. */
+ uint32_t *desc = descs->list + slot * 4;
+ desc[0] = va;
+ desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) |
+ S_008F04_SWIZZLE_ENABLE(swizzle);
+ desc[2] = num_records;
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+ S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(add_tid);
+
+ if (sctx->chip_class >= GFX9)
+ assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
+ else
+ desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
+
+ if (sctx->chip_class >= GFX10) {
+ desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ }
+
+ pipe_resource_reference(&buffers->buffers[slot], buffer);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE,
+ buffers->priority);
+ buffers->enabled_mask |= 1u << slot;
+ } else {
+ /* Clear the descriptor. */
+ memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+ buffers->enabled_mask &= ~(1u << slot);
+ }
+
+ sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
}
/* INTERNAL CONST BUFFERS */
-static void si_set_polygon_stipple(struct pipe_context *ctx,
- const struct pipe_poly_stipple *state)
+static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_poly_stipple *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_constant_buffer cb = {};
- unsigned stipple[32];
- int i;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_constant_buffer cb = {};
+ unsigned stipple[32];
+ int i;
- for (i = 0; i < 32; i++)
- stipple[i] = util_bitreverse(state->stipple[i]);
+ for (i = 0; i < 32; i++)
+ stipple[i] = util_bitreverse(state->stipple[i]);
- cb.user_buffer = stipple;
- cb.buffer_size = sizeof(stipple);
+ cb.user_buffer = stipple;
+ cb.buffer_size = sizeof(stipple);
- si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
+ si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
}
/* TEXTURE METADATA ENABLE/DISABLE */
-static void
-si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
+static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
{
- util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
- util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
+ util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
+ util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- struct pipe_resource *res = (*tex_handle)->view->texture;
- struct si_texture *tex;
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ struct pipe_resource *res = (*tex_handle)->view->texture;
+ struct si_texture *tex;
- if (!res || res->target == PIPE_BUFFER)
- continue;
+ if (!res || res->target == PIPE_BUFFER)
+ continue;
- tex = (struct si_texture *)res;
- if (!color_needs_decompression(tex))
- continue;
+ tex = (struct si_texture *)res;
+ if (!color_needs_decompression(tex))
+ continue;
- util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
- struct si_texture_handle *, *tex_handle);
- }
+ util_dynarray_append(&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+ *tex_handle);
+ }
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- struct pipe_image_view *view = &(*img_handle)->view;
- struct pipe_resource *res = view->resource;
- struct si_texture *tex;
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ struct pipe_image_view *view = &(*img_handle)->view;
+ struct pipe_resource *res = view->resource;
+ struct si_texture *tex;
- if (!res || res->target == PIPE_BUFFER)
- continue;
+ if (!res || res->target == PIPE_BUFFER)
+ continue;
- tex = (struct si_texture *)res;
- if (!color_needs_decompression(tex))
- continue;
+ tex = (struct si_texture *)res;
+ if (!color_needs_decompression(tex))
+ continue;
- util_dynarray_append(&sctx->resident_img_needs_color_decompress,
- struct si_image_handle *, *img_handle);
- }
+ util_dynarray_append(&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+ *img_handle);
+ }
}
/* CMASK can be enabled (for fast clear) and disabled (for texture export)
*/
void si_update_needs_color_decompress_masks(struct si_context *sctx)
{
- for (int i = 0; i < SI_NUM_SHADERS; ++i) {
- si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
- si_images_update_needs_color_decompress_mask(&sctx->images[i]);
- si_update_shader_needs_decompress_mask(sctx, i);
- }
+ for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+ si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
+ si_images_update_needs_color_decompress_mask(&sctx->images[i]);
+ si_update_shader_needs_decompress_mask(sctx, i);
+ }
- si_resident_handles_update_needs_color_decompress(sctx);
+ si_resident_handles_update_needs_color_decompress(sctx);
}
/* BUFFER DISCARD/INVALIDATION */
/* Reset descriptors of buffer resources after \p buf has been invalidated.
* If buf == NULL, reset all descriptors.
*/
-static void si_reset_buffer_resources(struct si_context *sctx,
- struct si_buffer_resources *buffers,
- unsigned descriptors_idx,
- unsigned slot_mask,
- struct pipe_resource *buf,
- enum radeon_bo_priority priority)
-{
- struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
- unsigned mask = buffers->enabled_mask & slot_mask;
-
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct pipe_resource *buffer = buffers->buffers[i];
-
- if (buffer && (!buf || buffer == buf)) {
- si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
- descs->list + i*4);
- sctx->descriptors_dirty |= 1u << descriptors_idx;
-
- radeon_add_to_gfx_buffer_list_check_mem(sctx,
- si_resource(buffer),
- buffers->writable_mask & (1u << i) ?
- RADEON_USAGE_READWRITE :
- RADEON_USAGE_READ,
- priority, true);
- }
- }
+static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
+ unsigned descriptors_idx, unsigned slot_mask,
+ struct pipe_resource *buf, enum radeon_bo_priority priority)
+{
+ struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+ unsigned mask = buffers->enabled_mask & slot_mask;
+
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct pipe_resource *buffer = buffers->buffers[i];
+
+ if (buffer && (!buf || buffer == buf)) {
+ si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+ sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+ radeon_add_to_gfx_buffer_list_check_mem(
+ sctx, si_resource(buffer),
+ buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+ priority, true);
+ }
+ }
}
/* Update all buffer bindings where the buffer is bound, including
*/
void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
{
- struct si_resource *buffer = si_resource(buf);
- unsigned i, shader;
- unsigned num_elems = sctx->num_vertex_elements;
-
- /* We changed the buffer, now we need to bind it where the old one
- * was bound. This consists of 2 things:
- * 1) Updating the resource descriptor and dirtying it.
- * 2) Adding a relocation to the CS, so that it's usable.
- */
-
- /* Vertex buffers. */
- if (!buffer) {
- if (num_elems)
- sctx->vertex_buffers_dirty = true;
- } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
- for (i = 0; i < num_elems; i++) {
- int vb = sctx->vertex_elements->vertex_buffer_index[i];
-
- if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
- continue;
- if (!sctx->vertex_buffer[vb].buffer.resource)
- continue;
-
- if (sctx->vertex_buffer[vb].buffer.resource == buf) {
- sctx->vertex_buffers_dirty = true;
- break;
- }
- }
- }
-
- /* Streamout buffers. (other internal buffers can't be invalidated) */
- if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
- for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
- struct si_buffer_resources *buffers = &sctx->rw_buffers;
- struct si_descriptors *descs =
- &sctx->descriptors[SI_DESCS_RW_BUFFERS];
- struct pipe_resource *buffer = buffers->buffers[i];
-
- if (!buffer || (buf && buffer != buf))
- continue;
-
- si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
- descs->list + i*4);
- sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
-
- radeon_add_to_gfx_buffer_list_check_mem(sctx,
- si_resource(buffer),
- RADEON_USAGE_WRITE,
- RADEON_PRIO_SHADER_RW_BUFFER,
- true);
-
- /* Update the streamout state. */
- if (sctx->streamout.begin_emitted)
- si_emit_streamout_end(sctx);
- sctx->streamout.append_bitmask =
- sctx->streamout.enabled_mask;
- si_streamout_buffers_dirty(sctx);
- }
- }
-
- /* Constant and shader buffers. */
- if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
- for (shader = 0; shader < SI_NUM_SHADERS; shader++)
- si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
- si_const_and_shader_buffer_descriptors_idx(shader),
- u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
- buf,
- sctx->const_and_shader_buffers[shader].priority_constbuf);
- }
-
- if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
- for (shader = 0; shader < SI_NUM_SHADERS; shader++)
- si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
- si_const_and_shader_buffer_descriptors_idx(shader),
- u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),
- buf,
- sctx->const_and_shader_buffers[shader].priority);
- }
-
- if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
- /* Texture buffers - update bindings. */
- for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
- struct si_samplers *samplers = &sctx->samplers[shader];
- struct si_descriptors *descs =
- si_sampler_and_image_descriptors(sctx, shader);
- unsigned mask = samplers->enabled_mask;
-
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct pipe_resource *buffer = samplers->views[i]->texture;
-
- if (buffer && buffer->target == PIPE_BUFFER &&
- (!buf || buffer == buf)) {
- unsigned desc_slot = si_get_sampler_slot(i);
-
- si_set_buf_desc_address(si_resource(buffer),
- samplers->views[i]->u.buf.offset,
- descs->list + desc_slot * 16 + 4);
- sctx->descriptors_dirty |=
- 1u << si_sampler_and_image_descriptors_idx(shader);
-
- radeon_add_to_gfx_buffer_list_check_mem(
- sctx, si_resource(buffer),
- RADEON_USAGE_READ,
- RADEON_PRIO_SAMPLER_BUFFER, true);
- }
- }
- }
- }
-
- /* Shader images */
- if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
- for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
- struct si_images *images = &sctx->images[shader];
- struct si_descriptors *descs =
- si_sampler_and_image_descriptors(sctx, shader);
- unsigned mask = images->enabled_mask;
-
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct pipe_resource *buffer = images->views[i].resource;
-
- if (buffer && buffer->target == PIPE_BUFFER &&
- (!buf || buffer == buf)) {
- unsigned desc_slot = si_get_image_slot(i);
-
- if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
- si_mark_image_range_valid(&images->views[i]);
-
- si_set_buf_desc_address(si_resource(buffer),
- images->views[i].u.buf.offset,
- descs->list + desc_slot * 8 + 4);
- sctx->descriptors_dirty |=
- 1u << si_sampler_and_image_descriptors_idx(shader);
-
- radeon_add_to_gfx_buffer_list_check_mem(
- sctx, si_resource(buffer),
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_SAMPLER_BUFFER, true);
- }
- }
- }
- }
-
- /* Bindless texture handles */
- if (!buffer || buffer->texture_handle_allocated) {
- struct si_descriptors *descs = &sctx->bindless_descriptors;
-
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- struct pipe_sampler_view *view = (*tex_handle)->view;
- unsigned desc_slot = (*tex_handle)->desc_slot;
- struct pipe_resource *buffer = view->texture;
-
- if (buffer && buffer->target == PIPE_BUFFER &&
- (!buf || buffer == buf)) {
- si_set_buf_desc_address(si_resource(buffer),
- view->u.buf.offset,
- descs->list +
- desc_slot * 16 + 4);
-
- (*tex_handle)->desc_dirty = true;
- sctx->bindless_descriptors_dirty = true;
-
- radeon_add_to_gfx_buffer_list_check_mem(
- sctx, si_resource(buffer),
- RADEON_USAGE_READ,
- RADEON_PRIO_SAMPLER_BUFFER, true);
- }
- }
- }
-
- /* Bindless image handles */
- if (!buffer || buffer->image_handle_allocated) {
- struct si_descriptors *descs = &sctx->bindless_descriptors;
-
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- struct pipe_image_view *view = &(*img_handle)->view;
- unsigned desc_slot = (*img_handle)->desc_slot;
- struct pipe_resource *buffer = view->resource;
-
- if (buffer && buffer->target == PIPE_BUFFER &&
- (!buf || buffer == buf)) {
- if (view->access & PIPE_IMAGE_ACCESS_WRITE)
- si_mark_image_range_valid(view);
-
- si_set_buf_desc_address(si_resource(buffer),
- view->u.buf.offset,
- descs->list +
- desc_slot * 16 + 4);
-
- (*img_handle)->desc_dirty = true;
- sctx->bindless_descriptors_dirty = true;
-
- radeon_add_to_gfx_buffer_list_check_mem(
- sctx, si_resource(buffer),
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_SAMPLER_BUFFER, true);
- }
- }
- }
-
- if (buffer) {
- /* Do the same for other contexts. They will invoke this function
- * with buffer == NULL.
- */
- unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
-
- /* Skip the update for the current context, because we have already updated
- * the buffer bindings.
- */
- if (new_counter == sctx->last_dirty_buf_counter + 1)
- sctx->last_dirty_buf_counter = new_counter;
- }
-}
-
-static void si_upload_bindless_descriptor(struct si_context *sctx,
- unsigned desc_slot,
- unsigned num_dwords)
-{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot_offset = desc_slot * 16;
- uint32_t *data;
- uint64_t va;
-
- data = desc->list + desc_slot_offset;
- va = desc->gpu_address + desc_slot_offset * 4;
-
- si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address,
- num_dwords * 4, V_370_TC_L2, V_370_ME, data);
+ struct si_resource *buffer = si_resource(buf);
+ unsigned i, shader;
+ unsigned num_elems = sctx->num_vertex_elements;
+
+ /* We changed the buffer, now we need to bind it where the old one
+ * was bound. This consists of 2 things:
+ * 1) Updating the resource descriptor and dirtying it.
+ * 2) Adding a relocation to the CS, so that it's usable.
+ */
+
+ /* Vertex buffers. */
+ if (!buffer) {
+ if (num_elems)
+ sctx->vertex_buffers_dirty = true;
+ } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+ for (i = 0; i < num_elems; i++) {
+ int vb = sctx->vertex_elements->vertex_buffer_index[i];
+
+ if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+ continue;
+ if (!sctx->vertex_buffer[vb].buffer.resource)
+ continue;
+
+ if (sctx->vertex_buffer[vb].buffer.resource == buf) {
+ sctx->vertex_buffers_dirty = true;
+ break;
+ }
+ }
+ }
+
+ /* Streamout buffers. (other internal buffers can't be invalidated) */
+ if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+ for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
+ struct si_buffer_resources *buffers = &sctx->rw_buffers;
+ struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+ struct pipe_resource *buffer = buffers->buffers[i];
+
+ if (!buffer || (buf && buffer != buf))
+ continue;
+
+ si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+ sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_WRITE,
+ RADEON_PRIO_SHADER_RW_BUFFER, true);
+
+ /* Update the streamout state. */
+ if (sctx->streamout.begin_emitted)
+ si_emit_streamout_end(sctx);
+ sctx->streamout.append_bitmask = sctx->streamout.enabled_mask;
+ si_streamout_buffers_dirty(sctx);
+ }
+ }
+
+ /* Constant and shader buffers. */
+ if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+ for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+ si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+ si_const_and_shader_buffer_descriptors_idx(shader),
+ u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
+ buf, sctx->const_and_shader_buffers[shader].priority_constbuf);
+ }
+
+ if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
+ for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+ si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+ si_const_and_shader_buffer_descriptors_idx(shader),
+ u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), buf,
+ sctx->const_and_shader_buffers[shader].priority);
+ }
+
+ if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+ /* Texture buffers - update bindings. */
+ for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+ struct si_samplers *samplers = &sctx->samplers[shader];
+ struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+ unsigned mask = samplers->enabled_mask;
+
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct pipe_resource *buffer = samplers->views[i]->texture;
+
+ if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+ unsigned desc_slot = si_get_sampler_slot(i);
+
+ si_set_buf_desc_address(si_resource(buffer), samplers->views[i]->u.buf.offset,
+ descs->list + desc_slot * 16 + 4);
+ sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+ RADEON_PRIO_SAMPLER_BUFFER, true);
+ }
+ }
+ }
+ }
+
+ /* Shader images */
+ if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
+ for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+ struct si_images *images = &sctx->images[shader];
+ struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+ unsigned mask = images->enabled_mask;
+
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct pipe_resource *buffer = images->views[i].resource;
+
+ if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+ unsigned desc_slot = si_get_image_slot(i);
+
+ if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
+ si_mark_image_range_valid(&images->views[i]);
+
+ si_set_buf_desc_address(si_resource(buffer), images->views[i].u.buf.offset,
+ descs->list + desc_slot * 8 + 4);
+ sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
+ RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SAMPLER_BUFFER, true);
+ }
+ }
+ }
+ }
+
+ /* Bindless texture handles */
+ if (!buffer || buffer->texture_handle_allocated) {
+ struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ struct pipe_sampler_view *view = (*tex_handle)->view;
+ unsigned desc_slot = (*tex_handle)->desc_slot;
+ struct pipe_resource *buffer = view->texture;
+
+ if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+ si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+ descs->list + desc_slot * 16 + 4);
+
+ (*tex_handle)->desc_dirty = true;
+ sctx->bindless_descriptors_dirty = true;
+
+ radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+ RADEON_PRIO_SAMPLER_BUFFER, true);
+ }
+ }
+ }
+
+ /* Bindless image handles */
+ if (!buffer || buffer->image_handle_allocated) {
+ struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ struct pipe_image_view *view = &(*img_handle)->view;
+ unsigned desc_slot = (*img_handle)->desc_slot;
+ struct pipe_resource *buffer = view->resource;
+
+ if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+ if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+ si_mark_image_range_valid(view);
+
+ si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+ descs->list + desc_slot * 16 + 4);
+
+ (*img_handle)->desc_dirty = true;
+ sctx->bindless_descriptors_dirty = true;
+
+ radeon_add_to_gfx_buffer_list_check_mem(
+ sctx, si_resource(buffer), RADEON_USAGE_READWRITE, RADEON_PRIO_SAMPLER_BUFFER, true);
+ }
+ }
+ }
+
+ if (buffer) {
+ /* Do the same for other contexts. They will invoke this function
+ * with buffer == NULL.
+ */
+ unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
+
+ /* Skip the update for the current context, because we have already updated
+ * the buffer bindings.
+ */
+ if (new_counter == sctx->last_dirty_buf_counter + 1)
+ sctx->last_dirty_buf_counter = new_counter;
+ }
+}
+
+static void si_upload_bindless_descriptor(struct si_context *sctx, unsigned desc_slot,
+ unsigned num_dwords)
+{
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot_offset = desc_slot * 16;
+ uint32_t *data;
+ uint64_t va;
+
+ data = desc->list + desc_slot_offset;
+ va = desc->gpu_address + desc_slot_offset * 4;
+
+ si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, num_dwords * 4, V_370_TC_L2,
+ V_370_ME, data);
}
static void si_upload_bindless_descriptors(struct si_context *sctx)
{
- if (!sctx->bindless_descriptors_dirty)
- return;
+ if (!sctx->bindless_descriptors_dirty)
+ return;
- /* Wait for graphics/compute to be idle before updating the resident
- * descriptors directly in memory, in case the GPU is using them.
- */
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- sctx->emit_cache_flush(sctx);
+ /* Wait for graphics/compute to be idle before updating the resident
+ * descriptors directly in memory, in case the GPU is using them.
+ */
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+ sctx->emit_cache_flush(sctx);
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- unsigned desc_slot = (*tex_handle)->desc_slot;
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ unsigned desc_slot = (*tex_handle)->desc_slot;
- if (!(*tex_handle)->desc_dirty)
- continue;
+ if (!(*tex_handle)->desc_dirty)
+ continue;
- si_upload_bindless_descriptor(sctx, desc_slot, 16);
- (*tex_handle)->desc_dirty = false;
- }
+ si_upload_bindless_descriptor(sctx, desc_slot, 16);
+ (*tex_handle)->desc_dirty = false;
+ }
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- unsigned desc_slot = (*img_handle)->desc_slot;
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ unsigned desc_slot = (*img_handle)->desc_slot;
- if (!(*img_handle)->desc_dirty)
- continue;
+ if (!(*img_handle)->desc_dirty)
+ continue;
- si_upload_bindless_descriptor(sctx, desc_slot, 8);
- (*img_handle)->desc_dirty = false;
- }
+ si_upload_bindless_descriptor(sctx, desc_slot, 8);
+ (*img_handle)->desc_dirty = false;
+ }
- /* Invalidate L1 because it doesn't know that L2 changed. */
- sctx->flags |= SI_CONTEXT_INV_SCACHE;
- sctx->emit_cache_flush(sctx);
+ /* Invalidate L1 because it doesn't know that L2 changed. */
+ sctx->flags |= SI_CONTEXT_INV_SCACHE;
+ sctx->emit_cache_flush(sctx);
- sctx->bindless_descriptors_dirty = false;
+ sctx->bindless_descriptors_dirty = false;
}
/* Update mutable image descriptor fields of all resident textures. */
static void si_update_bindless_texture_descriptor(struct si_context *sctx,
- struct si_texture_handle *tex_handle)
+ struct si_texture_handle *tex_handle)
{
- struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot_offset = tex_handle->desc_slot * 16;
- uint32_t desc_list[16];
+ struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot_offset = tex_handle->desc_slot * 16;
+ uint32_t desc_list[16];
- if (sview->base.texture->target == PIPE_BUFFER)
- return;
+ if (sview->base.texture->target == PIPE_BUFFER)
+ return;
- memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
- si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate,
- desc->list + desc_slot_offset);
+ memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
+ si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, desc->list + desc_slot_offset);
- if (memcmp(desc_list, desc->list + desc_slot_offset,
- sizeof(desc_list))) {
- tex_handle->desc_dirty = true;
- sctx->bindless_descriptors_dirty = true;
- }
+ if (memcmp(desc_list, desc->list + desc_slot_offset, sizeof(desc_list))) {
+ tex_handle->desc_dirty = true;
+ sctx->bindless_descriptors_dirty = true;
+ }
}
static void si_update_bindless_image_descriptor(struct si_context *sctx,
- struct si_image_handle *img_handle)
+ struct si_image_handle *img_handle)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot_offset = img_handle->desc_slot * 16;
- struct pipe_image_view *view = &img_handle->view;
- struct pipe_resource *res = view->resource;
- uint32_t image_desc[16];
- unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot_offset = img_handle->desc_slot * 16;
+ struct pipe_image_view *view = &img_handle->view;
+ struct pipe_resource *res = view->resource;
+ uint32_t image_desc[16];
+ unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
- if (res->target == PIPE_BUFFER)
- return;
+ if (res->target == PIPE_BUFFER)
+ return;
- memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
- si_set_shader_image_desc(sctx, view, true,
- desc->list + desc_slot_offset,
- desc->list + desc_slot_offset + 8);
+ memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
+ si_set_shader_image_desc(sctx, view, true, desc->list + desc_slot_offset,
+ desc->list + desc_slot_offset + 8);
- if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
- img_handle->desc_dirty = true;
- sctx->bindless_descriptors_dirty = true;
- }
+ if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
+ img_handle->desc_dirty = true;
+ sctx->bindless_descriptors_dirty = true;
+ }
}
static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
{
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- si_update_bindless_texture_descriptor(sctx, *tex_handle);
- }
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ si_update_bindless_texture_descriptor(sctx, *tex_handle);
+ }
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- si_update_bindless_image_descriptor(sctx, *img_handle);
- }
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ si_update_bindless_image_descriptor(sctx, *img_handle);
+ }
- si_upload_bindless_descriptors(sctx);
+ si_upload_bindless_descriptors(sctx);
}
/* Update mutable image descriptor fields of all bound textures. */
void si_update_all_texture_descriptors(struct si_context *sctx)
{
- unsigned shader;
+ unsigned shader;
- for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
- struct si_samplers *samplers = &sctx->samplers[shader];
- struct si_images *images = &sctx->images[shader];
- unsigned mask;
+ for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+ struct si_samplers *samplers = &sctx->samplers[shader];
+ struct si_images *images = &sctx->images[shader];
+ unsigned mask;
- /* Images. */
- mask = images->enabled_mask;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct pipe_image_view *view = &images->views[i];
+ /* Images. */
+ mask = images->enabled_mask;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct pipe_image_view *view = &images->views[i];
- if (!view->resource ||
- view->resource->target == PIPE_BUFFER)
- continue;
+ if (!view->resource || view->resource->target == PIPE_BUFFER)
+ continue;
- si_set_shader_image(sctx, shader, i, view, true);
- }
+ si_set_shader_image(sctx, shader, i, view, true);
+ }
- /* Sampler views. */
- mask = samplers->enabled_mask;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct pipe_sampler_view *view = samplers->views[i];
+ /* Sampler views. */
+ mask = samplers->enabled_mask;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct pipe_sampler_view *view = samplers->views[i];
- if (!view ||
- !view->texture ||
- view->texture->target == PIPE_BUFFER)
- continue;
+ if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
+ continue;
- si_set_sampler_view(sctx, shader, i,
- samplers->views[i], true);
- }
+ si_set_sampler_view(sctx, shader, i, samplers->views[i], true);
+ }
- si_update_shader_needs_decompress_mask(sctx, shader);
- }
+ si_update_shader_needs_decompress_mask(sctx, shader);
+ }
- si_update_all_resident_texture_descriptors(sctx);
- si_update_ps_colorbuf0_slot(sctx);
+ si_update_all_resident_texture_descriptors(sctx);
+ si_update_ps_colorbuf0_slot(sctx);
}
/* SHADER USER DATA */
-static void si_mark_shader_pointers_dirty(struct si_context *sctx,
- unsigned shader)
+static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shader)
{
- sctx->shader_pointers_dirty |=
- u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
- SI_NUM_SHADER_DESCS);
+ sctx->shader_pointers_dirty |=
+ u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
- if (shader == PIPE_SHADER_VERTEX) {
- sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
- sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
- sctx->screen->num_vbos_in_user_sgprs;
- }
+ if (shader == PIPE_SHADER_VERTEX) {
+ sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty =
+ sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+ }
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
}
static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
{
- sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
- sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
- sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
- sctx->screen->num_vbos_in_user_sgprs;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
- sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
- sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+ sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+ sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+ sctx->vertex_buffer_user_sgprs_dirty =
+ sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+ sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+ sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
}
/* Set a base register address for user data constants in the given shader.
* This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
*/
-static void si_set_user_data_base(struct si_context *sctx,
- unsigned shader, uint32_t new_base)
+static void si_set_user_data_base(struct si_context *sctx, unsigned shader, uint32_t new_base)
{
- uint32_t *base = &sctx->shader_pointers.sh_base[shader];
+ uint32_t *base = &sctx->shader_pointers.sh_base[shader];
- if (*base != new_base) {
- *base = new_base;
+ if (*base != new_base) {
+ *base = new_base;
- if (new_base)
- si_mark_shader_pointers_dirty(sctx, shader);
+ if (new_base)
+ si_mark_shader_pointers_dirty(sctx, shader);
- /* Any change in enabled shader stages requires re-emitting
- * the VS state SGPR, because it contains the clamp_vertex_color
- * state, which can be done in VS, TES, and GS.
- */
- sctx->last_vs_state = ~0;
- }
+ /* Any change in enabled shader stages requires re-emitting
+ * the VS state SGPR, because it contains the clamp_vertex_color
+ * state, which can be done in VS, TES, and GS.
+ */
+ sctx->last_vs_state = ~0;
+ }
}
/* This must be called when these are changed between enabled and disabled
*/
void si_shader_change_notify(struct si_context *sctx)
{
- /* VS can be bound as VS, ES, or LS. */
- if (sctx->tes_shader.cso) {
- if (sctx->chip_class >= GFX10) {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- } else if (sctx->chip_class == GFX9) {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B430_SPI_SHADER_USER_DATA_LS_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B530_SPI_SHADER_USER_DATA_LS_0);
- }
- } else if (sctx->chip_class >= GFX10) {
- if (sctx->ngg || sctx->gs_shader.cso) {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- }
- } else if (sctx->gs_shader.cso) {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- }
-
- /* TES can be bound as ES, VS, or not bound. */
- if (sctx->tes_shader.cso) {
- if (sctx->chip_class >= GFX10) {
- if (sctx->ngg || sctx->gs_shader.cso) {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- }
- } else if (sctx->gs_shader.cso) {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- }
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
- }
-}
-
-static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs,
- unsigned sh_offset,
- unsigned pointer_count)
-{
- radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
- radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
-}
-
-static void si_emit_shader_pointer_body(struct si_screen *sscreen,
- struct radeon_cmdbuf *cs,
- uint64_t va)
-{
- radeon_emit(cs, va);
-
- assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
-}
-
-static void si_emit_shader_pointer(struct si_context *sctx,
- struct si_descriptors *desc,
- unsigned sh_base)
-{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned sh_offset = sh_base + desc->shader_userdata_offset;
-
- si_emit_shader_pointer_head(cs, sh_offset, 1);
- si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
-}
-
-static void si_emit_consecutive_shader_pointers(struct si_context *sctx,
- unsigned pointer_mask,
- unsigned sh_base)
-{
- if (!sh_base)
- return;
-
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
-
- while (mask) {
- int start, count;
- u_bit_scan_consecutive_range(&mask, &start, &count);
-
- struct si_descriptors *descs = &sctx->descriptors[start];
- unsigned sh_offset = sh_base + descs->shader_userdata_offset;
-
- si_emit_shader_pointer_head(cs, sh_offset, count);
- for (int i = 0; i < count; i++)
- si_emit_shader_pointer_body(sctx->screen, cs,
- descs[i].gpu_address);
- }
-}
-
-static void si_emit_global_shader_pointers(struct si_context *sctx,
- struct si_descriptors *descs)
-{
- if (sctx->chip_class >= GFX10) {
- si_emit_shader_pointer(sctx, descs,
- R_00B030_SPI_SHADER_USER_DATA_PS_0);
- /* HW VS stage only used in non-NGG mode. */
- si_emit_shader_pointer(sctx, descs,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- return;
- } else if (sctx->chip_class == GFX9) {
- /* Broadcast it to all shader stages. */
- si_emit_shader_pointer(sctx, descs,
- R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
- return;
- }
-
- si_emit_shader_pointer(sctx, descs,
- R_00B030_SPI_SHADER_USER_DATA_PS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B130_SPI_SHADER_USER_DATA_VS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- si_emit_shader_pointer(sctx, descs,
- R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ /* VS can be bound as VS, ES, or LS. */
+ if (sctx->tes_shader.cso) {
+ if (sctx->chip_class >= GFX10) {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ } else if (sctx->chip_class == GFX9) {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ }
+ } else if (sctx->chip_class >= GFX10) {
+ if (sctx->ngg || sctx->gs_shader.cso) {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ }
+ } else if (sctx->gs_shader.cso) {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ }
+
+ /* TES can be bound as ES, VS, or not bound. */
+ if (sctx->tes_shader.cso) {
+ if (sctx->chip_class >= GFX10) {
+ if (sctx->ngg || sctx->gs_shader.cso) {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ }
+ } else if (sctx->gs_shader.cso) {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ }
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
+ }
+}
+
+static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset,
+ unsigned pointer_count)
+{
+ radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
+ radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
+}
+
+static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs,
+ uint64_t va)
+{
+ radeon_emit(cs, va);
+
+ assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc,
+ unsigned sh_base)
+{
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned sh_offset = sh_base + desc->shader_userdata_offset;
+
+ si_emit_shader_pointer_head(cs, sh_offset, 1);
+ si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
+}
+
+static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask,
+ unsigned sh_base)
+{
+ if (!sh_base)
+ return;
+
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
+
+ while (mask) {
+ int start, count;
+ u_bit_scan_consecutive_range(&mask, &start, &count);
+
+ struct si_descriptors *descs = &sctx->descriptors[start];
+ unsigned sh_offset = sh_base + descs->shader_userdata_offset;
+
+ si_emit_shader_pointer_head(cs, sh_offset, count);
+ for (int i = 0; i < count; i++)
+ si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address);
+ }
+}
+
+static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
+{
+ if (sctx->chip_class >= GFX10) {
+ si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ /* HW VS stage only used in non-NGG mode. */
+ si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ return;
+ } else if (sctx->chip_class == GFX9) {
+ /* Broadcast it to all shader stages. */
+ si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
+ return;
+ }
+
+ si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0);
}
void si_emit_graphics_shader_pointers(struct si_context *sctx)
{
- uint32_t *sh_base = sctx->shader_pointers.sh_base;
-
- if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
- si_emit_global_shader_pointers(sctx,
- &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
- }
-
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
- sh_base[PIPE_SHADER_VERTEX]);
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
- sh_base[PIPE_SHADER_TESS_EVAL]);
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
- sh_base[PIPE_SHADER_FRAGMENT]);
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
- sh_base[PIPE_SHADER_TESS_CTRL]);
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
- sh_base[PIPE_SHADER_GEOMETRY]);
-
- sctx->shader_pointers_dirty &=
- ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
-
- if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- /* Find the location of the VB descriptor pointer. */
- unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
- if (sctx->chip_class >= GFX9) {
- if (sctx->tes_shader.cso)
- sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
- else if (sctx->gs_shader.cso)
- sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
- }
-
- unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
- si_emit_shader_pointer_head(cs, sh_offset, 1);
- si_emit_shader_pointer_body(sctx->screen, cs,
- sctx->vb_descriptors_buffer->gpu_address +
- sctx->vb_descriptors_offset);
- sctx->vertex_buffer_pointer_dirty = false;
- }
-
- if (sctx->vertex_buffer_user_sgprs_dirty &&
- sctx->num_vertex_elements &&
- sctx->screen->num_vbos_in_user_sgprs) {
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned num_desc = MIN2(sctx->num_vertex_elements,
- sctx->screen->num_vbos_in_user_sgprs);
- unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
-
- si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
- radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
- sctx->vertex_buffer_user_sgprs_dirty = false;
- }
-
- if (sctx->graphics_bindless_pointer_dirty) {
- si_emit_global_shader_pointers(sctx,
- &sctx->bindless_descriptors);
- sctx->graphics_bindless_pointer_dirty = false;
- }
+ uint32_t *sh_base = sctx->shader_pointers.sh_base;
+
+ if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
+ si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+ }
+
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
+ sh_base[PIPE_SHADER_VERTEX]);
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
+ sh_base[PIPE_SHADER_TESS_EVAL]);
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
+ sh_base[PIPE_SHADER_FRAGMENT]);
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
+ sh_base[PIPE_SHADER_TESS_CTRL]);
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
+ sh_base[PIPE_SHADER_GEOMETRY]);
+
+ sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
+
+ if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ /* Find the location of the VB descriptor pointer. */
+ unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
+ if (sctx->chip_class >= GFX9) {
+ if (sctx->tes_shader.cso)
+ sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
+ else if (sctx->gs_shader.cso)
+ sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
+ }
+
+ unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
+ si_emit_shader_pointer_head(cs, sh_offset, 1);
+ si_emit_shader_pointer_body(
+ sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset);
+ sctx->vertex_buffer_pointer_dirty = false;
+ }
+
+ if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements &&
+ sctx->screen->num_vbos_in_user_sgprs) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs);
+ unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
+
+ si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+ radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
+ sctx->vertex_buffer_user_sgprs_dirty = false;
+ }
+
+ if (sctx->graphics_bindless_pointer_dirty) {
+ si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors);
+ sctx->graphics_bindless_pointer_dirty = false;
+ }
}
void si_emit_compute_shader_pointers(struct si_context *sctx)
{
- unsigned base = R_00B900_COMPUTE_USER_DATA_0;
+ unsigned base = R_00B900_COMPUTE_USER_DATA_0;
- si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
- R_00B900_COMPUTE_USER_DATA_0);
- sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
+ si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
+ R_00B900_COMPUTE_USER_DATA_0);
+ sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
- if (sctx->compute_bindless_pointer_dirty) {
- si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
- sctx->compute_bindless_pointer_dirty = false;
- }
+ if (sctx->compute_bindless_pointer_dirty) {
+ si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
+ sctx->compute_bindless_pointer_dirty = false;
+ }
}
/* BINDLESS */
-static void si_init_bindless_descriptors(struct si_context *sctx,
- struct si_descriptors *desc,
- short shader_userdata_rel_index,
- unsigned num_elements)
+static void si_init_bindless_descriptors(struct si_context *sctx, struct si_descriptors *desc,
+ short shader_userdata_rel_index, unsigned num_elements)
{
- ASSERTED unsigned desc_slot;
+ ASSERTED unsigned desc_slot;
- si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
- sctx->bindless_descriptors.num_active_slots = num_elements;
+ si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
+ sctx->bindless_descriptors.num_active_slots = num_elements;
- /* The first bindless descriptor is stored at slot 1, because 0 is not
- * considered to be a valid handle.
- */
- sctx->num_bindless_descriptors = 1;
+ /* The first bindless descriptor is stored at slot 1, because 0 is not
+ * considered to be a valid handle.
+ */
+ sctx->num_bindless_descriptors = 1;
- /* Track which bindless slots are used (or not). */
- util_idalloc_init(&sctx->bindless_used_slots);
- util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+ /* Track which bindless slots are used (or not). */
+ util_idalloc_init(&sctx->bindless_used_slots);
+ util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
- /* Reserve slot 0 because it's an invalid handle for bindless. */
- desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
- assert(desc_slot == 0);
+ /* Reserve slot 0 because it's an invalid handle for bindless. */
+ desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+ assert(desc_slot == 0);
}
static void si_release_bindless_descriptors(struct si_context *sctx)
{
- si_release_descriptors(&sctx->bindless_descriptors);
- util_idalloc_fini(&sctx->bindless_used_slots);
+ si_release_descriptors(&sctx->bindless_descriptors);
+ util_idalloc_fini(&sctx->bindless_used_slots);
}
static unsigned si_get_first_free_bindless_slot(struct si_context *sctx)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot;
- desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
- if (desc_slot >= desc->num_elements) {
- /* The array of bindless descriptors is full, resize it. */
- unsigned slot_size = desc->element_dw_size * 4;
- unsigned new_num_elements = desc->num_elements * 2;
+ desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+ if (desc_slot >= desc->num_elements) {
+ /* The array of bindless descriptors is full, resize it. */
+ unsigned slot_size = desc->element_dw_size * 4;
+ unsigned new_num_elements = desc->num_elements * 2;
- desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
- new_num_elements * slot_size);
- desc->num_elements = new_num_elements;
- desc->num_active_slots = new_num_elements;
- }
+ desc->list =
+ REALLOC(desc->list, desc->num_elements * slot_size, new_num_elements * slot_size);
+ desc->num_elements = new_num_elements;
+ desc->num_active_slots = new_num_elements;
+ }
- assert(desc_slot);
- return desc_slot;
+ assert(desc_slot);
+ return desc_slot;
}
-static unsigned
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
- unsigned size)
+static unsigned si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+ unsigned size)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- unsigned desc_slot, desc_slot_offset;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ unsigned desc_slot, desc_slot_offset;
- /* Find a free slot. */
- desc_slot = si_get_first_free_bindless_slot(sctx);
+ /* Find a free slot. */
+ desc_slot = si_get_first_free_bindless_slot(sctx);
- /* For simplicity, sampler and image bindless descriptors use fixed
- * 16-dword slots for now. Image descriptors only need 8-dword but this
- * doesn't really matter because no real apps use image handles.
- */
- desc_slot_offset = desc_slot * 16;
+ /* For simplicity, sampler and image bindless descriptors use fixed
+ * 16-dword slots for now. Image descriptors only need 8-dword but this
+ * doesn't really matter because no real apps use image handles.
+ */
+ desc_slot_offset = desc_slot * 16;
- /* Copy the descriptor into the array. */
- memcpy(desc->list + desc_slot_offset, desc_list, size);
+ /* Copy the descriptor into the array. */
+ memcpy(desc->list + desc_slot_offset, desc_list, size);
- /* Re-upload the whole array of bindless descriptors into a new buffer.
- */
- if (!si_upload_descriptors(sctx, desc))
- return 0;
+ /* Re-upload the whole array of bindless descriptors into a new buffer.
+ */
+ if (!si_upload_descriptors(sctx, desc))
+ return 0;
- /* Make sure to re-emit the shader pointers for all stages. */
- sctx->graphics_bindless_pointer_dirty = true;
- sctx->compute_bindless_pointer_dirty = true;
+ /* Make sure to re-emit the shader pointers for all stages. */
+ sctx->graphics_bindless_pointer_dirty = true;
+ sctx->compute_bindless_pointer_dirty = true;
- return desc_slot;
+ return desc_slot;
}
-static void si_update_bindless_buffer_descriptor(struct si_context *sctx,
- unsigned desc_slot,
- struct pipe_resource *resource,
- uint64_t offset,
- bool *desc_dirty)
+static void si_update_bindless_buffer_descriptor(struct si_context *sctx, unsigned desc_slot,
+ struct pipe_resource *resource, uint64_t offset,
+ bool *desc_dirty)
{
- struct si_descriptors *desc = &sctx->bindless_descriptors;
- struct si_resource *buf = si_resource(resource);
- unsigned desc_slot_offset = desc_slot * 16;
- uint32_t *desc_list = desc->list + desc_slot_offset + 4;
- uint64_t old_desc_va;
+ struct si_descriptors *desc = &sctx->bindless_descriptors;
+ struct si_resource *buf = si_resource(resource);
+ unsigned desc_slot_offset = desc_slot * 16;
+ uint32_t *desc_list = desc->list + desc_slot_offset + 4;
+ uint64_t old_desc_va;
- assert(resource->target == PIPE_BUFFER);
+ assert(resource->target == PIPE_BUFFER);
- /* Retrieve the old buffer addr from the descriptor. */
- old_desc_va = si_desc_extract_buffer_address(desc_list);
+ /* Retrieve the old buffer addr from the descriptor. */
+ old_desc_va = si_desc_extract_buffer_address(desc_list);
- if (old_desc_va != buf->gpu_address + offset) {
- /* The buffer has been invalidated when the handle wasn't
- * resident, update the descriptor and the dirty flag.
- */
- si_set_buf_desc_address(buf, offset, &desc_list[0]);
+ if (old_desc_va != buf->gpu_address + offset) {
+ /* The buffer has been invalidated when the handle wasn't
+ * resident, update the descriptor and the dirty flag.
+ */
+ si_set_buf_desc_address(buf, offset, &desc_list[0]);
- *desc_dirty = true;
- }
+ *desc_dirty = true;
+ }
}
-static uint64_t si_create_texture_handle(struct pipe_context *ctx,
- struct pipe_sampler_view *view,
- const struct pipe_sampler_state *state)
+static uint64_t si_create_texture_handle(struct pipe_context *ctx, struct pipe_sampler_view *view,
+ const struct pipe_sampler_state *state)
{
- struct si_sampler_view *sview = (struct si_sampler_view *)view;
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture_handle *tex_handle;
- struct si_sampler_state *sstate;
- uint32_t desc_list[16];
- uint64_t handle;
+ struct si_sampler_view *sview = (struct si_sampler_view *)view;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture_handle *tex_handle;
+ struct si_sampler_state *sstate;
+ uint32_t desc_list[16];
+ uint64_t handle;
- tex_handle = CALLOC_STRUCT(si_texture_handle);
- if (!tex_handle)
- return 0;
+ tex_handle = CALLOC_STRUCT(si_texture_handle);
+ if (!tex_handle)
+ return 0;
- memset(desc_list, 0, sizeof(desc_list));
- si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
+ memset(desc_list, 0, sizeof(desc_list));
+ si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
- sstate = ctx->create_sampler_state(ctx, state);
- if (!sstate) {
- FREE(tex_handle);
- return 0;
- }
+ sstate = ctx->create_sampler_state(ctx, state);
+ if (!sstate) {
+ FREE(tex_handle);
+ return 0;
+ }
- si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
- memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
- ctx->delete_sampler_state(ctx, sstate);
+ si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
+ memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
+ ctx->delete_sampler_state(ctx, sstate);
- tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
- sizeof(desc_list));
- if (!tex_handle->desc_slot) {
- FREE(tex_handle);
- return 0;
- }
+ tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+ if (!tex_handle->desc_slot) {
+ FREE(tex_handle);
+ return 0;
+ }
- handle = tex_handle->desc_slot;
+ handle = tex_handle->desc_slot;
- if (!_mesa_hash_table_insert(sctx->tex_handles,
- (void *)(uintptr_t)handle,
- tex_handle)) {
- FREE(tex_handle);
- return 0;
- }
+ if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)(uintptr_t)handle, tex_handle)) {
+ FREE(tex_handle);
+ return 0;
+ }
- pipe_sampler_view_reference(&tex_handle->view, view);
+ pipe_sampler_view_reference(&tex_handle->view, view);
- si_resource(sview->base.texture)->texture_handle_allocated = true;
+ si_resource(sview->base.texture)->texture_handle_allocated = true;
- return handle;
+ return handle;
}
static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture_handle *tex_handle;
- struct hash_entry *entry;
-
- entry = _mesa_hash_table_search(sctx->tex_handles,
- (void *)(uintptr_t)handle);
- if (!entry)
- return;
-
- tex_handle = (struct si_texture_handle *)entry->data;
-
- /* Allow this descriptor slot to be re-used. */
- util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
-
- pipe_sampler_view_reference(&tex_handle->view, NULL);
- _mesa_hash_table_remove(sctx->tex_handles, entry);
- FREE(tex_handle);
-}
-
-static void si_make_texture_handle_resident(struct pipe_context *ctx,
- uint64_t handle, bool resident)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture_handle *tex_handle;
- struct si_sampler_view *sview;
- struct hash_entry *entry;
-
- entry = _mesa_hash_table_search(sctx->tex_handles,
- (void *)(uintptr_t)handle);
- if (!entry)
- return;
-
- tex_handle = (struct si_texture_handle *)entry->data;
- sview = (struct si_sampler_view *)tex_handle->view;
-
- if (resident) {
- if (sview->base.texture->target != PIPE_BUFFER) {
- struct si_texture *tex =
- (struct si_texture *)sview->base.texture;
-
- if (depth_needs_decompression(tex)) {
- util_dynarray_append(
- &sctx->resident_tex_needs_depth_decompress,
- struct si_texture_handle *,
- tex_handle);
- }
-
- if (color_needs_decompression(tex)) {
- util_dynarray_append(
- &sctx->resident_tex_needs_color_decompress,
- struct si_texture_handle *,
- tex_handle);
- }
-
- if (tex->surface.dcc_offset &&
- p_atomic_read(&tex->framebuffers_bound))
- sctx->need_check_render_feedback = true;
-
- si_update_bindless_texture_descriptor(sctx, tex_handle);
- } else {
- si_update_bindless_buffer_descriptor(sctx,
- tex_handle->desc_slot,
- sview->base.texture,
- sview->base.u.buf.offset,
- &tex_handle->desc_dirty);
- }
-
- /* Re-upload the descriptor if it has been updated while it
- * wasn't resident.
- */
- if (tex_handle->desc_dirty)
- sctx->bindless_descriptors_dirty = true;
-
- /* Add the texture handle to the per-context list. */
- util_dynarray_append(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle);
-
- /* Add the buffers to the current CS in case si_begin_new_cs()
- * is not going to be called.
- */
- si_sampler_view_add_buffer(sctx, sview->base.texture,
- RADEON_USAGE_READ,
- sview->is_stencil_sampler, false);
- } else {
- /* Remove the texture handle from the per-context list. */
- util_dynarray_delete_unordered(&sctx->resident_tex_handles,
- struct si_texture_handle *,
- tex_handle);
-
- if (sview->base.texture->target != PIPE_BUFFER) {
- util_dynarray_delete_unordered(
- &sctx->resident_tex_needs_depth_decompress,
- struct si_texture_handle *, tex_handle);
-
- util_dynarray_delete_unordered(
- &sctx->resident_tex_needs_color_decompress,
- struct si_texture_handle *, tex_handle);
- }
- }
-}
-
-static uint64_t si_create_image_handle(struct pipe_context *ctx,
- const struct pipe_image_view *view)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_image_handle *img_handle;
- uint32_t desc_list[16];
- uint64_t handle;
-
- if (!view || !view->resource)
- return 0;
-
- img_handle = CALLOC_STRUCT(si_image_handle);
- if (!img_handle)
- return 0;
-
- memset(desc_list, 0, sizeof(desc_list));
- si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
-
- si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
-
- img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
- sizeof(desc_list));
- if (!img_handle->desc_slot) {
- FREE(img_handle);
- return 0;
- }
-
- handle = img_handle->desc_slot;
-
- if (!_mesa_hash_table_insert(sctx->img_handles,
- (void *)(uintptr_t)handle,
- img_handle)) {
- FREE(img_handle);
- return 0;
- }
-
- util_copy_image_view(&img_handle->view, view);
-
- si_resource(view->resource)->image_handle_allocated = true;
-
- return handle;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture_handle *tex_handle;
+ struct hash_entry *entry;
+
+ entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+ if (!entry)
+ return;
+
+ tex_handle = (struct si_texture_handle *)entry->data;
+
+ /* Allow this descriptor slot to be re-used. */
+ util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
+
+ pipe_sampler_view_reference(&tex_handle->view, NULL);
+ _mesa_hash_table_remove(sctx->tex_handles, entry);
+ FREE(tex_handle);
+}
+
+static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t handle,
+ bool resident)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture_handle *tex_handle;
+ struct si_sampler_view *sview;
+ struct hash_entry *entry;
+
+ entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+ if (!entry)
+ return;
+
+ tex_handle = (struct si_texture_handle *)entry->data;
+ sview = (struct si_sampler_view *)tex_handle->view;
+
+ if (resident) {
+ if (sview->base.texture->target != PIPE_BUFFER) {
+ struct si_texture *tex = (struct si_texture *)sview->base.texture;
+
+ if (depth_needs_decompression(tex)) {
+ util_dynarray_append(&sctx->resident_tex_needs_depth_decompress,
+ struct si_texture_handle *, tex_handle);
+ }
+
+ if (color_needs_decompression(tex)) {
+ util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
+ struct si_texture_handle *, tex_handle);
+ }
+
+ if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+ sctx->need_check_render_feedback = true;
+
+ si_update_bindless_texture_descriptor(sctx, tex_handle);
+ } else {
+ si_update_bindless_buffer_descriptor(sctx, tex_handle->desc_slot, sview->base.texture,
+ sview->base.u.buf.offset, &tex_handle->desc_dirty);
+ }
+
+ /* Re-upload the descriptor if it has been updated while it
+ * wasn't resident.
+ */
+ if (tex_handle->desc_dirty)
+ sctx->bindless_descriptors_dirty = true;
+
+ /* Add the texture handle to the per-context list. */
+ util_dynarray_append(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle);
+
+ /* Add the buffers to the current CS in case si_begin_new_cs()
+ * is not going to be called.
+ */
+ si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+ sview->is_stencil_sampler, false);
+ } else {
+ /* Remove the texture handle from the per-context list. */
+ util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *,
+ tex_handle);
+
+ if (sview->base.texture->target != PIPE_BUFFER) {
+ util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress,
+ struct si_texture_handle *, tex_handle);
+
+ util_dynarray_delete_unordered(&sctx->resident_tex_needs_color_decompress,
+ struct si_texture_handle *, tex_handle);
+ }
+ }
+}
+
+static uint64_t si_create_image_handle(struct pipe_context *ctx, const struct pipe_image_view *view)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_image_handle *img_handle;
+ uint32_t desc_list[16];
+ uint64_t handle;
+
+ if (!view || !view->resource)
+ return 0;
+
+ img_handle = CALLOC_STRUCT(si_image_handle);
+ if (!img_handle)
+ return 0;
+
+ memset(desc_list, 0, sizeof(desc_list));
+ si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
+
+ si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
+
+ img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+ if (!img_handle->desc_slot) {
+ FREE(img_handle);
+ return 0;
+ }
+
+ handle = img_handle->desc_slot;
+
+ if (!_mesa_hash_table_insert(sctx->img_handles, (void *)(uintptr_t)handle, img_handle)) {
+ FREE(img_handle);
+ return 0;
+ }
+
+ util_copy_image_view(&img_handle->view, view);
+
+ si_resource(view->resource)->image_handle_allocated = true;
+
+ return handle;
}
static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_image_handle *img_handle;
- struct hash_entry *entry;
-
- entry = _mesa_hash_table_search(sctx->img_handles,
- (void *)(uintptr_t)handle);
- if (!entry)
- return;
-
- img_handle = (struct si_image_handle *)entry->data;
-
- util_copy_image_view(&img_handle->view, NULL);
- _mesa_hash_table_remove(sctx->img_handles, entry);
- FREE(img_handle);
-}
-
-static void si_make_image_handle_resident(struct pipe_context *ctx,
- uint64_t handle, unsigned access,
- bool resident)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_image_handle *img_handle;
- struct pipe_image_view *view;
- struct si_resource *res;
- struct hash_entry *entry;
-
- entry = _mesa_hash_table_search(sctx->img_handles,
- (void *)(uintptr_t)handle);
- if (!entry)
- return;
-
- img_handle = (struct si_image_handle *)entry->data;
- view = &img_handle->view;
- res = si_resource(view->resource);
-
- if (resident) {
- if (res->b.b.target != PIPE_BUFFER) {
- struct si_texture *tex = (struct si_texture *)res;
- unsigned level = view->u.tex.level;
-
- if (color_needs_decompression(tex)) {
- util_dynarray_append(
- &sctx->resident_img_needs_color_decompress,
- struct si_image_handle *,
- img_handle);
- }
-
- if (vi_dcc_enabled(tex, level) &&
- p_atomic_read(&tex->framebuffers_bound))
- sctx->need_check_render_feedback = true;
-
- si_update_bindless_image_descriptor(sctx, img_handle);
- } else {
- si_update_bindless_buffer_descriptor(sctx,
- img_handle->desc_slot,
- view->resource,
- view->u.buf.offset,
- &img_handle->desc_dirty);
- }
-
- /* Re-upload the descriptor if it has been updated while it
- * wasn't resident.
- */
- if (img_handle->desc_dirty)
- sctx->bindless_descriptors_dirty = true;
-
- /* Add the image handle to the per-context list. */
- util_dynarray_append(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle);
-
- /* Add the buffers to the current CS in case si_begin_new_cs()
- * is not going to be called.
- */
- si_sampler_view_add_buffer(sctx, view->resource,
- (access & PIPE_IMAGE_ACCESS_WRITE) ?
- RADEON_USAGE_READWRITE :
- RADEON_USAGE_READ, false, false);
- } else {
- /* Remove the image handle from the per-context list. */
- util_dynarray_delete_unordered(&sctx->resident_img_handles,
- struct si_image_handle *,
- img_handle);
-
- if (res->b.b.target != PIPE_BUFFER) {
- util_dynarray_delete_unordered(
- &sctx->resident_img_needs_color_decompress,
- struct si_image_handle *,
- img_handle);
- }
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_image_handle *img_handle;
+ struct hash_entry *entry;
+
+ entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+ if (!entry)
+ return;
+
+ img_handle = (struct si_image_handle *)entry->data;
+
+ util_copy_image_view(&img_handle->view, NULL);
+ _mesa_hash_table_remove(sctx->img_handles, entry);
+ FREE(img_handle);
+}
+
+static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t handle,
+ unsigned access, bool resident)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_image_handle *img_handle;
+ struct pipe_image_view *view;
+ struct si_resource *res;
+ struct hash_entry *entry;
+
+ entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+ if (!entry)
+ return;
+
+ img_handle = (struct si_image_handle *)entry->data;
+ view = &img_handle->view;
+ res = si_resource(view->resource);
+
+ if (resident) {
+ if (res->b.b.target != PIPE_BUFFER) {
+ struct si_texture *tex = (struct si_texture *)res;
+ unsigned level = view->u.tex.level;
+
+ if (color_needs_decompression(tex)) {
+ util_dynarray_append(&sctx->resident_img_needs_color_decompress,
+ struct si_image_handle *, img_handle);
+ }
+
+ if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+ sctx->need_check_render_feedback = true;
+
+ si_update_bindless_image_descriptor(sctx, img_handle);
+ } else {
+ si_update_bindless_buffer_descriptor(sctx, img_handle->desc_slot, view->resource,
+ view->u.buf.offset, &img_handle->desc_dirty);
+ }
+
+ /* Re-upload the descriptor if it has been updated while it
+ * wasn't resident.
+ */
+ if (img_handle->desc_dirty)
+ sctx->bindless_descriptors_dirty = true;
+
+ /* Add the image handle to the per-context list. */
+ util_dynarray_append(&sctx->resident_img_handles, struct si_image_handle *, img_handle);
+
+ /* Add the buffers to the current CS in case si_begin_new_cs()
+ * is not going to be called.
+ */
+ si_sampler_view_add_buffer(
+ sctx, view->resource,
+ (access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+ false);
+ } else {
+ /* Remove the image handle from the per-context list. */
+ util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *,
+ img_handle);
+
+ if (res->b.b.target != PIPE_BUFFER) {
+ util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress,
+ struct si_image_handle *, img_handle);
+ }
+ }
}
static void si_resident_buffers_add_all_to_bo_list(struct si_context *sctx)
{
- unsigned num_resident_tex_handles, num_resident_img_handles;
+ unsigned num_resident_tex_handles, num_resident_img_handles;
- num_resident_tex_handles = sctx->resident_tex_handles.size /
- sizeof(struct si_texture_handle *);
- num_resident_img_handles = sctx->resident_img_handles.size /
- sizeof(struct si_image_handle *);
+ num_resident_tex_handles = sctx->resident_tex_handles.size / sizeof(struct si_texture_handle *);
+ num_resident_img_handles = sctx->resident_img_handles.size / sizeof(struct si_image_handle *);
- /* Add all resident texture handles. */
- util_dynarray_foreach(&sctx->resident_tex_handles,
- struct si_texture_handle *, tex_handle) {
- struct si_sampler_view *sview =
- (struct si_sampler_view *)(*tex_handle)->view;
+ /* Add all resident texture handles. */
+ util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+ struct si_sampler_view *sview = (struct si_sampler_view *)(*tex_handle)->view;
- si_sampler_view_add_buffer(sctx, sview->base.texture,
- RADEON_USAGE_READ,
- sview->is_stencil_sampler, false);
- }
+ si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+ sview->is_stencil_sampler, false);
+ }
- /* Add all resident image handles. */
- util_dynarray_foreach(&sctx->resident_img_handles,
- struct si_image_handle *, img_handle) {
- struct pipe_image_view *view = &(*img_handle)->view;
+ /* Add all resident image handles. */
+ util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+ struct pipe_image_view *view = &(*img_handle)->view;
- si_sampler_view_add_buffer(sctx, view->resource,
- RADEON_USAGE_READWRITE,
- false, false);
- }
+ si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+ }
- sctx->num_resident_handles += num_resident_tex_handles +
- num_resident_img_handles;
- assert(sctx->bo_list_add_all_resident_resources);
- sctx->bo_list_add_all_resident_resources = false;
+ sctx->num_resident_handles += num_resident_tex_handles + num_resident_img_handles;
+ assert(sctx->bo_list_add_all_resident_resources);
+ sctx->bo_list_add_all_resident_resources = false;
}
/* INIT/DEINIT/UPLOAD */
void si_init_all_descriptors(struct si_context *sctx)
{
- int i;
- unsigned first_shader =
- sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
-
- for (i = first_shader; i < SI_NUM_SHADERS; i++) {
- bool is_2nd = sctx->chip_class >= GFX9 &&
- (i == PIPE_SHADER_TESS_CTRL ||
- i == PIPE_SHADER_GEOMETRY);
- unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
- unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
- int rel_dw_offset;
- struct si_descriptors *desc;
-
- if (is_2nd) {
- if (i == PIPE_SHADER_TESS_CTRL) {
- rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS -
- R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
- } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
- rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
- R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
- } else {
- rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
- R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
- }
- } else {
- rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
- }
- desc = si_const_and_shader_buffer_descriptors(sctx, i);
- si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
- num_buffer_slots, rel_dw_offset,
- RADEON_PRIO_SHADER_RW_BUFFER,
- RADEON_PRIO_CONST_BUFFER);
- desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
-
- if (is_2nd) {
- if (i == PIPE_SHADER_TESS_CTRL) {
- rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS -
- R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
- } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
- rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
- R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
- } else {
- rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
- R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
- }
- } else {
- rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
- }
-
- desc = si_sampler_and_image_descriptors(sctx, i);
- si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
-
- int j;
- for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
- memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
- for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
- memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
- }
-
- si_init_buffer_resources(&sctx->rw_buffers,
- &sctx->descriptors[SI_DESCS_RW_BUFFERS],
- SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
- /* The second priority is used by
- * const buffers in RW buffer slots. */
- RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
- sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
-
- /* Initialize an array of 1024 bindless descriptors, when the limit is
- * reached, just make it larger and re-upload the whole array.
- */
- si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
- SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
- 1024);
-
- sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
-
- /* Set pipe_context functions. */
- sctx->b.bind_sampler_states = si_bind_sampler_states;
- sctx->b.set_shader_images = si_set_shader_images;
- sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
- sctx->b.set_shader_buffers = si_set_shader_buffers;
- sctx->b.set_sampler_views = si_set_sampler_views;
- sctx->b.create_texture_handle = si_create_texture_handle;
- sctx->b.delete_texture_handle = si_delete_texture_handle;
- sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
- sctx->b.create_image_handle = si_create_image_handle;
- sctx->b.delete_image_handle = si_delete_image_handle;
- sctx->b.make_image_handle_resident = si_make_image_handle_resident;
-
- if (!sctx->has_graphics)
- return;
-
- sctx->b.set_polygon_stipple = si_set_polygon_stipple;
-
- /* Shader user data. */
- sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
-
- /* Set default and immutable mappings. */
- if (sctx->ngg) {
- assert(sctx->chip_class >= GFX10);
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
- }
-
- if (sctx->chip_class == GFX9) {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
- R_00B430_SPI_SHADER_USER_DATA_LS_0);
- si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
- R_00B330_SPI_SHADER_USER_DATA_ES_0);
- } else {
- si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
- R_00B430_SPI_SHADER_USER_DATA_HS_0);
- si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
- R_00B230_SPI_SHADER_USER_DATA_GS_0);
- }
- si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ int i;
+ unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+
+ for (i = first_shader; i < SI_NUM_SHADERS; i++) {
+ bool is_2nd =
+ sctx->chip_class >= GFX9 && (i == PIPE_SHADER_TESS_CTRL || i == PIPE_SHADER_GEOMETRY);
+ unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
+ unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+ int rel_dw_offset;
+ struct si_descriptors *desc;
+
+ if (is_2nd) {
+ if (i == PIPE_SHADER_TESS_CTRL) {
+ rel_dw_offset =
+ (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+ } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+ rel_dw_offset =
+ (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+ } else {
+ rel_dw_offset =
+ (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+ }
+ } else {
+ rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
+ }
+ desc = si_const_and_shader_buffer_descriptors(sctx, i);
+ si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
+ rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER,
+ RADEON_PRIO_CONST_BUFFER);
+ desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
+
+ if (is_2nd) {
+ if (i == PIPE_SHADER_TESS_CTRL) {
+ rel_dw_offset =
+ (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+ } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+ rel_dw_offset =
+ (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+ } else {
+ rel_dw_offset =
+ (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+ }
+ } else {
+ rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
+ }
+
+ desc = si_sampler_and_image_descriptors(sctx, i);
+ si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
+
+ int j;
+ for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
+ memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
+ for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
+ memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
+ }
+
+ si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS],
+ SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+ /* The second priority is used by
+ * const buffers in RW buffer slots. */
+ RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
+ sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
+ /* Initialize an array of 1024 bindless descriptors, when the limit is
+ * reached, just make it larger and re-upload the whole array.
+ */
+ si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
+ SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, 1024);
+
+ sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+
+ /* Set pipe_context functions. */
+ sctx->b.bind_sampler_states = si_bind_sampler_states;
+ sctx->b.set_shader_images = si_set_shader_images;
+ sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
+ sctx->b.set_shader_buffers = si_set_shader_buffers;
+ sctx->b.set_sampler_views = si_set_sampler_views;
+ sctx->b.create_texture_handle = si_create_texture_handle;
+ sctx->b.delete_texture_handle = si_delete_texture_handle;
+ sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
+ sctx->b.create_image_handle = si_create_image_handle;
+ sctx->b.delete_image_handle = si_delete_image_handle;
+ sctx->b.make_image_handle_resident = si_make_image_handle_resident;
+
+ if (!sctx->has_graphics)
+ return;
+
+ sctx->b.set_polygon_stipple = si_set_polygon_stipple;
+
+ /* Shader user data. */
+ sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
+
+ /* Set default and immutable mappings. */
+ if (sctx->ngg) {
+ assert(sctx->chip_class >= GFX10);
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ }
+
+ if (sctx->chip_class == GFX9) {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+ si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ } else {
+ si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ }
+ si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
}
static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask)
{
- unsigned dirty = sctx->descriptors_dirty & mask;
+ unsigned dirty = sctx->descriptors_dirty & mask;
- /* Assume nothing will go wrong: */
- sctx->shader_pointers_dirty |= dirty;
+ /* Assume nothing will go wrong: */
+ sctx->shader_pointers_dirty |= dirty;
- while (dirty) {
- unsigned i = u_bit_scan(&dirty);
+ while (dirty) {
+ unsigned i = u_bit_scan(&dirty);
- if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
- return false;
- }
+ if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
+ return false;
+ }
- sctx->descriptors_dirty &= ~mask;
+ sctx->descriptors_dirty &= ~mask;
- si_upload_bindless_descriptors(sctx);
+ si_upload_bindless_descriptors(sctx);
- return true;
+ return true;
}
bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
{
- const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
- return si_upload_shader_descriptors(sctx, mask);
+ const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
+ return si_upload_shader_descriptors(sctx, mask);
}
bool si_upload_compute_shader_descriptors(struct si_context *sctx)
{
- /* Does not update rw_buffers as that is not needed for compute shaders
- * and the input buffer is using the same SGPR's anyway.
- */
- const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE,
- SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
- return si_upload_shader_descriptors(sctx, mask);
+ /* Does not update rw_buffers as that is not needed for compute shaders
+ * and the input buffer is using the same SGPR's anyway.
+ */
+ const unsigned mask =
+ u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
+ return si_upload_shader_descriptors(sctx, mask);
}
void si_release_all_descriptors(struct si_context *sctx)
{
- int i;
+ int i;
- for (i = 0; i < SI_NUM_SHADERS; i++) {
- si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
- si_const_and_shader_buffer_descriptors(sctx, i));
- si_release_sampler_views(&sctx->samplers[i]);
- si_release_image_views(&sctx->images[i]);
- }
- si_release_buffer_resources(&sctx->rw_buffers,
- &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
- for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
- pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
+ for (i = 0; i < SI_NUM_SHADERS; i++) {
+ si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
+ si_const_and_shader_buffer_descriptors(sctx, i));
+ si_release_sampler_views(&sctx->samplers[i]);
+ si_release_image_views(&sctx->images[i]);
+ }
+ si_release_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+ for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
+ pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
- for (i = 0; i < SI_NUM_DESCS; ++i)
- si_release_descriptors(&sctx->descriptors[i]);
+ for (i = 0; i < SI_NUM_DESCS; ++i)
+ si_release_descriptors(&sctx->descriptors[i]);
- si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
- sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
+ si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+ sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
- si_release_bindless_descriptors(sctx);
+ si_release_bindless_descriptors(sctx);
}
void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx)
{
- for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
- si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
- si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
- si_image_views_begin_new_cs(sctx, &sctx->images[i]);
- }
- si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
- si_vertex_buffers_begin_new_cs(sctx);
+ for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+ si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
+ si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
+ si_image_views_begin_new_cs(sctx, &sctx->images[i]);
+ }
+ si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+ si_vertex_buffers_begin_new_cs(sctx);
- if (sctx->bo_list_add_all_resident_resources)
- si_resident_buffers_add_all_to_bo_list(sctx);
+ if (sctx->bo_list_add_all_resident_resources)
+ si_resident_buffers_add_all_to_bo_list(sctx);
- assert(sctx->bo_list_add_all_gfx_resources);
- sctx->bo_list_add_all_gfx_resources = false;
+ assert(sctx->bo_list_add_all_gfx_resources);
+ sctx->bo_list_add_all_gfx_resources = false;
}
void si_compute_resources_add_all_to_bo_list(struct si_context *sctx)
{
- unsigned sh = PIPE_SHADER_COMPUTE;
+ unsigned sh = PIPE_SHADER_COMPUTE;
- si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
- si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
- si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
- si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+ si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
+ si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
+ si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
+ si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
- if (sctx->bo_list_add_all_resident_resources)
- si_resident_buffers_add_all_to_bo_list(sctx);
+ if (sctx->bo_list_add_all_resident_resources)
+ si_resident_buffers_add_all_to_bo_list(sctx);
- assert(sctx->bo_list_add_all_compute_resources);
- sctx->bo_list_add_all_compute_resources = false;
+ assert(sctx->bo_list_add_all_compute_resources);
+ sctx->bo_list_add_all_compute_resources = false;
}
void si_all_descriptors_begin_new_cs(struct si_context *sctx)
{
- for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
- si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
- si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
+ for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
+ si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
+ si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
- si_shader_pointers_begin_new_cs(sctx);
+ si_shader_pointers_begin_new_cs(sctx);
- sctx->bo_list_add_all_resident_resources = true;
- sctx->bo_list_add_all_gfx_resources = true;
- sctx->bo_list_add_all_compute_resources = true;
+ sctx->bo_list_add_all_resident_resources = true;
+ sctx->bo_list_add_all_gfx_resources = true;
+ sctx->bo_list_add_all_compute_resources = true;
}
-void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
- uint64_t new_active_mask)
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, uint64_t new_active_mask)
{
- struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+ struct si_descriptors *desc = &sctx->descriptors[desc_idx];
- /* Ignore no-op updates and updates that disable all slots. */
- if (!new_active_mask ||
- new_active_mask == u_bit_consecutive64(desc->first_active_slot,
- desc->num_active_slots))
- return;
+ /* Ignore no-op updates and updates that disable all slots. */
+ if (!new_active_mask ||
+ new_active_mask == u_bit_consecutive64(desc->first_active_slot, desc->num_active_slots))
+ return;
- int first, count;
- u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
- assert(new_active_mask == 0);
+ int first, count;
+ u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+ assert(new_active_mask == 0);
- /* Upload/dump descriptors if slots are being enabled. */
- if (first < desc->first_active_slot ||
- first + count > desc->first_active_slot + desc->num_active_slots)
- sctx->descriptors_dirty |= 1u << desc_idx;
+ /* Upload/dump descriptors if slots are being enabled. */
+ if (first < desc->first_active_slot ||
+ first + count > desc->first_active_slot + desc->num_active_slots)
+ sctx->descriptors_dirty |= 1u << desc_idx;
- desc->first_active_slot = first;
- desc->num_active_slots = count;
+ desc->first_active_slot = first;
+ desc->num_active_slots = count;
}
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
- struct si_shader_selector *sel)
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel)
{
- if (!sel)
- return;
+ if (!sel)
+ return;
- si_set_active_descriptors(sctx,
- si_const_and_shader_buffer_descriptors_idx(sel->type),
- sel->active_const_and_shader_buffers);
- si_set_active_descriptors(sctx,
- si_sampler_and_image_descriptors_idx(sel->type),
- sel->active_samplers_and_images);
+ si_set_active_descriptors(sctx, si_const_and_shader_buffer_descriptors_idx(sel->type),
+ sel->active_const_and_shader_buffers);
+ si_set_active_descriptors(sctx, si_sampler_and_image_descriptors_idx(sel->type),
+ sel->active_samplers_and_images);
}
static void si_dma_emit_wait_idle(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
- /* NOP waits for idle. */
- if (sctx->chip_class >= GFX7)
- radeon_emit(cs, 0x00000000); /* NOP */
- else
- radeon_emit(cs, 0xf0000000); /* NOP */
+ /* NOP waits for idle. */
+ if (sctx->chip_class >= GFX7)
+ radeon_emit(cs, 0x00000000); /* NOP */
+ else
+ radeon_emit(cs, 0xf0000000); /* NOP */
}
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
- uint64_t offset)
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
{
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
- uint64_t va = dst->gpu_address + offset;
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+ uint64_t va = dst->gpu_address + offset;
- if (sctx->chip_class == GFX6) {
- unreachable("SI DMA doesn't support the timestamp packet.");
- return;
- }
+ if (sctx->chip_class == GFX6) {
+ unreachable("SI DMA doesn't support the timestamp packet.");
+ return;
+ }
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
- assert(va % 8 == 0);
+ assert(va % 8 == 0);
- si_need_dma_space(sctx, 4, dst, NULL);
- si_dma_emit_wait_idle(sctx);
+ si_need_dma_space(sctx, 4, dst, NULL);
+ si_dma_emit_wait_idle(sctx);
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
- SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
- 0));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ radeon_emit(
+ cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
}
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, unsigned clear_value)
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, unsigned clear_value)
{
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
- unsigned i, ncopy, csize;
- struct si_resource *sdst = si_resource(dst);
-
- assert(offset % 4 == 0);
- assert(size);
- assert(size % 4 == 0);
-
- if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
- sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
- return;
- }
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
- offset += sdst->gpu_address;
-
- if (sctx->chip_class == GFX6) {
- /* the same maximum size as for copying */
- ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
- si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
- radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
- csize / 4));
- radeon_emit(cs, offset);
- radeon_emit(cs, clear_value);
- radeon_emit(cs, (offset >> 32) << 16);
- offset += csize;
- size -= csize;
- }
- return;
- }
-
- /* The following code is for Sea Islands and later. */
- /* the same maximum size as for copying */
- ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
- si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
- 0x8000 /* dword copy */));
- radeon_emit(cs, offset);
- radeon_emit(cs, offset >> 32);
- radeon_emit(cs, clear_value);
- /* dw count */
- radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
- offset += csize;
- size -= csize;
- }
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+ unsigned i, ncopy, csize;
+ struct si_resource *sdst = si_resource(dst);
+
+ assert(offset % 4 == 0);
+ assert(size);
+ assert(size % 4 == 0);
+
+ if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+ sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
+ sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+ return;
+ }
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+ offset += sdst->gpu_address;
+
+ if (sctx->chip_class == GFX6) {
+ /* the same maximum size as for copying */
+ ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+ si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+ radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
+ radeon_emit(cs, offset);
+ radeon_emit(cs, clear_value);
+ radeon_emit(cs, (offset >> 32) << 16);
+ offset += csize;
+ size -= csize;
+ }
+ return;
+ }
+
+ /* The following code is for Sea Islands and later. */
+ /* the same maximum size as for copying */
+ ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+ si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+ radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
+ radeon_emit(cs, offset);
+ radeon_emit(cs, offset >> 32);
+ radeon_emit(cs, clear_value);
+ /* dw count */
+ radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
+ offset += csize;
+ size -= csize;
+ }
}
void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
- struct pipe_resource *src, uint64_t dst_offset,
- uint64_t src_offset, uint64_t size)
+ struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+ uint64_t size)
{
- struct radeon_cmdbuf *cs = sctx->sdma_cs;
- unsigned i, ncopy, csize;
- struct si_resource *sdst = si_resource(dst);
- struct si_resource *ssrc = si_resource(src);
-
- if (!cs ||
- dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
- si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
- return;
- }
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &sdst->valid_buffer_range, dst_offset,
- dst_offset + size);
-
- dst_offset += sdst->gpu_address;
- src_offset += ssrc->gpu_address;
-
- if (sctx->chip_class == GFX6) {
- unsigned max_size, sub_cmd, shift;
-
- /* see whether we should use the dword-aligned or byte-aligned copy */
- if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
- sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
- shift = 2;
- max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
- } else {
- sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
- shift = 0;
- max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
- }
-
- ncopy = DIV_ROUND_UP(size, max_size);
- si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, max_size);
- radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
- csize >> shift));
- radeon_emit(cs, dst_offset);
- radeon_emit(cs, src_offset);
- radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
- radeon_emit(cs, (src_offset >> 32UL) & 0xff);
- dst_offset += csize;
- src_offset += csize;
- size -= csize;
- }
- return;
- }
-
- /* The following code is for CI and later. */
- unsigned align = ~0u;
- ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-
- /* Align copy size to dw if src/dst address are dw aligned */
- if ((src_offset & 0x3) == 0 &&
- (dst_offset & 0x3) == 0 &&
- size > 4 &&
- (size & 3) != 0) {
- align = ~0x3u;
- ncopy++;
- }
-
- si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
-
- for (i = 0; i < ncopy; i++) {
- csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
- 0));
- radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
- radeon_emit(cs, 0); /* src/dst endian swap */
- radeon_emit(cs, src_offset);
- radeon_emit(cs, src_offset >> 32);
- radeon_emit(cs, dst_offset);
- radeon_emit(cs, dst_offset >> 32);
- dst_offset += csize;
- src_offset += csize;
- size -= csize;
- }
+ struct radeon_cmdbuf *cs = sctx->sdma_cs;
+ unsigned i, ncopy, csize;
+ struct si_resource *sdst = si_resource(dst);
+ struct si_resource *ssrc = si_resource(src);
+
+ if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+ si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
+ return;
+ }
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
+
+ dst_offset += sdst->gpu_address;
+ src_offset += ssrc->gpu_address;
+
+ if (sctx->chip_class == GFX6) {
+ unsigned max_size, sub_cmd, shift;
+
+ /* see whether we should use the dword-aligned or byte-aligned copy */
+ if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
+ sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
+ shift = 2;
+ max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
+ } else {
+ sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
+ shift = 0;
+ max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
+ }
+
+ ncopy = DIV_ROUND_UP(size, max_size);
+ si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = MIN2(size, max_size);
+ radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
+ radeon_emit(cs, dst_offset);
+ radeon_emit(cs, src_offset);
+ radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
+ radeon_emit(cs, (src_offset >> 32UL) & 0xff);
+ dst_offset += csize;
+ src_offset += csize;
+ size -= csize;
+ }
+ return;
+ }
+
+ /* The following code is for CI and later. */
+ unsigned align = ~0u;
+ ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+
+ /* Align copy size to dw if src/dst address are dw aligned */
+ if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
+ align = ~0x3u;
+ ncopy++;
+ }
+
+ si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
+
+ for (i = 0; i < ncopy; i++) {
+ csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
+ radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
+ radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+ radeon_emit(cs, 0); /* src/dst endian swap */
+ radeon_emit(cs, src_offset);
+ radeon_emit(cs, src_offset >> 32);
+ radeon_emit(cs, dst_offset);
+ radeon_emit(cs, dst_offset >> 32);
+ dst_offset += csize;
+ src_offset += csize;
+ size -= csize;
+ }
}
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
- struct si_resource *dst, struct si_resource *src)
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+ struct si_resource *src)
{
- struct radeon_winsys *ws = ctx->ws;
- uint64_t vram = ctx->sdma_cs->used_vram;
- uint64_t gtt = ctx->sdma_cs->used_gart;
-
- if (dst) {
- vram += dst->vram_usage;
- gtt += dst->gart_usage;
- }
- if (src) {
- vram += src->vram_usage;
- gtt += src->gart_usage;
- }
-
- /* Flush the GFX IB if DMA depends on it. */
- if (!ctx->sdma_uploads_in_progress &&
- radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
- ((dst &&
- ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
- RADEON_USAGE_READWRITE)) ||
- (src &&
- ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
- RADEON_USAGE_WRITE))))
- si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
- /* Flush if there's not enough space, or if the memory usage per IB
- * is too large.
- *
- * IBs using too little memory are limited by the IB submission overhead.
- * IBs using too much memory are limited by the kernel/TTM overhead.
- * Too long IBs create CPU-GPU pipeline bubbles and add latency.
- *
- * This heuristic makes sure that DMA requests are executed
- * very soon after the call is made and lowers memory usage.
- * It improves texture upload performance by keeping the DMA
- * engine busy while uploads are being submitted.
- */
- num_dw++; /* for emit_wait_idle below */
- if (!ctx->sdma_uploads_in_progress &&
- (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
- ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
- !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
- si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
- assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
- }
-
- /* Wait for idle if either buffer has been used in the IB before to
- * prevent read-after-write hazards.
- */
- if ((dst &&
- ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf,
- RADEON_USAGE_READWRITE)) ||
- (src &&
- ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf,
- RADEON_USAGE_WRITE)))
- si_dma_emit_wait_idle(ctx);
-
- unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
- if (dst) {
- ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
- dst->domains, 0);
- }
- if (src) {
- ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync,
- src->domains, 0);
- }
-
- /* this function is called before all DMA calls, so increment this. */
- ctx->num_dma_calls++;
+ struct radeon_winsys *ws = ctx->ws;
+ uint64_t vram = ctx->sdma_cs->used_vram;
+ uint64_t gtt = ctx->sdma_cs->used_gart;
+
+ if (dst) {
+ vram += dst->vram_usage;
+ gtt += dst->gart_usage;
+ }
+ if (src) {
+ vram += src->vram_usage;
+ gtt += src->gart_usage;
+ }
+
+ /* Flush the GFX IB if DMA depends on it. */
+ if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+ ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+ (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+ /* Flush if there's not enough space, or if the memory usage per IB
+ * is too large.
+ *
+ * IBs using too little memory are limited by the IB submission overhead.
+ * IBs using too much memory are limited by the kernel/TTM overhead.
+ * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+ *
+ * This heuristic makes sure that DMA requests are executed
+ * very soon after the call is made and lowers memory usage.
+ * It improves texture upload performance by keeping the DMA
+ * engine busy while uploads are being submitted.
+ */
+ num_dw++; /* for emit_wait_idle below */
+ if (!ctx->sdma_uploads_in_progress &&
+ (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
+ ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
+ !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
+ si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+ assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
+ }
+
+ /* Wait for idle if either buffer has been used in the IB before to
+ * prevent read-after-write hazards.
+ */
+ if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+ (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
+ si_dma_emit_wait_idle(ctx);
+
+ unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
+ if (dst) {
+ ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
+ }
+ if (src) {
+ ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
+ }
+
+ /* this function is called before all DMA calls, so increment this. */
+ ctx->num_dma_calls++;
}
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
- struct pipe_fence_handle **fence)
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
{
- struct radeon_cmdbuf *cs = ctx->sdma_cs;
- struct radeon_saved_cs saved;
- bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
-
- if (!radeon_emitted(cs, 0)) {
- if (fence)
- ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
- return;
- }
-
- if (check_vm)
- si_save_cs(ctx->ws, cs, &saved, true);
-
- ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
- if (fence)
- ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-
- if (check_vm) {
- /* Use conservative timeout 800ms, after which we won't wait any
- * longer and assume the GPU is hung.
- */
- ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
-
- si_check_vm_faults(ctx, &saved, RING_DMA);
- si_clear_saved_cs(&saved);
- }
+ struct radeon_cmdbuf *cs = ctx->sdma_cs;
+ struct radeon_saved_cs saved;
+ bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+
+ if (!radeon_emitted(cs, 0)) {
+ if (fence)
+ ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+ return;
+ }
+
+ if (check_vm)
+ si_save_cs(ctx->ws, cs, &saved, true);
+
+ ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+ if (fence)
+ ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+
+ if (check_vm) {
+ /* Use conservative timeout 800ms, after which we won't wait any
+ * longer and assume the GPU is hung.
+ */
+ ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
+
+ si_check_vm_faults(ctx, &saved, RING_DMA);
+ si_clear_saved_cs(&saved);
+ }
}
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, unsigned value)
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, unsigned value)
{
- struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+ struct si_context *ctx = (struct si_context *)sscreen->aux_context;
- simple_mtx_lock(&sscreen->aux_context_lock);
- si_sdma_clear_buffer(ctx, dst, offset, size, value);
- sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
- simple_mtx_unlock(&sscreen->aux_context_lock);
+ simple_mtx_lock(&sscreen->aux_context_lock);
+ si_sdma_clear_buffer(ctx, dst, offset, size, value);
+ sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+ simple_mtx_unlock(&sscreen->aux_context_lock);
}
*
*/
-#include <libsync.h>
-
+#include "si_build_pm4.h"
#include "util/os_time.h"
#include "util/u_memory.h"
#include "util/u_queue.h"
#include "util/u_upload_mgr.h"
-#include "si_build_pm4.h"
+#include <libsync.h>
struct si_fine_fence {
- struct si_resource *buf;
- unsigned offset;
+ struct si_resource *buf;
+ unsigned offset;
};
struct si_multi_fence {
- struct pipe_reference reference;
- struct pipe_fence_handle *gfx;
- struct pipe_fence_handle *sdma;
- struct tc_unflushed_batch_token *tc_token;
- struct util_queue_fence ready;
-
- /* If the context wasn't flushed at fence creation, this is non-NULL. */
- struct {
- struct si_context *ctx;
- unsigned ib_index;
- } gfx_unflushed;
-
- struct si_fine_fence fine;
+ struct pipe_reference reference;
+ struct pipe_fence_handle *gfx;
+ struct pipe_fence_handle *sdma;
+ struct tc_unflushed_batch_token *tc_token;
+ struct util_queue_fence ready;
+
+ /* If the context wasn't flushed at fence creation, this is non-NULL. */
+ struct {
+ struct si_context *ctx;
+ unsigned ib_index;
+ } gfx_unflushed;
+
+ struct si_fine_fence fine;
};
/**
* \param old_value Previous fence value (for a bug workaround)
* \param new_value Fence value to write for this event.
*/
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
- unsigned event, unsigned event_flags,
- unsigned dst_sel, unsigned int_sel, unsigned data_sel,
- struct si_resource *buf, uint64_t va,
- uint32_t new_fence, unsigned query_type)
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+ unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+ struct si_resource *buf, uint64_t va, uint32_t new_fence,
+ unsigned query_type)
{
- unsigned op = EVENT_TYPE(event) |
- EVENT_INDEX(event == V_028A90_CS_DONE ||
- event == V_028A90_PS_DONE ? 6 : 5) |
- event_flags;
- unsigned sel = EOP_DST_SEL(dst_sel) |
- EOP_INT_SEL(int_sel) |
- EOP_DATA_SEL(data_sel);
- bool compute_ib = !ctx->has_graphics ||
- cs == ctx->prim_discard_compute_cs;
-
- if (ctx->chip_class >= GFX9 ||
- (compute_ib && ctx->chip_class >= GFX7)) {
- /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
- * counters) must immediately precede every timestamp event to
- * prevent a GPU hang on GFX9.
- *
- * Occlusion queries don't need to do it here, because they
- * always do ZPASS_DONE before the timestamp.
- */
- if (ctx->chip_class == GFX9 && !compute_ib &&
- query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
- query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
- query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
- struct si_resource *scratch = ctx->eop_bug_scratch;
-
- assert(16 * ctx->screen->info.num_render_backends <=
- scratch->b.b.width0);
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
- radeon_emit(cs, scratch->gpu_address);
- radeon_emit(cs, scratch->gpu_address >> 32);
-
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
- RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
- }
-
- radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
- radeon_emit(cs, op);
- radeon_emit(cs, sel);
- radeon_emit(cs, va); /* address lo */
- radeon_emit(cs, va >> 32); /* address hi */
- radeon_emit(cs, new_fence); /* immediate data lo */
- radeon_emit(cs, 0); /* immediate data hi */
- if (ctx->chip_class >= GFX9)
- radeon_emit(cs, 0); /* unused */
- } else {
- if (ctx->chip_class == GFX7 ||
- ctx->chip_class == GFX8) {
- struct si_resource *scratch = ctx->eop_bug_scratch;
- uint64_t va = scratch->gpu_address;
-
- /* Two EOP events are required to make all engines go idle
- * (and optional cache flushes executed) before the timestamp
- * is written.
- */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
- radeon_emit(cs, op);
- radeon_emit(cs, va);
- radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
- radeon_emit(cs, 0); /* immediate data */
- radeon_emit(cs, 0); /* unused */
-
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
- RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
- }
-
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
- radeon_emit(cs, op);
- radeon_emit(cs, va);
- radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
- radeon_emit(cs, new_fence); /* immediate data */
- radeon_emit(cs, 0); /* unused */
- }
-
- if (buf) {
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE,
- RADEON_PRIO_QUERY);
- }
+ unsigned op = EVENT_TYPE(event) |
+ EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
+ event_flags;
+ unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
+ bool compute_ib = !ctx->has_graphics || cs == ctx->prim_discard_compute_cs;
+
+ if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) {
+ /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+ * counters) must immediately precede every timestamp event to
+ * prevent a GPU hang on GFX9.
+ *
+ * Occlusion queries don't need to do it here, because they
+ * always do ZPASS_DONE before the timestamp.
+ */
+ if (ctx->chip_class == GFX9 && !compute_ib && query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+ query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
+ query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ struct si_resource *scratch = ctx->eop_bug_scratch;
+
+ assert(16 * ctx->screen->info.num_render_backends <= scratch->b.b.width0);
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, scratch->gpu_address);
+ radeon_emit(cs, scratch->gpu_address >> 32);
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, sel);
+ radeon_emit(cs, va); /* address lo */
+ radeon_emit(cs, va >> 32); /* address hi */
+ radeon_emit(cs, new_fence); /* immediate data lo */
+ radeon_emit(cs, 0); /* immediate data hi */
+ if (ctx->chip_class >= GFX9)
+ radeon_emit(cs, 0); /* unused */
+ } else {
+ if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) {
+ struct si_resource *scratch = ctx->eop_bug_scratch;
+ uint64_t va = scratch->gpu_address;
+
+ /* Two EOP events are required to make all engines go idle
+ * (and optional cache flushes executed) before the timestamp
+ * is written.
+ */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+ radeon_emit(cs, 0); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+ radeon_emit(cs, new_fence); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+ }
+
+ if (buf) {
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+ }
}
unsigned si_cp_write_fence_dwords(struct si_screen *screen)
{
- unsigned dwords = 6;
+ unsigned dwords = 6;
- if (screen->info.chip_class == GFX7 ||
- screen->info.chip_class == GFX8)
- dwords *= 2;
+ if (screen->info.chip_class == GFX7 || screen->info.chip_class == GFX8)
+ dwords *= 2;
- return dwords;
+ return dwords;
}
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
- uint64_t va, uint32_t ref, uint32_t mask, unsigned flags)
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+ uint32_t mask, unsigned flags)
{
- radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
- radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- radeon_emit(cs, ref); /* reference value */
- radeon_emit(cs, mask); /* mask */
- radeon_emit(cs, 4); /* poll interval */
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, ref); /* reference value */
+ radeon_emit(cs, mask); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
}
-static void si_add_fence_dependency(struct si_context *sctx,
- struct pipe_fence_handle *fence)
+static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence)
{
- struct radeon_winsys *ws = sctx->ws;
+ struct radeon_winsys *ws = sctx->ws;
- if (sctx->sdma_cs)
- ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
- ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
+ if (sctx->sdma_cs)
+ ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
+ ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
}
-static void si_add_syncobj_signal(struct si_context *sctx,
- struct pipe_fence_handle *fence)
+static void si_add_syncobj_signal(struct si_context *sctx, struct pipe_fence_handle *fence)
{
- sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
+ sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
}
-static void si_fence_reference(struct pipe_screen *screen,
- struct pipe_fence_handle **dst,
- struct pipe_fence_handle *src)
+static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
{
- struct radeon_winsys *ws = ((struct si_screen*)screen)->ws;
- struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
- struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
-
- if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
- ws->fence_reference(&(*sdst)->gfx, NULL);
- ws->fence_reference(&(*sdst)->sdma, NULL);
- tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
- si_resource_reference(&(*sdst)->fine.buf, NULL);
- FREE(*sdst);
- }
- *sdst = ssrc;
+ struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
+ struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
+ struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
+
+ if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
+ ws->fence_reference(&(*sdst)->gfx, NULL);
+ ws->fence_reference(&(*sdst)->sdma, NULL);
+ tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
+ si_resource_reference(&(*sdst)->fine.buf, NULL);
+ FREE(*sdst);
+ }
+ *sdst = ssrc;
}
static struct si_multi_fence *si_create_multi_fence()
{
- struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
- if (!fence)
- return NULL;
+ struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+ if (!fence)
+ return NULL;
- pipe_reference_init(&fence->reference, 1);
- util_queue_fence_init(&fence->ready);
+ pipe_reference_init(&fence->reference, 1);
+ util_queue_fence_init(&fence->ready);
- return fence;
+ return fence;
}
struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
- struct tc_unflushed_batch_token *tc_token)
+ struct tc_unflushed_batch_token *tc_token)
{
- struct si_multi_fence *fence = si_create_multi_fence();
- if (!fence)
- return NULL;
+ struct si_multi_fence *fence = si_create_multi_fence();
+ if (!fence)
+ return NULL;
- util_queue_fence_reset(&fence->ready);
- tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+ util_queue_fence_reset(&fence->ready);
+ tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
- return (struct pipe_fence_handle *)fence;
+ return (struct pipe_fence_handle *)fence;
}
-static bool si_fine_fence_signaled(struct radeon_winsys *rws,
- const struct si_fine_fence *fine)
+static bool si_fine_fence_signaled(struct radeon_winsys *rws, const struct si_fine_fence *fine)
{
- char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
- PIPE_TRANSFER_UNSYNCHRONIZED);
- if (!map)
- return false;
+ char *map =
+ rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!map)
+ return false;
- uint32_t *fence = (uint32_t*)(map + fine->offset);
- return *fence != 0;
+ uint32_t *fence = (uint32_t *)(map + fine->offset);
+ return *fence != 0;
}
-static void si_fine_fence_set(struct si_context *ctx,
- struct si_fine_fence *fine,
- unsigned flags)
+static void si_fine_fence_set(struct si_context *ctx, struct si_fine_fence *fine, unsigned flags)
{
- uint32_t *fence_ptr;
-
- assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
-
- /* Use cached system memory for the fence. */
- u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4,
- &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
- if (!fine->buf)
- return;
-
- *fence_ptr = 0;
-
- if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
- uint32_t value = 0x80000000;
-
- si_cp_write_data(ctx, fine->buf, fine->offset, 4,
- V_370_MEM, V_370_PFP, &value);
- } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
- uint64_t fence_va = fine->buf->gpu_address + fine->offset;
-
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
- RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
- si_cp_release_mem(ctx, ctx->gfx_cs,
- V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- NULL, fence_va, 0x80000000,
- PIPE_QUERY_GPU_FINISHED);
- } else {
- assert(false);
- }
+ uint32_t *fence_ptr;
+
+ assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+ /* Use cached system memory for the fence. */
+ u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4, &fine->offset,
+ (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
+ if (!fine->buf)
+ return;
+
+ *fence_ptr = 0;
+
+ if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+ uint32_t value = 0x80000000;
+
+ si_cp_write_data(ctx, fine->buf, fine->offset, 4, V_370_MEM, V_370_PFP, &value);
+ } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
+ uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+ si_cp_release_mem(ctx, ctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, NULL, fence_va, 0x80000000,
+ PIPE_QUERY_GPU_FINISHED);
+ } else {
+ assert(false);
+ }
}
-static bool si_fence_finish(struct pipe_screen *screen,
- struct pipe_context *ctx,
- struct pipe_fence_handle *fence,
- uint64_t timeout)
+static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx,
+ struct pipe_fence_handle *fence, uint64_t timeout)
{
- struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
- struct si_context *sctx;
- int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
-
- ctx = threaded_context_unwrap_sync(ctx);
- sctx = (struct si_context*)(ctx ? ctx : NULL);
-
- if (!util_queue_fence_is_signalled(&sfence->ready)) {
- if (sfence->tc_token) {
- /* Ensure that si_flush_from_st will be called for
- * this fence, but only if we're in the API thread
- * where the context is current.
- *
- * Note that the batch containing the flush may already
- * be in flight in the driver thread, so the fence
- * may not be ready yet when this call returns.
- */
- threaded_context_flush(ctx, sfence->tc_token,
- timeout == 0);
- }
-
- if (!timeout)
- return false;
-
- if (timeout == PIPE_TIMEOUT_INFINITE) {
- util_queue_fence_wait(&sfence->ready);
- } else {
- if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
- return false;
- }
-
- if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
- int64_t time = os_time_get_nano();
- timeout = abs_timeout > time ? abs_timeout - time : 0;
- }
- }
-
- if (sfence->sdma) {
- if (!rws->fence_wait(rws, sfence->sdma, timeout))
- return false;
-
- /* Recompute the timeout after waiting. */
- if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
- int64_t time = os_time_get_nano();
- timeout = abs_timeout > time ? abs_timeout - time : 0;
- }
- }
-
- if (!sfence->gfx)
- return true;
-
- if (sfence->fine.buf &&
- si_fine_fence_signaled(rws, &sfence->fine)) {
- rws->fence_reference(&sfence->gfx, NULL);
- si_resource_reference(&sfence->fine.buf, NULL);
- return true;
- }
-
- /* Flush the gfx IB if it hasn't been flushed yet. */
- if (sctx && sfence->gfx_unflushed.ctx == sctx &&
- sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
- /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
- * spec says:
- *
- * "If the sync object being blocked upon will not be
- * signaled in finite time (for example, by an associated
- * fence command issued previously, but not yet flushed to
- * the graphics pipeline), then ClientWaitSync may hang
- * forever. To help prevent this behavior, if
- * ClientWaitSync is called and all of the following are
- * true:
- *
- * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
- * * sync is unsignaled when ClientWaitSync is called,
- * * and the calls to ClientWaitSync and FenceSync were
- * issued from the same context,
- *
- * then the GL will behave as if the equivalent of Flush
- * were inserted immediately after the creation of sync."
- *
- * This means we need to flush for such fences even when we're
- * not going to wait.
- */
- si_flush_gfx_cs(sctx,
- (timeout ? 0 : PIPE_FLUSH_ASYNC) |
- RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
- NULL);
- sfence->gfx_unflushed.ctx = NULL;
-
- if (!timeout)
- return false;
-
- /* Recompute the timeout after all that. */
- if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
- int64_t time = os_time_get_nano();
- timeout = abs_timeout > time ? abs_timeout - time : 0;
- }
- }
-
- if (rws->fence_wait(rws, sfence->gfx, timeout))
- return true;
-
- /* Re-check in case the GPU is slow or hangs, but the commands before
- * the fine-grained fence have completed. */
- if (sfence->fine.buf &&
- si_fine_fence_signaled(rws, &sfence->fine))
- return true;
-
- return false;
+ struct radeon_winsys *rws = ((struct si_screen *)screen)->ws;
+ struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+ struct si_context *sctx;
+ int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+ ctx = threaded_context_unwrap_sync(ctx);
+ sctx = (struct si_context *)(ctx ? ctx : NULL);
+
+ if (!util_queue_fence_is_signalled(&sfence->ready)) {
+ if (sfence->tc_token) {
+ /* Ensure that si_flush_from_st will be called for
+ * this fence, but only if we're in the API thread
+ * where the context is current.
+ *
+ * Note that the batch containing the flush may already
+ * be in flight in the driver thread, so the fence
+ * may not be ready yet when this call returns.
+ */
+ threaded_context_flush(ctx, sfence->tc_token, timeout == 0);
+ }
+
+ if (!timeout)
+ return false;
+
+ if (timeout == PIPE_TIMEOUT_INFINITE) {
+ util_queue_fence_wait(&sfence->ready);
+ } else {
+ if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
+ return false;
+ }
+
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (sfence->sdma) {
+ if (!rws->fence_wait(rws, sfence->sdma, timeout))
+ return false;
+
+ /* Recompute the timeout after waiting. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (!sfence->gfx)
+ return true;
+
+ if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine)) {
+ rws->fence_reference(&sfence->gfx, NULL);
+ si_resource_reference(&sfence->fine.buf, NULL);
+ return true;
+ }
+
+ /* Flush the gfx IB if it hasn't been flushed yet. */
+ if (sctx && sfence->gfx_unflushed.ctx == sctx &&
+ sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+ /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+ * spec says:
+ *
+ * "If the sync object being blocked upon will not be
+ * signaled in finite time (for example, by an associated
+ * fence command issued previously, but not yet flushed to
+ * the graphics pipeline), then ClientWaitSync may hang
+ * forever. To help prevent this behavior, if
+ * ClientWaitSync is called and all of the following are
+ * true:
+ *
+ * * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+ * * sync is unsignaled when ClientWaitSync is called,
+ * * and the calls to ClientWaitSync and FenceSync were
+ * issued from the same context,
+ *
+ * then the GL will behave as if the equivalent of Flush
+ * were inserted immediately after the creation of sync."
+ *
+ * This means we need to flush for such fences even when we're
+ * not going to wait.
+ */
+ si_flush_gfx_cs(sctx, (timeout ? 0 : PIPE_FLUSH_ASYNC) | RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+ NULL);
+ sfence->gfx_unflushed.ctx = NULL;
+
+ if (!timeout)
+ return false;
+
+ /* Recompute the timeout after all that. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (rws->fence_wait(rws, sfence->gfx, timeout))
+ return true;
+
+ /* Re-check in case the GPU is slow or hangs, but the commands before
+ * the fine-grained fence have completed. */
+ if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine))
+ return true;
+
+ return false;
}
-static void si_create_fence_fd(struct pipe_context *ctx,
- struct pipe_fence_handle **pfence, int fd,
- enum pipe_fd_type type)
+static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handle **pfence, int fd,
+ enum pipe_fd_type type)
{
- struct si_screen *sscreen = (struct si_screen*)ctx->screen;
- struct radeon_winsys *ws = sscreen->ws;
- struct si_multi_fence *sfence;
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct si_multi_fence *sfence;
- *pfence = NULL;
+ *pfence = NULL;
- sfence = si_create_multi_fence();
- if (!sfence)
- return;
+ sfence = si_create_multi_fence();
+ if (!sfence)
+ return;
- switch (type) {
- case PIPE_FD_TYPE_NATIVE_SYNC:
- if (!sscreen->info.has_fence_to_handle)
- goto finish;
+ switch (type) {
+ case PIPE_FD_TYPE_NATIVE_SYNC:
+ if (!sscreen->info.has_fence_to_handle)
+ goto finish;
- sfence->gfx = ws->fence_import_sync_file(ws, fd);
- break;
+ sfence->gfx = ws->fence_import_sync_file(ws, fd);
+ break;
- case PIPE_FD_TYPE_SYNCOBJ:
- if (!sscreen->info.has_syncobj)
- goto finish;
+ case PIPE_FD_TYPE_SYNCOBJ:
+ if (!sscreen->info.has_syncobj)
+ goto finish;
- sfence->gfx = ws->fence_import_syncobj(ws, fd);
- break;
+ sfence->gfx = ws->fence_import_syncobj(ws, fd);
+ break;
- default:
- unreachable("bad fence fd type when importing");
- }
+ default:
+ unreachable("bad fence fd type when importing");
+ }
finish:
- if (!sfence->gfx) {
- FREE(sfence);
- return;
- }
+ if (!sfence->gfx) {
+ FREE(sfence);
+ return;
+ }
- *pfence = (struct pipe_fence_handle*)sfence;
+ *pfence = (struct pipe_fence_handle *)sfence;
}
-static int si_fence_get_fd(struct pipe_screen *screen,
- struct pipe_fence_handle *fence)
+static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct radeon_winsys *ws = sscreen->ws;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
- int gfx_fd = -1, sdma_fd = -1;
-
- if (!sscreen->info.has_fence_to_handle)
- return -1;
-
- util_queue_fence_wait(&sfence->ready);
-
- /* Deferred fences aren't supported. */
- assert(!sfence->gfx_unflushed.ctx);
- if (sfence->gfx_unflushed.ctx)
- return -1;
-
- if (sfence->sdma) {
- sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
- if (sdma_fd == -1)
- return -1;
- }
- if (sfence->gfx) {
- gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
- if (gfx_fd == -1) {
- if (sdma_fd != -1)
- close(sdma_fd);
- return -1;
- }
- }
-
- /* If we don't have FDs at this point, it means we don't have fences
- * either. */
- if (sdma_fd == -1 && gfx_fd == -1)
- return ws->export_signalled_sync_file(ws);
- if (sdma_fd == -1)
- return gfx_fd;
- if (gfx_fd == -1)
- return sdma_fd;
-
- /* Get a fence that will be a combination of both fences. */
- sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
- close(sdma_fd);
- return gfx_fd;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+ int gfx_fd = -1, sdma_fd = -1;
+
+ if (!sscreen->info.has_fence_to_handle)
+ return -1;
+
+ util_queue_fence_wait(&sfence->ready);
+
+ /* Deferred fences aren't supported. */
+ assert(!sfence->gfx_unflushed.ctx);
+ if (sfence->gfx_unflushed.ctx)
+ return -1;
+
+ if (sfence->sdma) {
+ sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
+ if (sdma_fd == -1)
+ return -1;
+ }
+ if (sfence->gfx) {
+ gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
+ if (gfx_fd == -1) {
+ if (sdma_fd != -1)
+ close(sdma_fd);
+ return -1;
+ }
+ }
+
+ /* If we don't have FDs at this point, it means we don't have fences
+ * either. */
+ if (sdma_fd == -1 && gfx_fd == -1)
+ return ws->export_signalled_sync_file(ws);
+ if (sdma_fd == -1)
+ return gfx_fd;
+ if (gfx_fd == -1)
+ return sdma_fd;
+
+ /* Get a fence that will be a combination of both fences. */
+ sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
+ close(sdma_fd);
+ return gfx_fd;
}
-static void si_flush_from_st(struct pipe_context *ctx,
- struct pipe_fence_handle **fence,
- unsigned flags)
+static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence,
+ unsigned flags)
{
- struct pipe_screen *screen = ctx->screen;
- struct si_context *sctx = (struct si_context *)ctx;
- struct radeon_winsys *ws = sctx->ws;
- struct pipe_fence_handle *gfx_fence = NULL;
- struct pipe_fence_handle *sdma_fence = NULL;
- bool deferred_fence = false;
- struct si_fine_fence fine = {};
- unsigned rflags = PIPE_FLUSH_ASYNC;
-
- if (flags & PIPE_FLUSH_END_OF_FRAME)
- rflags |= PIPE_FLUSH_END_OF_FRAME;
-
- if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
- assert(flags & PIPE_FLUSH_DEFERRED);
- assert(fence);
-
- si_fine_fence_set(sctx, &fine, flags);
- }
-
- /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
- if (sctx->sdma_cs)
- si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
-
- if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
- if (fence)
- ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
- if (!(flags & PIPE_FLUSH_DEFERRED))
- ws->cs_sync_flush(sctx->gfx_cs);
- } else {
- /* Instead of flushing, create a deferred fence. Constraints:
- * - The state tracker must allow a deferred flush.
- * - The state tracker must request a fence.
- * - fence_get_fd is not allowed.
- * Thread safety in fence_finish must be ensured by the state tracker.
- */
- if (flags & PIPE_FLUSH_DEFERRED &&
- !(flags & PIPE_FLUSH_FENCE_FD) &&
- fence) {
- gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
- deferred_fence = true;
- } else {
- si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
- }
- }
-
- /* Both engines can signal out of order, so we need to keep both fences. */
- if (fence) {
- struct si_multi_fence *multi_fence;
-
- if (flags & TC_FLUSH_ASYNC) {
- multi_fence = (struct si_multi_fence *)*fence;
- assert(multi_fence);
- } else {
- multi_fence = si_create_multi_fence();
- if (!multi_fence) {
- ws->fence_reference(&sdma_fence, NULL);
- ws->fence_reference(&gfx_fence, NULL);
- goto finish;
- }
-
- screen->fence_reference(screen, fence, NULL);
- *fence = (struct pipe_fence_handle*)multi_fence;
- }
-
- /* If both fences are NULL, fence_finish will always return true. */
- multi_fence->gfx = gfx_fence;
- multi_fence->sdma = sdma_fence;
-
- if (deferred_fence) {
- multi_fence->gfx_unflushed.ctx = sctx;
- multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
- }
-
- multi_fence->fine = fine;
- fine.buf = NULL;
-
- if (flags & TC_FLUSH_ASYNC) {
- util_queue_fence_signal(&multi_fence->ready);
- tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
- }
- }
- assert(!fine.buf);
+ struct pipe_screen *screen = ctx->screen;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct radeon_winsys *ws = sctx->ws;
+ struct pipe_fence_handle *gfx_fence = NULL;
+ struct pipe_fence_handle *sdma_fence = NULL;
+ bool deferred_fence = false;
+ struct si_fine_fence fine = {};
+ unsigned rflags = PIPE_FLUSH_ASYNC;
+
+ if (flags & PIPE_FLUSH_END_OF_FRAME)
+ rflags |= PIPE_FLUSH_END_OF_FRAME;
+
+ if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
+ assert(flags & PIPE_FLUSH_DEFERRED);
+ assert(fence);
+
+ si_fine_fence_set(sctx, &fine, flags);
+ }
+
+ /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+ if (sctx->sdma_cs)
+ si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
+
+ if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
+ if (fence)
+ ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
+ if (!(flags & PIPE_FLUSH_DEFERRED))
+ ws->cs_sync_flush(sctx->gfx_cs);
+ } else {
+ /* Instead of flushing, create a deferred fence. Constraints:
+ * - The state tracker must allow a deferred flush.
+ * - The state tracker must request a fence.
+ * - fence_get_fd is not allowed.
+ * Thread safety in fence_finish must be ensured by the state tracker.
+ */
+ if (flags & PIPE_FLUSH_DEFERRED && !(flags & PIPE_FLUSH_FENCE_FD) && fence) {
+ gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
+ deferred_fence = true;
+ } else {
+ si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
+ }
+ }
+
+ /* Both engines can signal out of order, so we need to keep both fences. */
+ if (fence) {
+ struct si_multi_fence *multi_fence;
+
+ if (flags & TC_FLUSH_ASYNC) {
+ multi_fence = (struct si_multi_fence *)*fence;
+ assert(multi_fence);
+ } else {
+ multi_fence = si_create_multi_fence();
+ if (!multi_fence) {
+ ws->fence_reference(&sdma_fence, NULL);
+ ws->fence_reference(&gfx_fence, NULL);
+ goto finish;
+ }
+
+ screen->fence_reference(screen, fence, NULL);
+ *fence = (struct pipe_fence_handle *)multi_fence;
+ }
+
+ /* If both fences are NULL, fence_finish will always return true. */
+ multi_fence->gfx = gfx_fence;
+ multi_fence->sdma = sdma_fence;
+
+ if (deferred_fence) {
+ multi_fence->gfx_unflushed.ctx = sctx;
+ multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+ }
+
+ multi_fence->fine = fine;
+ fine.buf = NULL;
+
+ if (flags & TC_FLUSH_ASYNC) {
+ util_queue_fence_signal(&multi_fence->ready);
+ tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+ }
+ }
+ assert(!fine.buf);
finish:
- if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
- if (sctx->sdma_cs)
- ws->cs_sync_flush(sctx->sdma_cs);
- ws->cs_sync_flush(sctx->gfx_cs);
- }
+ if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
+ if (sctx->sdma_cs)
+ ws->cs_sync_flush(sctx->sdma_cs);
+ ws->cs_sync_flush(sctx->gfx_cs);
+ }
}
-static void si_fence_server_signal(struct pipe_context *ctx,
- struct pipe_fence_handle *fence)
+static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
- /* We should have at least one syncobj to signal */
- assert(sfence->sdma || sfence->gfx);
-
- if (sfence->sdma)
- si_add_syncobj_signal(sctx, sfence->sdma);
- if (sfence->gfx)
- si_add_syncobj_signal(sctx, sfence->gfx);
-
- /**
- * The spec does not require a flush here. We insert a flush
- * because syncobj based signals are not directly placed into
- * the command stream. Instead the signal happens when the
- * submission associated with the syncobj finishes execution.
- *
- * Therefore, we must make sure that we flush the pipe to avoid
- * new work being emitted and getting executed before the signal
- * operation.
- *
- * Set sctx->initial_gfx_cs_size to force IB submission even if
- * it is empty.
- */
- sctx->initial_gfx_cs_size = 0;
- si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+ /* We should have at least one syncobj to signal */
+ assert(sfence->sdma || sfence->gfx);
+
+ if (sfence->sdma)
+ si_add_syncobj_signal(sctx, sfence->sdma);
+ if (sfence->gfx)
+ si_add_syncobj_signal(sctx, sfence->gfx);
+
+ /**
+ * The spec does not require a flush here. We insert a flush
+ * because syncobj based signals are not directly placed into
+ * the command stream. Instead the signal happens when the
+ * submission associated with the syncobj finishes execution.
+ *
+ * Therefore, we must make sure that we flush the pipe to avoid
+ * new work being emitted and getting executed before the signal
+ * operation.
+ *
+ * Set sctx->initial_gfx_cs_size to force IB submission even if
+ * it is empty.
+ */
+ sctx->initial_gfx_cs_size = 0;
+ si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
}
-static void si_fence_server_sync(struct pipe_context *ctx,
- struct pipe_fence_handle *fence)
+static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
- util_queue_fence_wait(&sfence->ready);
-
- /* Unflushed fences from the same context are no-ops. */
- if (sfence->gfx_unflushed.ctx &&
- sfence->gfx_unflushed.ctx == sctx)
- return;
-
- /* All unflushed commands will not start execution before
- * this fence dependency is signalled.
- *
- * Therefore we must flush before inserting the dependency
- */
- si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
-
- if (sfence->sdma)
- si_add_fence_dependency(sctx, sfence->sdma);
- if (sfence->gfx)
- si_add_fence_dependency(sctx, sfence->gfx);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+ util_queue_fence_wait(&sfence->ready);
+
+ /* Unflushed fences from the same context are no-ops. */
+ if (sfence->gfx_unflushed.ctx && sfence->gfx_unflushed.ctx == sctx)
+ return;
+
+ /* All unflushed commands will not start execution before
+ * this fence dependency is signalled.
+ *
+ * Therefore we must flush before inserting the dependency
+ */
+ si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+
+ if (sfence->sdma)
+ si_add_fence_dependency(sctx, sfence->sdma);
+ if (sfence->gfx)
+ si_add_fence_dependency(sctx, sfence->gfx);
}
void si_init_fence_functions(struct si_context *ctx)
{
- ctx->b.flush = si_flush_from_st;
- ctx->b.create_fence_fd = si_create_fence_fd;
- ctx->b.fence_server_sync = si_fence_server_sync;
- ctx->b.fence_server_signal = si_fence_server_signal;
+ ctx->b.flush = si_flush_from_st;
+ ctx->b.create_fence_fd = si_create_fence_fd;
+ ctx->b.fence_server_sync = si_fence_server_sync;
+ ctx->b.fence_server_signal = si_fence_server_signal;
}
void si_init_screen_fence_functions(struct si_screen *screen)
{
- screen->b.fence_finish = si_fence_finish;
- screen->b.fence_reference = si_fence_reference;
- screen->b.fence_get_fd = si_fence_get_fd;
+ screen->b.fence_finish = si_fence_finish;
+ screen->b.fence_reference = si_fence_reference;
+ screen->b.fence_get_fd = si_fence_get_fd;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
-#include "radeon/radeon_vce.h"
+#include "compiler/nir/nir.h"
#include "radeon/radeon_uvd_enc.h"
-#include "vl/vl_decoder.h"
-#include "vl/vl_video_buffer.h"
+#include "radeon/radeon_vce.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
#include "util/u_screen.h"
#include "util/u_video.h"
-#include "compiler/nir/nir.h"
-
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
#include <sys/utsname.h>
static const char *si_get_vendor(struct pipe_screen *pscreen)
{
- /* Don't change this. Games such as Alien Isolation are broken if this
- * returns "Advanced Micro Devices, Inc."
- */
- return "X.Org";
+ /* Don't change this. Games such as Alien Isolation are broken if this
+ * returns "Advanced Micro Devices, Inc."
+ */
+ return "X.Org";
}
static const char *si_get_device_vendor(struct pipe_screen *pscreen)
{
- return "AMD";
+ return "AMD";
}
static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
{
- struct si_screen *sscreen = (struct si_screen *)pscreen;
-
- switch (param) {
- /* Supported features (boolean caps). */
- case PIPE_CAP_ACCELERATED:
- case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
- case PIPE_CAP_ANISOTROPIC_FILTER:
- case PIPE_CAP_POINT_SPRITE:
- case PIPE_CAP_OCCLUSION_QUERY:
- case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
- case PIPE_CAP_TEXTURE_SHADOW_LOD:
- case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
- case PIPE_CAP_BLEND_EQUATION_SEPARATE:
- case PIPE_CAP_TEXTURE_SWIZZLE:
- case PIPE_CAP_DEPTH_CLIP_DISABLE:
- case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
- case PIPE_CAP_SHADER_STENCIL_EXPORT:
- case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
- case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
- case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
- case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
- case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
- case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
- case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
- case PIPE_CAP_VERTEX_SHADER_SATURATE:
- case PIPE_CAP_SEAMLESS_CUBE_MAP:
- case PIPE_CAP_PRIMITIVE_RESTART:
- case PIPE_CAP_CONDITIONAL_RENDER:
- case PIPE_CAP_TEXTURE_BARRIER:
- case PIPE_CAP_INDEP_BLEND_ENABLE:
- case PIPE_CAP_INDEP_BLEND_FUNC:
- case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
- case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
- case PIPE_CAP_START_INSTANCE:
- case PIPE_CAP_NPOT_TEXTURES:
- case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
- case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
- case PIPE_CAP_VERTEX_COLOR_CLAMPED:
- case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
- case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
- case PIPE_CAP_TGSI_INSTANCEID:
- case PIPE_CAP_COMPUTE:
- case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
- case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
- case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
- case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
- case PIPE_CAP_CUBE_MAP_ARRAY:
- case PIPE_CAP_SAMPLE_SHADING:
- case PIPE_CAP_DRAW_INDIRECT:
- case PIPE_CAP_CLIP_HALFZ:
- case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
- case PIPE_CAP_POLYGON_OFFSET_CLAMP:
- case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
- case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
- case PIPE_CAP_TGSI_TEXCOORD:
- case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
- case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
- case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
- case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
- case PIPE_CAP_SHAREABLE_SHADERS:
- case PIPE_CAP_DEPTH_BOUNDS_TEST:
- case PIPE_CAP_SAMPLER_VIEW_TARGET:
- case PIPE_CAP_TEXTURE_QUERY_LOD:
- case PIPE_CAP_TEXTURE_GATHER_SM5:
- case PIPE_CAP_TGSI_TXQS:
- case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
- case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
- case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
- case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
- case PIPE_CAP_INVALIDATE_BUFFER:
- case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
- case PIPE_CAP_QUERY_BUFFER_OBJECT:
- case PIPE_CAP_QUERY_MEMORY_INFO:
- case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
- case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
- case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
- case PIPE_CAP_GENERATE_MIPMAP:
- case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
- case PIPE_CAP_STRING_MARKER:
- case PIPE_CAP_CLEAR_TEXTURE:
- case PIPE_CAP_CULL_DISTANCE:
- case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
- case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
- case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
- case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
- case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
- case PIPE_CAP_DOUBLES:
- case PIPE_CAP_TGSI_TEX_TXF_LZ:
- case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
- case PIPE_CAP_BINDLESS_TEXTURE:
- case PIPE_CAP_QUERY_TIMESTAMP:
- case PIPE_CAP_QUERY_TIME_ELAPSED:
- case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
- case PIPE_CAP_MEMOBJ:
- case PIPE_CAP_LOAD_CONSTBUF:
- case PIPE_CAP_INT64:
- case PIPE_CAP_INT64_DIVMOD:
- case PIPE_CAP_TGSI_CLOCK:
- case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
- case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
- case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
- case PIPE_CAP_TGSI_BALLOT:
- case PIPE_CAP_TGSI_VOTE:
- case PIPE_CAP_FBFETCH:
- case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
- case PIPE_CAP_IMAGE_LOAD_FORMATTED:
- case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
- case PIPE_CAP_TGSI_DIV:
- case PIPE_CAP_PACKED_UNIFORMS:
- case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
- case PIPE_CAP_GL_SPIRV:
- case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
- return 1;
-
- case PIPE_CAP_QUERY_SO_OVERFLOW:
- return !sscreen->use_ngg_streamout;
-
- case PIPE_CAP_POST_DEPTH_COVERAGE:
- return sscreen->info.chip_class >= GFX10;
-
- case PIPE_CAP_GRAPHICS:
- return sscreen->info.has_graphics;
-
- case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
- return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
-
- case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
- return sscreen->info.has_gpu_reset_status_query;
-
- case PIPE_CAP_TEXTURE_MULTISAMPLE:
- return sscreen->info.has_2d_tiling;
-
- case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
- return SI_MAP_BUFFER_ALIGNMENT;
-
- case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
- case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
- case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
- case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
- case PIPE_CAP_MAX_VERTEX_STREAMS:
- case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
- case PIPE_CAP_MAX_WINDOW_RECTANGLES:
- return 4;
-
- case PIPE_CAP_GLSL_FEATURE_LEVEL:
- case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
- if (!sscreen->info.has_indirect_compute_dispatch)
- return 420;
- return 460;
-
- case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
- /* Optimal number for good TexSubImage performance on Polaris10. */
- return 64 * 1024 * 1024;
-
- case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
- return 4096 * 1024;
-
- case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
- case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
- return MIN2(sscreen->info.max_alloc_size, INT_MAX);
-
- case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
- case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
- case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
- return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
-
- case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
- return sscreen->info.has_sparse_vm_mappings ?
- RADEON_SPARSE_PAGE_SIZE : 0;
-
-
- case PIPE_CAP_UMA:
- return 0;
-
- case PIPE_CAP_FENCE_SIGNAL:
- return sscreen->info.has_syncobj;
-
- case PIPE_CAP_CONSTBUF0_FLAGS:
- return SI_RESOURCE_FLAG_32BIT;
-
- case PIPE_CAP_NATIVE_FENCE_FD:
- return sscreen->info.has_fence_to_handle;
-
- case PIPE_CAP_DRAW_PARAMETERS:
- case PIPE_CAP_MULTI_DRAW_INDIRECT:
- case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
- return sscreen->has_draw_indirect_multi;
-
- case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
- return 30;
-
- case PIPE_CAP_MAX_VARYINGS:
- return 32;
-
- case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
- return sscreen->info.chip_class <= GFX8 ?
- PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
-
- /* Stream output. */
- case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
- case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
- return 32*4;
-
- /* Geometry shader output. */
- case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
- /* gfx9 has to report 256 to make piglit/gs-max-output pass.
- * gfx8 and earlier can do 1024.
- */
- return 256;
- case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
- return 4095;
- case PIPE_CAP_MAX_GS_INVOCATIONS:
- /* The closed driver exposes 127, but 125 is the greatest
- * number that works. */
- return 125;
-
- case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
- return 2048;
-
- /* Texturing. */
- case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
- return 16384;
- case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
- return 15; /* 16384 */
- case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
- if (sscreen->info.chip_class >= GFX10)
- return 14;
- /* textures support 8192, but layered rendering supports 2048 */
- return 12;
- case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
- if (sscreen->info.chip_class >= GFX10)
- return 8192;
- /* textures support 8192, but layered rendering supports 2048 */
- return 2048;
-
- /* Viewports and render targets. */
- case PIPE_CAP_MAX_VIEWPORTS:
- return SI_MAX_VIEWPORTS;
- case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
- case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
- case PIPE_CAP_MAX_RENDER_TARGETS:
- return 8;
- case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
- return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
-
- case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
- case PIPE_CAP_MIN_TEXEL_OFFSET:
- return -32;
-
- case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
- case PIPE_CAP_MAX_TEXEL_OFFSET:
- return 31;
-
- case PIPE_CAP_ENDIANNESS:
- return PIPE_ENDIAN_LITTLE;
-
- case PIPE_CAP_VENDOR_ID:
- return ATI_VENDOR_ID;
- case PIPE_CAP_DEVICE_ID:
- return sscreen->info.pci_id;
- case PIPE_CAP_VIDEO_MEMORY:
- return sscreen->info.vram_size >> 20;
- case PIPE_CAP_PCI_GROUP:
- return sscreen->info.pci_domain;
- case PIPE_CAP_PCI_BUS:
- return sscreen->info.pci_bus;
- case PIPE_CAP_PCI_DEVICE:
- return sscreen->info.pci_dev;
- case PIPE_CAP_PCI_FUNCTION:
- return sscreen->info.pci_func;
- case PIPE_CAP_TGSI_ATOMINC_WRAP:
- return LLVM_VERSION_MAJOR >= 10;
-
- default:
- return u_pipe_screen_get_param_defaults(pscreen, param);
- }
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+ switch (param) {
+ /* Supported features (boolean caps). */
+ case PIPE_CAP_ACCELERATED:
+ case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+ case PIPE_CAP_ANISOTROPIC_FILTER:
+ case PIPE_CAP_POINT_SPRITE:
+ case PIPE_CAP_OCCLUSION_QUERY:
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+ case PIPE_CAP_TEXTURE_SHADOW_LOD:
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+ case PIPE_CAP_TEXTURE_SWIZZLE:
+ case PIPE_CAP_DEPTH_CLIP_DISABLE:
+ case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+ case PIPE_CAP_SHADER_STENCIL_EXPORT:
+ case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+ case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+ case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+ case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
+ case PIPE_CAP_VERTEX_SHADER_SATURATE:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP:
+ case PIPE_CAP_PRIMITIVE_RESTART:
+ case PIPE_CAP_CONDITIONAL_RENDER:
+ case PIPE_CAP_TEXTURE_BARRIER:
+ case PIPE_CAP_INDEP_BLEND_ENABLE:
+ case PIPE_CAP_INDEP_BLEND_FUNC:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+ case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+ case PIPE_CAP_START_INSTANCE:
+ case PIPE_CAP_NPOT_TEXTURES:
+ case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+ case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+ case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+ case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+ case PIPE_CAP_TGSI_INSTANCEID:
+ case PIPE_CAP_COMPUTE:
+ case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+ case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+ case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+ case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+ case PIPE_CAP_CUBE_MAP_ARRAY:
+ case PIPE_CAP_SAMPLE_SHADING:
+ case PIPE_CAP_DRAW_INDIRECT:
+ case PIPE_CAP_CLIP_HALFZ:
+ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+ case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+ case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+ case PIPE_CAP_TGSI_TEXCOORD:
+ case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+ case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+ case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+ case PIPE_CAP_SHAREABLE_SHADERS:
+ case PIPE_CAP_DEPTH_BOUNDS_TEST:
+ case PIPE_CAP_SAMPLER_VIEW_TARGET:
+ case PIPE_CAP_TEXTURE_QUERY_LOD:
+ case PIPE_CAP_TEXTURE_GATHER_SM5:
+ case PIPE_CAP_TGSI_TXQS:
+ case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+ case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+ case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+ case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+ case PIPE_CAP_INVALIDATE_BUFFER:
+ case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+ case PIPE_CAP_QUERY_BUFFER_OBJECT:
+ case PIPE_CAP_QUERY_MEMORY_INFO:
+ case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+ case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+ case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+ case PIPE_CAP_GENERATE_MIPMAP:
+ case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+ case PIPE_CAP_STRING_MARKER:
+ case PIPE_CAP_CLEAR_TEXTURE:
+ case PIPE_CAP_CULL_DISTANCE:
+ case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+ case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+ case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+ case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+ case PIPE_CAP_DOUBLES:
+ case PIPE_CAP_TGSI_TEX_TXF_LZ:
+ case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+ case PIPE_CAP_BINDLESS_TEXTURE:
+ case PIPE_CAP_QUERY_TIMESTAMP:
+ case PIPE_CAP_QUERY_TIME_ELAPSED:
+ case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+ case PIPE_CAP_MEMOBJ:
+ case PIPE_CAP_LOAD_CONSTBUF:
+ case PIPE_CAP_INT64:
+ case PIPE_CAP_INT64_DIVMOD:
+ case PIPE_CAP_TGSI_CLOCK:
+ case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+ case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+ case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+ case PIPE_CAP_TGSI_BALLOT:
+ case PIPE_CAP_TGSI_VOTE:
+ case PIPE_CAP_FBFETCH:
+ case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
+ case PIPE_CAP_IMAGE_LOAD_FORMATTED:
+ case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
+ case PIPE_CAP_TGSI_DIV:
+ case PIPE_CAP_PACKED_UNIFORMS:
+ case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
+ case PIPE_CAP_GL_SPIRV:
+ case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
+ return 1;
+
+ case PIPE_CAP_QUERY_SO_OVERFLOW:
+ return !sscreen->use_ngg_streamout;
+
+ case PIPE_CAP_POST_DEPTH_COVERAGE:
+ return sscreen->info.chip_class >= GFX10;
+
+ case PIPE_CAP_GRAPHICS:
+ return sscreen->info.has_graphics;
+
+ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+ return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
+
+ case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+ return sscreen->info.has_gpu_reset_status_query;
+
+ case PIPE_CAP_TEXTURE_MULTISAMPLE:
+ return sscreen->info.has_2d_tiling;
+
+ case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+ return SI_MAP_BUFFER_ALIGNMENT;
+
+ case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+ case PIPE_CAP_MAX_VERTEX_STREAMS:
+ case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+ case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+ return 4;
+
+ case PIPE_CAP_GLSL_FEATURE_LEVEL:
+ case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+ if (!sscreen->info.has_indirect_compute_dispatch)
+ return 420;
+ return 460;
+
+ case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+ /* Optimal number for good TexSubImage performance on Polaris10. */
+ return 64 * 1024 * 1024;
+
+ case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
+ return 4096 * 1024;
+
+ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+ case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+ return MIN2(sscreen->info.max_alloc_size, INT_MAX);
+
+ case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+ case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+ case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+ return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
+
+ case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+ return sscreen->info.has_sparse_vm_mappings ? RADEON_SPARSE_PAGE_SIZE : 0;
+
+ case PIPE_CAP_UMA:
+ return 0;
+
+ case PIPE_CAP_FENCE_SIGNAL:
+ return sscreen->info.has_syncobj;
+
+ case PIPE_CAP_CONSTBUF0_FLAGS:
+ return SI_RESOURCE_FLAG_32BIT;
+
+ case PIPE_CAP_NATIVE_FENCE_FD:
+ return sscreen->info.has_fence_to_handle;
+
+ case PIPE_CAP_DRAW_PARAMETERS:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+ return sscreen->has_draw_indirect_multi;
+
+ case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+ return 30;
+
+ case PIPE_CAP_MAX_VARYINGS:
+ return 32;
+
+ case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+ return sscreen->info.chip_class <= GFX8 ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
+
+ /* Stream output. */
+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+ return 32 * 4;
+
+ /* Geometry shader output. */
+ case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+ /* gfx9 has to report 256 to make piglit/gs-max-output pass.
+ * gfx8 and earlier can do 1024.
+ */
+ return 256;
+ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+ return 4095;
+ case PIPE_CAP_MAX_GS_INVOCATIONS:
+ /* The closed driver exposes 127, but 125 is the greatest
+ * number that works. */
+ return 125;
+
+ case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+ return 2048;
+
+ /* Texturing. */
+ case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
+ return 16384;
+ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+ return 15; /* 16384 */
+ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+ if (sscreen->info.chip_class >= GFX10)
+ return 14;
+ /* textures support 8192, but layered rendering supports 2048 */
+ return 12;
+ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+ if (sscreen->info.chip_class >= GFX10)
+ return 8192;
+ /* textures support 8192, but layered rendering supports 2048 */
+ return 2048;
+
+ /* Viewports and render targets. */
+ case PIPE_CAP_MAX_VIEWPORTS:
+ return SI_MAX_VIEWPORTS;
+ case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+ case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
+ case PIPE_CAP_MAX_RENDER_TARGETS:
+ return 8;
+ case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+ return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
+
+ case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+ case PIPE_CAP_MIN_TEXEL_OFFSET:
+ return -32;
+
+ case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+ case PIPE_CAP_MAX_TEXEL_OFFSET:
+ return 31;
+
+ case PIPE_CAP_ENDIANNESS:
+ return PIPE_ENDIAN_LITTLE;
+
+ case PIPE_CAP_VENDOR_ID:
+ return ATI_VENDOR_ID;
+ case PIPE_CAP_DEVICE_ID:
+ return sscreen->info.pci_id;
+ case PIPE_CAP_VIDEO_MEMORY:
+ return sscreen->info.vram_size >> 20;
+ case PIPE_CAP_PCI_GROUP:
+ return sscreen->info.pci_domain;
+ case PIPE_CAP_PCI_BUS:
+ return sscreen->info.pci_bus;
+ case PIPE_CAP_PCI_DEVICE:
+ return sscreen->info.pci_dev;
+ case PIPE_CAP_PCI_FUNCTION:
+ return sscreen->info.pci_func;
+ case PIPE_CAP_TGSI_ATOMINC_WRAP:
+ return LLVM_VERSION_MAJOR >= 10;
+
+ default:
+ return u_pipe_screen_get_param_defaults(pscreen, param);
+ }
}
-static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param)
+static float si_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
{
- switch (param) {
- case PIPE_CAPF_MAX_LINE_WIDTH:
- case PIPE_CAPF_MAX_LINE_WIDTH_AA:
- /* This depends on the quant mode, though the precise interactions
- * are unknown. */
- return 2048;
- case PIPE_CAPF_MAX_POINT_WIDTH:
- case PIPE_CAPF_MAX_POINT_WIDTH_AA:
- return SI_MAX_POINT_SIZE;
- case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
- return 16.0f;
- case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
- return 16.0f;
- case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
- case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
- case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
- return 0.0f;
- }
- return 0.0f;
+ switch (param) {
+ case PIPE_CAPF_MAX_LINE_WIDTH:
+ case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+ /* This depends on the quant mode, though the precise interactions
+ * are unknown. */
+ return 2048;
+ case PIPE_CAPF_MAX_POINT_WIDTH:
+ case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+ return SI_MAX_POINT_SIZE;
+ case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+ return 16.0f;
+ case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+ return 16.0f;
+ case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+ return 0.0f;
+ }
+ return 0.0f;
}
-static int si_get_shader_param(struct pipe_screen* pscreen,
- enum pipe_shader_type shader,
- enum pipe_shader_cap param)
+static int si_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader,
+ enum pipe_shader_cap param)
{
- struct si_screen *sscreen = (struct si_screen *)pscreen;
-
- switch(shader)
- {
- case PIPE_SHADER_FRAGMENT:
- case PIPE_SHADER_VERTEX:
- case PIPE_SHADER_GEOMETRY:
- case PIPE_SHADER_TESS_CTRL:
- case PIPE_SHADER_TESS_EVAL:
- break;
- case PIPE_SHADER_COMPUTE:
- switch (param) {
- case PIPE_SHADER_CAP_SUPPORTED_IRS: {
- int ir = 1 << PIPE_SHADER_IR_NATIVE;
-
- if (sscreen->info.has_indirect_compute_dispatch)
- ir |= 1 << PIPE_SHADER_IR_NIR;
-
- return ir;
- }
-
- case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
- uint64_t max_const_buffer_size;
- pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
- PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
- &max_const_buffer_size);
- return MIN2(max_const_buffer_size, INT_MAX);
- }
- default:
- /* If compute shaders don't require a special value
- * for this cap, we can return the same value we
- * do for other shader types. */
- break;
- }
- break;
- default:
- return 0;
- }
-
- switch (param) {
- /* Shader limits. */
- case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
- case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
- case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
- case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
- case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
- return 16384;
- case PIPE_SHADER_CAP_MAX_INPUTS:
- return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
- case PIPE_SHADER_CAP_MAX_OUTPUTS:
- return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
- case PIPE_SHADER_CAP_MAX_TEMPS:
- return 256; /* Max native temporaries. */
- case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
- return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
- case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
- return SI_NUM_CONST_BUFFERS;
- case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
- case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
- return SI_NUM_SAMPLERS;
- case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
- return SI_NUM_SHADER_BUFFERS;
- case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
- return SI_NUM_IMAGES;
- case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
- return 0;
- case PIPE_SHADER_CAP_PREFERRED_IR:
- return PIPE_SHADER_IR_NIR;
- case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
- return 4;
-
- /* Supported boolean features. */
- case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
- case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
- case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
- case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
- case PIPE_SHADER_CAP_INTEGERS:
- case PIPE_SHADER_CAP_INT64_ATOMICS:
- case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
- case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
- case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
- case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
- case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
- case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
- return 1;
-
- case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
- /* TODO: Indirect indexing of GS inputs is unimplemented. */
- if (shader == PIPE_SHADER_GEOMETRY)
- return 0;
-
- if (shader == PIPE_SHADER_VERTEX &&
- !sscreen->llvm_has_working_vgpr_indexing)
- return 0;
-
- /* TCS and TES load inputs directly from LDS or offchip
- * memory, so indirect indexing is always supported.
- * PS has to support indirect indexing, because we can't
- * lower that to TEMPs for INTERP instructions.
- */
- return 1;
-
- case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
- return sscreen->llvm_has_working_vgpr_indexing ||
- /* TCS stores outputs directly to memory. */
- shader == PIPE_SHADER_TESS_CTRL;
-
- /* Unsupported boolean features. */
- case PIPE_SHADER_CAP_FP16:
- case PIPE_SHADER_CAP_SUBROUTINES:
- case PIPE_SHADER_CAP_SUPPORTED_IRS:
- case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
- case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
- return 0;
- }
- return 0;
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+ switch (shader) {
+ case PIPE_SHADER_FRAGMENT:
+ case PIPE_SHADER_VERTEX:
+ case PIPE_SHADER_GEOMETRY:
+ case PIPE_SHADER_TESS_CTRL:
+ case PIPE_SHADER_TESS_EVAL:
+ break;
+ case PIPE_SHADER_COMPUTE:
+ switch (param) {
+ case PIPE_SHADER_CAP_SUPPORTED_IRS: {
+ int ir = 1 << PIPE_SHADER_IR_NATIVE;
+
+ if (sscreen->info.has_indirect_compute_dispatch)
+ ir |= 1 << PIPE_SHADER_IR_NIR;
+
+ return ir;
+ }
+
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
+ uint64_t max_const_buffer_size;
+ pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
+ PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, &max_const_buffer_size);
+ return MIN2(max_const_buffer_size, INT_MAX);
+ }
+ default:
+ /* If compute shaders don't require a special value
+ * for this cap, we can return the same value we
+ * do for other shader types. */
+ break;
+ }
+ break;
+ default:
+ return 0;
+ }
+
+ switch (param) {
+ /* Shader limits. */
+ case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+ case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+ return 16384;
+ case PIPE_SHADER_CAP_MAX_INPUTS:
+ return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
+ case PIPE_SHADER_CAP_MAX_OUTPUTS:
+ return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
+ case PIPE_SHADER_CAP_MAX_TEMPS:
+ return 256; /* Max native temporaries. */
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+ return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+ return SI_NUM_CONST_BUFFERS;
+ case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+ case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+ return SI_NUM_SAMPLERS;
+ case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+ return SI_NUM_SHADER_BUFFERS;
+ case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ return SI_NUM_IMAGES;
+ case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+ return 0;
+ case PIPE_SHADER_CAP_PREFERRED_IR:
+ return PIPE_SHADER_IR_NIR;
+ case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+ return 4;
+
+ /* Supported boolean features. */
+ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+ case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+ case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+ case PIPE_SHADER_CAP_INTEGERS:
+ case PIPE_SHADER_CAP_INT64_ATOMICS:
+ case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+ case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+ return 1;
+
+ case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+ /* TODO: Indirect indexing of GS inputs is unimplemented. */
+ if (shader == PIPE_SHADER_GEOMETRY)
+ return 0;
+
+ if (shader == PIPE_SHADER_VERTEX && !sscreen->llvm_has_working_vgpr_indexing)
+ return 0;
+
+ /* TCS and TES load inputs directly from LDS or offchip
+ * memory, so indirect indexing is always supported.
+ * PS has to support indirect indexing, because we can't
+ * lower that to TEMPs for INTERP instructions.
+ */
+ return 1;
+
+ case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+ return sscreen->llvm_has_working_vgpr_indexing ||
+ /* TCS stores outputs directly to memory. */
+ shader == PIPE_SHADER_TESS_CTRL;
+
+ /* Unsupported boolean features. */
+ case PIPE_SHADER_CAP_FP16:
+ case PIPE_SHADER_CAP_SUBROUTINES:
+ case PIPE_SHADER_CAP_SUPPORTED_IRS:
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+ return 0;
+ }
+ return 0;
}
static const struct nir_shader_compiler_options nir_options = {
- .lower_scmp = true,
- .lower_flrp32 = true,
- .lower_flrp64 = true,
- .lower_fsat = true,
- .lower_fdiv = true,
- .lower_bitfield_insert_to_bitfield_select = true,
- .lower_bitfield_extract = true,
- .lower_sub = true,
- .fuse_ffma = true,
- .lower_fmod = true,
- .lower_pack_snorm_4x8 = true,
- .lower_pack_unorm_4x8 = true,
- .lower_unpack_snorm_2x16 = true,
- .lower_unpack_snorm_4x8 = true,
- .lower_unpack_unorm_2x16 = true,
- .lower_unpack_unorm_4x8 = true,
- .lower_extract_byte = true,
- .lower_extract_word = true,
- .lower_rotate = true,
- .lower_to_scalar = true,
- .optimize_sample_mask_in = true,
- .max_unroll_iterations = 32,
- .use_interpolated_input_intrinsics = true,
+ .lower_scmp = true,
+ .lower_flrp32 = true,
+ .lower_flrp64 = true,
+ .lower_fsat = true,
+ .lower_fdiv = true,
+ .lower_bitfield_insert_to_bitfield_select = true,
+ .lower_bitfield_extract = true,
+ .lower_sub = true,
+ .fuse_ffma = true,
+ .lower_fmod = true,
+ .lower_pack_snorm_4x8 = true,
+ .lower_pack_unorm_4x8 = true,
+ .lower_unpack_snorm_2x16 = true,
+ .lower_unpack_snorm_4x8 = true,
+ .lower_unpack_unorm_2x16 = true,
+ .lower_unpack_unorm_4x8 = true,
+ .lower_extract_byte = true,
+ .lower_extract_word = true,
+ .lower_rotate = true,
+ .lower_to_scalar = true,
+ .optimize_sample_mask_in = true,
+ .max_unroll_iterations = 32,
+ .use_interpolated_input_intrinsics = true,
};
-static const void *
-si_get_compiler_options(struct pipe_screen *screen,
- enum pipe_shader_ir ir,
- enum pipe_shader_type shader)
+static const void *si_get_compiler_options(struct pipe_screen *screen, enum pipe_shader_ir ir,
+ enum pipe_shader_type shader)
{
- assert(ir == PIPE_SHADER_IR_NIR);
- return &nir_options;
+ assert(ir == PIPE_SHADER_IR_NIR);
+ return &nir_options;
}
static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
{
- ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
+ ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
}
static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
{
- struct si_screen *sscreen = (struct si_screen *)pscreen;
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
- ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
+ ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
}
-static const char* si_get_name(struct pipe_screen *pscreen)
+static const char *si_get_name(struct pipe_screen *pscreen)
{
- struct si_screen *sscreen = (struct si_screen*)pscreen;
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
- return sscreen->renderer_string;
+ return sscreen->renderer_string;
}
-static int si_get_video_param_no_decode(struct pipe_screen *screen,
- enum pipe_video_profile profile,
- enum pipe_video_entrypoint entrypoint,
- enum pipe_video_cap param)
+static int si_get_video_param_no_decode(struct pipe_screen *screen, enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint,
+ enum pipe_video_cap param)
{
- switch (param) {
- case PIPE_VIDEO_CAP_SUPPORTED:
- return vl_profile_supported(screen, profile, entrypoint);
- case PIPE_VIDEO_CAP_NPOT_TEXTURES:
- return 1;
- case PIPE_VIDEO_CAP_MAX_WIDTH:
- case PIPE_VIDEO_CAP_MAX_HEIGHT:
- return vl_video_buffer_max_size(screen);
- case PIPE_VIDEO_CAP_PREFERED_FORMAT:
- return PIPE_FORMAT_NV12;
- case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
- return true;
- case PIPE_VIDEO_CAP_MAX_LEVEL:
- return vl_level_supported(screen, profile);
- default:
- return 0;
- }
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ return vl_profile_supported(screen, profile, entrypoint);
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ return vl_video_buffer_max_size(screen);
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ return PIPE_FORMAT_NV12;
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_MAX_LEVEL:
+ return vl_level_supported(screen, profile);
+ default:
+ return 0;
+ }
}
-static int si_get_video_param(struct pipe_screen *screen,
- enum pipe_video_profile profile,
- enum pipe_video_entrypoint entrypoint,
- enum pipe_video_cap param)
+static int si_get_video_param(struct pipe_screen *screen, enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint, enum pipe_video_cap param)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- enum pipe_video_format codec = u_reduce_video_profile(profile);
-
- if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
- switch (param) {
- case PIPE_VIDEO_CAP_SUPPORTED:
- return ((codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
- (sscreen->info.family >= CHIP_RAVEN ||
- si_vce_is_fw_version_supported(sscreen))) ||
- (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
- (sscreen->info.family >= CHIP_RAVEN ||
- si_radeon_uvd_enc_supported(sscreen))) ||
- (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 &&
- sscreen->info.family >= CHIP_RENOIR));
- case PIPE_VIDEO_CAP_NPOT_TEXTURES:
- return 1;
- case PIPE_VIDEO_CAP_MAX_WIDTH:
- return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
- case PIPE_VIDEO_CAP_MAX_HEIGHT:
- return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
- case PIPE_VIDEO_CAP_PREFERED_FORMAT:
- return PIPE_FORMAT_NV12;
- case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
- return true;
- case PIPE_VIDEO_CAP_STACKED_FRAMES:
- return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
- default:
- return 0;
- }
- }
-
- switch (param) {
- case PIPE_VIDEO_CAP_SUPPORTED:
- switch (codec) {
- case PIPE_VIDEO_FORMAT_MPEG12:
- return profile != PIPE_VIDEO_PROFILE_MPEG1;
- case PIPE_VIDEO_FORMAT_MPEG4:
- return 1;
- case PIPE_VIDEO_FORMAT_MPEG4_AVC:
- if ((sscreen->info.family == CHIP_POLARIS10 ||
- sscreen->info.family == CHIP_POLARIS11) &&
- sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) {
- RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
- return false;
- }
- return true;
- case PIPE_VIDEO_FORMAT_VC1:
- return true;
- case PIPE_VIDEO_FORMAT_HEVC:
- /* Carrizo only supports HEVC Main */
- if (sscreen->info.family >= CHIP_STONEY)
- return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
- profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
- else if (sscreen->info.family >= CHIP_CARRIZO)
- return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
- return false;
- case PIPE_VIDEO_FORMAT_JPEG:
- if (sscreen->info.family >= CHIP_RAVEN)
- return true;
- if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
- return false;
- if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
- RVID_ERR("No MJPEG support for the kernel version\n");
- return false;
- }
- return true;
- case PIPE_VIDEO_FORMAT_VP9:
- if (sscreen->info.family < CHIP_RAVEN)
- return false;
- return true;
- default:
- return false;
- }
- case PIPE_VIDEO_CAP_NPOT_TEXTURES:
- return 1;
- case PIPE_VIDEO_CAP_MAX_WIDTH:
- switch (codec) {
- case PIPE_VIDEO_FORMAT_HEVC:
- case PIPE_VIDEO_FORMAT_VP9:
- return (sscreen->info.family < CHIP_RENOIR) ?
- ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) :
- 8192;
- default:
- return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
- }
- case PIPE_VIDEO_CAP_MAX_HEIGHT:
- switch (codec) {
- case PIPE_VIDEO_FORMAT_HEVC:
- case PIPE_VIDEO_FORMAT_VP9:
- return (sscreen->info.family < CHIP_RENOIR) ?
- ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) :
- 4352;
- default:
- return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
- }
- case PIPE_VIDEO_CAP_PREFERED_FORMAT:
- if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
- return PIPE_FORMAT_P010;
- else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
- return PIPE_FORMAT_P016;
- else
- return PIPE_FORMAT_NV12;
-
- case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
- case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
- enum pipe_video_format format = u_reduce_video_profile(profile);
-
- if (format == PIPE_VIDEO_FORMAT_HEVC)
- return false; //The firmware doesn't support interlaced HEVC.
- else if (format == PIPE_VIDEO_FORMAT_JPEG)
- return false;
- else if (format == PIPE_VIDEO_FORMAT_VP9)
- return false;
- return true;
- }
- case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
- return true;
- case PIPE_VIDEO_CAP_MAX_LEVEL:
- switch (profile) {
- case PIPE_VIDEO_PROFILE_MPEG1:
- return 0;
- case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
- case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
- return 3;
- case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
- return 3;
- case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
- return 5;
- case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
- return 1;
- case PIPE_VIDEO_PROFILE_VC1_MAIN:
- return 2;
- case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
- return 4;
- case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
- case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
- case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
- return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
- case PIPE_VIDEO_PROFILE_HEVC_MAIN:
- case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
- return 186;
- default:
- return 0;
- }
- default:
- return 0;
- }
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ enum pipe_video_format codec = u_reduce_video_profile(profile);
+
+ if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ return (
+ (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+ (sscreen->info.family >= CHIP_RAVEN || si_vce_is_fw_version_supported(sscreen))) ||
+ (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
+ (sscreen->info.family >= CHIP_RAVEN || si_radeon_uvd_enc_supported(sscreen))) ||
+ (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 && sscreen->info.family >= CHIP_RENOIR));
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ return PIPE_FORMAT_NV12;
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_STACKED_FRAMES:
+ return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
+ default:
+ return 0;
+ }
+ }
+
+ switch (param) {
+ case PIPE_VIDEO_CAP_SUPPORTED:
+ switch (codec) {
+ case PIPE_VIDEO_FORMAT_MPEG12:
+ return profile != PIPE_VIDEO_PROFILE_MPEG1;
+ case PIPE_VIDEO_FORMAT_MPEG4:
+ return 1;
+ case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+ if ((sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11) &&
+ sscreen->info.uvd_fw_version < UVD_FW_1_66_16) {
+ RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+ return false;
+ }
+ return true;
+ case PIPE_VIDEO_FORMAT_VC1:
+ return true;
+ case PIPE_VIDEO_FORMAT_HEVC:
+ /* Carrizo only supports HEVC Main */
+ if (sscreen->info.family >= CHIP_STONEY)
+ return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+ profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+ else if (sscreen->info.family >= CHIP_CARRIZO)
+ return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+ return false;
+ case PIPE_VIDEO_FORMAT_JPEG:
+ if (sscreen->info.family >= CHIP_RAVEN)
+ return true;
+ if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
+ return false;
+ if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
+ RVID_ERR("No MJPEG support for the kernel version\n");
+ return false;
+ }
+ return true;
+ case PIPE_VIDEO_FORMAT_VP9:
+ if (sscreen->info.family < CHIP_RAVEN)
+ return false;
+ return true;
+ default:
+ return false;
+ }
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
+ switch (codec) {
+ case PIPE_VIDEO_FORMAT_HEVC:
+ case PIPE_VIDEO_FORMAT_VP9:
+ return (sscreen->info.family < CHIP_RENOIR)
+ ? ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096)
+ : 8192;
+ default:
+ return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+ }
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ switch (codec) {
+ case PIPE_VIDEO_FORMAT_HEVC:
+ case PIPE_VIDEO_FORMAT_VP9:
+ return (sscreen->info.family < CHIP_RENOIR)
+ ? ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096)
+ : 4352;
+ default:
+ return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
+ }
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+ return PIPE_FORMAT_P010;
+ else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+ return PIPE_FORMAT_P016;
+ else
+ return PIPE_FORMAT_NV12;
+
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
+ enum pipe_video_format format = u_reduce_video_profile(profile);
+
+ if (format == PIPE_VIDEO_FORMAT_HEVC)
+ return false; // The firmware doesn't support interlaced HEVC.
+ else if (format == PIPE_VIDEO_FORMAT_JPEG)
+ return false;
+ else if (format == PIPE_VIDEO_FORMAT_VP9)
+ return false;
+ return true;
+ }
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_MAX_LEVEL:
+ switch (profile) {
+ case PIPE_VIDEO_PROFILE_MPEG1:
+ return 0;
+ case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+ case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+ return 3;
+ case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+ return 3;
+ case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+ return 5;
+ case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+ return 1;
+ case PIPE_VIDEO_PROFILE_VC1_MAIN:
+ return 2;
+ case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+ return 4;
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+ case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+ return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
+ case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+ case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+ return 186;
+ default:
+ return 0;
+ }
+ default:
+ return 0;
+ }
}
-static bool si_vid_is_format_supported(struct pipe_screen *screen,
- enum pipe_format format,
- enum pipe_video_profile profile,
- enum pipe_video_entrypoint entrypoint)
+static bool si_vid_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+ enum pipe_video_profile profile,
+ enum pipe_video_entrypoint entrypoint)
{
- /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
- if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
- return (format == PIPE_FORMAT_NV12) ||
- (format == PIPE_FORMAT_P010) ||
- (format == PIPE_FORMAT_P016);
-
- /* Vp9 profile 2 supports 10 bit decoding using P016 */
- if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
- return format == PIPE_FORMAT_P016;
+ /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
+ if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+ return (format == PIPE_FORMAT_NV12) || (format == PIPE_FORMAT_P010) ||
+ (format == PIPE_FORMAT_P016);
+ /* Vp9 profile 2 supports 10 bit decoding using P016 */
+ if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+ return format == PIPE_FORMAT_P016;
- /* we can only handle this one with UVD */
- if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
- return format == PIPE_FORMAT_NV12;
+ /* we can only handle this one with UVD */
+ if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+ return format == PIPE_FORMAT_NV12;
- return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+ return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
}
-static unsigned get_max_threads_per_block(struct si_screen *screen,
- enum pipe_shader_ir ir_type)
+static unsigned get_max_threads_per_block(struct si_screen *screen, enum pipe_shader_ir ir_type)
{
- if (ir_type == PIPE_SHADER_IR_NATIVE)
- return 256;
+ if (ir_type == PIPE_SHADER_IR_NATIVE)
+ return 256;
- /* LLVM 10 only supports 1024 threads per block. */
- return 1024;
+ /* LLVM 10 only supports 1024 threads per block. */
+ return 1024;
}
-static int si_get_compute_param(struct pipe_screen *screen,
- enum pipe_shader_ir ir_type,
- enum pipe_compute_cap param,
- void *ret)
+static int si_get_compute_param(struct pipe_screen *screen, enum pipe_shader_ir ir_type,
+ enum pipe_compute_cap param, void *ret)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
-
- //TODO: select these params by asic
- switch (param) {
- case PIPE_COMPUTE_CAP_IR_TARGET: {
- const char *gpu, *triple;
-
- triple = "amdgcn-mesa-mesa3d";
- gpu = ac_get_llvm_processor_name(sscreen->info.family);
- if (ret) {
- sprintf(ret, "%s-%s", gpu, triple);
- }
- /* +2 for dash and terminating NIL byte */
- return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
- }
- case PIPE_COMPUTE_CAP_GRID_DIMENSION:
- if (ret) {
- uint64_t *grid_dimension = ret;
- grid_dimension[0] = 3;
- }
- return 1 * sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
- if (ret) {
- uint64_t *grid_size = ret;
- grid_size[0] = 65535;
- grid_size[1] = 65535;
- grid_size[2] = 65535;
- }
- return 3 * sizeof(uint64_t) ;
-
- case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
- if (ret) {
- uint64_t *block_size = ret;
- unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
- block_size[0] = threads_per_block;
- block_size[1] = threads_per_block;
- block_size[2] = threads_per_block;
- }
- return 3 * sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
- if (ret) {
- uint64_t *max_threads_per_block = ret;
- *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
- }
- return sizeof(uint64_t);
- case PIPE_COMPUTE_CAP_ADDRESS_BITS:
- if (ret) {
- uint32_t *address_bits = ret;
- address_bits[0] = 64;
- }
- return 1 * sizeof(uint32_t);
-
- case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
- if (ret) {
- uint64_t *max_global_size = ret;
- uint64_t max_mem_alloc_size;
-
- si_get_compute_param(screen, ir_type,
- PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
- &max_mem_alloc_size);
-
- /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
- * 1/4 of the MAX_GLOBAL_SIZE. Since the
- * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
- * make sure we never report more than
- * 4 * MAX_MEM_ALLOC_SIZE.
- */
- *max_global_size = MIN2(4 * max_mem_alloc_size,
- MAX2(sscreen->info.gart_size,
- sscreen->info.vram_size));
- }
- return sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
- if (ret) {
- uint64_t *max_local_size = ret;
- /* Value reported by the closed source driver. */
- *max_local_size = 32768;
- }
- return sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
- if (ret) {
- uint64_t *max_input_size = ret;
- /* Value reported by the closed source driver. */
- *max_input_size = 1024;
- }
- return sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
- if (ret) {
- uint64_t *max_mem_alloc_size = ret;
-
- *max_mem_alloc_size = sscreen->info.max_alloc_size;
- }
- return sizeof(uint64_t);
-
- case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
- if (ret) {
- uint32_t *max_clock_frequency = ret;
- *max_clock_frequency = sscreen->info.max_shader_clock;
- }
- return sizeof(uint32_t);
-
- case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
- if (ret) {
- uint32_t *max_compute_units = ret;
- *max_compute_units = sscreen->info.num_good_compute_units;
- }
- return sizeof(uint32_t);
-
- case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
- if (ret) {
- uint32_t *images_supported = ret;
- *images_supported = 0;
- }
- return sizeof(uint32_t);
- case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
- break; /* unused */
- case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
- if (ret) {
- uint32_t *subgroup_size = ret;
- *subgroup_size = sscreen->compute_wave_size;
- }
- return sizeof(uint32_t);
- case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
- if (ret) {
- uint64_t *max_variable_threads_per_block = ret;
- if (ir_type == PIPE_SHADER_IR_NATIVE)
- *max_variable_threads_per_block = 0;
- else
- *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
- }
- return sizeof(uint64_t);
- }
-
- fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
- return 0;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ // TODO: select these params by asic
+ switch (param) {
+ case PIPE_COMPUTE_CAP_IR_TARGET: {
+ const char *gpu, *triple;
+
+ triple = "amdgcn-mesa-mesa3d";
+ gpu = ac_get_llvm_processor_name(sscreen->info.family);
+ if (ret) {
+ sprintf(ret, "%s-%s", gpu, triple);
+ }
+ /* +2 for dash and terminating NIL byte */
+ return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
+ }
+ case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+ if (ret) {
+ uint64_t *grid_dimension = ret;
+ grid_dimension[0] = 3;
+ }
+ return 1 * sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+ if (ret) {
+ uint64_t *grid_size = ret;
+ grid_size[0] = 65535;
+ grid_size[1] = 65535;
+ grid_size[2] = 65535;
+ }
+ return 3 * sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+ if (ret) {
+ uint64_t *block_size = ret;
+ unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+ block_size[0] = threads_per_block;
+ block_size[1] = threads_per_block;
+ block_size[2] = threads_per_block;
+ }
+ return 3 * sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+ if (ret) {
+ uint64_t *max_threads_per_block = ret;
+ *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+ }
+ return sizeof(uint64_t);
+ case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+ if (ret) {
+ uint32_t *address_bits = ret;
+ address_bits[0] = 64;
+ }
+ return 1 * sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+ if (ret) {
+ uint64_t *max_global_size = ret;
+ uint64_t max_mem_alloc_size;
+
+ si_get_compute_param(screen, ir_type, PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+ &max_mem_alloc_size);
+
+ /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
+ * 1/4 of the MAX_GLOBAL_SIZE. Since the
+ * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
+ * make sure we never report more than
+ * 4 * MAX_MEM_ALLOC_SIZE.
+ */
+ *max_global_size =
+ MIN2(4 * max_mem_alloc_size, MAX2(sscreen->info.gart_size, sscreen->info.vram_size));
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+ if (ret) {
+ uint64_t *max_local_size = ret;
+ /* Value reported by the closed source driver. */
+ *max_local_size = 32768;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+ if (ret) {
+ uint64_t *max_input_size = ret;
+ /* Value reported by the closed source driver. */
+ *max_input_size = 1024;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+ if (ret) {
+ uint64_t *max_mem_alloc_size = ret;
+
+ *max_mem_alloc_size = sscreen->info.max_alloc_size;
+ }
+ return sizeof(uint64_t);
+
+ case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+ if (ret) {
+ uint32_t *max_clock_frequency = ret;
+ *max_clock_frequency = sscreen->info.max_shader_clock;
+ }
+ return sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+ if (ret) {
+ uint32_t *max_compute_units = ret;
+ *max_compute_units = sscreen->info.num_good_compute_units;
+ }
+ return sizeof(uint32_t);
+
+ case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+ if (ret) {
+ uint32_t *images_supported = ret;
+ *images_supported = 0;
+ }
+ return sizeof(uint32_t);
+ case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+ break; /* unused */
+ case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+ if (ret) {
+ uint32_t *subgroup_size = ret;
+ *subgroup_size = sscreen->compute_wave_size;
+ }
+ return sizeof(uint32_t);
+ case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+ if (ret) {
+ uint64_t *max_variable_threads_per_block = ret;
+ if (ir_type == PIPE_SHADER_IR_NATIVE)
+ *max_variable_threads_per_block = 0;
+ else
+ *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+ }
+ return sizeof(uint64_t);
+ }
+
+ fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+ return 0;
}
static uint64_t si_get_timestamp(struct pipe_screen *screen)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
+ struct si_screen *sscreen = (struct si_screen *)screen;
- return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
- sscreen->info.clock_crystal_freq;
+ return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
+ sscreen->info.clock_crystal_freq;
}
-static void si_query_memory_info(struct pipe_screen *screen,
- struct pipe_memory_info *info)
+static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_info *info)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct radeon_winsys *ws = sscreen->ws;
- unsigned vram_usage, gtt_usage;
-
- info->total_device_memory = sscreen->info.vram_size / 1024;
- info->total_staging_memory = sscreen->info.gart_size / 1024;
-
- /* The real TTM memory usage is somewhat random, because:
- *
- * 1) TTM delays freeing memory, because it can only free it after
- * fences expire.
- *
- * 2) The memory usage can be really low if big VRAM evictions are
- * taking place, but the real usage is well above the size of VRAM.
- *
- * Instead, return statistics of this process.
- */
- vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
- gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
-
- info->avail_device_memory =
- vram_usage <= info->total_device_memory ?
- info->total_device_memory - vram_usage : 0;
- info->avail_staging_memory =
- gtt_usage <= info->total_staging_memory ?
- info->total_staging_memory - gtt_usage : 0;
-
- info->device_memory_evicted =
- ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
-
- if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
- info->nr_device_memory_evictions =
- ws->query_value(ws, RADEON_NUM_EVICTIONS);
- else
- /* Just return the number of evicted 64KB pages. */
- info->nr_device_memory_evictions = info->device_memory_evicted / 64;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct radeon_winsys *ws = sscreen->ws;
+ unsigned vram_usage, gtt_usage;
+
+ info->total_device_memory = sscreen->info.vram_size / 1024;
+ info->total_staging_memory = sscreen->info.gart_size / 1024;
+
+ /* The real TTM memory usage is somewhat random, because:
+ *
+ * 1) TTM delays freeing memory, because it can only free it after
+ * fences expire.
+ *
+ * 2) The memory usage can be really low if big VRAM evictions are
+ * taking place, but the real usage is well above the size of VRAM.
+ *
+ * Instead, return statistics of this process.
+ */
+ vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+ gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
+
+ info->avail_device_memory =
+ vram_usage <= info->total_device_memory ? info->total_device_memory - vram_usage : 0;
+ info->avail_staging_memory =
+ gtt_usage <= info->total_staging_memory ? info->total_staging_memory - gtt_usage : 0;
+
+ info->device_memory_evicted = ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+ if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
+ info->nr_device_memory_evictions = ws->query_value(ws, RADEON_NUM_EVICTIONS);
+ else
+ /* Just return the number of evicted 64KB pages. */
+ info->nr_device_memory_evictions = info->device_memory_evicted / 64;
}
static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
{
- struct si_screen *sscreen = (struct si_screen*)pscreen;
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
- return sscreen->disk_shader_cache;
+ return sscreen->disk_shader_cache;
}
static void si_init_renderer_string(struct si_screen *sscreen)
{
- char first_name[256], second_name[32] = {}, kernel_version[128] = {};
- struct utsname uname_data;
-
- if (sscreen->info.marketing_name) {
- snprintf(first_name, sizeof(first_name), "%s",
- sscreen->info.marketing_name);
- snprintf(second_name, sizeof(second_name), "%s, ",
- sscreen->info.name);
- } else {
- snprintf(first_name, sizeof(first_name), "AMD %s",
- sscreen->info.name);
- }
-
- if (uname(&uname_data) == 0)
- snprintf(kernel_version, sizeof(kernel_version),
- ", %s", uname_data.release);
-
- snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
- "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")",
- first_name, second_name, sscreen->info.drm_major,
- sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
- kernel_version);
+ char first_name[256], second_name[32] = {}, kernel_version[128] = {};
+ struct utsname uname_data;
+
+ if (sscreen->info.marketing_name) {
+ snprintf(first_name, sizeof(first_name), "%s", sscreen->info.marketing_name);
+ snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.name);
+ } else {
+ snprintf(first_name, sizeof(first_name), "AMD %s", sscreen->info.name);
+ }
+
+ if (uname(&uname_data) == 0)
+ snprintf(kernel_version, sizeof(kernel_version), ", %s", uname_data.release);
+
+ snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
+ "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", first_name, second_name,
+ sscreen->info.drm_major, sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
+ kernel_version);
}
void si_init_screen_get_functions(struct si_screen *sscreen)
{
- sscreen->b.get_name = si_get_name;
- sscreen->b.get_vendor = si_get_vendor;
- sscreen->b.get_device_vendor = si_get_device_vendor;
- sscreen->b.get_param = si_get_param;
- sscreen->b.get_paramf = si_get_paramf;
- sscreen->b.get_compute_param = si_get_compute_param;
- sscreen->b.get_timestamp = si_get_timestamp;
- sscreen->b.get_shader_param = si_get_shader_param;
- sscreen->b.get_compiler_options = si_get_compiler_options;
- sscreen->b.get_device_uuid = si_get_device_uuid;
- sscreen->b.get_driver_uuid = si_get_driver_uuid;
- sscreen->b.query_memory_info = si_query_memory_info;
- sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
-
- if (sscreen->info.has_hw_decode) {
- sscreen->b.get_video_param = si_get_video_param;
- sscreen->b.is_video_format_supported = si_vid_is_format_supported;
- } else {
- sscreen->b.get_video_param = si_get_video_param_no_decode;
- sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
- }
-
- si_init_renderer_string(sscreen);
+ sscreen->b.get_name = si_get_name;
+ sscreen->b.get_vendor = si_get_vendor;
+ sscreen->b.get_device_vendor = si_get_device_vendor;
+ sscreen->b.get_param = si_get_param;
+ sscreen->b.get_paramf = si_get_paramf;
+ sscreen->b.get_compute_param = si_get_compute_param;
+ sscreen->b.get_timestamp = si_get_timestamp;
+ sscreen->b.get_shader_param = si_get_shader_param;
+ sscreen->b.get_compiler_options = si_get_compiler_options;
+ sscreen->b.get_device_uuid = si_get_device_uuid;
+ sscreen->b.get_driver_uuid = si_get_driver_uuid;
+ sscreen->b.query_memory_info = si_query_memory_info;
+ sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
+
+ if (sscreen->info.has_hw_decode) {
+ sscreen->b.get_video_param = si_get_video_param;
+ sscreen->b.is_video_format_supported = si_vid_is_format_supported;
+ } else {
+ sscreen->b.get_video_param = si_get_video_param_no_decode;
+ sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
+ }
+
+ si_init_renderer_string(sscreen);
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
#include "si_build_pm4.h"
+#include "si_pipe.h"
#include "sid.h"
-
#include "util/os_time.h"
#include "util/u_upload_mgr.h"
/* initialize */
void si_need_gfx_cs_space(struct si_context *ctx)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
- /* There is no need to flush the DMA IB here, because
- * si_need_dma_space always flushes the GFX IB if there is
- * a conflict, which means any unflushed DMA commands automatically
- * precede the GFX IB (= they had no dependency on the GFX IB when
- * they were submitted).
- */
-
- /* There are two memory usage counters in the winsys for all buffers
- * that have been added (cs_add_buffer) and two counters in the pipe
- * driver for those that haven't been added yet.
- */
- if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
- ctx->vram, ctx->gtt))) {
- ctx->gtt = 0;
- ctx->vram = 0;
- si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- return;
- }
- ctx->gtt = 0;
- ctx->vram = 0;
-
- unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
- if (!ctx->ws->cs_check_space(cs, need_dwords, false))
- si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ /* There is no need to flush the DMA IB here, because
+ * si_need_dma_space always flushes the GFX IB if there is
+ * a conflict, which means any unflushed DMA commands automatically
+ * precede the GFX IB (= they had no dependency on the GFX IB when
+ * they were submitted).
+ */
+
+ /* There are two memory usage counters in the winsys for all buffers
+ * that have been added (cs_add_buffer) and two counters in the pipe
+ * driver for those that haven't been added yet.
+ */
+ if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) {
+ ctx->gtt = 0;
+ ctx->vram = 0;
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ return;
+ }
+ ctx->gtt = 0;
+ ctx->vram = 0;
+
+ unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
+ if (!ctx->ws->cs_check_space(cs, need_dwords, false))
+ si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
void si_unref_sdma_uploads(struct si_context *sctx)
{
- for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
- si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
- si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
- }
- sctx->num_sdma_uploads = 0;
+ for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+ si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+ si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+ }
+ sctx->num_sdma_uploads = 0;
}
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
- struct pipe_fence_handle **fence)
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
- struct radeon_winsys *ws = ctx->ws;
- const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- unsigned wait_flags = 0;
-
- if (ctx->gfx_flush_in_progress)
- return;
-
- if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
- wait_flags |= wait_ps_cs |
- SI_CONTEXT_INV_L2;
- } else if (ctx->chip_class == GFX6) {
- /* The kernel flushes L2 before shaders are finished. */
- wait_flags |= wait_ps_cs;
- } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
- wait_flags |= wait_ps_cs;
- }
-
- /* Drop this flush if it's a no-op. */
- if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
- (!wait_flags || !ctx->gfx_last_ib_is_busy))
- return;
-
- if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
- return;
-
- if (ctx->screen->debug_flags & DBG(CHECK_VM))
- flags &= ~PIPE_FLUSH_ASYNC;
-
- ctx->gfx_flush_in_progress = true;
-
- /* If the state tracker is flushing the GFX IB, si_flush_from_st is
- * responsible for flushing the DMA IB and merging the fences from both.
- * If the driver flushes the GFX IB internally, and it should never ask
- * for a fence handle.
- */
- assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
-
- /* Update the sdma_uploads list by flushing the uploader. */
- u_upload_unmap(ctx->b.const_uploader);
-
- /* Execute SDMA uploads. */
- ctx->sdma_uploads_in_progress = true;
- for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
- struct si_sdma_upload *up = &ctx->sdma_uploads[i];
-
- assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
- up->size % 4 == 0);
-
- si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b,
- up->dst_offset, up->src_offset, up->size);
- }
- ctx->sdma_uploads_in_progress = false;
- si_unref_sdma_uploads(ctx);
-
- /* Flush SDMA (preamble IB). */
- if (radeon_emitted(ctx->sdma_cs, 0))
- si_flush_dma_cs(ctx, flags, NULL);
-
- if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
- struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
- si_compute_signal_gfx(ctx);
-
- /* Make sure compute shaders are idle before leaving the IB, so that
- * the next IB doesn't overwrite GDS that might be in use. */
- radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
- EVENT_INDEX(4));
-
- /* Save the GDS prim restart counter if needed. */
- if (ctx->preserve_prim_restart_gds_at_flush) {
- si_cp_copy_data(ctx, compute_cs,
- COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
- COPY_DATA_GDS, NULL, 4);
- }
- }
-
- if (ctx->has_graphics) {
- if (!list_is_empty(&ctx->active_queries))
- si_suspend_queries(ctx);
-
- ctx->streamout.suspended = false;
- if (ctx->streamout.begin_emitted) {
- si_emit_streamout_end(ctx);
- ctx->streamout.suspended = true;
-
- /* Since NGG streamout uses GDS, we need to make GDS
- * idle when we leave the IB, otherwise another process
- * might overwrite it while our shaders are busy.
- */
- if (ctx->screen->use_ngg_streamout)
- wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
- }
- }
-
- /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
- * because the kernel doesn't wait for it. */
- if (ctx->chip_class >= GFX7)
- si_cp_dma_wait_for_idle(ctx);
-
- /* Wait for draw calls to finish if needed. */
- if (wait_flags) {
- ctx->flags |= wait_flags;
- ctx->emit_cache_flush(ctx);
- }
- ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
-
- if (ctx->current_saved_cs) {
- si_trace_emit(ctx);
-
- /* Save the IB for debug contexts. */
- si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
- ctx->current_saved_cs->flushed = true;
- ctx->current_saved_cs->time_flush = os_time_get_nano();
-
- si_log_hw_flush(ctx);
- }
-
- if (si_compute_prim_discard_enabled(ctx)) {
- /* The compute IB can start after the previous gfx IB starts. */
- if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
- ctx->last_gfx_fence) {
- ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
- ctx->last_gfx_fence,
- RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
- RADEON_DEPENDENCY_START_FENCE);
- }
-
- /* Remember the last execution barrier. It's in the IB.
- * It will signal the start of the next compute IB.
- */
- if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
- ctx->last_pkt3_write_data) {
- *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
- ctx->last_pkt3_write_data = NULL;
-
- si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
- ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
- si_resource_reference(&ctx->barrier_buf, NULL);
-
- ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
- }
- }
-
- /* Flush the CS. */
- ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
- if (fence)
- ws->fence_reference(fence, ctx->last_gfx_fence);
-
- ctx->num_gfx_cs_flushes++;
-
- if (si_compute_prim_discard_enabled(ctx)) {
- /* Remember the last execution barrier, which is the last fence
- * in this case.
- */
- if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
- ctx->last_pkt3_write_data = NULL;
- si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
- ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
- }
- }
-
- /* Check VM faults if needed. */
- if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
- /* Use conservative timeout 800ms, after which we won't wait any
- * longer and assume the GPU is hung.
- */
- ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
-
- si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
- }
-
- if (ctx->current_saved_cs)
- si_saved_cs_reference(&ctx->current_saved_cs, NULL);
-
- si_begin_new_gfx_cs(ctx);
- ctx->gfx_flush_in_progress = false;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ struct radeon_winsys *ws = ctx->ws;
+ const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+ unsigned wait_flags = 0;
+
+ if (ctx->gfx_flush_in_progress)
+ return;
+
+ if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+ wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
+ } else if (ctx->chip_class == GFX6) {
+ /* The kernel flushes L2 before shaders are finished. */
+ wait_flags |= wait_ps_cs;
+ } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+ wait_flags |= wait_ps_cs;
+ }
+
+ /* Drop this flush if it's a no-op. */
+ if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy))
+ return;
+
+ if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
+ return;
+
+ if (ctx->screen->debug_flags & DBG(CHECK_VM))
+ flags &= ~PIPE_FLUSH_ASYNC;
+
+ ctx->gfx_flush_in_progress = true;
+
+ /* If the state tracker is flushing the GFX IB, si_flush_from_st is
+ * responsible for flushing the DMA IB and merging the fences from both.
+ * If the driver flushes the GFX IB internally, and it should never ask
+ * for a fence handle.
+ */
+ assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
+
+ /* Update the sdma_uploads list by flushing the uploader. */
+ u_upload_unmap(ctx->b.const_uploader);
+
+ /* Execute SDMA uploads. */
+ ctx->sdma_uploads_in_progress = true;
+ for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+ struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+
+ assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);
+
+ si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
+ up->size);
+ }
+ ctx->sdma_uploads_in_progress = false;
+ si_unref_sdma_uploads(ctx);
+
+ /* Flush SDMA (preamble IB). */
+ if (radeon_emitted(ctx->sdma_cs, 0))
+ si_flush_dma_cs(ctx, flags, NULL);
+
+ if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+ struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+ si_compute_signal_gfx(ctx);
+
+ /* Make sure compute shaders are idle before leaving the IB, so that
+ * the next IB doesn't overwrite GDS that might be in use. */
+ radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ /* Save the GDS prim restart counter if needed. */
+ if (ctx->preserve_prim_restart_gds_at_flush) {
+ si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+ COPY_DATA_GDS, NULL, 4);
+ }
+ }
+
+ if (ctx->has_graphics) {
+ if (!list_is_empty(&ctx->active_queries))
+ si_suspend_queries(ctx);
+
+ ctx->streamout.suspended = false;
+ if (ctx->streamout.begin_emitted) {
+ si_emit_streamout_end(ctx);
+ ctx->streamout.suspended = true;
+
+ /* Since NGG streamout uses GDS, we need to make GDS
+ * idle when we leave the IB, otherwise another process
+ * might overwrite it while our shaders are busy.
+ */
+ if (ctx->screen->use_ngg_streamout)
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+ }
+ }
+
+ /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+ * because the kernel doesn't wait for it. */
+ if (ctx->chip_class >= GFX7)
+ si_cp_dma_wait_for_idle(ctx);
+
+ /* Wait for draw calls to finish if needed. */
+ if (wait_flags) {
+ ctx->flags |= wait_flags;
+ ctx->emit_cache_flush(ctx);
+ }
+ ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
+
+ if (ctx->current_saved_cs) {
+ si_trace_emit(ctx);
+
+ /* Save the IB for debug contexts. */
+ si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+ ctx->current_saved_cs->flushed = true;
+ ctx->current_saved_cs->time_flush = os_time_get_nano();
+
+ si_log_hw_flush(ctx);
+ }
+
+ if (si_compute_prim_discard_enabled(ctx)) {
+ /* The compute IB can start after the previous gfx IB starts. */
+ if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
+ ctx->ws->cs_add_fence_dependency(
+ ctx->gfx_cs, ctx->last_gfx_fence,
+ RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
+ }
+
+ /* Remember the last execution barrier. It's in the IB.
+ * It will signal the start of the next compute IB.
+ */
+ if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
+ *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+ ctx->last_pkt3_write_data = NULL;
+
+ si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+ ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+ si_resource_reference(&ctx->barrier_buf, NULL);
+
+ ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+ }
+ }
+
+ /* Flush the CS. */
+ ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+ if (fence)
+ ws->fence_reference(fence, ctx->last_gfx_fence);
+
+ ctx->num_gfx_cs_flushes++;
+
+ if (si_compute_prim_discard_enabled(ctx)) {
+ /* Remember the last execution barrier, which is the last fence
+ * in this case.
+ */
+ if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+ ctx->last_pkt3_write_data = NULL;
+ si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+ ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+ }
+ }
+
+ /* Check VM faults if needed. */
+ if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+ /* Use conservative timeout 800ms, after which we won't wait any
+ * longer and assume the GPU is hung.
+ */
+ ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);
+
+ si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+ }
+
+ if (ctx->current_saved_cs)
+ si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+
+ si_begin_new_gfx_cs(ctx);
+ ctx->gfx_flush_in_progress = false;
}
static void si_begin_gfx_cs_debug(struct si_context *ctx)
{
- static const uint32_t zeros[1];
- assert(!ctx->current_saved_cs);
+ static const uint32_t zeros[1];
+ assert(!ctx->current_saved_cs);
- ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
- if (!ctx->current_saved_cs)
- return;
+ ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+ if (!ctx->current_saved_cs)
+ return;
- pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+ pipe_reference_init(&ctx->current_saved_cs->reference, 1);
- ctx->current_saved_cs->trace_buf = si_resource(
- pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
- if (!ctx->current_saved_cs->trace_buf) {
- free(ctx->current_saved_cs);
- ctx->current_saved_cs = NULL;
- return;
- }
+ ctx->current_saved_cs->trace_buf =
+ si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+ if (!ctx->current_saved_cs->trace_buf) {
+ free(ctx->current_saved_cs);
+ ctx->current_saved_cs = NULL;
+ return;
+ }
- pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
- 0, sizeof(zeros), zeros);
- ctx->current_saved_cs->trace_id = 0;
+ pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
+ zeros);
+ ctx->current_saved_cs->trace_id = 0;
- si_trace_emit(ctx);
+ si_trace_emit(ctx);
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
- RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+ RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
}
static void si_add_gds_to_buffer_list(struct si_context *sctx)
{
- if (sctx->gds) {
- sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
- RADEON_USAGE_READWRITE, 0, 0);
- if (sctx->gds_oa) {
- sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
- RADEON_USAGE_READWRITE, 0, 0);
- }
- }
+ if (sctx->gds) {
+ sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+ if (sctx->gds_oa) {
+ sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+ }
+ }
}
void si_allocate_gds(struct si_context *sctx)
{
- struct radeon_winsys *ws = sctx->ws;
+ struct radeon_winsys *ws = sctx->ws;
- if (sctx->gds)
- return;
+ if (sctx->gds)
+ return;
- assert(sctx->screen->use_ngg_streamout);
+ assert(sctx->screen->use_ngg_streamout);
- /* 4 streamout GDS counters.
- * We need 256B (64 dw) of GDS, otherwise streamout hangs.
- */
- sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
- sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
+ /* 4 streamout GDS counters.
+ * We need 256B (64 dw) of GDS, otherwise streamout hangs.
+ */
+ sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
+ sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
- assert(sctx->gds && sctx->gds_oa);
- si_add_gds_to_buffer_list(sctx);
+ assert(sctx->gds && sctx->gds_oa);
+ si_add_gds_to_buffer_list(sctx);
}
void si_begin_new_gfx_cs(struct si_context *ctx)
{
- if (ctx->is_debug)
- si_begin_gfx_cs_debug(ctx);
-
- si_add_gds_to_buffer_list(ctx);
-
- /* Always invalidate caches at the beginning of IBs, because external
- * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
- * buffers.
- *
- * Note that the cache flush done by the kernel at the end of GFX IBs
- * isn't useful here, because that flush can finish after the following
- * IB starts drawing.
- *
- * TODO: Do we also need to invalidate CB & DB caches?
- */
- ctx->flags |= SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 |
- SI_CONTEXT_START_PIPELINE_STATS;
-
- ctx->cs_shader_state.initialized = false;
- si_all_descriptors_begin_new_cs(ctx);
-
- if (!ctx->has_graphics) {
- ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
- return;
- }
-
- /* set all valid group as dirty so they get reemited on
- * next draw command
- */
- si_pm4_reset_emitted(ctx);
-
- /* The CS initialization should be emitted before everything else. */
- si_pm4_emit(ctx, ctx->init_config);
- if (ctx->init_config_gs_rings)
- si_pm4_emit(ctx, ctx->init_config_gs_rings);
-
- if (ctx->queued.named.ls)
- ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
- if (ctx->queued.named.hs)
- ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
- if (ctx->queued.named.es)
- ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
- if (ctx->queued.named.gs)
- ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
- if (ctx->queued.named.vs)
- ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
- if (ctx->queued.named.ps)
- ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
- if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
- ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
-
- /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
- bool has_clear_state = ctx->screen->info.has_clear_state;
- if (has_clear_state) {
- ctx->framebuffer.dirty_cbufs =
- u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
- /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
- ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
- } else {
- ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
- ctx->framebuffer.dirty_zsbuf = true;
- }
- /* This should always be marked as dirty to set the framebuffer scissor
- * at least. */
- si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
-
- si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
- /* CLEAR_STATE sets zeros. */
- if (!has_clear_state || ctx->clip_state.any_nonzeros)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
- ctx->sample_locs_num_samples = 0;
- si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
- /* CLEAR_STATE sets 0xffff. */
- if (!has_clear_state || ctx->sample_mask != 0xffff)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
- /* CLEAR_STATE sets zeros. */
- if (!has_clear_state || ctx->blend_color.any_nonzeros)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
- if (ctx->chip_class >= GFX9)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
- if (!ctx->screen->use_ngg_streamout)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
- /* CLEAR_STATE disables all window rectangles. */
- if (!has_clear_state || ctx->num_window_rectangles > 0)
- si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
-
- si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
- if (ctx->scratch_buffer) {
- si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
- }
-
- if (ctx->streamout.suspended) {
- ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
- si_streamout_buffers_dirty(ctx);
- }
-
- if (!list_is_empty(&ctx->active_queries))
- si_resume_queries(ctx);
-
- assert(!ctx->gfx_cs->prev_dw);
- ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-
- /* Invalidate various draw states so that they are emitted before
- * the first draw call. */
- si_invalidate_draw_sh_constants(ctx);
- ctx->last_index_size = -1;
- ctx->last_primitive_restart_en = -1;
- ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
- ctx->last_prim = -1;
- ctx->last_multi_vgt_param = -1;
- ctx->last_vs_state = ~0;
- ctx->last_ls = NULL;
- ctx->last_tcs = NULL;
- ctx->last_tes_sh_base = -1;
- ctx->last_num_tcs_input_cp = -1;
- ctx->last_ls_hs_config = -1; /* impossible value */
- ctx->last_binning_enabled = -1;
- ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
-
- ctx->prim_discard_compute_ib_initialized = false;
-
- /* Compute-based primitive discard:
- * The index ring is divided into 2 halves. Switch between the halves
- * in the same fashion as doublebuffering.
- */
- if (ctx->index_ring_base)
- ctx->index_ring_base = 0;
- else
- ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
- ctx->index_ring_offset = 0;
-
- STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
-
- if (has_clear_state) {
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
- ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
- ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
- ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */
-
- /* Set all cleared context registers to saved. */
- ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
- ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
- } else {
- /* Set all register values to unknown. */
- ctx->tracked_regs.reg_saved = 0;
- ctx->last_gs_out_prim = -1; /* unknown */
- }
-
- /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
- memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+ if (ctx->is_debug)
+ si_begin_gfx_cs_debug(ctx);
+
+ si_add_gds_to_buffer_list(ctx);
+
+ /* Always invalidate caches at the beginning of IBs, because external
+ * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+ * buffers.
+ *
+ * Note that the cache flush done by the kernel at the end of GFX IBs
+ * isn't useful here, because that flush can finish after the following
+ * IB starts drawing.
+ *
+ * TODO: Do we also need to invalidate CB & DB caches?
+ */
+ ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
+
+ ctx->cs_shader_state.initialized = false;
+ si_all_descriptors_begin_new_cs(ctx);
+
+ if (!ctx->has_graphics) {
+ ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+ return;
+ }
+
+ /* set all valid group as dirty so they get reemited on
+ * next draw command
+ */
+ si_pm4_reset_emitted(ctx);
+
+ /* The CS initialization should be emitted before everything else. */
+ si_pm4_emit(ctx, ctx->init_config);
+ if (ctx->init_config_gs_rings)
+ si_pm4_emit(ctx, ctx->init_config_gs_rings);
+
+ if (ctx->queued.named.ls)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+ if (ctx->queued.named.hs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+ if (ctx->queued.named.es)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+ if (ctx->queued.named.gs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+ if (ctx->queued.named.vs)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+ if (ctx->queued.named.ps)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+ if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+ ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+
+ /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+ bool has_clear_state = ctx->screen->info.has_clear_state;
+ if (has_clear_state) {
+ ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+ /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+ ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+ } else {
+ ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+ ctx->framebuffer.dirty_zsbuf = true;
+ }
+ /* This should always be marked as dirty to set the framebuffer scissor
+ * at least. */
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+ /* CLEAR_STATE sets zeros. */
+ if (!has_clear_state || ctx->clip_state.any_nonzeros)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+ ctx->sample_locs_num_samples = 0;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+ /* CLEAR_STATE sets 0xffff. */
+ if (!has_clear_state || ctx->sample_mask != 0xffff)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+ /* CLEAR_STATE sets zeros. */
+ if (!has_clear_state || ctx->blend_color.any_nonzeros)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+ if (ctx->chip_class >= GFX9)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+ if (!ctx->screen->use_ngg_streamout)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+ /* CLEAR_STATE disables all window rectangles. */
+ if (!has_clear_state || ctx->num_window_rectangles > 0)
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+ if (ctx->scratch_buffer) {
+ si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+ }
+
+ if (ctx->streamout.suspended) {
+ ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+ si_streamout_buffers_dirty(ctx);
+ }
+
+ if (!list_is_empty(&ctx->active_queries))
+ si_resume_queries(ctx);
+
+ assert(!ctx->gfx_cs->prev_dw);
+ ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+
+ /* Invalidate various draw states so that they are emitted before
+ * the first draw call. */
+ si_invalidate_draw_sh_constants(ctx);
+ ctx->last_index_size = -1;
+ ctx->last_primitive_restart_en = -1;
+ ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+ ctx->last_prim = -1;
+ ctx->last_multi_vgt_param = -1;
+ ctx->last_vs_state = ~0;
+ ctx->last_ls = NULL;
+ ctx->last_tcs = NULL;
+ ctx->last_tes_sh_base = -1;
+ ctx->last_num_tcs_input_cp = -1;
+ ctx->last_ls_hs_config = -1; /* impossible value */
+ ctx->last_binning_enabled = -1;
+ ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
+
+ ctx->prim_discard_compute_ib_initialized = false;
+
+ /* Compute-based primitive discard:
+ * The index ring is divided into 2 halves. Switch between the halves
+ * in the same fashion as doublebuffering.
+ */
+ if (ctx->index_ring_base)
+ ctx->index_ring_base = 0;
+ else
+ ctx->index_ring_base = ctx->index_ring_size_per_ib;
+
+ ctx->index_ring_offset = 0;
+
+ STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
+
+ if (has_clear_state) {
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+ ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
+ ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] =
+ 0x0000001e; /* From GFX8 */
+
+ /* Set all cleared context registers to saved. */
+ ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+ ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
+ } else {
+ /* Set all register values to unknown. */
+ ctx->tracked_regs.reg_saved = 0;
+ ctx->last_gs_out_prim = -1; /* unknown */
+ }
+
+ /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+ memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
}
* fps (there are too few samples per frame). */
#define SAMPLES_PER_SEC 10000
-#define GRBM_STATUS 0x8010
-#define TA_BUSY(x) (((x) >> 14) & 0x1)
-#define GDS_BUSY(x) (((x) >> 15) & 0x1)
-#define VGT_BUSY(x) (((x) >> 17) & 0x1)
-#define IA_BUSY(x) (((x) >> 19) & 0x1)
-#define SX_BUSY(x) (((x) >> 20) & 0x1)
-#define WD_BUSY(x) (((x) >> 21) & 0x1)
-#define SPI_BUSY(x) (((x) >> 22) & 0x1)
-#define BCI_BUSY(x) (((x) >> 23) & 0x1)
-#define SC_BUSY(x) (((x) >> 24) & 0x1)
-#define PA_BUSY(x) (((x) >> 25) & 0x1)
-#define DB_BUSY(x) (((x) >> 26) & 0x1)
-#define CP_BUSY(x) (((x) >> 29) & 0x1)
-#define CB_BUSY(x) (((x) >> 30) & 0x1)
-#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
-
-#define SRBM_STATUS2 0x0e4c
-#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
-
-#define CP_STAT 0x8680
-#define PFP_BUSY(x) (((x) >> 15) & 0x1)
-#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
-#define ME_BUSY(x) (((x) >> 17) & 0x1)
-#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
-#define DMA_BUSY(x) (((x) >> 22) & 0x1)
-#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
+#define GRBM_STATUS 0x8010
+#define TA_BUSY(x) (((x) >> 14) & 0x1)
+#define GDS_BUSY(x) (((x) >> 15) & 0x1)
+#define VGT_BUSY(x) (((x) >> 17) & 0x1)
+#define IA_BUSY(x) (((x) >> 19) & 0x1)
+#define SX_BUSY(x) (((x) >> 20) & 0x1)
+#define WD_BUSY(x) (((x) >> 21) & 0x1)
+#define SPI_BUSY(x) (((x) >> 22) & 0x1)
+#define BCI_BUSY(x) (((x) >> 23) & 0x1)
+#define SC_BUSY(x) (((x) >> 24) & 0x1)
+#define PA_BUSY(x) (((x) >> 25) & 0x1)
+#define DB_BUSY(x) (((x) >> 26) & 0x1)
+#define CP_BUSY(x) (((x) >> 29) & 0x1)
+#define CB_BUSY(x) (((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
+
+#define SRBM_STATUS2 0x0e4c
+#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
+
+#define CP_STAT 0x8680
+#define PFP_BUSY(x) (((x) >> 15) & 0x1)
+#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
+#define ME_BUSY(x) (((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
+#define DMA_BUSY(x) (((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
#define IDENTITY(x) x
-#define UPDATE_COUNTER(field, mask) \
- do { \
- if (mask(value)) \
- p_atomic_inc(&counters->named.field.busy); \
- else \
- p_atomic_inc(&counters->named.field.idle); \
- } while (0)
+#define UPDATE_COUNTER(field, mask) \
+ do { \
+ if (mask(value)) \
+ p_atomic_inc(&counters->named.field.busy); \
+ else \
+ p_atomic_inc(&counters->named.field.idle); \
+ } while (0)
-static void si_update_mmio_counters(struct si_screen *sscreen,
- union si_mmio_counters *counters)
+static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters)
{
- uint32_t value = 0;
- bool gui_busy, sdma_busy = false;
-
- /* GRBM_STATUS */
- sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
-
- UPDATE_COUNTER(ta, TA_BUSY);
- UPDATE_COUNTER(gds, GDS_BUSY);
- UPDATE_COUNTER(vgt, VGT_BUSY);
- UPDATE_COUNTER(ia, IA_BUSY);
- UPDATE_COUNTER(sx, SX_BUSY);
- UPDATE_COUNTER(wd, WD_BUSY);
- UPDATE_COUNTER(spi, SPI_BUSY);
- UPDATE_COUNTER(bci, BCI_BUSY);
- UPDATE_COUNTER(sc, SC_BUSY);
- UPDATE_COUNTER(pa, PA_BUSY);
- UPDATE_COUNTER(db, DB_BUSY);
- UPDATE_COUNTER(cp, CP_BUSY);
- UPDATE_COUNTER(cb, CB_BUSY);
- UPDATE_COUNTER(gui, GUI_ACTIVE);
- gui_busy = GUI_ACTIVE(value);
-
- if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
- /* SRBM_STATUS2 */
- sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
-
- UPDATE_COUNTER(sdma, SDMA_BUSY);
- sdma_busy = SDMA_BUSY(value);
- }
-
- if (sscreen->info.chip_class >= GFX8) {
- /* CP_STAT */
- sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
-
- UPDATE_COUNTER(pfp, PFP_BUSY);
- UPDATE_COUNTER(meq, MEQ_BUSY);
- UPDATE_COUNTER(me, ME_BUSY);
- UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
- UPDATE_COUNTER(cp_dma, DMA_BUSY);
- UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
- }
-
- value = gui_busy || sdma_busy;
- UPDATE_COUNTER(gpu, IDENTITY);
+ uint32_t value = 0;
+ bool gui_busy, sdma_busy = false;
+
+ /* GRBM_STATUS */
+ sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+
+ UPDATE_COUNTER(ta, TA_BUSY);
+ UPDATE_COUNTER(gds, GDS_BUSY);
+ UPDATE_COUNTER(vgt, VGT_BUSY);
+ UPDATE_COUNTER(ia, IA_BUSY);
+ UPDATE_COUNTER(sx, SX_BUSY);
+ UPDATE_COUNTER(wd, WD_BUSY);
+ UPDATE_COUNTER(spi, SPI_BUSY);
+ UPDATE_COUNTER(bci, BCI_BUSY);
+ UPDATE_COUNTER(sc, SC_BUSY);
+ UPDATE_COUNTER(pa, PA_BUSY);
+ UPDATE_COUNTER(db, DB_BUSY);
+ UPDATE_COUNTER(cp, CP_BUSY);
+ UPDATE_COUNTER(cb, CB_BUSY);
+ UPDATE_COUNTER(gui, GUI_ACTIVE);
+ gui_busy = GUI_ACTIVE(value);
+
+ if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
+ /* SRBM_STATUS2 */
+ sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+
+ UPDATE_COUNTER(sdma, SDMA_BUSY);
+ sdma_busy = SDMA_BUSY(value);
+ }
+
+ if (sscreen->info.chip_class >= GFX8) {
+ /* CP_STAT */
+ sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+
+ UPDATE_COUNTER(pfp, PFP_BUSY);
+ UPDATE_COUNTER(meq, MEQ_BUSY);
+ UPDATE_COUNTER(me, ME_BUSY);
+ UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+ UPDATE_COUNTER(cp_dma, DMA_BUSY);
+ UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+ }
+
+ value = gui_busy || sdma_busy;
+ UPDATE_COUNTER(gpu, IDENTITY);
}
#undef UPDATE_COUNTER
-static int
-si_gpu_load_thread(void *param)
+static int si_gpu_load_thread(void *param)
{
- struct si_screen *sscreen = (struct si_screen*)param;
- const int period_us = 1000000 / SAMPLES_PER_SEC;
- int sleep_us = period_us;
- int64_t cur_time, last_time = os_time_get();
-
- while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
- if (sleep_us)
- os_time_sleep(sleep_us);
-
- /* Make sure we sleep the ideal amount of time to match
- * the expected frequency. */
- cur_time = os_time_get();
-
- if (os_time_timeout(last_time, last_time + period_us,
- cur_time))
- sleep_us = MAX2(sleep_us - 1, 1);
- else
- sleep_us += 1;
-
- /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
- last_time = cur_time;
-
- /* Update the counters. */
- si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
- }
- p_atomic_dec(&sscreen->gpu_load_stop_thread);
- return 0;
+ struct si_screen *sscreen = (struct si_screen *)param;
+ const int period_us = 1000000 / SAMPLES_PER_SEC;
+ int sleep_us = period_us;
+ int64_t cur_time, last_time = os_time_get();
+
+ while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+ if (sleep_us)
+ os_time_sleep(sleep_us);
+
+ /* Make sure we sleep the ideal amount of time to match
+ * the expected frequency. */
+ cur_time = os_time_get();
+
+ if (os_time_timeout(last_time, last_time + period_us, cur_time))
+ sleep_us = MAX2(sleep_us - 1, 1);
+ else
+ sleep_us += 1;
+
+ /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+ last_time = cur_time;
+
+ /* Update the counters. */
+ si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+ }
+ p_atomic_dec(&sscreen->gpu_load_stop_thread);
+ return 0;
}
void si_gpu_load_kill_thread(struct si_screen *sscreen)
{
- if (!sscreen->gpu_load_thread)
- return;
+ if (!sscreen->gpu_load_thread)
+ return;
- p_atomic_inc(&sscreen->gpu_load_stop_thread);
- thrd_join(sscreen->gpu_load_thread, NULL);
- sscreen->gpu_load_thread = 0;
+ p_atomic_inc(&sscreen->gpu_load_stop_thread);
+ thrd_join(sscreen->gpu_load_thread, NULL);
+ sscreen->gpu_load_thread = 0;
}
-static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
- unsigned busy_index)
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index)
{
- /* Start the thread if needed. */
- if (!sscreen->gpu_load_thread) {
- simple_mtx_lock(&sscreen->gpu_load_mutex);
- /* Check again inside the mutex. */
- if (!sscreen->gpu_load_thread)
- sscreen->gpu_load_thread =
- u_thread_create(si_gpu_load_thread, sscreen);
- simple_mtx_unlock(&sscreen->gpu_load_mutex);
- }
-
- unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
- unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
-
- return busy | ((uint64_t)idle << 32);
+ /* Start the thread if needed. */
+ if (!sscreen->gpu_load_thread) {
+ simple_mtx_lock(&sscreen->gpu_load_mutex);
+ /* Check again inside the mutex. */
+ if (!sscreen->gpu_load_thread)
+ sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen);
+ simple_mtx_unlock(&sscreen->gpu_load_mutex);
+ }
+
+ unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+ unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+
+ return busy | ((uint64_t)idle << 32);
}
-static unsigned si_end_mmio_counter(struct si_screen *sscreen,
- uint64_t begin, unsigned busy_index)
+static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index)
{
- uint64_t end = si_read_mmio_counter(sscreen, busy_index);
- unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
- unsigned idle = (end >> 32) - (begin >> 32);
-
- /* Calculate the % of time the busy counter was being incremented.
- *
- * If no counters were incremented, return the current counter status.
- * It's for the case when the load is queried faster than
- * the counters are updated.
- */
- if (idle || busy) {
- return busy*100 / (busy + idle);
- } else {
- union si_mmio_counters counters;
-
- memset(&counters, 0, sizeof(counters));
- si_update_mmio_counters(sscreen, &counters);
- return counters.array[busy_index] ? 100 : 0;
- }
+ uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+ unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+ unsigned idle = (end >> 32) - (begin >> 32);
+
+ /* Calculate the % of time the busy counter was being incremented.
+ *
+ * If no counters were incremented, return the current counter status.
+ * It's for the case when the load is queried faster than
+ * the counters are updated.
+ */
+ if (idle || busy) {
+ return busy * 100 / (busy + idle);
+ } else {
+ union si_mmio_counters counters;
+
+ memset(&counters, 0, sizeof(counters));
+ si_update_mmio_counters(sscreen, &counters);
+ return counters.array[busy_index] ? 100 : 0;
+ }
}
-#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
- sscreen->mmio_counters.array)
+#define BUSY_INDEX(sscreen, field) \
+ (&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array)
-static unsigned busy_index_from_type(struct si_screen *sscreen,
- unsigned type)
+static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type)
{
- switch (type) {
- case SI_QUERY_GPU_LOAD:
- return BUSY_INDEX(sscreen, gpu);
- case SI_QUERY_GPU_SHADERS_BUSY:
- return BUSY_INDEX(sscreen, spi);
- case SI_QUERY_GPU_TA_BUSY:
- return BUSY_INDEX(sscreen, ta);
- case SI_QUERY_GPU_GDS_BUSY:
- return BUSY_INDEX(sscreen, gds);
- case SI_QUERY_GPU_VGT_BUSY:
- return BUSY_INDEX(sscreen, vgt);
- case SI_QUERY_GPU_IA_BUSY:
- return BUSY_INDEX(sscreen, ia);
- case SI_QUERY_GPU_SX_BUSY:
- return BUSY_INDEX(sscreen, sx);
- case SI_QUERY_GPU_WD_BUSY:
- return BUSY_INDEX(sscreen, wd);
- case SI_QUERY_GPU_BCI_BUSY:
- return BUSY_INDEX(sscreen, bci);
- case SI_QUERY_GPU_SC_BUSY:
- return BUSY_INDEX(sscreen, sc);
- case SI_QUERY_GPU_PA_BUSY:
- return BUSY_INDEX(sscreen, pa);
- case SI_QUERY_GPU_DB_BUSY:
- return BUSY_INDEX(sscreen, db);
- case SI_QUERY_GPU_CP_BUSY:
- return BUSY_INDEX(sscreen, cp);
- case SI_QUERY_GPU_CB_BUSY:
- return BUSY_INDEX(sscreen, cb);
- case SI_QUERY_GPU_SDMA_BUSY:
- return BUSY_INDEX(sscreen, sdma);
- case SI_QUERY_GPU_PFP_BUSY:
- return BUSY_INDEX(sscreen, pfp);
- case SI_QUERY_GPU_MEQ_BUSY:
- return BUSY_INDEX(sscreen, meq);
- case SI_QUERY_GPU_ME_BUSY:
- return BUSY_INDEX(sscreen, me);
- case SI_QUERY_GPU_SURF_SYNC_BUSY:
- return BUSY_INDEX(sscreen, surf_sync);
- case SI_QUERY_GPU_CP_DMA_BUSY:
- return BUSY_INDEX(sscreen, cp_dma);
- case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
- return BUSY_INDEX(sscreen, scratch_ram);
- default:
- unreachable("invalid query type");
- }
+ switch (type) {
+ case SI_QUERY_GPU_LOAD:
+ return BUSY_INDEX(sscreen, gpu);
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ return BUSY_INDEX(sscreen, spi);
+ case SI_QUERY_GPU_TA_BUSY:
+ return BUSY_INDEX(sscreen, ta);
+ case SI_QUERY_GPU_GDS_BUSY:
+ return BUSY_INDEX(sscreen, gds);
+ case SI_QUERY_GPU_VGT_BUSY:
+ return BUSY_INDEX(sscreen, vgt);
+ case SI_QUERY_GPU_IA_BUSY:
+ return BUSY_INDEX(sscreen, ia);
+ case SI_QUERY_GPU_SX_BUSY:
+ return BUSY_INDEX(sscreen, sx);
+ case SI_QUERY_GPU_WD_BUSY:
+ return BUSY_INDEX(sscreen, wd);
+ case SI_QUERY_GPU_BCI_BUSY:
+ return BUSY_INDEX(sscreen, bci);
+ case SI_QUERY_GPU_SC_BUSY:
+ return BUSY_INDEX(sscreen, sc);
+ case SI_QUERY_GPU_PA_BUSY:
+ return BUSY_INDEX(sscreen, pa);
+ case SI_QUERY_GPU_DB_BUSY:
+ return BUSY_INDEX(sscreen, db);
+ case SI_QUERY_GPU_CP_BUSY:
+ return BUSY_INDEX(sscreen, cp);
+ case SI_QUERY_GPU_CB_BUSY:
+ return BUSY_INDEX(sscreen, cb);
+ case SI_QUERY_GPU_SDMA_BUSY:
+ return BUSY_INDEX(sscreen, sdma);
+ case SI_QUERY_GPU_PFP_BUSY:
+ return BUSY_INDEX(sscreen, pfp);
+ case SI_QUERY_GPU_MEQ_BUSY:
+ return BUSY_INDEX(sscreen, meq);
+ case SI_QUERY_GPU_ME_BUSY:
+ return BUSY_INDEX(sscreen, me);
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ return BUSY_INDEX(sscreen, surf_sync);
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ return BUSY_INDEX(sscreen, cp_dma);
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ return BUSY_INDEX(sscreen, scratch_ram);
+ default:
+ unreachable("invalid query type");
+ }
}
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
{
- unsigned busy_index = busy_index_from_type(sscreen, type);
- return si_read_mmio_counter(sscreen, busy_index);
+ unsigned busy_index = busy_index_from_type(sscreen, type);
+ return si_read_mmio_counter(sscreen, busy_index);
}
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
- uint64_t begin)
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin)
{
- unsigned busy_index = busy_index_from_type(sscreen, type);
- return si_end_mmio_counter(sscreen, begin, busy_index);
+ unsigned busy_index = busy_index_from_type(sscreen, type);
+ return si_end_mmio_counter(sscreen, begin, busy_index);
}
#include "si_query.h"
#include "util/u_memory.h"
+enum si_pc_block_flags
+{
+ /* This block is part of the shader engine */
+ SI_PC_BLOCK_SE = (1 << 0),
-enum si_pc_block_flags {
- /* This block is part of the shader engine */
- SI_PC_BLOCK_SE = (1 << 0),
-
- /* Expose per-instance groups instead of summing all instances (within
- * an SE). */
- SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+ /* Expose per-instance groups instead of summing all instances (within
+ * an SE). */
+ SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
- /* Expose per-SE groups instead of summing instances across SEs. */
- SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+ /* Expose per-SE groups instead of summing instances across SEs. */
+ SI_PC_BLOCK_SE_GROUPS = (1 << 2),
- /* Shader block */
- SI_PC_BLOCK_SHADER = (1 << 3),
+ /* Shader block */
+ SI_PC_BLOCK_SHADER = (1 << 3),
- /* Non-shader block with perfcounters windowed by shaders. */
- SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+ /* Non-shader block with perfcounters windowed by shaders. */
+ SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
};
-enum si_pc_reg_layout {
- /* All secondary selector dwords follow as one block after the primary
- * selector dwords for the counters that have secondary selectors.
- */
- SI_PC_MULTI_BLOCK = 0,
+enum si_pc_reg_layout
+{
+ /* All secondary selector dwords follow as one block after the primary
+ * selector dwords for the counters that have secondary selectors.
+ */
+ SI_PC_MULTI_BLOCK = 0,
- /* Each secondary selector dword follows immediately afters the
- * corresponding primary.
- */
- SI_PC_MULTI_ALTERNATE = 1,
+ /* Each secondary selector dword follows immediately afters the
+ * corresponding primary.
+ */
+ SI_PC_MULTI_ALTERNATE = 1,
- /* All secondary selector dwords follow as one block after all primary
- * selector dwords.
- */
- SI_PC_MULTI_TAIL = 2,
+ /* All secondary selector dwords follow as one block after all primary
+ * selector dwords.
+ */
+ SI_PC_MULTI_TAIL = 2,
- /* Free-form arrangement of selector registers. */
- SI_PC_MULTI_CUSTOM = 3,
+ /* Free-form arrangement of selector registers. */
+ SI_PC_MULTI_CUSTOM = 3,
- SI_PC_MULTI_MASK = 3,
+ SI_PC_MULTI_MASK = 3,
- /* Registers are laid out in decreasing rather than increasing order. */
- SI_PC_REG_REVERSE = 4,
+ /* Registers are laid out in decreasing rather than increasing order. */
+ SI_PC_REG_REVERSE = 4,
- SI_PC_FAKE = 8,
+ SI_PC_FAKE = 8,
};
struct si_pc_block_base {
- const char *name;
- unsigned num_counters;
- unsigned flags;
-
- unsigned select_or;
- unsigned select0;
- unsigned counter0_lo;
- unsigned *select;
- unsigned *counters;
- unsigned num_multi;
- unsigned num_prelude;
- unsigned layout;
+ const char *name;
+ unsigned num_counters;
+ unsigned flags;
+
+ unsigned select_or;
+ unsigned select0;
+ unsigned counter0_lo;
+ unsigned *select;
+ unsigned *counters;
+ unsigned num_multi;
+ unsigned num_prelude;
+ unsigned layout;
};
struct si_pc_block_gfxdescr {
- struct si_pc_block_base *b;
- unsigned selectors;
- unsigned instances;
+ struct si_pc_block_base *b;
+ unsigned selectors;
+ unsigned instances;
};
struct si_pc_block {
- const struct si_pc_block_gfxdescr *b;
- unsigned num_instances;
+ const struct si_pc_block_gfxdescr *b;
+ unsigned num_instances;
- unsigned num_groups;
- char *group_names;
- unsigned group_name_stride;
+ unsigned num_groups;
+ char *group_names;
+ unsigned group_name_stride;
- char *selector_names;
- unsigned selector_name_stride;
+ char *selector_names;
+ unsigned selector_name_stride;
};
/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
* performance counter group IDs.
*/
-static const char * const si_pc_shader_type_suffixes[] = {
- "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
-};
+static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS",
+ "_PS", "_LS", "_HS", "_CS"};
static const unsigned si_pc_shader_type_bits[] = {
- 0x7f,
- S_036780_ES_EN(1),
- S_036780_GS_EN(1),
- S_036780_VS_EN(1),
- S_036780_PS_EN(1),
- S_036780_LS_EN(1),
- S_036780_HS_EN(1),
- S_036780_CS_EN(1),
+ 0x7f,
+ S_036780_ES_EN(1),
+ S_036780_GS_EN(1),
+ S_036780_VS_EN(1),
+ S_036780_PS_EN(1),
+ S_036780_LS_EN(1),
+ S_036780_HS_EN(1),
+ S_036780_CS_EN(1),
};
/* Max counters per HW block */
#define SI_PC_SHADERS_WINDOWING (1u << 31)
struct si_query_group {
- struct si_query_group *next;
- struct si_pc_block *block;
- unsigned sub_gid; /* only used during init */
- unsigned result_base; /* only used during init */
- int se;
- int instance;
- unsigned num_counters;
- unsigned selectors[SI_QUERY_MAX_COUNTERS];
+ struct si_query_group *next;
+ struct si_pc_block *block;
+ unsigned sub_gid; /* only used during init */
+ unsigned result_base; /* only used during init */
+ int se;
+ int instance;
+ unsigned num_counters;
+ unsigned selectors[SI_QUERY_MAX_COUNTERS];
};
struct si_query_counter {
- unsigned base;
- unsigned qwords;
- unsigned stride; /* in uint64s */
+ unsigned base;
+ unsigned qwords;
+ unsigned stride; /* in uint64s */
};
struct si_query_pc {
- struct si_query b;
- struct si_query_buffer buffer;
+ struct si_query b;
+ struct si_query_buffer buffer;
- /* Size of the results in memory, in bytes. */
- unsigned result_size;
+ /* Size of the results in memory, in bytes. */
+ unsigned result_size;
- unsigned shaders;
- unsigned num_counters;
- struct si_query_counter *counters;
- struct si_query_group *groups;
+ unsigned shaders;
+ unsigned num_counters;
+ struct si_query_counter *counters;
+ struct si_query_group *groups;
};
-
static struct si_pc_block_base cik_CB = {
- .name = "CB",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_037000_CB_PERFCOUNTER_FILTER,
- .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
- .num_multi = 1,
- .num_prelude = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "CB",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+ .select0 = R_037000_CB_PERFCOUNTER_FILTER,
+ .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .num_prelude = 1,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static unsigned cik_CPC_select[] = {
- R_036024_CPC_PERFCOUNTER0_SELECT,
- R_036010_CPC_PERFCOUNTER0_SELECT1,
- R_03600C_CPC_PERFCOUNTER1_SELECT,
+ R_036024_CPC_PERFCOUNTER0_SELECT,
+ R_036010_CPC_PERFCOUNTER0_SELECT1,
+ R_03600C_CPC_PERFCOUNTER1_SELECT,
};
static struct si_pc_block_base cik_CPC = {
- .name = "CPC",
- .num_counters = 2,
+ .name = "CPC",
+ .num_counters = 2,
- .select = cik_CPC_select,
- .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
+ .select = cik_CPC_select,
+ .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
};
static struct si_pc_block_base cik_CPF = {
- .name = "CPF",
- .num_counters = 2,
+ .name = "CPF",
+ .num_counters = 2,
- .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+ .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
};
static struct si_pc_block_base cik_CPG = {
- .name = "CPG",
- .num_counters = 2,
+ .name = "CPG",
+ .num_counters = 2,
- .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+ .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
};
static struct si_pc_block_base cik_DB = {
- .name = "DB",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
- .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
- .num_multi = 3, // really only 2, but there's a gap between registers
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "DB",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+ .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
+ .num_multi = 3, // really only 2, but there's a gap between registers
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_GDS = {
- .name = "GDS",
- .num_counters = 4,
+ .name = "GDS",
+ .num_counters = 4,
- .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
+ .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_TAIL,
};
static unsigned cik_GRBM_counters[] = {
- R_034100_GRBM_PERFCOUNTER0_LO,
- R_03410C_GRBM_PERFCOUNTER1_LO,
+ R_034100_GRBM_PERFCOUNTER0_LO,
+ R_03410C_GRBM_PERFCOUNTER1_LO,
};
static struct si_pc_block_base cik_GRBM = {
- .name = "GRBM",
- .num_counters = 2,
+ .name = "GRBM",
+ .num_counters = 2,
- .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
- .counters = cik_GRBM_counters,
+ .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
+ .counters = cik_GRBM_counters,
};
static struct si_pc_block_base cik_GRBMSE = {
- .name = "GRBMSE",
- .num_counters = 4,
+ .name = "GRBMSE",
+ .num_counters = 4,
- .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
- .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
+ .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
+ .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
};
static struct si_pc_block_base cik_IA = {
- .name = "IA",
- .num_counters = 4,
+ .name = "IA",
+ .num_counters = 4,
- .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
+ .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_TAIL,
};
static struct si_pc_block_base cik_PA_SC = {
- .name = "PA_SC",
- .num_counters = 8,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "PA_SC",
+ .num_counters = 8,
+ .flags = SI_PC_BLOCK_SE,
+
+ .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
/* According to docs, PA_SU counters are only 48 bits wide. */
static struct si_pc_block_base cik_PA_SU = {
- .name = "PA_SU",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "PA_SU",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE,
+
+ .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
+ .num_multi = 2,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_SPI = {
- .name = "SPI",
- .num_counters = 6,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
- .num_multi = 4,
- .layout = SI_PC_MULTI_BLOCK,
+ .name = "SPI",
+ .num_counters = 6,
+ .flags = SI_PC_BLOCK_SE,
+
+ .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
+ .num_multi = 4,
+ .layout = SI_PC_MULTI_BLOCK,
};
static struct si_pc_block_base cik_SQ = {
- .name = "SQ",
- .num_counters = 16,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
-
- .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
- .select_or = S_036700_SQC_BANK_MASK(15) |
- S_036700_SQC_CLIENT_MASK(15) |
- S_036700_SIMD_MASK(15),
- .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
+ .name = "SQ",
+ .num_counters = 16,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
+
+ .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
+ .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
+ .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
};
static struct si_pc_block_base cik_SX = {
- .name = "SX",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_TAIL,
+ .name = "SX",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE,
+
+ .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
+ .num_multi = 2,
+ .layout = SI_PC_MULTI_TAIL,
};
static struct si_pc_block_base cik_TA = {
- .name = "TA",
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "TA",
+ .num_counters = 2,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+ .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_TD = {
- .name = "TD",
- .num_counters = 2,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "TD",
+ .num_counters = 2,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+ .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_TCA = {
- .name = "TCA",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "TCA",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+ .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
+ .num_multi = 2,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_TCC = {
- .name = "TCC",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
- .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "TCC",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+ .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
+ .num_multi = 2,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_TCP = {
- .name = "TCP",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
- .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
- .num_multi = 2,
- .layout = SI_PC_MULTI_ALTERNATE,
+ .name = "TCP",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+ .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
+ .num_multi = 2,
+ .layout = SI_PC_MULTI_ALTERNATE,
};
static struct si_pc_block_base cik_VGT = {
- .name = "VGT",
- .num_counters = 4,
- .flags = SI_PC_BLOCK_SE,
-
- .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
- .num_multi = 1,
- .layout = SI_PC_MULTI_TAIL,
+ .name = "VGT",
+ .num_counters = 4,
+ .flags = SI_PC_BLOCK_SE,
+
+ .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
+ .num_multi = 1,
+ .layout = SI_PC_MULTI_TAIL,
};
static struct si_pc_block_base cik_WD = {
- .name = "WD",
- .num_counters = 4,
+ .name = "WD",
+ .num_counters = 4,
- .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
- .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
+ .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
+ .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
};
static struct si_pc_block_base cik_MC = {
- .name = "MC",
- .num_counters = 4,
+ .name = "MC",
+ .num_counters = 4,
- .layout = SI_PC_FAKE,
+ .layout = SI_PC_FAKE,
};
static struct si_pc_block_base cik_SRBM = {
- .name = "SRBM",
- .num_counters = 2,
+ .name = "SRBM",
+ .num_counters = 2,
- .layout = SI_PC_FAKE,
+ .layout = SI_PC_FAKE,
};
/* Both the number of instances and selectors varies between chips of the same
* blocks here matters.
*/
static struct si_pc_block_gfxdescr groups_CIK[] = {
- { &cik_CB, 226},
- { &cik_CPF, 17 },
- { &cik_DB, 257},
- { &cik_GRBM, 34 },
- { &cik_GRBMSE, 15 },
- { &cik_PA_SU, 153 },
- { &cik_PA_SC, 395 },
- { &cik_SPI, 186 },
- { &cik_SQ, 252 },
- { &cik_SX, 32 },
- { &cik_TA, 111, 11 },
- { &cik_TCA, 39, 2 },
- { &cik_TCC, 160},
- { &cik_TD, 55, 11 },
- { &cik_TCP, 154, 11 },
- { &cik_GDS, 121 },
- { &cik_VGT, 140 },
- { &cik_IA, 22 },
- { &cik_MC, 22 },
- { &cik_SRBM, 19 },
- { &cik_WD, 22 },
- { &cik_CPG, 46 },
- { &cik_CPC, 22 },
+ {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
+ {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32},
+ {&cik_TA, 111, 11}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55, 11}, {&cik_TCP, 154, 11},
+ {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19},
+ {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22},
};
static struct si_pc_block_gfxdescr groups_VI[] = {
- { &cik_CB, 405},
- { &cik_CPF, 19 },
- { &cik_DB, 257},
- { &cik_GRBM, 34 },
- { &cik_GRBMSE, 15 },
- { &cik_PA_SU, 154 },
- { &cik_PA_SC, 397 },
- { &cik_SPI, 197 },
- { &cik_SQ, 273 },
- { &cik_SX, 34 },
- { &cik_TA, 119, 16 },
- { &cik_TCA, 35, 2 },
- { &cik_TCC, 192},
- { &cik_TD, 55, 16 },
- { &cik_TCP, 180, 16 },
- { &cik_GDS, 121 },
- { &cik_VGT, 147 },
- { &cik_IA, 24 },
- { &cik_MC, 22 },
- { &cik_SRBM, 27 },
- { &cik_WD, 37 },
- { &cik_CPG, 48 },
- { &cik_CPC, 24 },
+ {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
+ {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34},
+ {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55, 16}, {&cik_TCP, 180, 16},
+ {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27},
+ {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24},
};
static struct si_pc_block_gfxdescr groups_gfx9[] = {
- { &cik_CB, 438},
- { &cik_CPF, 32 },
- { &cik_DB, 328},
- { &cik_GRBM, 38 },
- { &cik_GRBMSE, 16 },
- { &cik_PA_SU, 292 },
- { &cik_PA_SC, 491 },
- { &cik_SPI, 196 },
- { &cik_SQ, 374 },
- { &cik_SX, 208 },
- { &cik_TA, 119, 16 },
- { &cik_TCA, 35, 2 },
- { &cik_TCC, 256},
- { &cik_TD, 57, 16 },
- { &cik_TCP, 85, 16 },
- { &cik_GDS, 121 },
- { &cik_VGT, 148 },
- { &cik_IA, 32 },
- { &cik_WD, 58 },
- { &cik_CPG, 59 },
- { &cik_CPC, 35 },
+ {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16},
+ {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208},
+ {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57, 16}, {&cik_TCP, 85, 16},
+ {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59},
+ {&cik_CPC, 35},
};
static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
- const struct si_pc_block *block)
+ const struct si_pc_block *block)
{
- return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
- (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
+ return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
+ (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
}
static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
- const struct si_pc_block *block)
+ const struct si_pc_block *block)
{
- return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
- (block->num_instances > 1 && pc->separate_instance);
+ return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
+ (block->num_instances > 1 && pc->separate_instance);
}
-static struct si_pc_block *
-lookup_counter(struct si_perfcounters *pc, unsigned index,
- unsigned *base_gid, unsigned *sub_index)
+static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
+ unsigned *base_gid, unsigned *sub_index)
{
- struct si_pc_block *block = pc->blocks;
- unsigned bid;
+ struct si_pc_block *block = pc->blocks;
+ unsigned bid;
- *base_gid = 0;
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- unsigned total = block->num_groups * block->b->selectors;
+ *base_gid = 0;
+ for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+ unsigned total = block->num_groups * block->b->selectors;
- if (index < total) {
- *sub_index = index;
- return block;
- }
+ if (index < total) {
+ *sub_index = index;
+ return block;
+ }
- index -= total;
- *base_gid += block->num_groups;
- }
+ index -= total;
+ *base_gid += block->num_groups;
+ }
- return NULL;
+ return NULL;
}
-static struct si_pc_block *
-lookup_group(struct si_perfcounters *pc, unsigned *index)
+static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
{
- unsigned bid;
- struct si_pc_block *block = pc->blocks;
+ unsigned bid;
+ struct si_pc_block *block = pc->blocks;
- for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
- if (*index < block->num_groups)
- return block;
- *index -= block->num_groups;
- }
+ for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+ if (*index < block->num_groups)
+ return block;
+ *index -= block->num_groups;
+ }
- return NULL;
+ return NULL;
}
-static void si_pc_emit_instance(struct si_context *sctx,
- int se, int instance)
+static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned value = S_030800_SH_BROADCAST_WRITES(1);
-
- if (se >= 0) {
- value |= S_030800_SE_INDEX(se);
- } else {
- value |= S_030800_SE_BROADCAST_WRITES(1);
- }
-
- if (instance >= 0) {
- value |= S_030800_INSTANCE_INDEX(instance);
- } else {
- value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
- }
-
- radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned value = S_030800_SH_BROADCAST_WRITES(1);
+
+ if (se >= 0) {
+ value |= S_030800_SE_INDEX(se);
+ } else {
+ value |= S_030800_SE_BROADCAST_WRITES(1);
+ }
+
+ if (instance >= 0) {
+ value |= S_030800_INSTANCE_INDEX(instance);
+ } else {
+ value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
+ }
+
+ radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
}
-static void si_pc_emit_shaders(struct si_context *sctx,
- unsigned shaders)
+static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
- radeon_emit(cs, shaders & 0x7f);
- radeon_emit(cs, 0xffffffff);
+ radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
+ radeon_emit(cs, shaders & 0x7f);
+ radeon_emit(cs, 0xffffffff);
}
-static void si_pc_emit_select(struct si_context *sctx,
- struct si_pc_block *block,
- unsigned count, unsigned *selectors)
+static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+ unsigned *selectors)
{
- struct si_pc_block_base *regs = block->b->b;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned idx;
- unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
- unsigned dw;
-
- assert(count <= regs->num_counters);
-
- if (regs->layout & SI_PC_FAKE)
- return;
-
- if (layout_multi == SI_PC_MULTI_BLOCK) {
- assert(!(regs->layout & SI_PC_REG_REVERSE));
-
- dw = count + regs->num_prelude;
- if (count >= regs->num_multi)
- dw += regs->num_multi;
- radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
-
- if (count < regs->num_multi) {
- unsigned select1 =
- regs->select0 + 4 * regs->num_multi;
- radeon_set_uconfig_reg_seq(cs, select1, count);
- }
-
- for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
- radeon_emit(cs, 0);
-
- if (count > regs->num_multi) {
- for (idx = regs->num_multi; idx < count; ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
- }
- } else if (layout_multi == SI_PC_MULTI_TAIL) {
- unsigned select1, select1_count;
-
- assert(!(regs->layout & SI_PC_REG_REVERSE));
-
- radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < count; ++idx)
- radeon_emit(cs, selectors[idx] | regs->select_or);
-
- select1 = regs->select0 + 4 * regs->num_counters;
- select1_count = MIN2(count, regs->num_multi);
- radeon_set_uconfig_reg_seq(cs, select1, select1_count);
- for (idx = 0; idx < select1_count; ++idx)
- radeon_emit(cs, 0);
- } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
- unsigned *reg = regs->select;
- for (idx = 0; idx < count; ++idx) {
- radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
- if (idx < regs->num_multi)
- radeon_set_uconfig_reg(cs, *reg++, 0);
- }
- } else {
- assert(layout_multi == SI_PC_MULTI_ALTERNATE);
-
- unsigned reg_base = regs->select0;
- unsigned reg_count = count + MIN2(count, regs->num_multi);
- reg_count += regs->num_prelude;
-
- if (!(regs->layout & SI_PC_REG_REVERSE)) {
- radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- for (idx = 0; idx < count; ++idx) {
- radeon_emit(cs, selectors[idx] | regs->select_or);
- if (idx < regs->num_multi)
- radeon_emit(cs, 0);
- }
- } else {
- reg_base -= (reg_count - 1) * 4;
- radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
- for (idx = count; idx > 0; --idx) {
- if (idx <= regs->num_multi)
- radeon_emit(cs, 0);
- radeon_emit(cs, selectors[idx - 1] | regs->select_or);
- }
- for (idx = 0; idx < regs->num_prelude; ++idx)
- radeon_emit(cs, 0);
- }
- }
+ struct si_pc_block_base *regs = block->b->b;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned idx;
+ unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
+ unsigned dw;
+
+ assert(count <= regs->num_counters);
+
+ if (regs->layout & SI_PC_FAKE)
+ return;
+
+ if (layout_multi == SI_PC_MULTI_BLOCK) {
+ assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+ dw = count + regs->num_prelude;
+ if (count >= regs->num_multi)
+ dw += regs->num_multi;
+ radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
+ for (idx = 0; idx < regs->num_prelude; ++idx)
+ radeon_emit(cs, 0);
+ for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+ radeon_emit(cs, selectors[idx] | regs->select_or);
+
+ if (count < regs->num_multi) {
+ unsigned select1 = regs->select0 + 4 * regs->num_multi;
+ radeon_set_uconfig_reg_seq(cs, select1, count);
+ }
+
+ for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+ radeon_emit(cs, 0);
+
+ if (count > regs->num_multi) {
+ for (idx = regs->num_multi; idx < count; ++idx)
+ radeon_emit(cs, selectors[idx] | regs->select_or);
+ }
+ } else if (layout_multi == SI_PC_MULTI_TAIL) {
+ unsigned select1, select1_count;
+
+ assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+ radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
+ for (idx = 0; idx < regs->num_prelude; ++idx)
+ radeon_emit(cs, 0);
+ for (idx = 0; idx < count; ++idx)
+ radeon_emit(cs, selectors[idx] | regs->select_or);
+
+ select1 = regs->select0 + 4 * regs->num_counters;
+ select1_count = MIN2(count, regs->num_multi);
+ radeon_set_uconfig_reg_seq(cs, select1, select1_count);
+ for (idx = 0; idx < select1_count; ++idx)
+ radeon_emit(cs, 0);
+ } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
+ unsigned *reg = regs->select;
+ for (idx = 0; idx < count; ++idx) {
+ radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
+ if (idx < regs->num_multi)
+ radeon_set_uconfig_reg(cs, *reg++, 0);
+ }
+ } else {
+ assert(layout_multi == SI_PC_MULTI_ALTERNATE);
+
+ unsigned reg_base = regs->select0;
+ unsigned reg_count = count + MIN2(count, regs->num_multi);
+ reg_count += regs->num_prelude;
+
+ if (!(regs->layout & SI_PC_REG_REVERSE)) {
+ radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+ for (idx = 0; idx < regs->num_prelude; ++idx)
+ radeon_emit(cs, 0);
+ for (idx = 0; idx < count; ++idx) {
+ radeon_emit(cs, selectors[idx] | regs->select_or);
+ if (idx < regs->num_multi)
+ radeon_emit(cs, 0);
+ }
+ } else {
+ reg_base -= (reg_count - 1) * 4;
+ radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+ for (idx = count; idx > 0; --idx) {
+ if (idx <= regs->num_multi)
+ radeon_emit(cs, 0);
+ radeon_emit(cs, selectors[idx - 1] | regs->select_or);
+ }
+ for (idx = 0; idx < regs->num_prelude; ++idx)
+ radeon_emit(cs, 0);
+ }
+ }
}
-static void si_pc_emit_start(struct si_context *sctx,
- struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- si_cp_copy_data(sctx, sctx->gfx_cs,
- COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
- COPY_DATA_IMM, NULL, 1);
-
- radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
- S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
- radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
- S_036020_PERFMON_STATE(V_036020_START_COUNTING));
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
+ COPY_DATA_IMM, NULL, 1);
+
+ radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+ S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+ radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+ S_036020_PERFMON_STATE(V_036020_START_COUNTING));
}
/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
* do it again in here. */
-static void si_pc_emit_stop(struct si_context *sctx,
- struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- buffer, va, 0, SI_NOT_QUERY);
- si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
-
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
- radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
- S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
- S_036020_PERFMON_SAMPLE_ENABLE(1));
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
+ si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+ radeon_set_uconfig_reg(
+ cs, R_036020_CP_PERFMON_CNTL,
+ S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
}
-static void si_pc_emit_read(struct si_context *sctx,
- struct si_pc_block *block,
- unsigned count, uint64_t va)
+static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+ uint64_t va)
{
- struct si_pc_block_base *regs = block->b->b;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned idx;
- unsigned reg = regs->counter0_lo;
- unsigned reg_delta = 8;
-
- if (!(regs->layout & SI_PC_FAKE)) {
- if (regs->layout & SI_PC_REG_REVERSE)
- reg_delta = -reg_delta;
-
- for (idx = 0; idx < count; ++idx) {
- if (regs->counters)
- reg = regs->counters[idx];
-
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
- COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
- COPY_DATA_COUNT_SEL); /* 64 bits */
- radeon_emit(cs, reg >> 2);
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- va += sizeof(uint64_t);
- reg += reg_delta;
- }
- } else {
- for (idx = 0; idx < count; ++idx) {
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
- COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
- COPY_DATA_COUNT_SEL);
- radeon_emit(cs, 0); /* immediate */
- radeon_emit(cs, 0);
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- va += sizeof(uint64_t);
- }
- }
+ struct si_pc_block_base *regs = block->b->b;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned idx;
+ unsigned reg = regs->counter0_lo;
+ unsigned reg_delta = 8;
+
+ if (!(regs->layout & SI_PC_FAKE)) {
+ if (regs->layout & SI_PC_REG_REVERSE)
+ reg_delta = -reg_delta;
+
+ for (idx = 0; idx < count; ++idx) {
+ if (regs->counters)
+ reg = regs->counters[idx];
+
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ COPY_DATA_COUNT_SEL); /* 64 bits */
+ radeon_emit(cs, reg >> 2);
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ va += sizeof(uint64_t);
+ reg += reg_delta;
+ }
+ } else {
+ for (idx = 0; idx < count; ++idx) {
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+ COPY_DATA_COUNT_SEL);
+ radeon_emit(cs, 0); /* immediate */
+ radeon_emit(cs, 0);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ va += sizeof(uint64_t);
+ }
+ }
}
-static void si_pc_query_destroy(struct si_context *sctx,
- struct si_query *squery)
+static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- while (query->groups) {
- struct si_query_group *group = query->groups;
- query->groups = group->next;
- FREE(group);
- }
+ while (query->groups) {
+ struct si_query_group *group = query->groups;
+ query->groups = group->next;
+ FREE(group);
+ }
- FREE(query->counters);
+ FREE(query->counters);
- si_query_buffer_destroy(sctx->screen, &query->buffer);
- FREE(query);
+ si_query_buffer_destroy(sctx->screen, &query->buffer);
+ FREE(query);
}
static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
/*
- struct si_query_hw *hwquery,
- struct si_resource *buffer, uint64_t va)*/
+ struct si_query_hw *hwquery,
+ struct si_resource *buffer, uint64_t va)*/
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
- int current_se = -1;
- int current_instance = -1;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
+ int current_se = -1;
+ int current_instance = -1;
- if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
- return;
- si_need_gfx_cs_space(sctx);
+ if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
+ return;
+ si_need_gfx_cs_space(sctx);
- if (query->shaders)
- si_pc_emit_shaders(sctx, query->shaders);
+ if (query->shaders)
+ si_pc_emit_shaders(sctx, query->shaders);
- for (struct si_query_group *group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
- if (group->se != current_se || group->instance != current_instance) {
- current_se = group->se;
- current_instance = group->instance;
- si_pc_emit_instance(sctx, group->se, group->instance);
- }
+ if (group->se != current_se || group->instance != current_instance) {
+ current_se = group->se;
+ current_instance = group->instance;
+ si_pc_emit_instance(sctx, group->se, group->instance);
+ }
- si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
- }
+ si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
+ }
- if (current_se != -1 || current_instance != -1)
- si_pc_emit_instance(sctx, -1, -1);
+ if (current_se != -1 || current_instance != -1)
+ si_pc_emit_instance(sctx, -1, -1);
- uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
- si_pc_emit_start(sctx, query->buffer.buf, va);
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ si_pc_emit_start(sctx, query->buffer.buf, va);
}
static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- if (!query->buffer.buf)
- return;
+ if (!query->buffer.buf)
+ return;
- uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
- query->buffer.results_end += query->result_size;
+ uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ query->buffer.results_end += query->result_size;
- si_pc_emit_stop(sctx, query->buffer.buf, va);
+ si_pc_emit_stop(sctx, query->buffer.buf, va);
- for (struct si_query_group *group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
- unsigned se = group->se >= 0 ? group->se : 0;
- unsigned se_end = se + 1;
+ for (struct si_query_group *group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
+ unsigned se = group->se >= 0 ? group->se : 0;
+ unsigned se_end = se + 1;
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
- se_end = sctx->screen->info.max_se;
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+ se_end = sctx->screen->info.max_se;
- do {
- unsigned instance = group->instance >= 0 ? group->instance : 0;
+ do {
+ unsigned instance = group->instance >= 0 ? group->instance : 0;
- do {
- si_pc_emit_instance(sctx, se, instance);
- si_pc_emit_read(sctx, block, group->num_counters, va);
- va += sizeof(uint64_t) * group->num_counters;
- } while (group->instance < 0 && ++instance < block->num_instances);
- } while (++se < se_end);
- }
+ do {
+ si_pc_emit_instance(sctx, se, instance);
+ si_pc_emit_read(sctx, block, group->num_counters, va);
+ va += sizeof(uint64_t) * group->num_counters;
+ } while (group->instance < 0 && ++instance < block->num_instances);
+ } while (++se < se_end);
+ }
- si_pc_emit_instance(sctx, -1, -1);
+ si_pc_emit_instance(sctx, -1, -1);
}
static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- si_query_buffer_reset(ctx, &query->buffer);
+ si_query_buffer_reset(ctx, &query->buffer);
- list_addtail(&query->b.active_list, &ctx->active_queries);
- ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+ list_addtail(&query->b.active_list, &ctx->active_queries);
+ ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
- si_pc_query_resume(ctx, squery);
+ si_pc_query_resume(ctx, squery);
- return true;
+ return true;
}
static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- si_pc_query_suspend(ctx, squery);
+ si_pc_query_suspend(ctx, squery);
- list_del(&squery->active_list);
- ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
+ list_del(&squery->active_list);
+ ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
- return query->buffer.buf != NULL;
+ return query->buffer.buf != NULL;
}
-static void si_pc_query_add_result(struct si_query_pc *query,
- void *buffer,
- union pipe_query_result *result)
+static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
+ union pipe_query_result *result)
{
- uint64_t *results = buffer;
- unsigned i, j;
+ uint64_t *results = buffer;
+ unsigned i, j;
- for (i = 0; i < query->num_counters; ++i) {
- struct si_query_counter *counter = &query->counters[i];
+ for (i = 0; i < query->num_counters; ++i) {
+ struct si_query_counter *counter = &query->counters[i];
- for (j = 0; j < counter->qwords; ++j) {
- uint32_t value = results[counter->base + j * counter->stride];
- result->batch[i].u64 += value;
- }
- }
+ for (j = 0; j < counter->qwords; ++j) {
+ uint32_t value = results[counter->base + j * counter->stride];
+ result->batch[i].u64 += value;
+ }
+ }
}
-static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery,
- bool wait, union pipe_query_result *result)
+static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+ union pipe_query_result *result)
{
- struct si_query_pc *query = (struct si_query_pc *)squery;
+ struct si_query_pc *query = (struct si_query_pc *)squery;
- memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+ memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
- for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- unsigned usage = PIPE_TRANSFER_READ |
- (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
- unsigned results_base = 0;
- void *map;
+ for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ unsigned results_base = 0;
+ void *map;
- if (squery->b.flushed)
- map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
- else
- map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+ if (squery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
- if (!map)
- return false;
+ if (!map)
+ return false;
- while (results_base != qbuf->results_end) {
- si_pc_query_add_result(query, map + results_base, result);
- results_base += query->result_size;
- }
- }
+ while (results_base != qbuf->results_end) {
+ si_pc_query_add_result(query, map + results_base, result);
+ results_base += query->result_size;
+ }
+ }
- return true;
+ return true;
}
static const struct si_query_ops batch_query_ops = {
- .destroy = si_pc_query_destroy,
- .begin = si_pc_query_begin,
- .end = si_pc_query_end,
- .get_result = si_pc_query_get_result,
+ .destroy = si_pc_query_destroy,
+ .begin = si_pc_query_begin,
+ .end = si_pc_query_end,
+ .get_result = si_pc_query_get_result,
- .suspend = si_pc_query_suspend,
- .resume = si_pc_query_resume,
+ .suspend = si_pc_query_suspend,
+ .resume = si_pc_query_resume,
};
-static struct si_query_group *get_group_state(struct si_screen *screen,
- struct si_query_pc *query,
- struct si_pc_block *block,
- unsigned sub_gid)
+static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
+ struct si_pc_block *block, unsigned sub_gid)
{
- struct si_query_group *group = query->groups;
-
- while (group) {
- if (group->block == block && group->sub_gid == sub_gid)
- return group;
- group = group->next;
- }
-
- group = CALLOC_STRUCT(si_query_group);
- if (!group)
- return NULL;
-
- group->block = block;
- group->sub_gid = sub_gid;
-
- if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
- unsigned sub_gids = block->num_instances;
- unsigned shader_id;
- unsigned shaders;
- unsigned query_shaders;
-
- if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
- sub_gids = sub_gids * screen->info.max_se;
- shader_id = sub_gid / sub_gids;
- sub_gid = sub_gid % sub_gids;
-
- shaders = si_pc_shader_type_bits[shader_id];
-
- query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
- if (query_shaders && query_shaders != shaders) {
- fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
- FREE(group);
- return NULL;
- }
- query->shaders = shaders;
- }
-
- if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
- // A non-zero value in query->shaders ensures that the shader
- // masking is reset unless the user explicitly requests one.
- query->shaders = SI_PC_SHADERS_WINDOWING;
- }
-
- if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
- group->se = sub_gid / block->num_instances;
- sub_gid = sub_gid % block->num_instances;
- } else {
- group->se = -1;
- }
-
- if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
- group->instance = sub_gid;
- } else {
- group->instance = -1;
- }
-
- group->next = query->groups;
- query->groups = group;
-
- return group;
+ struct si_query_group *group = query->groups;
+
+ while (group) {
+ if (group->block == block && group->sub_gid == sub_gid)
+ return group;
+ group = group->next;
+ }
+
+ group = CALLOC_STRUCT(si_query_group);
+ if (!group)
+ return NULL;
+
+ group->block = block;
+ group->sub_gid = sub_gid;
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+ unsigned sub_gids = block->num_instances;
+ unsigned shader_id;
+ unsigned shaders;
+ unsigned query_shaders;
+
+ if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+ sub_gids = sub_gids * screen->info.max_se;
+ shader_id = sub_gid / sub_gids;
+ sub_gid = sub_gid % sub_gids;
+
+ shaders = si_pc_shader_type_bits[shader_id];
+
+ query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+ if (query_shaders && query_shaders != shaders) {
+ fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
+ FREE(group);
+ return NULL;
+ }
+ query->shaders = shaders;
+ }
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+ // A non-zero value in query->shaders ensures that the shader
+ // masking is reset unless the user explicitly requests one.
+ query->shaders = SI_PC_SHADERS_WINDOWING;
+ }
+
+ if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+ group->se = sub_gid / block->num_instances;
+ sub_gid = sub_gid % block->num_instances;
+ } else {
+ group->se = -1;
+ }
+
+ if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+ group->instance = sub_gid;
+ } else {
+ group->instance = -1;
+ }
+
+ group->next = query->groups;
+ query->groups = group;
+
+ return group;
}
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
- unsigned num_queries,
- unsigned *query_types)
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+ unsigned *query_types)
{
- struct si_screen *screen =
- (struct si_screen *)ctx->screen;
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
- struct si_query_group *group;
- struct si_query_pc *query;
- unsigned base_gid, sub_gid, sub_index;
- unsigned i, j;
-
- if (!pc)
- return NULL;
-
- query = CALLOC_STRUCT(si_query_pc);
- if (!query)
- return NULL;
-
- query->b.ops = &batch_query_ops;
-
- query->num_counters = num_queries;
-
- /* Collect selectors per group */
- for (i = 0; i < num_queries; ++i) {
- unsigned sub_gid;
-
- if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
- goto error;
-
- block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
- &base_gid, &sub_index);
- if (!block)
- goto error;
-
- sub_gid = sub_index / block->b->selectors;
- sub_index = sub_index % block->b->selectors;
-
- group = get_group_state(screen, query, block, sub_gid);
- if (!group)
- goto error;
-
- if (group->num_counters >= block->b->b->num_counters) {
- fprintf(stderr,
- "perfcounter group %s: too many selected\n",
- block->b->b->name);
- goto error;
- }
- group->selectors[group->num_counters] = sub_index;
- ++group->num_counters;
- }
-
- /* Compute result bases and CS size per group */
- query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
- query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
-
- i = 0;
- for (group = query->groups; group; group = group->next) {
- struct si_pc_block *block = group->block;
- unsigned read_dw;
- unsigned instances = 1;
-
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
- instances = screen->info.max_se;
- if (group->instance < 0)
- instances *= block->num_instances;
-
- group->result_base = i;
- query->result_size += sizeof(uint64_t) * instances * group->num_counters;
- i += instances * group->num_counters;
-
- read_dw = 6 * group->num_counters;
- query->b.num_cs_dw_suspend += instances * read_dw;
- query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
- }
-
- if (query->shaders) {
- if (query->shaders == SI_PC_SHADERS_WINDOWING)
- query->shaders = 0xffffffff;
- }
-
- /* Map user-supplied query array to result indices */
- query->counters = CALLOC(num_queries, sizeof(*query->counters));
- for (i = 0; i < num_queries; ++i) {
- struct si_query_counter *counter = &query->counters[i];
- struct si_pc_block *block;
-
- block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
- &base_gid, &sub_index);
-
- sub_gid = sub_index / block->b->selectors;
- sub_index = sub_index % block->b->selectors;
-
- group = get_group_state(screen, query, block, sub_gid);
- assert(group != NULL);
-
- for (j = 0; j < group->num_counters; ++j) {
- if (group->selectors[j] == sub_index)
- break;
- }
-
- counter->base = group->result_base + j;
- counter->stride = group->num_counters;
-
- counter->qwords = 1;
- if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
- counter->qwords = screen->info.max_se;
- if (group->instance < 0)
- counter->qwords *= block->num_instances;
- }
+ struct si_screen *screen = (struct si_screen *)ctx->screen;
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+ struct si_query_group *group;
+ struct si_query_pc *query;
+ unsigned base_gid, sub_gid, sub_index;
+ unsigned i, j;
+
+ if (!pc)
+ return NULL;
+
+ query = CALLOC_STRUCT(si_query_pc);
+ if (!query)
+ return NULL;
+
+ query->b.ops = &batch_query_ops;
+
+ query->num_counters = num_queries;
+
+ /* Collect selectors per group */
+ for (i = 0; i < num_queries; ++i) {
+ unsigned sub_gid;
+
+ if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
+ goto error;
+
+ block =
+ lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+ if (!block)
+ goto error;
+
+ sub_gid = sub_index / block->b->selectors;
+ sub_index = sub_index % block->b->selectors;
+
+ group = get_group_state(screen, query, block, sub_gid);
+ if (!group)
+ goto error;
+
+ if (group->num_counters >= block->b->b->num_counters) {
+ fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
+ goto error;
+ }
+ group->selectors[group->num_counters] = sub_index;
+ ++group->num_counters;
+ }
+
+ /* Compute result bases and CS size per group */
+ query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+ query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
+
+ i = 0;
+ for (group = query->groups; group; group = group->next) {
+ struct si_pc_block *block = group->block;
+ unsigned read_dw;
+ unsigned instances = 1;
+
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ instances = screen->info.max_se;
+ if (group->instance < 0)
+ instances *= block->num_instances;
+
+ group->result_base = i;
+ query->result_size += sizeof(uint64_t) * instances * group->num_counters;
+ i += instances * group->num_counters;
+
+ read_dw = 6 * group->num_counters;
+ query->b.num_cs_dw_suspend += instances * read_dw;
+ query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
+ }
+
+ if (query->shaders) {
+ if (query->shaders == SI_PC_SHADERS_WINDOWING)
+ query->shaders = 0xffffffff;
+ }
+
+ /* Map user-supplied query array to result indices */
+ query->counters = CALLOC(num_queries, sizeof(*query->counters));
+ for (i = 0; i < num_queries; ++i) {
+ struct si_query_counter *counter = &query->counters[i];
+ struct si_pc_block *block;
+
+ block =
+ lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+
+ sub_gid = sub_index / block->b->selectors;
+ sub_index = sub_index % block->b->selectors;
+
+ group = get_group_state(screen, query, block, sub_gid);
+ assert(group != NULL);
+
+ for (j = 0; j < group->num_counters; ++j) {
+ if (group->selectors[j] == sub_index)
+ break;
+ }
+
+ counter->base = group->result_base + j;
+ counter->stride = group->num_counters;
+
+ counter->qwords = 1;
+ if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+ counter->qwords = screen->info.max_se;
+ if (group->instance < 0)
+ counter->qwords *= block->num_instances;
+ }
- return (struct pipe_query *)query;
+ return (struct pipe_query *)query;
error:
- si_pc_query_destroy((struct si_context *)ctx, &query->b);
- return NULL;
+ si_pc_query_destroy((struct si_context *)ctx, &query->b);
+ return NULL;
}
-static bool si_init_block_names(struct si_screen *screen,
- struct si_pc_block *block)
+static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
{
- bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
- bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
- unsigned i, j, k;
- unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
- unsigned namelen;
- char *groupname;
- char *p;
-
- if (per_instance_groups)
- groups_instance = block->num_instances;
- if (per_se_groups)
- groups_se = screen->info.max_se;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
-
- namelen = strlen(block->b->b->name);
- block->group_name_stride = namelen + 1;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- block->group_name_stride += 3;
- if (per_se_groups) {
- assert(groups_se <= 10);
- block->group_name_stride += 1;
-
- if (per_instance_groups)
- block->group_name_stride += 1;
- }
- if (per_instance_groups) {
- assert(groups_instance <= 100);
- block->group_name_stride += 2;
- }
-
- block->group_names = MALLOC(block->num_groups * block->group_name_stride);
- if (!block->group_names)
- return false;
-
- groupname = block->group_names;
- for (i = 0; i < groups_shader; ++i) {
- const char *shader_suffix = si_pc_shader_type_suffixes[i];
- unsigned shaderlen = strlen(shader_suffix);
- for (j = 0; j < groups_se; ++j) {
- for (k = 0; k < groups_instance; ++k) {
- strcpy(groupname, block->b->b->name);
- p = groupname + namelen;
-
- if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
- strcpy(p, shader_suffix);
- p += shaderlen;
- }
-
- if (per_se_groups) {
- p += sprintf(p, "%d", j);
- if (per_instance_groups)
- *p++ = '_';
- }
-
- if (per_instance_groups)
- p += sprintf(p, "%d", k);
-
- groupname += block->group_name_stride;
- }
- }
- }
-
- assert(block->b->selectors <= 1000);
- block->selector_name_stride = block->group_name_stride + 4;
- block->selector_names = MALLOC(block->num_groups * block->b->selectors *
- block->selector_name_stride);
- if (!block->selector_names)
- return false;
-
- groupname = block->group_names;
- p = block->selector_names;
- for (i = 0; i < block->num_groups; ++i) {
- for (j = 0; j < block->b->selectors; ++j) {
- sprintf(p, "%s_%03d", groupname, j);
- p += block->selector_name_stride;
- }
- groupname += block->group_name_stride;
- }
-
- return true;
+ bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
+ bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
+ unsigned i, j, k;
+ unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
+ unsigned namelen;
+ char *groupname;
+ char *p;
+
+ if (per_instance_groups)
+ groups_instance = block->num_instances;
+ if (per_se_groups)
+ groups_se = screen->info.max_se;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
+
+ namelen = strlen(block->b->b->name);
+ block->group_name_stride = namelen + 1;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ block->group_name_stride += 3;
+ if (per_se_groups) {
+ assert(groups_se <= 10);
+ block->group_name_stride += 1;
+
+ if (per_instance_groups)
+ block->group_name_stride += 1;
+ }
+ if (per_instance_groups) {
+ assert(groups_instance <= 100);
+ block->group_name_stride += 2;
+ }
+
+ block->group_names = MALLOC(block->num_groups * block->group_name_stride);
+ if (!block->group_names)
+ return false;
+
+ groupname = block->group_names;
+ for (i = 0; i < groups_shader; ++i) {
+ const char *shader_suffix = si_pc_shader_type_suffixes[i];
+ unsigned shaderlen = strlen(shader_suffix);
+ for (j = 0; j < groups_se; ++j) {
+ for (k = 0; k < groups_instance; ++k) {
+ strcpy(groupname, block->b->b->name);
+ p = groupname + namelen;
+
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+ strcpy(p, shader_suffix);
+ p += shaderlen;
+ }
+
+ if (per_se_groups) {
+ p += sprintf(p, "%d", j);
+ if (per_instance_groups)
+ *p++ = '_';
+ }
+
+ if (per_instance_groups)
+ p += sprintf(p, "%d", k);
+
+ groupname += block->group_name_stride;
+ }
+ }
+ }
+
+ assert(block->b->selectors <= 1000);
+ block->selector_name_stride = block->group_name_stride + 4;
+ block->selector_names =
+ MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
+ if (!block->selector_names)
+ return false;
+
+ groupname = block->group_names;
+ p = block->selector_names;
+ for (i = 0; i < block->num_groups; ++i) {
+ for (j = 0; j < block->b->selectors; ++j) {
+ sprintf(p, "%s_%03d", groupname, j);
+ p += block->selector_name_stride;
+ }
+ groupname += block->group_name_stride;
+ }
+
+ return true;
}
-int si_get_perfcounter_info(struct si_screen *screen,
- unsigned index,
- struct pipe_driver_query_info *info)
+int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
+ struct pipe_driver_query_info *info)
{
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
- unsigned base_gid, sub;
-
- if (!pc)
- return 0;
-
- if (!info) {
- unsigned bid, num_queries = 0;
-
- for (bid = 0; bid < pc->num_blocks; ++bid) {
- num_queries += pc->blocks[bid].b->selectors *
- pc->blocks[bid].num_groups;
- }
-
- return num_queries;
- }
-
- block = lookup_counter(pc, index, &base_gid, &sub);
- if (!block)
- return 0;
-
- if (!block->selector_names) {
- if (!si_init_block_names(screen, block))
- return 0;
- }
- info->name = block->selector_names + sub * block->selector_name_stride;
- info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
- info->max_value.u64 = 0;
- info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
- info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
- info->group_id = base_gid + sub / block->b->selectors;
- info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
- if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
- info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
- return 1;
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+ unsigned base_gid, sub;
+
+ if (!pc)
+ return 0;
+
+ if (!info) {
+ unsigned bid, num_queries = 0;
+
+ for (bid = 0; bid < pc->num_blocks; ++bid) {
+ num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
+ }
+
+ return num_queries;
+ }
+
+ block = lookup_counter(pc, index, &base_gid, &sub);
+ if (!block)
+ return 0;
+
+ if (!block->selector_names) {
+ if (!si_init_block_names(screen, block))
+ return 0;
+ }
+ info->name = block->selector_names + sub * block->selector_name_stride;
+ info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
+ info->max_value.u64 = 0;
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+ info->group_id = base_gid + sub / block->b->selectors;
+ info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+ if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
+ info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
+ return 1;
}
-int si_get_perfcounter_group_info(struct si_screen *screen,
- unsigned index,
- struct pipe_driver_query_group_info *info)
+int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
+ struct pipe_driver_query_group_info *info)
{
- struct si_perfcounters *pc = screen->perfcounters;
- struct si_pc_block *block;
-
- if (!pc)
- return 0;
-
- if (!info)
- return pc->num_groups;
-
- block = lookup_group(pc, &index);
- if (!block)
- return 0;
-
- if (!block->group_names) {
- if (!si_init_block_names(screen, block))
- return 0;
- }
- info->name = block->group_names + index * block->group_name_stride;
- info->num_queries = block->b->selectors;
- info->max_active_queries = block->b->b->num_counters;
- return 1;
+ struct si_perfcounters *pc = screen->perfcounters;
+ struct si_pc_block *block;
+
+ if (!pc)
+ return 0;
+
+ if (!info)
+ return pc->num_groups;
+
+ block = lookup_group(pc, &index);
+ if (!block)
+ return 0;
+
+ if (!block->group_names) {
+ if (!si_init_block_names(screen, block))
+ return 0;
+ }
+ info->name = block->group_names + index * block->group_name_stride;
+ info->num_queries = block->b->selectors;
+ info->max_active_queries = block->b->b->num_counters;
+ return 1;
}
void si_destroy_perfcounters(struct si_screen *screen)
{
- struct si_perfcounters *pc = screen->perfcounters;
- unsigned i;
-
- if (!pc)
- return;
-
- for (i = 0; i < pc->num_blocks; ++i) {
- FREE(pc->blocks[i].group_names);
- FREE(pc->blocks[i].selector_names);
- }
- FREE(pc->blocks);
- FREE(pc);
- screen->perfcounters = NULL;
+ struct si_perfcounters *pc = screen->perfcounters;
+ unsigned i;
+
+ if (!pc)
+ return;
+
+ for (i = 0; i < pc->num_blocks; ++i) {
+ FREE(pc->blocks[i].group_names);
+ FREE(pc->blocks[i].selector_names);
+ }
+ FREE(pc->blocks);
+ FREE(pc);
+ screen->perfcounters = NULL;
}
void si_init_perfcounters(struct si_screen *screen)
{
- struct si_perfcounters *pc;
- const struct si_pc_block_gfxdescr *blocks;
- unsigned num_blocks;
- unsigned i;
-
- switch (screen->info.chip_class) {
- case GFX7:
- blocks = groups_CIK;
- num_blocks = ARRAY_SIZE(groups_CIK);
- break;
- case GFX8:
- blocks = groups_VI;
- num_blocks = ARRAY_SIZE(groups_VI);
- break;
- case GFX9:
- blocks = groups_gfx9;
- num_blocks = ARRAY_SIZE(groups_gfx9);
- break;
- case GFX6:
- default:
- return; /* not implemented */
- }
-
- if (screen->info.max_sh_per_se != 1) {
- /* This should not happen on non-GFX6 chips. */
- fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
- "supported (inaccurate performance counters)\n",
- screen->info.max_sh_per_se);
- }
-
- screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
- if (!pc)
- return;
-
- pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
- pc->num_instance_cs_dwords = 3;
-
- pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
- pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
- pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
- if (!pc->blocks)
- goto error;
- pc->num_blocks = num_blocks;
-
- for (i = 0; i < num_blocks; ++i) {
- struct si_pc_block *block = &pc->blocks[i];
- block->b = &blocks[i];
- block->num_instances = MAX2(1, block->b->instances);
-
- if (!strcmp(block->b->b->name, "CB") ||
- !strcmp(block->b->b->name, "DB"))
- block->num_instances = screen->info.max_se;
- else if (!strcmp(block->b->b->name, "TCC"))
- block->num_instances = screen->info.num_tcc_blocks;
- else if (!strcmp(block->b->b->name, "IA"))
- block->num_instances = MAX2(1, screen->info.max_se / 2);
-
- if (si_pc_block_has_per_instance_groups(pc, block)) {
- block->num_groups = block->num_instances;
- } else {
- block->num_groups = 1;
- }
-
- if (si_pc_block_has_per_se_groups(pc, block))
- block->num_groups *= screen->info.max_se;
- if (block->b->b->flags & SI_PC_BLOCK_SHADER)
- block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
-
- pc->num_groups += block->num_groups;
- }
-
- return;
+ struct si_perfcounters *pc;
+ const struct si_pc_block_gfxdescr *blocks;
+ unsigned num_blocks;
+ unsigned i;
+
+ switch (screen->info.chip_class) {
+ case GFX7:
+ blocks = groups_CIK;
+ num_blocks = ARRAY_SIZE(groups_CIK);
+ break;
+ case GFX8:
+ blocks = groups_VI;
+ num_blocks = ARRAY_SIZE(groups_VI);
+ break;
+ case GFX9:
+ blocks = groups_gfx9;
+ num_blocks = ARRAY_SIZE(groups_gfx9);
+ break;
+ case GFX6:
+ default:
+ return; /* not implemented */
+ }
+
+ if (screen->info.max_sh_per_se != 1) {
+ /* This should not happen on non-GFX6 chips. */
+ fprintf(stderr,
+ "si_init_perfcounters: max_sh_per_se = %d not "
+ "supported (inaccurate performance counters)\n",
+ screen->info.max_sh_per_se);
+ }
+
+ screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
+ if (!pc)
+ return;
+
+ pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
+ pc->num_instance_cs_dwords = 3;
+
+ pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+ pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
+
+ pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
+ if (!pc->blocks)
+ goto error;
+ pc->num_blocks = num_blocks;
+
+ for (i = 0; i < num_blocks; ++i) {
+ struct si_pc_block *block = &pc->blocks[i];
+ block->b = &blocks[i];
+ block->num_instances = MAX2(1, block->b->instances);
+
+ if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB"))
+ block->num_instances = screen->info.max_se;
+ else if (!strcmp(block->b->b->name, "TCC"))
+ block->num_instances = screen->info.num_tcc_blocks;
+ else if (!strcmp(block->b->b->name, "IA"))
+ block->num_instances = MAX2(1, screen->info.max_se / 2);
+
+ if (si_pc_block_has_per_instance_groups(pc, block)) {
+ block->num_groups = block->num_instances;
+ } else {
+ block->num_groups = 1;
+ }
+
+ if (si_pc_block_has_per_se_groups(pc, block))
+ block->num_groups *= screen->info.max_se;
+ if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+ block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+
+ pc->num_groups += block->num_groups;
+ }
+
+ return;
error:
- si_destroy_perfcounters(screen);
+ si_destroy_perfcounters(screen);
}
*/
#include "si_pipe.h"
+
+#include "driver_ddebug/dd_util.h"
+#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
+#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
+#include "radeon/radeon_uvd.h"
+#include "si_compute.h"
#include "si_public.h"
#include "si_shader_internal.h"
-#include "si_compute.h"
#include "sid.h"
-
-#include "radeon/radeon_uvd.h"
#include "util/disk_cache.h"
#include "util/u_log.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
#include "util/xmlconfig.h"
#include "vl/vl_decoder.h"
-#include "driver_ddebug/dd_util.h"
-#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
-#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
#include <xf86drm.h>
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
- unsigned flags);
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags);
static const struct debug_named_value debug_options[] = {
- /* Shader logging options: */
- { "vs", DBG(VS), "Print vertex shaders" },
- { "ps", DBG(PS), "Print pixel shaders" },
- { "gs", DBG(GS), "Print geometry shaders" },
- { "tcs", DBG(TCS), "Print tessellation control shaders" },
- { "tes", DBG(TES), "Print tessellation evaluation shaders" },
- { "cs", DBG(CS), "Print compute shaders" },
- { "noir", DBG(NO_IR), "Don't print the LLVM IR"},
- { "nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
- { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
- { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
-
- /* Shader compiler options the shader cache should be aware of: */
- { "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
- { "w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders." },
- { "w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders." },
- { "w32cs", DBG(W32_CS), "Use Wave32 for computes shaders." },
- { "w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders." },
- { "w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders." },
- { "w64cs", DBG(W64_CS), "Use Wave64 for computes shaders." },
-
- /* Shader compiler options (with no effect on the shader cache): */
- { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
- { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
- { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
-
- /* Information logging options: */
- { "info", DBG(INFO), "Print driver information" },
- { "tex", DBG(TEX), "Print texture info" },
- { "compute", DBG(COMPUTE), "Print compute info" },
- { "vm", DBG(VM), "Print virtual addresses when creating resources" },
- { "cache_stats", DBG(CACHE_STATS), "Print shader cache statistics." },
-
- /* Driver options: */
- { "forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible." },
- { "nodma", DBG(NO_SDMA), "Disable SDMA" },
- { "nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears" },
- { "nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies" },
- { "nowc", DBG(NO_WC), "Disable GTT write combining" },
- { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
- { "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." },
- { "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
-
- /* 3D engine options: */
- { "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." },
- { "nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline." },
- { "nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt." },
- { "nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling." },
- { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
- { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
- { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
- { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
- { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
- { "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
- { "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
- { "dpbb", DBG(DPBB), "Enable DPBB." },
- { "dfsm", DBG(DFSM), "Enable DFSM." },
- { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
- { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
- { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
- { "notiling", DBG(NO_TILING), "Disable tiling" },
- { "nodcc", DBG(NO_DCC), "Disable DCC." },
- { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
- { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
- { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
- { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
-
- DEBUG_NAMED_VALUE_END /* must be last */
+ /* Shader logging options: */
+ {"vs", DBG(VS), "Print vertex shaders"},
+ {"ps", DBG(PS), "Print pixel shaders"},
+ {"gs", DBG(GS), "Print geometry shaders"},
+ {"tcs", DBG(TCS), "Print tessellation control shaders"},
+ {"tes", DBG(TES), "Print tessellation evaluation shaders"},
+ {"cs", DBG(CS), "Print compute shaders"},
+ {"noir", DBG(NO_IR), "Don't print the LLVM IR"},
+ {"nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
+ {"noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
+ {"preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations"},
+
+ /* Shader compiler options the shader cache should be aware of: */
+ {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
+ {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
+ {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
+ {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
+ {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
+ {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
+ {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."},
+
+ /* Shader compiler options (with no effect on the shader cache): */
+ {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"},
+ {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"},
+ {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."},
+
+ /* Information logging options: */
+ {"info", DBG(INFO), "Print driver information"},
+ {"tex", DBG(TEX), "Print texture info"},
+ {"compute", DBG(COMPUTE), "Print compute info"},
+ {"vm", DBG(VM), "Print virtual addresses when creating resources"},
+ {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+
+ /* Driver options: */
+ {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."},
+ {"nodma", DBG(NO_SDMA), "Disable SDMA"},
+ {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"},
+ {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"},
+ {"nowc", DBG(NO_WC), "Disable GTT write combining"},
+ {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
+ {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
+ {"zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations."},
+
+ /* 3D engine options: */
+ {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
+ {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
+ {"nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt."},
+ {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
+ {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
+ {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
+ {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
+ {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
+ {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
+ {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
+ {"nodfsm", DBG(NO_DFSM), "Disable DFSM."},
+ {"dpbb", DBG(DPBB), "Enable DPBB."},
+ {"dfsm", DBG(DFSM), "Enable DFSM."},
+ {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"},
+ {"norbplus", DBG(NO_RB_PLUS), "Disable RB+."},
+ {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"},
+ {"notiling", DBG(NO_TILING), "Disable tiling"},
+ {"nodcc", DBG(NO_DCC), "Disable DCC."},
+ {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
+ {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+ {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
+ {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+
+ DEBUG_NAMED_VALUE_END /* must be last */
};
static const struct debug_named_value test_options[] = {
- /* Tests: */
- { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
- { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
- { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
- { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
- { "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
- { "testgds", DBG(TEST_GDS), "Test GDS." },
- { "testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management." },
- { "testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management." },
-
- DEBUG_NAMED_VALUE_END /* must be last */
+ /* Tests: */
+ {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."},
+ {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."},
+ {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."},
+ {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
+ {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
+ {"testgds", DBG(TEST_GDS), "Test GDS."},
+ {"testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management."},
+ {"testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management."},
+
+ DEBUG_NAMED_VALUE_END /* must be last */
};
void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler)
{
- /* Only create the less-optimizing version of the compiler on APUs
- * predating Ryzen (Raven). */
- bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
- sscreen->info.chip_class <= GFX8;
-
- enum ac_target_machine_options tm_options =
- (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
- (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
- (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
- (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
- (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
- (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
-
- ac_init_llvm_once();
- ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
- compiler->passes = ac_create_llvm_passes(compiler->tm);
-
- if (compiler->tm_wave32)
- compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
- if (compiler->low_opt_tm)
- compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
+ /* Only create the less-optimizing version of the compiler on APUs
+ * predating Ryzen (Raven). */
+ bool create_low_opt_compiler =
+ !sscreen->info.has_dedicated_vram && sscreen->info.chip_class <= GFX8;
+
+ enum ac_target_machine_options tm_options =
+ (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
+ (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
+ (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
+ (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
+ (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+ (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
+
+ ac_init_llvm_once();
+ ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
+ compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+ if (compiler->tm_wave32)
+ compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
+ if (compiler->low_opt_tm)
+ compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
}
static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
{
- ac_destroy_llvm_compiler(compiler);
+ ac_destroy_llvm_compiler(compiler);
}
/*
*/
static void si_destroy_context(struct pipe_context *context)
{
- struct si_context *sctx = (struct si_context *)context;
- int i;
-
- /* Unreference the framebuffer normally to disable related logic
- * properly.
- */
- struct pipe_framebuffer_state fb = {};
- if (context->set_framebuffer_state)
- context->set_framebuffer_state(context, &fb);
-
- si_release_all_descriptors(sctx);
-
- if (sctx->chip_class >= GFX10 && sctx->has_graphics)
- gfx10_destroy_query(sctx);
-
- pipe_resource_reference(&sctx->esgs_ring, NULL);
- pipe_resource_reference(&sctx->gsvs_ring, NULL);
- pipe_resource_reference(&sctx->tess_rings, NULL);
- pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
- pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
- si_resource_reference(&sctx->border_color_buffer, NULL);
- free(sctx->border_color_table);
- si_resource_reference(&sctx->scratch_buffer, NULL);
- si_resource_reference(&sctx->compute_scratch_buffer, NULL);
- si_resource_reference(&sctx->wait_mem_scratch, NULL);
- si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
-
- si_pm4_free_state(sctx, sctx->init_config, ~0);
- if (sctx->init_config_gs_rings)
- si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
- for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
- si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
-
- if (sctx->fixed_func_tcs_shader.cso)
- sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
- if (sctx->custom_dsa_flush)
- sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
- if (sctx->custom_blend_resolve)
- sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
- if (sctx->custom_blend_fmask_decompress)
- sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
- if (sctx->custom_blend_eliminate_fastclear)
- sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
- if (sctx->custom_blend_dcc_decompress)
- sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
- if (sctx->vs_blit_pos)
- sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
- if (sctx->vs_blit_pos_layered)
- sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
- if (sctx->vs_blit_color)
- sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
- if (sctx->vs_blit_color_layered)
- sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
- if (sctx->vs_blit_texcoord)
- sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
- if (sctx->cs_clear_buffer)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
- if (sctx->cs_copy_buffer)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
- if (sctx->cs_copy_image)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
- if (sctx->cs_copy_image_1d_array)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
- if (sctx->cs_clear_render_target)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
- if (sctx->cs_clear_render_target_1d_array)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
- if (sctx->cs_clear_12bytes_buffer)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
- if (sctx->cs_dcc_retile)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
-
- for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
- for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
- if (sctx->cs_fmask_expand[i][j]) {
- sctx->b.delete_compute_state(&sctx->b,
- sctx->cs_fmask_expand[i][j]);
- }
- }
- }
-
- if (sctx->blitter)
- util_blitter_destroy(sctx->blitter);
-
- /* Release DCC stats. */
- for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
- assert(!sctx->dcc_stats[i].query_active);
-
- for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
- if (sctx->dcc_stats[i].ps_stats[j])
- sctx->b.destroy_query(&sctx->b,
- sctx->dcc_stats[i].ps_stats[j]);
-
- si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
- }
-
- if (sctx->query_result_shader)
- sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
- if (sctx->sh_query_result_shader)
- sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
- if (sctx->gfx_cs)
- sctx->ws->cs_destroy(sctx->gfx_cs);
- if (sctx->sdma_cs)
- sctx->ws->cs_destroy(sctx->sdma_cs);
- if (sctx->ctx)
- sctx->ws->ctx_destroy(sctx->ctx);
-
- if (sctx->b.stream_uploader)
- u_upload_destroy(sctx->b.stream_uploader);
- if (sctx->b.const_uploader)
- u_upload_destroy(sctx->b.const_uploader);
- if (sctx->cached_gtt_allocator)
- u_upload_destroy(sctx->cached_gtt_allocator);
-
- slab_destroy_child(&sctx->pool_transfers);
- slab_destroy_child(&sctx->pool_transfers_unsync);
-
- if (sctx->allocator_zeroed_memory)
- u_suballocator_destroy(sctx->allocator_zeroed_memory);
-
- sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
- sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
- sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
- si_resource_reference(&sctx->eop_bug_scratch, NULL);
- si_resource_reference(&sctx->index_ring, NULL);
- si_resource_reference(&sctx->barrier_buf, NULL);
- si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
- pb_reference(&sctx->gds, NULL);
- pb_reference(&sctx->gds_oa, NULL);
-
- si_destroy_compiler(&sctx->compiler);
-
- si_saved_cs_reference(&sctx->current_saved_cs, NULL);
-
- _mesa_hash_table_destroy(sctx->tex_handles, NULL);
- _mesa_hash_table_destroy(sctx->img_handles, NULL);
-
- util_dynarray_fini(&sctx->resident_tex_handles);
- util_dynarray_fini(&sctx->resident_img_handles);
- util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
- util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
- util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
- si_unref_sdma_uploads(sctx);
- free(sctx->sdma_uploads);
- FREE(sctx);
+ struct si_context *sctx = (struct si_context *)context;
+ int i;
+
+ /* Unreference the framebuffer normally to disable related logic
+ * properly.
+ */
+ struct pipe_framebuffer_state fb = {};
+ if (context->set_framebuffer_state)
+ context->set_framebuffer_state(context, &fb);
+
+ si_release_all_descriptors(sctx);
+
+ if (sctx->chip_class >= GFX10 && sctx->has_graphics)
+ gfx10_destroy_query(sctx);
+
+ pipe_resource_reference(&sctx->esgs_ring, NULL);
+ pipe_resource_reference(&sctx->gsvs_ring, NULL);
+ pipe_resource_reference(&sctx->tess_rings, NULL);
+ pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
+ pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
+ si_resource_reference(&sctx->border_color_buffer, NULL);
+ free(sctx->border_color_table);
+ si_resource_reference(&sctx->scratch_buffer, NULL);
+ si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+ si_resource_reference(&sctx->wait_mem_scratch, NULL);
+ si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
+
+ si_pm4_free_state(sctx, sctx->init_config, ~0);
+ if (sctx->init_config_gs_rings)
+ si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+ for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
+ si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
+
+ if (sctx->fixed_func_tcs_shader.cso)
+ sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
+ if (sctx->custom_dsa_flush)
+ sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
+ if (sctx->custom_blend_resolve)
+ sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
+ if (sctx->custom_blend_fmask_decompress)
+ sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
+ if (sctx->custom_blend_eliminate_fastclear)
+ sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
+ if (sctx->custom_blend_dcc_decompress)
+ sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
+ if (sctx->vs_blit_pos)
+ sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
+ if (sctx->vs_blit_pos_layered)
+ sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
+ if (sctx->vs_blit_color)
+ sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
+ if (sctx->vs_blit_color_layered)
+ sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
+ if (sctx->vs_blit_texcoord)
+ sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
+ if (sctx->cs_clear_buffer)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+ if (sctx->cs_copy_buffer)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+ if (sctx->cs_copy_image)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
+ if (sctx->cs_copy_image_1d_array)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
+ if (sctx->cs_clear_render_target)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
+ if (sctx->cs_clear_render_target_1d_array)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
+ if (sctx->cs_clear_12bytes_buffer)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
+ if (sctx->cs_dcc_retile)
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
+ for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
+ if (sctx->cs_fmask_expand[i][j]) {
+ sctx->b.delete_compute_state(&sctx->b, sctx->cs_fmask_expand[i][j]);
+ }
+ }
+ }
+
+ if (sctx->blitter)
+ util_blitter_destroy(sctx->blitter);
+
+ /* Release DCC stats. */
+ for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+ assert(!sctx->dcc_stats[i].query_active);
+
+ for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
+ if (sctx->dcc_stats[i].ps_stats[j])
+ sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]);
+
+ si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
+ }
+
+ if (sctx->query_result_shader)
+ sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+ if (sctx->sh_query_result_shader)
+ sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+ if (sctx->gfx_cs)
+ sctx->ws->cs_destroy(sctx->gfx_cs);
+ if (sctx->sdma_cs)
+ sctx->ws->cs_destroy(sctx->sdma_cs);
+ if (sctx->ctx)
+ sctx->ws->ctx_destroy(sctx->ctx);
+
+ if (sctx->b.stream_uploader)
+ u_upload_destroy(sctx->b.stream_uploader);
+ if (sctx->b.const_uploader)
+ u_upload_destroy(sctx->b.const_uploader);
+ if (sctx->cached_gtt_allocator)
+ u_upload_destroy(sctx->cached_gtt_allocator);
+
+ slab_destroy_child(&sctx->pool_transfers);
+ slab_destroy_child(&sctx->pool_transfers_unsync);
+
+ if (sctx->allocator_zeroed_memory)
+ u_suballocator_destroy(sctx->allocator_zeroed_memory);
+
+ sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
+ sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
+ sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
+ si_resource_reference(&sctx->eop_bug_scratch, NULL);
+ si_resource_reference(&sctx->index_ring, NULL);
+ si_resource_reference(&sctx->barrier_buf, NULL);
+ si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+ pb_reference(&sctx->gds, NULL);
+ pb_reference(&sctx->gds_oa, NULL);
+
+ si_destroy_compiler(&sctx->compiler);
+
+ si_saved_cs_reference(&sctx->current_saved_cs, NULL);
+
+ _mesa_hash_table_destroy(sctx->tex_handles, NULL);
+ _mesa_hash_table_destroy(sctx->img_handles, NULL);
+
+ util_dynarray_fini(&sctx->resident_tex_handles);
+ util_dynarray_fini(&sctx->resident_img_handles);
+ util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
+ util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
+ util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+ si_unref_sdma_uploads(sctx);
+ free(sctx->sdma_uploads);
+ FREE(sctx);
}
static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_screen *sscreen = sctx->screen;
- enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
-
- if (status != PIPE_NO_RESET) {
- /* Call the state tracker to set a no-op API dispatch. */
- if (sctx->device_reset_callback.reset) {
- sctx->device_reset_callback.reset(sctx->device_reset_callback.data,
- status);
- }
-
- /* Re-create the auxiliary context, because it won't submit
- * any new IBs due to a GPU reset.
- */
- simple_mtx_lock(&sscreen->aux_context_lock);
-
- struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
- sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
- sscreen->aux_context->destroy(sscreen->aux_context);
-
- sscreen->aux_context = si_create_context(&sscreen->b,
- (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
- (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
- sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
- simple_mtx_unlock(&sscreen->aux_context_lock);
- }
- return status;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_screen *sscreen = sctx->screen;
+ enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
+
+ if (status != PIPE_NO_RESET) {
+ /* Call the state tracker to set a no-op API dispatch. */
+ if (sctx->device_reset_callback.reset) {
+ sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status);
+ }
+
+ /* Re-create the auxiliary context, because it won't submit
+ * any new IBs due to a GPU reset.
+ */
+ simple_mtx_lock(&sscreen->aux_context_lock);
+
+ struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+ sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+ sscreen->aux_context->destroy(sscreen->aux_context);
+
+ sscreen->aux_context = si_create_context(
+ &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+ (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+ sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
+ simple_mtx_unlock(&sscreen->aux_context_lock);
+ }
+ return status;
}
static void si_set_device_reset_callback(struct pipe_context *ctx,
- const struct pipe_device_reset_callback *cb)
+ const struct pipe_device_reset_callback *cb)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (cb)
- sctx->device_reset_callback = *cb;
- else
- memset(&sctx->device_reset_callback, 0,
- sizeof(sctx->device_reset_callback));
+ if (cb)
+ sctx->device_reset_callback = *cb;
+ else
+ memset(&sctx->device_reset_callback, 0, sizeof(sctx->device_reset_callback));
}
/* Apitrace profiling:
* call and print the results.
* 4) glretrace --benchmark --markers ..
*/
-static void si_emit_string_marker(struct pipe_context *ctx,
- const char *string, int len)
+static void si_emit_string_marker(struct pipe_context *ctx, const char *string, int len)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
+ dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
- if (sctx->log)
- u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
+ if (sctx->log)
+ u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
}
-static void si_set_debug_callback(struct pipe_context *ctx,
- const struct pipe_debug_callback *cb)
+static void si_set_debug_callback(struct pipe_context *ctx, const struct pipe_debug_callback *cb)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_screen *screen = sctx->screen;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_screen *screen = sctx->screen;
- util_queue_finish(&screen->shader_compiler_queue);
- util_queue_finish(&screen->shader_compiler_queue_low_priority);
+ util_queue_finish(&screen->shader_compiler_queue);
+ util_queue_finish(&screen->shader_compiler_queue_low_priority);
- if (cb)
- sctx->debug = *cb;
- else
- memset(&sctx->debug, 0, sizeof(sctx->debug));
+ if (cb)
+ sctx->debug = *cb;
+ else
+ memset(&sctx->debug, 0, sizeof(sctx->debug));
}
-static void si_set_log_context(struct pipe_context *ctx,
- struct u_log_context *log)
+static void si_set_log_context(struct pipe_context *ctx, struct u_log_context *log)
{
- struct si_context *sctx = (struct si_context *)ctx;
- sctx->log = log;
+ struct si_context *sctx = (struct si_context *)ctx;
+ sctx->log = log;
- if (log)
- u_log_add_auto_logger(log, si_auto_log_cs, sctx);
+ if (log)
+ u_log_add_auto_logger(log, si_auto_log_cs, sctx);
}
-static void si_set_context_param(struct pipe_context *ctx,
- enum pipe_context_param param,
- unsigned value)
+static void si_set_context_param(struct pipe_context *ctx, enum pipe_context_param param,
+ unsigned value)
{
- struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
-
- switch (param) {
- case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
- ws->pin_threads_to_L3_cache(ws, value);
- break;
- default:;
- }
+ struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
+
+ switch (param) {
+ case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
+ ws->pin_threads_to_L3_cache(ws, value);
+ break;
+ default:;
+ }
}
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
- unsigned flags)
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags)
{
- struct si_screen* sscreen = (struct si_screen *)screen;
- STATIC_ASSERT(DBG_COUNT <= 64);
-
- /* Don't create a context if it's not compute-only and hw is compute-only. */
- if (!sscreen->info.has_graphics &&
- !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
- return NULL;
-
- struct si_context *sctx = CALLOC_STRUCT(si_context);
- struct radeon_winsys *ws = sscreen->ws;
- int shader, i;
- bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
-
- if (!sctx)
- return NULL;
-
- sctx->has_graphics = sscreen->info.chip_class == GFX6 ||
- !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
-
- if (flags & PIPE_CONTEXT_DEBUG)
- sscreen->record_llvm_ir = true; /* racy but not critical */
-
- sctx->b.screen = screen; /* this must be set first */
- sctx->b.priv = NULL;
- sctx->b.destroy = si_destroy_context;
- sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
- sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
-
- slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
- slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
-
- sctx->ws = sscreen->ws;
- sctx->family = sscreen->info.family;
- sctx->chip_class = sscreen->info.chip_class;
-
- if (sctx->chip_class == GFX7 ||
- sctx->chip_class == GFX8 ||
- sctx->chip_class == GFX9) {
- sctx->eop_bug_scratch = si_resource(
- pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
- 16 * sscreen->info.num_render_backends));
- if (!sctx->eop_bug_scratch)
- goto fail;
- }
-
- /* Initialize context allocators. */
- sctx->allocator_zeroed_memory =
- u_suballocator_create(&sctx->b, 128 * 1024,
- 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_UNMAPPABLE |
- SI_RESOURCE_FLAG_CLEAR, false);
- if (!sctx->allocator_zeroed_memory)
- goto fail;
-
- sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
- 0, PIPE_USAGE_STREAM,
- SI_RESOURCE_FLAG_READ_ONLY);
- if (!sctx->b.stream_uploader)
- goto fail;
-
- sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
- 0, PIPE_USAGE_STAGING, 0);
- if (!sctx->cached_gtt_allocator)
- goto fail;
-
- sctx->ctx = sctx->ws->ctx_create(sctx->ws);
- if (!sctx->ctx)
- goto fail;
-
- if (sscreen->info.num_rings[RING_DMA] &&
- !(sscreen->debug_flags & DBG(NO_SDMA)) &&
- /* SDMA causes corruption on RX 580:
- * https://gitlab.freedesktop.org/mesa/mesa/issues/1399
- * https://gitlab.freedesktop.org/mesa/mesa/issues/1889
- */
- (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
- /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
- * https://bugs.freedesktop.org/show_bug.cgi?id=111481
- * https://gitlab.freedesktop.org/mesa/mesa/issues/1907
- */
- (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
- sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
- (void*)si_flush_dma_cs,
- sctx, stop_exec_on_failure);
- }
-
- bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
- sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
- 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_32BIT |
- (use_sdma_upload ?
- SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
- if (!sctx->b.const_uploader)
- goto fail;
-
- if (use_sdma_upload)
- u_upload_enable_flush_explicit(sctx->b.const_uploader);
-
- sctx->gfx_cs = ws->cs_create(sctx->ctx,
- sctx->has_graphics ? RING_GFX : RING_COMPUTE,
- (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure);
-
- /* Border colors. */
- sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
- sizeof(*sctx->border_color_table));
- if (!sctx->border_color_table)
- goto fail;
-
- sctx->border_color_buffer = si_resource(
- pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
- SI_MAX_BORDER_COLORS *
- sizeof(*sctx->border_color_table)));
- if (!sctx->border_color_buffer)
- goto fail;
-
- sctx->border_color_map =
- ws->buffer_map(sctx->border_color_buffer->buf,
- NULL, PIPE_TRANSFER_WRITE);
- if (!sctx->border_color_map)
- goto fail;
-
- sctx->ngg = sscreen->use_ngg;
-
- /* Initialize context functions used by graphics and compute. */
- if (sctx->chip_class >= GFX10)
- sctx->emit_cache_flush = gfx10_emit_cache_flush;
- else
- sctx->emit_cache_flush = si_emit_cache_flush;
-
- sctx->b.emit_string_marker = si_emit_string_marker;
- sctx->b.set_debug_callback = si_set_debug_callback;
- sctx->b.set_log_context = si_set_log_context;
- sctx->b.set_context_param = si_set_context_param;
- sctx->b.get_device_reset_status = si_get_reset_status;
- sctx->b.set_device_reset_callback = si_set_device_reset_callback;
-
- si_init_all_descriptors(sctx);
- si_init_buffer_functions(sctx);
- si_init_clear_functions(sctx);
- si_init_blit_functions(sctx);
- si_init_compute_functions(sctx);
- si_init_compute_blit_functions(sctx);
- si_init_debug_functions(sctx);
- si_init_fence_functions(sctx);
- si_init_query_functions(sctx);
- si_init_state_compute_functions(sctx);
- si_init_context_texture_functions(sctx);
-
- /* Initialize graphics-only context functions. */
- if (sctx->has_graphics) {
- if (sctx->chip_class >= GFX10)
- gfx10_init_query(sctx);
- si_init_msaa_functions(sctx);
- si_init_shader_functions(sctx);
- si_init_state_functions(sctx);
- si_init_streamout_functions(sctx);
- si_init_viewport_functions(sctx);
-
- sctx->blitter = util_blitter_create(&sctx->b);
- if (sctx->blitter == NULL)
- goto fail;
- sctx->blitter->skip_viewport_restore = true;
-
- /* Some states are expected to be always non-NULL. */
- sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
- sctx->queued.named.blend = sctx->noop_blend;
-
- sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
- sctx->queued.named.dsa = sctx->noop_dsa;
-
- sctx->discard_rasterizer_state =
- util_blitter_get_discard_rasterizer_state(sctx->blitter);
- sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
-
- si_init_draw_functions(sctx);
-
- /* If aux_context == NULL, we are initializing aux_context right now. */
- bool is_aux_context = !sscreen->aux_context;
- si_initialize_prim_discard_tunables(sscreen, is_aux_context,
- &sctx->prim_discard_vertex_count_threshold,
- &sctx->index_ring_size_per_ib);
- }
-
- /* Initialize SDMA functions. */
- if (sctx->chip_class >= GFX7)
- cik_init_sdma_functions(sctx);
- else
- sctx->dma_copy = si_resource_copy_region;
-
- if (sscreen->debug_flags & DBG(FORCE_SDMA))
- sctx->b.resource_copy_region = sctx->dma_copy;
-
- sctx->sample_mask = 0xffff;
-
- /* Initialize multimedia functions. */
- if (sscreen->info.has_hw_decode) {
- sctx->b.create_video_codec = si_uvd_create_decoder;
- sctx->b.create_video_buffer = si_video_buffer_create;
- } else {
- sctx->b.create_video_codec = vl_create_decoder;
- sctx->b.create_video_buffer = vl_video_buffer_create;
- }
-
- if (sctx->chip_class >= GFX9 ||
- si_compute_prim_discard_enabled(sctx)) {
- sctx->wait_mem_scratch = si_resource(
- pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
- if (!sctx->wait_mem_scratch)
- goto fail;
-
- /* Initialize the memory. */
- si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4,
- V_370_MEM, V_370_ME, &sctx->wait_mem_number);
- }
-
- /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
- * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
- if (sctx->chip_class == GFX7) {
- sctx->null_const_buf.buffer =
- pipe_aligned_buffer_create(screen,
- SI_RESOURCE_FLAG_32BIT,
- PIPE_USAGE_DEFAULT, 16,
- sctx->screen->info.tcc_cache_line_size);
- if (!sctx->null_const_buf.buffer)
- goto fail;
- sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
-
- unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
- for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
- for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
- sctx->b.set_constant_buffer(&sctx->b, shader, i,
- &sctx->null_const_buf);
- }
- }
-
- si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
- &sctx->null_const_buf);
- }
-
- uint64_t max_threads_per_block;
- screen->get_compute_param(screen, PIPE_SHADER_IR_NIR,
- PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
- &max_threads_per_block);
-
- /* The maximum number of scratch waves. Scratch space isn't divided
- * evenly between CUs. The number is only a function of the number of CUs.
- * We can decrease the constant to decrease the scratch buffer size.
- *
- * sctx->scratch_waves must be >= the maximum posible size of
- * 1 threadgroup, so that the hw doesn't hang from being unable
- * to start any.
- *
- * The recommended value is 4 per CU at most. Higher numbers don't
- * bring much benefit, but they still occupy chip resources (think
- * async compute). I've seen ~2% performance difference between 4 and 32.
- */
- sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
- max_threads_per_block / 64);
-
- /* Bindless handles. */
- sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
- _mesa_key_pointer_equal);
- sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
- _mesa_key_pointer_equal);
-
- util_dynarray_init(&sctx->resident_tex_handles, NULL);
- util_dynarray_init(&sctx->resident_img_handles, NULL);
- util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
- util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
- util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
-
- sctx->sample_pos_buffer =
- pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT,
- sizeof(sctx->sample_positions));
- pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0,
- sizeof(sctx->sample_positions), &sctx->sample_positions);
-
- /* this must be last */
- si_begin_new_gfx_cs(sctx);
-
- if (sctx->chip_class == GFX7) {
- /* Clear the NULL constant buffer, because loads should return zeros.
- * Note that this forces CP DMA to be used, because clover deadlocks
- * for some reason when the compute codepath is used.
- */
- uint32_t clear_value = 0;
- si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
- sctx->null_const_buf.buffer->width0,
- &clear_value, 4, SI_COHERENCY_SHADER, true);
- }
- return &sctx->b;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ STATIC_ASSERT(DBG_COUNT <= 64);
+
+ /* Don't create a context if it's not compute-only and hw is compute-only. */
+ if (!sscreen->info.has_graphics && !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
+ return NULL;
+
+ struct si_context *sctx = CALLOC_STRUCT(si_context);
+ struct radeon_winsys *ws = sscreen->ws;
+ int shader, i;
+ bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
+
+ if (!sctx)
+ return NULL;
+
+ sctx->has_graphics = sscreen->info.chip_class == GFX6 || !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
+
+ if (flags & PIPE_CONTEXT_DEBUG)
+ sscreen->record_llvm_ir = true; /* racy but not critical */
+
+ sctx->b.screen = screen; /* this must be set first */
+ sctx->b.priv = NULL;
+ sctx->b.destroy = si_destroy_context;
+ sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
+ sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
+
+ slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
+ slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
+
+ sctx->ws = sscreen->ws;
+ sctx->family = sscreen->info.family;
+ sctx->chip_class = sscreen->info.chip_class;
+
+ if (sctx->chip_class == GFX7 || sctx->chip_class == GFX8 || sctx->chip_class == GFX9) {
+ sctx->eop_bug_scratch = si_resource(pipe_buffer_create(
+ &sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends));
+ if (!sctx->eop_bug_scratch)
+ goto fail;
+ }
+
+ /* Initialize context allocators. */
+ sctx->allocator_zeroed_memory =
+ u_suballocator_create(&sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT,
+ SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+ if (!sctx->allocator_zeroed_memory)
+ goto fail;
+
+ sctx->b.stream_uploader =
+ u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY);
+ if (!sctx->b.stream_uploader)
+ goto fail;
+
+ sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
+ if (!sctx->cached_gtt_allocator)
+ goto fail;
+
+ sctx->ctx = sctx->ws->ctx_create(sctx->ws);
+ if (!sctx->ctx)
+ goto fail;
+
+ if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) &&
+ /* SDMA causes corruption on RX 580:
+ * https://gitlab.freedesktop.org/mesa/mesa/issues/1399
+ * https://gitlab.freedesktop.org/mesa/mesa/issues/1889
+ */
+ (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+ /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=111481
+ * https://gitlab.freedesktop.org/mesa/mesa/issues/1907
+ */
+ (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
+ sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, sctx,
+ stop_exec_on_failure);
+ }
+
+ bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
+ sctx->b.const_uploader =
+ u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT,
+ SI_RESOURCE_FLAG_32BIT |
+ (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
+ if (!sctx->b.const_uploader)
+ goto fail;
+
+ if (use_sdma_upload)
+ u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
+ sctx->gfx_cs = ws->cs_create(sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE,
+ (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure);
+
+ /* Border colors. */
+ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table));
+ if (!sctx->border_color_table)
+ goto fail;
+
+ sctx->border_color_buffer = si_resource(pipe_buffer_create(
+ screen, 0, PIPE_USAGE_DEFAULT, SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table)));
+ if (!sctx->border_color_buffer)
+ goto fail;
+
+ sctx->border_color_map =
+ ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+ if (!sctx->border_color_map)
+ goto fail;
+
+ sctx->ngg = sscreen->use_ngg;
+
+ /* Initialize context functions used by graphics and compute. */
+ if (sctx->chip_class >= GFX10)
+ sctx->emit_cache_flush = gfx10_emit_cache_flush;
+ else
+ sctx->emit_cache_flush = si_emit_cache_flush;
+
+ sctx->b.emit_string_marker = si_emit_string_marker;
+ sctx->b.set_debug_callback = si_set_debug_callback;
+ sctx->b.set_log_context = si_set_log_context;
+ sctx->b.set_context_param = si_set_context_param;
+ sctx->b.get_device_reset_status = si_get_reset_status;
+ sctx->b.set_device_reset_callback = si_set_device_reset_callback;
+
+ si_init_all_descriptors(sctx);
+ si_init_buffer_functions(sctx);
+ si_init_clear_functions(sctx);
+ si_init_blit_functions(sctx);
+ si_init_compute_functions(sctx);
+ si_init_compute_blit_functions(sctx);
+ si_init_debug_functions(sctx);
+ si_init_fence_functions(sctx);
+ si_init_query_functions(sctx);
+ si_init_state_compute_functions(sctx);
+ si_init_context_texture_functions(sctx);
+
+ /* Initialize graphics-only context functions. */
+ if (sctx->has_graphics) {
+ if (sctx->chip_class >= GFX10)
+ gfx10_init_query(sctx);
+ si_init_msaa_functions(sctx);
+ si_init_shader_functions(sctx);
+ si_init_state_functions(sctx);
+ si_init_streamout_functions(sctx);
+ si_init_viewport_functions(sctx);
+
+ sctx->blitter = util_blitter_create(&sctx->b);
+ if (sctx->blitter == NULL)
+ goto fail;
+ sctx->blitter->skip_viewport_restore = true;
+
+ /* Some states are expected to be always non-NULL. */
+ sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
+ sctx->queued.named.blend = sctx->noop_blend;
+
+ sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
+ sctx->queued.named.dsa = sctx->noop_dsa;
+
+ sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
+ sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
+
+ si_init_draw_functions(sctx);
+
+ /* If aux_context == NULL, we are initializing aux_context right now. */
+ bool is_aux_context = !sscreen->aux_context;
+ si_initialize_prim_discard_tunables(sscreen, is_aux_context,
+ &sctx->prim_discard_vertex_count_threshold,
+ &sctx->index_ring_size_per_ib);
+ }
+
+ /* Initialize SDMA functions. */
+ if (sctx->chip_class >= GFX7)
+ cik_init_sdma_functions(sctx);
+ else
+ sctx->dma_copy = si_resource_copy_region;
+
+ if (sscreen->debug_flags & DBG(FORCE_SDMA))
+ sctx->b.resource_copy_region = sctx->dma_copy;
+
+ sctx->sample_mask = 0xffff;
+
+ /* Initialize multimedia functions. */
+ if (sscreen->info.has_hw_decode) {
+ sctx->b.create_video_codec = si_uvd_create_decoder;
+ sctx->b.create_video_buffer = si_video_buffer_create;
+ } else {
+ sctx->b.create_video_codec = vl_create_decoder;
+ sctx->b.create_video_buffer = vl_video_buffer_create;
+ }
+
+ if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+ sctx->wait_mem_scratch = si_resource(pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
+ if (!sctx->wait_mem_scratch)
+ goto fail;
+
+ /* Initialize the memory. */
+ si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, V_370_MEM, V_370_ME,
+ &sctx->wait_mem_number);
+ }
+
+ /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
+ * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
+ if (sctx->chip_class == GFX7) {
+ sctx->null_const_buf.buffer =
+ pipe_aligned_buffer_create(screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16,
+ sctx->screen->info.tcc_cache_line_size);
+ if (!sctx->null_const_buf.buffer)
+ goto fail;
+ sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
+
+ unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+ for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
+ for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
+ sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf);
+ }
+ }
+
+ si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
+ }
+
+ uint64_t max_threads_per_block;
+ screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
+ &max_threads_per_block);
+
+ /* The maximum number of scratch waves. Scratch space isn't divided
+ * evenly between CUs. The number is only a function of the number of CUs.
+ * We can decrease the constant to decrease the scratch buffer size.
+ *
+ * sctx->scratch_waves must be >= the maximum posible size of
+ * 1 threadgroup, so that the hw doesn't hang from being unable
+ * to start any.
+ *
+ * The recommended value is 4 per CU at most. Higher numbers don't
+ * bring much benefit, but they still occupy chip resources (think
+ * async compute). I've seen ~2% performance difference between 4 and 32.
+ */
+ sctx->scratch_waves =
+ MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
+
+ /* Bindless handles. */
+ sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+ sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+ util_dynarray_init(&sctx->resident_tex_handles, NULL);
+ util_dynarray_init(&sctx->resident_img_handles, NULL);
+ util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
+ util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
+ util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
+
+ sctx->sample_pos_buffer =
+ pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
+ pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
+ &sctx->sample_positions);
+
+ /* this must be last */
+ si_begin_new_gfx_cs(sctx);
+
+ if (sctx->chip_class == GFX7) {
+ /* Clear the NULL constant buffer, because loads should return zeros.
+ * Note that this forces CP DMA to be used, because clover deadlocks
+ * for some reason when the compute codepath is used.
+ */
+ uint32_t clear_value = 0;
+ si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0,
+ &clear_value, 4, SI_COHERENCY_SHADER, true);
+ }
+ return &sctx->b;
fail:
- fprintf(stderr, "radeonsi: Failed to create a context.\n");
- si_destroy_context(&sctx->b);
- return NULL;
+ fprintf(stderr, "radeonsi: Failed to create a context.\n");
+ si_destroy_context(&sctx->b);
+ return NULL;
}
-static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
- void *priv, unsigned flags)
+static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv,
+ unsigned flags)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- struct pipe_context *ctx;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct pipe_context *ctx;
- if (sscreen->debug_flags & DBG(CHECK_VM))
- flags |= PIPE_CONTEXT_DEBUG;
+ if (sscreen->debug_flags & DBG(CHECK_VM))
+ flags |= PIPE_CONTEXT_DEBUG;
- ctx = si_create_context(screen, flags);
+ ctx = si_create_context(screen, flags);
- if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
- return ctx;
+ if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
+ return ctx;
- /* Clover (compute-only) is unsupported. */
- if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
- return ctx;
+ /* Clover (compute-only) is unsupported. */
+ if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
+ return ctx;
- /* When shaders are logged to stderr, asynchronous compilation is
- * disabled too. */
- if (sscreen->debug_flags & DBG_ALL_SHADERS)
- return ctx;
+ /* When shaders are logged to stderr, asynchronous compilation is
+ * disabled too. */
+ if (sscreen->debug_flags & DBG_ALL_SHADERS)
+ return ctx;
- /* Use asynchronous flushes only on amdgpu, since the radeon
- * implementation for fence_server_sync is incomplete. */
- return threaded_context_create(ctx, &sscreen->pool_transfers,
- si_replace_buffer_storage,
- sscreen->info.is_amdgpu ? si_create_fence : NULL,
- &((struct si_context*)ctx)->tc);
+ /* Use asynchronous flushes only on amdgpu, since the radeon
+ * implementation for fence_server_sync is incomplete. */
+ return threaded_context_create(ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
+ sscreen->info.is_amdgpu ? si_create_fence : NULL,
+ &((struct si_context *)ctx)->tc);
}
/*
* pipe_screen
*/
-static void si_destroy_screen(struct pipe_screen* pscreen)
+static void si_destroy_screen(struct pipe_screen *pscreen)
{
- struct si_screen *sscreen = (struct si_screen *)pscreen;
- struct si_shader_part *parts[] = {
- sscreen->vs_prologs,
- sscreen->tcs_epilogs,
- sscreen->gs_prologs,
- sscreen->ps_prologs,
- sscreen->ps_epilogs
- };
- unsigned i;
-
- if (!sscreen->ws->unref(sscreen->ws))
- return;
-
- if (sscreen->debug_flags & DBG(CACHE_STATS)) {
- printf("live shader cache: hits = %u, misses = %u\n",
- sscreen->live_shader_cache.hits,
- sscreen->live_shader_cache.misses);
- printf("memory shader cache: hits = %u, misses = %u\n",
- sscreen->num_memory_shader_cache_hits,
- sscreen->num_memory_shader_cache_misses);
- printf("disk shader cache: hits = %u, misses = %u\n",
- sscreen->num_disk_shader_cache_hits,
- sscreen->num_disk_shader_cache_misses);
- }
-
- simple_mtx_destroy(&sscreen->aux_context_lock);
-
- struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
- if (aux_log) {
- sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
- u_log_context_destroy(aux_log);
- FREE(aux_log);
- }
-
- sscreen->aux_context->destroy(sscreen->aux_context);
-
- util_queue_destroy(&sscreen->shader_compiler_queue);
- util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
-
- /* Release the reference on glsl types of the compiler threads. */
- glsl_type_singleton_decref();
-
- for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
- si_destroy_compiler(&sscreen->compiler[i]);
-
- for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
- si_destroy_compiler(&sscreen->compiler_lowp[i]);
-
- /* Free shader parts. */
- for (i = 0; i < ARRAY_SIZE(parts); i++) {
- while (parts[i]) {
- struct si_shader_part *part = parts[i];
-
- parts[i] = part->next;
- si_shader_binary_clean(&part->binary);
- FREE(part);
- }
- }
- simple_mtx_destroy(&sscreen->shader_parts_mutex);
- si_destroy_shader_cache(sscreen);
-
- si_destroy_perfcounters(sscreen);
- si_gpu_load_kill_thread(sscreen);
-
- simple_mtx_destroy(&sscreen->gpu_load_mutex);
-
- slab_destroy_parent(&sscreen->pool_transfers);
-
- disk_cache_destroy(sscreen->disk_shader_cache);
- util_live_shader_cache_deinit(&sscreen->live_shader_cache);
- sscreen->ws->destroy(sscreen->ws);
- FREE(sscreen);
+ struct si_screen *sscreen = (struct si_screen *)pscreen;
+ struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs,
+ sscreen->ps_prologs, sscreen->ps_epilogs};
+ unsigned i;
+
+ if (!sscreen->ws->unref(sscreen->ws))
+ return;
+
+ if (sscreen->debug_flags & DBG(CACHE_STATS)) {
+ printf("live shader cache: hits = %u, misses = %u\n", sscreen->live_shader_cache.hits,
+ sscreen->live_shader_cache.misses);
+ printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits,
+ sscreen->num_memory_shader_cache_misses);
+ printf("disk shader cache: hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits,
+ sscreen->num_disk_shader_cache_misses);
+ }
+
+ simple_mtx_destroy(&sscreen->aux_context_lock);
+
+ struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+ if (aux_log) {
+ sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+ u_log_context_destroy(aux_log);
+ FREE(aux_log);
+ }
+
+ sscreen->aux_context->destroy(sscreen->aux_context);
+
+ util_queue_destroy(&sscreen->shader_compiler_queue);
+ util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
+
+ /* Release the reference on glsl types of the compiler threads. */
+ glsl_type_singleton_decref();
+
+ for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
+ si_destroy_compiler(&sscreen->compiler[i]);
+
+ for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
+ si_destroy_compiler(&sscreen->compiler_lowp[i]);
+
+ /* Free shader parts. */
+ for (i = 0; i < ARRAY_SIZE(parts); i++) {
+ while (parts[i]) {
+ struct si_shader_part *part = parts[i];
+
+ parts[i] = part->next;
+ si_shader_binary_clean(&part->binary);
+ FREE(part);
+ }
+ }
+ simple_mtx_destroy(&sscreen->shader_parts_mutex);
+ si_destroy_shader_cache(sscreen);
+
+ si_destroy_perfcounters(sscreen);
+ si_gpu_load_kill_thread(sscreen);
+
+ simple_mtx_destroy(&sscreen->gpu_load_mutex);
+
+ slab_destroy_parent(&sscreen->pool_transfers);
+
+ disk_cache_destroy(sscreen->disk_shader_cache);
+ util_live_shader_cache_deinit(&sscreen->live_shader_cache);
+ sscreen->ws->destroy(sscreen->ws);
+ FREE(sscreen);
}
static void si_init_gs_info(struct si_screen *sscreen)
{
- sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
- sscreen->info.family);
+ sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class, sscreen->info.family);
}
static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags)
{
- struct pipe_context *ctx = sscreen->aux_context;
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_resource *buf =
- pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
-
- if (!buf) {
- puts("Buffer allocation failed.");
- exit(1);
- }
-
- si_resource(buf)->gpu_address = 0; /* cause a VM fault */
-
- if (test_flags & DBG(TEST_VMFAULT_CP)) {
- si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0,
- SI_COHERENCY_NONE, L2_BYPASS);
- ctx->flush(ctx, NULL, 0);
- puts("VM fault test: CP - done.");
- }
- if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
- si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
- ctx->flush(ctx, NULL, 0);
- puts("VM fault test: SDMA - done.");
- }
- if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
- util_test_constant_buffer(ctx, buf);
- puts("VM fault test: Shader - done.");
- }
- exit(0);
+ struct pipe_context *ctx = sscreen->aux_context;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_resource *buf = pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
+
+ if (!buf) {
+ puts("Buffer allocation failed.");
+ exit(1);
+ }
+
+ si_resource(buf)->gpu_address = 0; /* cause a VM fault */
+
+ if (test_flags & DBG(TEST_VMFAULT_CP)) {
+ si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, SI_COHERENCY_NONE, L2_BYPASS);
+ ctx->flush(ctx, NULL, 0);
+ puts("VM fault test: CP - done.");
+ }
+ if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
+ si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
+ ctx->flush(ctx, NULL, 0);
+ puts("VM fault test: SDMA - done.");
+ }
+ if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
+ util_test_constant_buffer(ctx, buf);
+ puts("VM fault test: Shader - done.");
+ }
+ exit(0);
}
-static void si_test_gds_memory_management(struct si_context *sctx,
- unsigned alloc_size, unsigned alignment,
- enum radeon_bo_domain domain)
+static void si_test_gds_memory_management(struct si_context *sctx, unsigned alloc_size,
+ unsigned alignment, enum radeon_bo_domain domain)
{
- struct radeon_winsys *ws = sctx->ws;
- struct radeon_cmdbuf *cs[8];
- struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
-
- for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
- cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE,
- NULL, NULL, false);
- gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
- assert(gds_bo[i]);
- }
-
- for (unsigned iterations = 0; iterations < 20000; iterations++) {
- for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
- /* This clears GDS with CP DMA.
- *
- * We don't care if GDS is present. Just add some packet
- * to make the GPU busy for a moment.
- */
- si_cp_dma_clear_buffer(sctx, cs[i], NULL, 0, alloc_size, 0,
- SI_CPDMA_SKIP_BO_LIST_UPDATE |
- SI_CPDMA_SKIP_CHECK_CS_SPACE |
- SI_CPDMA_SKIP_GFX_SYNC, 0, 0);
-
- ws->cs_add_buffer(cs[i], gds_bo[i], domain,
- RADEON_USAGE_READWRITE, 0);
- ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
- }
- }
- exit(0);
+ struct radeon_winsys *ws = sctx->ws;
+ struct radeon_cmdbuf *cs[8];
+ struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
+
+ for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+ cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE, NULL, NULL, false);
+ gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
+ assert(gds_bo[i]);
+ }
+
+ for (unsigned iterations = 0; iterations < 20000; iterations++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+ /* This clears GDS with CP DMA.
+ *
+ * We don't care if GDS is present. Just add some packet
+ * to make the GPU busy for a moment.
+ */
+ si_cp_dma_clear_buffer(
+ sctx, cs[i], NULL, 0, alloc_size, 0,
+ SI_CPDMA_SKIP_BO_LIST_UPDATE | SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC, 0,
+ 0);
+
+ ws->cs_add_buffer(cs[i], gds_bo[i], domain, RADEON_USAGE_READWRITE, 0);
+ ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
+ }
+ }
+ exit(0);
}
static void si_disk_cache_create(struct si_screen *sscreen)
{
- /* Don't use the cache if shader dumping is enabled. */
- if (sscreen->debug_flags & DBG_ALL_SHADERS)
- return;
-
- struct mesa_sha1 ctx;
- unsigned char sha1[20];
- char cache_id[20 * 2 + 1];
-
- _mesa_sha1_init(&ctx);
-
- if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
- !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo,
- &ctx))
- return;
-
- _mesa_sha1_final(&ctx, sha1);
- disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
-
- /* These flags affect shader compilation. */
- #define ALL_FLAGS (DBG(GISEL))
- uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
-
- /* Add the high bits of 32-bit addresses, which affects
- * how 32-bit addresses are expanded to 64 bits.
- */
- STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
- assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
- shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
-
- sscreen->disk_shader_cache =
- disk_cache_create(sscreen->info.name,
- cache_id,
- shader_debug_flags);
+ /* Don't use the cache if shader dumping is enabled. */
+ if (sscreen->debug_flags & DBG_ALL_SHADERS)
+ return;
+
+ struct mesa_sha1 ctx;
+ unsigned char sha1[20];
+ char cache_id[20 * 2 + 1];
+
+ _mesa_sha1_init(&ctx);
+
+ if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
+ !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
+ return;
+
+ _mesa_sha1_final(&ctx, sha1);
+ disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
+
+/* These flags affect shader compilation. */
+#define ALL_FLAGS (DBG(GISEL))
+ uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
+
+ /* Add the high bits of 32-bit addresses, which affects
+ * how 32-bit addresses are expanded to 64 bits.
+ */
+ STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
+ assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
+ shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
+
+ sscreen->disk_shader_cache = disk_cache_create(sscreen->info.name, cache_id, shader_debug_flags);
}
-static void si_set_max_shader_compiler_threads(struct pipe_screen *screen,
- unsigned max_threads)
+static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_screen *sscreen = (struct si_screen *)screen;
- /* This function doesn't allow a greater number of threads than
- * the queue had at its creation. */
- util_queue_adjust_num_threads(&sscreen->shader_compiler_queue,
- max_threads);
- /* Don't change the number of threads on the low priority queue. */
+ /* This function doesn't allow a greater number of threads than
+ * the queue had at its creation. */
+ util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads);
+ /* Don't change the number of threads on the low priority queue. */
}
-static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
- void *shader,
- enum pipe_shader_type shader_type)
+static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader,
+ enum pipe_shader_type shader_type)
{
- struct si_shader_selector *sel = (struct si_shader_selector *)shader;
+ struct si_shader_selector *sel = (struct si_shader_selector *)shader;
- return util_queue_fence_is_signalled(&sel->ready);
+ return util_queue_fence_is_signalled(&sel->ready);
}
-static struct pipe_screen *
-radeonsi_screen_create_impl(struct radeon_winsys *ws,
- const struct pipe_screen_config *config)
+static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
+ const struct pipe_screen_config *config)
{
- struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
- unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
- uint64_t test_flags;
-
- if (!sscreen) {
- return NULL;
- }
-
- sscreen->ws = ws;
- ws->query_info(ws, &sscreen->info);
-
- if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
- fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
- FREE(sscreen);
- return NULL;
- }
-
- if (sscreen->info.chip_class >= GFX9) {
- sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
- } else {
- ac_get_raster_config(&sscreen->info,
- &sscreen->pa_sc_raster_config,
- &sscreen->pa_sc_raster_config_1,
- &sscreen->se_tile_repeat);
- }
-
- sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
- debug_options, 0);
- sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG",
- debug_options, 0);
- test_flags = debug_get_flags_option("AMD_TEST",
- test_options, 0);
-
- if (sscreen->debug_flags & DBG(NO_GFX))
- sscreen->info.has_graphics = false;
-
- /* Set functions first. */
- sscreen->b.context_create = si_pipe_create_context;
- sscreen->b.destroy = si_destroy_screen;
- sscreen->b.set_max_shader_compiler_threads =
- si_set_max_shader_compiler_threads;
- sscreen->b.is_parallel_shader_compilation_finished =
- si_is_parallel_shader_compilation_finished;
- sscreen->b.finalize_nir = si_finalize_nir;
-
- si_init_screen_get_functions(sscreen);
- si_init_screen_buffer_functions(sscreen);
- si_init_screen_fence_functions(sscreen);
- si_init_screen_state_functions(sscreen);
- si_init_screen_texture_functions(sscreen);
- si_init_screen_query_functions(sscreen);
- si_init_screen_live_shader_cache(sscreen);
-
- /* Set these flags in debug_flags early, so that the shader cache takes
- * them into account.
- */
- if (driQueryOptionb(config->options,
- "glsl_correct_derivatives_after_discard"))
- sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
-
- if (sscreen->debug_flags & DBG(INFO))
- ac_print_gpu_info(&sscreen->info);
-
- slab_create_parent(&sscreen->pool_transfers,
- sizeof(struct si_transfer), 64);
-
- sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
- if (sscreen->force_aniso == -1) {
- sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
- }
-
- if (sscreen->force_aniso >= 0) {
- printf("radeonsi: Forcing anisotropy filter to %ix\n",
- /* round down to a power of two */
- 1 << util_logbase2(sscreen->force_aniso));
- }
-
- (void) simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
- (void) simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
-
- si_init_gs_info(sscreen);
- if (!si_init_shader_cache(sscreen)) {
- FREE(sscreen);
- return NULL;
- }
-
- {
-#define OPT_BOOL(name, dflt, description) \
- sscreen->options.name = \
- driQueryOptionb(config->options, "radeonsi_"#name);
+ struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
+ unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
+ uint64_t test_flags;
+
+ if (!sscreen) {
+ return NULL;
+ }
+
+ sscreen->ws = ws;
+ ws->query_info(ws, &sscreen->info);
+
+ if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
+ fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
+ FREE(sscreen);
+ return NULL;
+ }
+
+ if (sscreen->info.chip_class >= GFX9) {
+ sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
+ } else {
+ ac_get_raster_config(&sscreen->info, &sscreen->pa_sc_raster_config,
+ &sscreen->pa_sc_raster_config_1, &sscreen->se_tile_repeat);
+ }
+
+ sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", debug_options, 0);
+ sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", debug_options, 0);
+ test_flags = debug_get_flags_option("AMD_TEST", test_options, 0);
+
+ if (sscreen->debug_flags & DBG(NO_GFX))
+ sscreen->info.has_graphics = false;
+
+ /* Set functions first. */
+ sscreen->b.context_create = si_pipe_create_context;
+ sscreen->b.destroy = si_destroy_screen;
+ sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads;
+ sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished;
+ sscreen->b.finalize_nir = si_finalize_nir;
+
+ si_init_screen_get_functions(sscreen);
+ si_init_screen_buffer_functions(sscreen);
+ si_init_screen_fence_functions(sscreen);
+ si_init_screen_state_functions(sscreen);
+ si_init_screen_texture_functions(sscreen);
+ si_init_screen_query_functions(sscreen);
+ si_init_screen_live_shader_cache(sscreen);
+
+ /* Set these flags in debug_flags early, so that the shader cache takes
+ * them into account.
+ */
+ if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard"))
+ sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
+
+ if (sscreen->debug_flags & DBG(INFO))
+ ac_print_gpu_info(&sscreen->info);
+
+ slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64);
+
+ sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+ if (sscreen->force_aniso == -1) {
+ sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
+ }
+
+ if (sscreen->force_aniso >= 0) {
+ printf("radeonsi: Forcing anisotropy filter to %ix\n",
+ /* round down to a power of two */
+ 1 << util_logbase2(sscreen->force_aniso));
+ }
+
+ (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+ (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
+
+ si_init_gs_info(sscreen);
+ if (!si_init_shader_cache(sscreen)) {
+ FREE(sscreen);
+ return NULL;
+ }
+
+ {
+#define OPT_BOOL(name, dflt, description) \
+ sscreen->options.name = driQueryOptionb(config->options, "radeonsi_" #name);
#include "si_debug_options.h"
- }
-
- si_disk_cache_create(sscreen);
-
- /* Determine the number of shader compiler threads. */
- hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
-
- if (hw_threads >= 12) {
- num_comp_hi_threads = hw_threads * 3 / 4;
- num_comp_lo_threads = hw_threads / 3;
- } else if (hw_threads >= 6) {
- num_comp_hi_threads = hw_threads - 2;
- num_comp_lo_threads = hw_threads / 2;
- } else if (hw_threads >= 2) {
- num_comp_hi_threads = hw_threads - 1;
- num_comp_lo_threads = hw_threads / 2;
- } else {
- num_comp_hi_threads = 1;
- num_comp_lo_threads = 1;
- }
-
- num_comp_hi_threads = MIN2(num_comp_hi_threads,
- ARRAY_SIZE(sscreen->compiler));
- num_comp_lo_threads = MIN2(num_comp_lo_threads,
- ARRAY_SIZE(sscreen->compiler_lowp));
-
- /* Take a reference on the glsl types for the compiler threads. */
- glsl_type_singleton_init_or_ref();
-
- if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
- 64, num_comp_hi_threads,
- UTIL_QUEUE_INIT_RESIZE_IF_FULL |
- UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
- si_destroy_shader_cache(sscreen);
- FREE(sscreen);
- glsl_type_singleton_decref();
- return NULL;
- }
-
- if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
- "shlo",
- 64, num_comp_lo_threads,
- UTIL_QUEUE_INIT_RESIZE_IF_FULL |
- UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
- UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
- si_destroy_shader_cache(sscreen);
- FREE(sscreen);
- glsl_type_singleton_decref();
- return NULL;
- }
-
- if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
- si_init_perfcounters(sscreen);
-
- unsigned prim_discard_vertex_count_threshold, tmp;
- si_initialize_prim_discard_tunables(sscreen, false,
- &prim_discard_vertex_count_threshold,
- &tmp);
- /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
- if (prim_discard_vertex_count_threshold == UINT_MAX)
- sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-
- /* Determine tessellation ring info. */
- bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
- sscreen->info.family != CHIP_CARRIZO &&
- sscreen->info.family != CHIP_STONEY;
- /* This must be one less than the maximum number due to a hw limitation.
- * Various hardware bugs need this.
- */
- unsigned max_offchip_buffers_per_se;
-
- if (sscreen->info.chip_class >= GFX10)
- max_offchip_buffers_per_se = 256;
- /* Only certain chips can use the maximum value. */
- else if (sscreen->info.family == CHIP_VEGA12 ||
- sscreen->info.family == CHIP_VEGA20)
- max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
- else
- max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
-
- unsigned max_offchip_buffers = max_offchip_buffers_per_se *
- sscreen->info.max_se;
- unsigned offchip_granularity;
-
- /* Hawaii has a bug with offchip buffers > 256 that can be worked
- * around by setting 4K granularity.
- */
- if (sscreen->info.family == CHIP_HAWAII) {
- sscreen->tess_offchip_block_dw_size = 4096;
- offchip_granularity = V_03093C_X_4K_DWORDS;
- } else {
- sscreen->tess_offchip_block_dw_size = 8192;
- offchip_granularity = V_03093C_X_8K_DWORDS;
- }
-
- sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
- sscreen->tess_offchip_ring_size = max_offchip_buffers *
- sscreen->tess_offchip_block_dw_size * 4;
-
- if (sscreen->info.chip_class >= GFX7) {
- if (sscreen->info.chip_class >= GFX8)
- --max_offchip_buffers;
- sscreen->vgt_hs_offchip_param =
- S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
- S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
- } else {
- assert(offchip_granularity == V_03093C_X_8K_DWORDS);
- sscreen->vgt_hs_offchip_param =
- S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
- }
-
- sscreen->has_draw_indirect_multi =
- (sscreen->info.family >= CHIP_POLARIS10) ||
- (sscreen->info.chip_class == GFX8 &&
- sscreen->info.pfp_fw_version >= 121 &&
- sscreen->info.me_fw_version >= 87) ||
- (sscreen->info.chip_class == GFX7 &&
- sscreen->info.pfp_fw_version >= 211 &&
- sscreen->info.me_fw_version >= 173) ||
- (sscreen->info.chip_class == GFX6 &&
- sscreen->info.pfp_fw_version >= 79 &&
- sscreen->info.me_fw_version >= 142);
-
- sscreen->has_out_of_order_rast = sscreen->info.has_out_of_order_rast &&
- !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
- sscreen->assume_no_z_fights =
- driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
- driQueryOptionb(config->options, "allow_draw_out_of_order");
- sscreen->commutative_blend_add =
- driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
- driQueryOptionb(config->options, "allow_draw_out_of_order");
-
- sscreen->use_ngg = sscreen->info.chip_class >= GFX10 &&
- sscreen->info.family != CHIP_NAVI14 &&
- !(sscreen->debug_flags & DBG(NO_NGG));
- sscreen->use_ngg_culling = sscreen->use_ngg &&
- !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
- sscreen->always_use_ngg_culling = sscreen->use_ngg_culling &&
- sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
- sscreen->use_ngg_streamout = false;
-
- /* Only enable primitive binning on APUs by default. */
- if (sscreen->info.chip_class >= GFX10) {
- sscreen->dpbb_allowed = true;
- sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
- } else if (sscreen->info.chip_class == GFX9) {
- sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
- sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
- }
-
- /* Process DPBB enable flags. */
- if (sscreen->debug_flags & DBG(DPBB)) {
- sscreen->dpbb_allowed = true;
- if (sscreen->debug_flags & DBG(DFSM))
- sscreen->dfsm_allowed = true;
- }
-
- /* Process DPBB disable flags. */
- if (sscreen->debug_flags & DBG(NO_DPBB)) {
- sscreen->dpbb_allowed = false;
- sscreen->dfsm_allowed = false;
- } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
- sscreen->dfsm_allowed = false;
- }
-
- /* While it would be nice not to have this flag, we are constrained
- * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
- */
- sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
-
- sscreen->dcc_msaa_allowed =
- !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
-
- (void) simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
- sscreen->use_monolithic_shaders =
- (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
-
- sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE;
- if (sscreen->info.chip_class <= GFX8) {
- sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
- sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
- }
-
- if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
- sscreen->debug_flags |= DBG_ALL_SHADERS;
-
- /* Syntax:
- * EQAA=s,z,c
- * Example:
- * EQAA=8,4,2
-
- * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
- * Constraints:
- * s >= z >= c (ignoring this only wastes memory)
- * s = [2..16]
- * z = [2..8]
- * c = [2..8]
- *
- * Only MSAA color and depth buffers are overriden.
- */
- if (sscreen->info.has_eqaa_surface_allocator) {
- const char *eqaa = debug_get_option("EQAA", NULL);
- unsigned s,z,f;
-
- if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
- sscreen->eqaa_force_coverage_samples = s;
- sscreen->eqaa_force_z_samples = z;
- sscreen->eqaa_force_color_samples = f;
- }
- }
-
- sscreen->ge_wave_size = 64;
- sscreen->ps_wave_size = 64;
- sscreen->compute_wave_size = 64;
-
- if (sscreen->info.chip_class >= GFX10) {
- /* Pixels shaders: Wave64 is recommended.
- * Compute shaders: There are piglit failures with Wave32.
- */
- sscreen->ge_wave_size = 32;
-
- if (sscreen->debug_flags & DBG(W32_GE))
- sscreen->ge_wave_size = 32;
- if (sscreen->debug_flags & DBG(W32_PS))
- sscreen->ps_wave_size = 32;
- if (sscreen->debug_flags & DBG(W32_CS))
- sscreen->compute_wave_size = 32;
-
- if (sscreen->debug_flags & DBG(W64_GE))
- sscreen->ge_wave_size = 64;
- if (sscreen->debug_flags & DBG(W64_PS))
- sscreen->ps_wave_size = 64;
- if (sscreen->debug_flags & DBG(W64_CS))
- sscreen->compute_wave_size = 64;
- }
-
- /* Create the auxiliary context. This must be done last. */
- sscreen->aux_context = si_create_context(&sscreen->b,
- (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
- (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
- if (sscreen->options.aux_debug) {
- struct u_log_context *log = CALLOC_STRUCT(u_log_context);
- u_log_context_init(log);
- sscreen->aux_context->set_log_context(sscreen->aux_context, log);
- }
-
- if (test_flags & DBG(TEST_DMA))
- si_test_dma(sscreen);
-
- if (test_flags & DBG(TEST_DMA_PERF)) {
- si_test_dma_perf(sscreen);
- }
-
- if (test_flags & (DBG(TEST_VMFAULT_CP) |
- DBG(TEST_VMFAULT_SDMA) |
- DBG(TEST_VMFAULT_SHADER)))
- si_test_vmfault(sscreen, test_flags);
-
- if (test_flags & DBG(TEST_GDS))
- si_test_gds((struct si_context*)sscreen->aux_context);
-
- if (test_flags & DBG(TEST_GDS_MM)) {
- si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
- 32 * 1024, 4, RADEON_DOMAIN_GDS);
- }
- if (test_flags & DBG(TEST_GDS_OA_MM)) {
- si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
- 4, 1, RADEON_DOMAIN_OA);
- }
-
- STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
- return &sscreen->b;
+ }
+
+ si_disk_cache_create(sscreen);
+
+ /* Determine the number of shader compiler threads. */
+ hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+ if (hw_threads >= 12) {
+ num_comp_hi_threads = hw_threads * 3 / 4;
+ num_comp_lo_threads = hw_threads / 3;
+ } else if (hw_threads >= 6) {
+ num_comp_hi_threads = hw_threads - 2;
+ num_comp_lo_threads = hw_threads / 2;
+ } else if (hw_threads >= 2) {
+ num_comp_hi_threads = hw_threads - 1;
+ num_comp_lo_threads = hw_threads / 2;
+ } else {
+ num_comp_hi_threads = 1;
+ num_comp_lo_threads = 1;
+ }
+
+ num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler));
+ num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp));
+
+ /* Take a reference on the glsl types for the compiler threads. */
+ glsl_type_singleton_init_or_ref();
+
+ if (!util_queue_init(
+ &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads,
+ UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
+ si_destroy_shader_cache(sscreen);
+ FREE(sscreen);
+ glsl_type_singleton_decref();
+ return NULL;
+ }
+
+ if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64,
+ num_comp_lo_threads,
+ UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
+ UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+ si_destroy_shader_cache(sscreen);
+ FREE(sscreen);
+ glsl_type_singleton_decref();
+ return NULL;
+ }
+
+ if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
+ si_init_perfcounters(sscreen);
+
+ unsigned prim_discard_vertex_count_threshold, tmp;
+ si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
+ /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
+ if (prim_discard_vertex_count_threshold == UINT_MAX)
+ sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+
+ /* Determine tessellation ring info. */
+ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
+ sscreen->info.family != CHIP_CARRIZO &&
+ sscreen->info.family != CHIP_STONEY;
+ /* This must be one less than the maximum number due to a hw limitation.
+ * Various hardware bugs need this.
+ */
+ unsigned max_offchip_buffers_per_se;
+
+ if (sscreen->info.chip_class >= GFX10)
+ max_offchip_buffers_per_se = 256;
+ /* Only certain chips can use the maximum value. */
+ else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20)
+ max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+ else
+ max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
+
+ unsigned max_offchip_buffers = max_offchip_buffers_per_se * sscreen->info.max_se;
+ unsigned offchip_granularity;
+
+ /* Hawaii has a bug with offchip buffers > 256 that can be worked
+ * around by setting 4K granularity.
+ */
+ if (sscreen->info.family == CHIP_HAWAII) {
+ sscreen->tess_offchip_block_dw_size = 4096;
+ offchip_granularity = V_03093C_X_4K_DWORDS;
+ } else {
+ sscreen->tess_offchip_block_dw_size = 8192;
+ offchip_granularity = V_03093C_X_8K_DWORDS;
+ }
+
+ sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
+ sscreen->tess_offchip_ring_size = max_offchip_buffers * sscreen->tess_offchip_block_dw_size * 4;
+
+ if (sscreen->info.chip_class >= GFX7) {
+ if (sscreen->info.chip_class >= GFX8)
+ --max_offchip_buffers;
+ sscreen->vgt_hs_offchip_param = S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
+ S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
+ } else {
+ assert(offchip_granularity == V_03093C_X_8K_DWORDS);
+ sscreen->vgt_hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
+ }
+
+ sscreen->has_draw_indirect_multi =
+ (sscreen->info.family >= CHIP_POLARIS10) ||
+ (sscreen->info.chip_class == GFX8 && sscreen->info.pfp_fw_version >= 121 &&
+ sscreen->info.me_fw_version >= 87) ||
+ (sscreen->info.chip_class == GFX7 && sscreen->info.pfp_fw_version >= 211 &&
+ sscreen->info.me_fw_version >= 173) ||
+ (sscreen->info.chip_class == GFX6 && sscreen->info.pfp_fw_version >= 79 &&
+ sscreen->info.me_fw_version >= 142);
+
+ sscreen->has_out_of_order_rast =
+ sscreen->info.has_out_of_order_rast && !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
+ sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
+ driQueryOptionb(config->options, "allow_draw_out_of_order");
+ sscreen->commutative_blend_add =
+ driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
+ driQueryOptionb(config->options, "allow_draw_out_of_order");
+
+ sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 &&
+ !(sscreen->debug_flags & DBG(NO_NGG));
+ sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
+ sscreen->always_use_ngg_culling =
+ sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
+ sscreen->use_ngg_streamout = false;
+
+ /* Only enable primitive binning on APUs by default. */
+ if (sscreen->info.chip_class >= GFX10) {
+ sscreen->dpbb_allowed = true;
+ sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+ } else if (sscreen->info.chip_class == GFX9) {
+ sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
+ sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+ }
+
+ /* Process DPBB enable flags. */
+ if (sscreen->debug_flags & DBG(DPBB)) {
+ sscreen->dpbb_allowed = true;
+ if (sscreen->debug_flags & DBG(DFSM))
+ sscreen->dfsm_allowed = true;
+ }
+
+ /* Process DPBB disable flags. */
+ if (sscreen->debug_flags & DBG(NO_DPBB)) {
+ sscreen->dpbb_allowed = false;
+ sscreen->dfsm_allowed = false;
+ } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
+ sscreen->dfsm_allowed = false;
+ }
+
+ /* While it would be nice not to have this flag, we are constrained
+ * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
+ */
+ sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
+
+ sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
+
+ (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
+ sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
+
+ sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+ if (sscreen->info.chip_class <= GFX8) {
+ sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
+ sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
+ }
+
+ if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
+ sscreen->debug_flags |= DBG_ALL_SHADERS;
+
+ /* Syntax:
+ * EQAA=s,z,c
+ * Example:
+ * EQAA=8,4,2
+
+ * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
+ * Constraints:
+ * s >= z >= c (ignoring this only wastes memory)
+ * s = [2..16]
+ * z = [2..8]
+ * c = [2..8]
+ *
+ * Only MSAA color and depth buffers are overriden.
+ */
+ if (sscreen->info.has_eqaa_surface_allocator) {
+ const char *eqaa = debug_get_option("EQAA", NULL);
+ unsigned s, z, f;
+
+ if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
+ sscreen->eqaa_force_coverage_samples = s;
+ sscreen->eqaa_force_z_samples = z;
+ sscreen->eqaa_force_color_samples = f;
+ }
+ }
+
+ sscreen->ge_wave_size = 64;
+ sscreen->ps_wave_size = 64;
+ sscreen->compute_wave_size = 64;
+
+ if (sscreen->info.chip_class >= GFX10) {
+ /* Pixels shaders: Wave64 is recommended.
+ * Compute shaders: There are piglit failures with Wave32.
+ */
+ sscreen->ge_wave_size = 32;
+
+ if (sscreen->debug_flags & DBG(W32_GE))
+ sscreen->ge_wave_size = 32;
+ if (sscreen->debug_flags & DBG(W32_PS))
+ sscreen->ps_wave_size = 32;
+ if (sscreen->debug_flags & DBG(W32_CS))
+ sscreen->compute_wave_size = 32;
+
+ if (sscreen->debug_flags & DBG(W64_GE))
+ sscreen->ge_wave_size = 64;
+ if (sscreen->debug_flags & DBG(W64_PS))
+ sscreen->ps_wave_size = 64;
+ if (sscreen->debug_flags & DBG(W64_CS))
+ sscreen->compute_wave_size = 64;
+ }
+
+ /* Create the auxiliary context. This must be done last. */
+ sscreen->aux_context = si_create_context(
+ &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+ (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+ if (sscreen->options.aux_debug) {
+ struct u_log_context *log = CALLOC_STRUCT(u_log_context);
+ u_log_context_init(log);
+ sscreen->aux_context->set_log_context(sscreen->aux_context, log);
+ }
+
+ if (test_flags & DBG(TEST_DMA))
+ si_test_dma(sscreen);
+
+ if (test_flags & DBG(TEST_DMA_PERF)) {
+ si_test_dma_perf(sscreen);
+ }
+
+ if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER)))
+ si_test_vmfault(sscreen, test_flags);
+
+ if (test_flags & DBG(TEST_GDS))
+ si_test_gds((struct si_context *)sscreen->aux_context);
+
+ if (test_flags & DBG(TEST_GDS_MM)) {
+ si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 32 * 1024, 4,
+ RADEON_DOMAIN_GDS);
+ }
+ if (test_flags & DBG(TEST_GDS_OA_MM)) {
+ si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 4, 1,
+ RADEON_DOMAIN_OA);
+ }
+
+ STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
+ return &sscreen->b;
}
struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_config *config)
{
- drmVersionPtr version = drmGetVersion(fd);
- struct radeon_winsys *rw = NULL;
-
- switch (version->version_major) {
- case 2:
- rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
- break;
- case 3:
- rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
- break;
- }
-
- drmFreeVersion(version);
- return rw ? rw->screen : NULL;
+ drmVersionPtr version = drmGetVersion(fd);
+ struct radeon_winsys *rw = NULL;
+
+ switch (version->version_major) {
+ case 2:
+ rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
+ break;
+ case 3:
+ rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
+ break;
+ }
+
+ drmFreeVersion(version);
+ return rw ? rw->screen : NULL;
}
#include "si_shader.h"
#include "si_state.h"
-
#include "util/u_dynarray.h"
#include "util/u_idalloc.h"
#include "util/u_threaded_context.h"
#define SI_BIG_ENDIAN 0
#endif
-#define ATI_VENDOR_ID 0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
-#define SI_NOT_QUERY 0xffffffff
+#define ATI_VENDOR_ID 0x1002
+#define SI_PRIM_DISCARD_DEBUG 0
+#define SI_NOT_QUERY 0xffffffff
/* The base vertex and primitive restart can be any number, but we must pick
* one which will mean "unknown" for the purpose of state tracking and
* the number shouldn't be a commonly-used one. */
-#define SI_BASE_VERTEX_UNKNOWN INT_MIN
-#define SI_RESTART_INDEX_UNKNOWN INT_MIN
-#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN
-#define SI_NUM_SMOOTH_AA_SAMPLES 8
-#define SI_MAX_POINT_SIZE 2048
-#define SI_GS_PER_ES 128
+#define SI_BASE_VERTEX_UNKNOWN INT_MIN
+#define SI_RESTART_INDEX_UNKNOWN INT_MIN
+#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN
+#define SI_NUM_SMOOTH_AA_SAMPLES 8
+#define SI_MAX_POINT_SIZE 2048
+#define SI_GS_PER_ES 128
/* Alignment for optimal CP DMA performance. */
-#define SI_CPDMA_ALIGNMENT 32
+#define SI_CPDMA_ALIGNMENT 32
/* Tunables for compute-based clear_buffer and copy_buffer: */
-#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
-#define SI_COMPUTE_COPY_DW_PER_THREAD 4
-#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
+#define SI_COMPUTE_COPY_DW_PER_THREAD 4
+#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
/* Pipeline & streamout query controls. */
-#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
-#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
+#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
+#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
#define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
/* Instruction cache. */
-#define SI_CONTEXT_INV_ICACHE (1 << 3)
+#define SI_CONTEXT_INV_ICACHE (1 << 3)
/* Scalar cache. (GFX6-9: scalar L1; GFX10: scalar L0)
* GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_SCACHE (1 << 4)
+#define SI_CONTEXT_INV_SCACHE (1 << 4)
/* Vector cache. (GFX6-9: vector L1; GFX10: vector L0)
* GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_VCACHE (1 << 5)
+#define SI_CONTEXT_INV_VCACHE (1 << 5)
/* L2 cache + L2 metadata cache writeback & invalidate.
* GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
-#define SI_CONTEXT_INV_L2 (1 << 6)
+#define SI_CONTEXT_INV_L2 (1 << 6)
/* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
* Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
* GFX6-7 will do complete invalidation, because the writeback is unsupported. */
-#define SI_CONTEXT_WB_L2 (1 << 7)
+#define SI_CONTEXT_WB_L2 (1 << 7)
/* Writeback & invalidate the L2 metadata cache only. It can only be coupled with
* a CB or DB flush. */
-#define SI_CONTEXT_INV_L2_METADATA (1 << 8)
+#define SI_CONTEXT_INV_L2_METADATA (1 << 8)
/* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_DB (1 << 9)
+#define SI_CONTEXT_FLUSH_AND_INV_DB (1 << 9)
#define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
-#define SI_CONTEXT_FLUSH_AND_INV_CB (1 << 11)
+#define SI_CONTEXT_FLUSH_AND_INV_CB (1 << 11)
/* Engine synchronization. */
-#define SI_CONTEXT_VS_PARTIAL_FLUSH (1 << 12)
-#define SI_CONTEXT_PS_PARTIAL_FLUSH (1 << 13)
-#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
-#define SI_CONTEXT_VGT_FLUSH (1 << 15)
-#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
-
-#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
-#define SI_PREFETCH_LS (1 << 1)
-#define SI_PREFETCH_HS (1 << 2)
-#define SI_PREFETCH_ES (1 << 3)
-#define SI_PREFETCH_GS (1 << 4)
-#define SI_PREFETCH_VS (1 << 5)
-#define SI_PREFETCH_PS (1 << 6)
-
-#define SI_MAX_BORDER_COLORS 4096
-#define SI_MAX_VIEWPORTS 16
-#define SIX_BITS 0x3F
-#define SI_MAP_BUFFER_ALIGNMENT 64
+#define SI_CONTEXT_VS_PARTIAL_FLUSH (1 << 12)
+#define SI_CONTEXT_PS_PARTIAL_FLUSH (1 << 13)
+#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
+#define SI_CONTEXT_VGT_FLUSH (1 << 15)
+#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
+
+#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
+#define SI_PREFETCH_LS (1 << 1)
+#define SI_PREFETCH_HS (1 << 2)
+#define SI_PREFETCH_ES (1 << 3)
+#define SI_PREFETCH_GS (1 << 4)
+#define SI_PREFETCH_VS (1 << 5)
+#define SI_PREFETCH_PS (1 << 6)
+
+#define SI_MAX_BORDER_COLORS 4096
+#define SI_MAX_VIEWPORTS 16
+#define SIX_BITS 0x3F
+#define SI_MAP_BUFFER_ALIGNMENT 64
#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
-#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+#define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
-#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
-#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
-#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
-#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
-#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
+#define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
+#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
+#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
-#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
/* Set a micro tile mode: */
-#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) (((x) & 0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
+#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) \
+ (((x)&0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) \
+ (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
enum si_clear_code
{
- DCC_CLEAR_COLOR_0000 = 0x00000000,
- DCC_CLEAR_COLOR_0001 = 0x40404040,
- DCC_CLEAR_COLOR_1110 = 0x80808080,
- DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0,
- DCC_CLEAR_COLOR_REG = 0x20202020,
- DCC_UNCOMPRESSED = 0xFFFFFFFF,
+ DCC_CLEAR_COLOR_0000 = 0x00000000,
+ DCC_CLEAR_COLOR_0001 = 0x40404040,
+ DCC_CLEAR_COLOR_1110 = 0x80808080,
+ DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0,
+ DCC_CLEAR_COLOR_REG = 0x20202020,
+ DCC_UNCOMPRESSED = 0xFFFFFFFF,
};
-#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
+#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
/* Debug flags. */
-enum {
- /* Shader logging options: */
- DBG_VS = PIPE_SHADER_VERTEX,
- DBG_PS = PIPE_SHADER_FRAGMENT,
- DBG_GS = PIPE_SHADER_GEOMETRY,
- DBG_TCS = PIPE_SHADER_TESS_CTRL,
- DBG_TES = PIPE_SHADER_TESS_EVAL,
- DBG_CS = PIPE_SHADER_COMPUTE,
- DBG_NO_IR,
- DBG_NO_NIR,
- DBG_NO_ASM,
- DBG_PREOPT_IR,
-
- /* Shader compiler options the shader cache should be aware of: */
- DBG_FS_CORRECT_DERIVS_AFTER_KILL,
- DBG_GISEL,
- DBG_W32_GE,
- DBG_W32_PS,
- DBG_W32_CS,
- DBG_W64_GE,
- DBG_W64_PS,
- DBG_W64_CS,
-
- /* Shader compiler options (with no effect on the shader cache): */
- DBG_CHECK_IR,
- DBG_MONOLITHIC_SHADERS,
- DBG_NO_OPT_VARIANT,
-
- /* Information logging options: */
- DBG_INFO,
- DBG_TEX,
- DBG_COMPUTE,
- DBG_VM,
- DBG_CACHE_STATS,
-
- /* Driver options: */
- DBG_FORCE_SDMA,
- DBG_NO_SDMA,
- DBG_NO_SDMA_CLEARS,
- DBG_NO_SDMA_COPY_IMAGE,
- DBG_NO_WC,
- DBG_CHECK_VM,
- DBG_RESERVE_VMID,
- DBG_ZERO_VRAM,
-
- /* 3D engine options: */
- DBG_NO_GFX,
- DBG_NO_NGG,
- DBG_ALWAYS_NGG_CULLING,
- DBG_NO_NGG_CULLING,
- DBG_ALWAYS_PD,
- DBG_PD,
- DBG_NO_PD,
- DBG_SWITCH_ON_EOP,
- DBG_NO_OUT_OF_ORDER,
- DBG_NO_DPBB,
- DBG_NO_DFSM,
- DBG_DPBB,
- DBG_DFSM,
- DBG_NO_HYPERZ,
- DBG_NO_RB_PLUS,
- DBG_NO_2D_TILING,
- DBG_NO_TILING,
- DBG_NO_DCC,
- DBG_NO_DCC_CLEAR,
- DBG_NO_DCC_FB,
- DBG_NO_DCC_MSAA,
- DBG_NO_FMASK,
-
- DBG_COUNT
+enum
+{
+ /* Shader logging options: */
+ DBG_VS = PIPE_SHADER_VERTEX,
+ DBG_PS = PIPE_SHADER_FRAGMENT,
+ DBG_GS = PIPE_SHADER_GEOMETRY,
+ DBG_TCS = PIPE_SHADER_TESS_CTRL,
+ DBG_TES = PIPE_SHADER_TESS_EVAL,
+ DBG_CS = PIPE_SHADER_COMPUTE,
+ DBG_NO_IR,
+ DBG_NO_NIR,
+ DBG_NO_ASM,
+ DBG_PREOPT_IR,
+
+ /* Shader compiler options the shader cache should be aware of: */
+ DBG_FS_CORRECT_DERIVS_AFTER_KILL,
+ DBG_GISEL,
+ DBG_W32_GE,
+ DBG_W32_PS,
+ DBG_W32_CS,
+ DBG_W64_GE,
+ DBG_W64_PS,
+ DBG_W64_CS,
+
+ /* Shader compiler options (with no effect on the shader cache): */
+ DBG_CHECK_IR,
+ DBG_MONOLITHIC_SHADERS,
+ DBG_NO_OPT_VARIANT,
+
+ /* Information logging options: */
+ DBG_INFO,
+ DBG_TEX,
+ DBG_COMPUTE,
+ DBG_VM,
+ DBG_CACHE_STATS,
+
+ /* Driver options: */
+ DBG_FORCE_SDMA,
+ DBG_NO_SDMA,
+ DBG_NO_SDMA_CLEARS,
+ DBG_NO_SDMA_COPY_IMAGE,
+ DBG_NO_WC,
+ DBG_CHECK_VM,
+ DBG_RESERVE_VMID,
+ DBG_ZERO_VRAM,
+
+ /* 3D engine options: */
+ DBG_NO_GFX,
+ DBG_NO_NGG,
+ DBG_ALWAYS_NGG_CULLING,
+ DBG_NO_NGG_CULLING,
+ DBG_ALWAYS_PD,
+ DBG_PD,
+ DBG_NO_PD,
+ DBG_SWITCH_ON_EOP,
+ DBG_NO_OUT_OF_ORDER,
+ DBG_NO_DPBB,
+ DBG_NO_DFSM,
+ DBG_DPBB,
+ DBG_DFSM,
+ DBG_NO_HYPERZ,
+ DBG_NO_RB_PLUS,
+ DBG_NO_2D_TILING,
+ DBG_NO_TILING,
+ DBG_NO_DCC,
+ DBG_NO_DCC_CLEAR,
+ DBG_NO_DCC_FB,
+ DBG_NO_DCC_MSAA,
+ DBG_NO_FMASK,
+
+ DBG_COUNT
};
-enum {
- /* Tests: */
- DBG_TEST_DMA,
- DBG_TEST_VMFAULT_CP,
- DBG_TEST_VMFAULT_SDMA,
- DBG_TEST_VMFAULT_SHADER,
- DBG_TEST_DMA_PERF,
- DBG_TEST_GDS,
- DBG_TEST_GDS_MM,
- DBG_TEST_GDS_OA_MM,
+enum
+{
+ /* Tests: */
+ DBG_TEST_DMA,
+ DBG_TEST_VMFAULT_CP,
+ DBG_TEST_VMFAULT_SDMA,
+ DBG_TEST_VMFAULT_SHADER,
+ DBG_TEST_DMA_PERF,
+ DBG_TEST_GDS,
+ DBG_TEST_GDS_MM,
+ DBG_TEST_GDS_OA_MM,
};
-#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1))
-#define DBG(name) (1ull << DBG_##name)
+#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1))
+#define DBG(name) (1ull << DBG_##name)
-enum si_cache_policy {
- L2_BYPASS,
- L2_STREAM, /* same as SLC=1 */
- L2_LRU, /* same as SLC=0 */
+enum si_cache_policy
+{
+ L2_BYPASS,
+ L2_STREAM, /* same as SLC=1 */
+ L2_LRU, /* same as SLC=0 */
};
-enum si_coherency {
- SI_COHERENCY_NONE, /* no cache flushes needed */
- SI_COHERENCY_SHADER,
- SI_COHERENCY_CB_META,
- SI_COHERENCY_CP,
+enum si_coherency
+{
+ SI_COHERENCY_NONE, /* no cache flushes needed */
+ SI_COHERENCY_SHADER,
+ SI_COHERENCY_CB_META,
+ SI_COHERENCY_CP,
};
struct si_compute;
* at the moment.
*/
struct si_resource {
- struct threaded_resource b;
-
- /* Winsys objects. */
- struct pb_buffer *buf;
- uint64_t gpu_address;
- /* Memory usage if the buffer placement is optimal. */
- uint64_t vram_usage;
- uint64_t gart_usage;
-
- /* Resource properties. */
- uint64_t bo_size;
- unsigned bo_alignment;
- enum radeon_bo_domain domains;
- enum radeon_bo_flag flags;
- unsigned bind_history;
- int max_forced_staging_uploads;
-
- /* The buffer range which is initialized (with a write transfer,
- * streamout, DMA, or as a random access target). The rest of
- * the buffer is considered invalid and can be mapped unsynchronized.
- *
- * This allows unsychronized mapping of a buffer range which hasn't
- * been used yet. It's for applications which forget to use
- * the unsynchronized map flag and expect the driver to figure it out.
- */
- struct util_range valid_buffer_range;
-
- /* For buffers only. This indicates that a write operation has been
- * performed by TC L2, but the cache hasn't been flushed.
- * Any hw block which doesn't use or bypasses TC L2 should check this
- * flag and flush the cache before using the buffer.
- *
- * For example, TC L2 must be flushed if a buffer which has been
- * modified by a shader store instruction is about to be used as
- * an index buffer. The reason is that VGT DMA index fetching doesn't
- * use TC L2.
- */
- bool TC_L2_dirty;
-
- /* Whether this resource is referenced by bindless handles. */
- bool texture_handle_allocated;
- bool image_handle_allocated;
-
- /* Whether the resource has been exported via resource_get_handle. */
- unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
+ struct threaded_resource b;
+
+ /* Winsys objects. */
+ struct pb_buffer *buf;
+ uint64_t gpu_address;
+ /* Memory usage if the buffer placement is optimal. */
+ uint64_t vram_usage;
+ uint64_t gart_usage;
+
+ /* Resource properties. */
+ uint64_t bo_size;
+ unsigned bo_alignment;
+ enum radeon_bo_domain domains;
+ enum radeon_bo_flag flags;
+ unsigned bind_history;
+ int max_forced_staging_uploads;
+
+ /* The buffer range which is initialized (with a write transfer,
+ * streamout, DMA, or as a random access target). The rest of
+ * the buffer is considered invalid and can be mapped unsynchronized.
+ *
+ * This allows unsychronized mapping of a buffer range which hasn't
+ * been used yet. It's for applications which forget to use
+ * the unsynchronized map flag and expect the driver to figure it out.
+ */
+ struct util_range valid_buffer_range;
+
+ /* For buffers only. This indicates that a write operation has been
+ * performed by TC L2, but the cache hasn't been flushed.
+ * Any hw block which doesn't use or bypasses TC L2 should check this
+ * flag and flush the cache before using the buffer.
+ *
+ * For example, TC L2 must be flushed if a buffer which has been
+ * modified by a shader store instruction is about to be used as
+ * an index buffer. The reason is that VGT DMA index fetching doesn't
+ * use TC L2.
+ */
+ bool TC_L2_dirty;
+
+ /* Whether this resource is referenced by bindless handles. */
+ bool texture_handle_allocated;
+ bool image_handle_allocated;
+
+ /* Whether the resource has been exported via resource_get_handle. */
+ unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
};
struct si_transfer {
- struct threaded_transfer b;
- struct si_resource *staging;
- unsigned offset;
+ struct threaded_transfer b;
+ struct si_resource *staging;
+ unsigned offset;
};
struct si_texture {
- struct si_resource buffer;
-
- struct radeon_surf surface;
- struct si_texture *flushed_depth_texture;
-
- /* One texture allocation can contain these buffers:
- * - image (pixel data)
- * - FMASK buffer (MSAA compression)
- * - CMASK buffer (MSAA compression and/or legacy fast color clear)
- * - HTILE buffer (Z/S compression and fast Z/S clear)
- * - DCC buffer (color compression and new fast color clear)
- * - displayable DCC buffer (if the DCC buffer is not displayable)
- * - DCC retile mapping buffer (if the DCC buffer is not displayable)
- */
- uint64_t cmask_base_address_reg;
- struct si_resource *cmask_buffer;
- unsigned cb_color_info; /* fast clear enable bit */
- unsigned color_clear_value[2];
- unsigned last_msaa_resolve_target_micro_mode;
- unsigned num_level0_transfers;
- unsigned plane_index; /* other planes are different pipe_resources */
- unsigned num_planes;
-
- /* Depth buffer compression and fast clear. */
- float depth_clear_value;
- uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */
- uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
- enum pipe_format db_render_format:16;
- uint8_t stencil_clear_value;
- bool fmask_is_identity:1;
- bool tc_compatible_htile:1;
- bool htile_stencil_disabled:1;
- bool depth_cleared:1; /* if it was cleared at least once */
- bool stencil_cleared:1; /* if it was cleared at least once */
- bool upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */
- bool is_depth:1;
- bool db_compatible:1;
- bool can_sample_z:1;
- bool can_sample_s:1;
-
- /* We need to track DCC dirtiness, because st/dri usually calls
- * flush_resource twice per frame (not a bug) and we don't wanna
- * decompress DCC twice. Also, the dirty tracking must be done even
- * if DCC isn't used, because it's required by the DCC usage analysis
- * for a possible future enablement.
- */
- bool separate_dcc_dirty:1;
- bool displayable_dcc_dirty:1;
-
- /* Statistics gathering for the DCC enablement heuristic. */
- bool dcc_gather_statistics:1;
- /* Counter that should be non-zero if the texture is bound to a
- * framebuffer.
- */
- unsigned framebuffers_bound;
- /* Whether the texture is a displayable back buffer and needs DCC
- * decompression, which is expensive. Therefore, it's enabled only
- * if statistics suggest that it will pay off and it's allocated
- * separately. It can't be bound as a sampler by apps. Limited to
- * target == 2D and last_level == 0. If enabled, dcc_offset contains
- * the absolute GPUVM address, not the relative one.
- */
- struct si_resource *dcc_separate_buffer;
- /* When DCC is temporarily disabled, the separate buffer is here. */
- struct si_resource *last_dcc_separate_buffer;
- /* Estimate of how much this color buffer is written to in units of
- * full-screen draws: ps_invocations / (width * height)
- * Shader kills, late Z, and blending with trivial discards make it
- * inaccurate (we need to count CB updates, not PS invocations).
- */
- unsigned ps_draw_ratio;
- /* The number of clears since the last DCC usage analysis. */
- unsigned num_slow_clears;
+ struct si_resource buffer;
+
+ struct radeon_surf surface;
+ struct si_texture *flushed_depth_texture;
+
+ /* One texture allocation can contain these buffers:
+ * - image (pixel data)
+ * - FMASK buffer (MSAA compression)
+ * - CMASK buffer (MSAA compression and/or legacy fast color clear)
+ * - HTILE buffer (Z/S compression and fast Z/S clear)
+ * - DCC buffer (color compression and new fast color clear)
+ * - displayable DCC buffer (if the DCC buffer is not displayable)
+ * - DCC retile mapping buffer (if the DCC buffer is not displayable)
+ */
+ uint64_t cmask_base_address_reg;
+ struct si_resource *cmask_buffer;
+ unsigned cb_color_info; /* fast clear enable bit */
+ unsigned color_clear_value[2];
+ unsigned last_msaa_resolve_target_micro_mode;
+ unsigned num_level0_transfers;
+ unsigned plane_index; /* other planes are different pipe_resources */
+ unsigned num_planes;
+
+ /* Depth buffer compression and fast clear. */
+ float depth_clear_value;
+ uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */
+ uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
+ enum pipe_format db_render_format : 16;
+ uint8_t stencil_clear_value;
+ bool fmask_is_identity : 1;
+ bool tc_compatible_htile : 1;
+ bool htile_stencil_disabled : 1;
+ bool depth_cleared : 1; /* if it was cleared at least once */
+ bool stencil_cleared : 1; /* if it was cleared at least once */
+ bool upgraded_depth : 1; /* upgraded from unorm to Z32_FLOAT */
+ bool is_depth : 1;
+ bool db_compatible : 1;
+ bool can_sample_z : 1;
+ bool can_sample_s : 1;
+
+ /* We need to track DCC dirtiness, because st/dri usually calls
+ * flush_resource twice per frame (not a bug) and we don't wanna
+ * decompress DCC twice. Also, the dirty tracking must be done even
+ * if DCC isn't used, because it's required by the DCC usage analysis
+ * for a possible future enablement.
+ */
+ bool separate_dcc_dirty : 1;
+ bool displayable_dcc_dirty : 1;
+
+ /* Statistics gathering for the DCC enablement heuristic. */
+ bool dcc_gather_statistics : 1;
+ /* Counter that should be non-zero if the texture is bound to a
+ * framebuffer.
+ */
+ unsigned framebuffers_bound;
+ /* Whether the texture is a displayable back buffer and needs DCC
+ * decompression, which is expensive. Therefore, it's enabled only
+ * if statistics suggest that it will pay off and it's allocated
+ * separately. It can't be bound as a sampler by apps. Limited to
+ * target == 2D and last_level == 0. If enabled, dcc_offset contains
+ * the absolute GPUVM address, not the relative one.
+ */
+ struct si_resource *dcc_separate_buffer;
+ /* When DCC is temporarily disabled, the separate buffer is here. */
+ struct si_resource *last_dcc_separate_buffer;
+ /* Estimate of how much this color buffer is written to in units of
+ * full-screen draws: ps_invocations / (width * height)
+ * Shader kills, late Z, and blending with trivial discards make it
+ * inaccurate (we need to count CB updates, not PS invocations).
+ */
+ unsigned ps_draw_ratio;
+ /* The number of clears since the last DCC usage analysis. */
+ unsigned num_slow_clears;
};
struct si_surface {
- struct pipe_surface base;
-
- /* These can vary with block-compressed textures. */
- uint16_t width0;
- uint16_t height0;
-
- bool color_initialized:1;
- bool depth_initialized:1;
-
- /* Misc. color flags. */
- bool color_is_int8:1;
- bool color_is_int10:1;
- bool dcc_incompatible:1;
-
- /* Color registers. */
- unsigned cb_color_info;
- unsigned cb_color_view;
- unsigned cb_color_attrib;
- unsigned cb_color_attrib2; /* GFX9 and later */
- unsigned cb_color_attrib3; /* GFX10 and later */
- unsigned cb_dcc_control; /* GFX8 and later */
- unsigned spi_shader_col_format:8; /* no blending, no alpha-to-coverage. */
- unsigned spi_shader_col_format_alpha:8; /* alpha-to-coverage */
- unsigned spi_shader_col_format_blend:8; /* blending without alpha. */
- unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */
-
- /* DB registers. */
- uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
- uint64_t db_stencil_base;
- uint64_t db_htile_data_base;
- unsigned db_depth_info;
- unsigned db_z_info;
- unsigned db_z_info2; /* GFX9 only */
- unsigned db_depth_view;
- unsigned db_depth_size;
- unsigned db_depth_slice;
- unsigned db_stencil_info;
- unsigned db_stencil_info2; /* GFX9 only */
- unsigned db_htile_surface;
+ struct pipe_surface base;
+
+ /* These can vary with block-compressed textures. */
+ uint16_t width0;
+ uint16_t height0;
+
+ bool color_initialized : 1;
+ bool depth_initialized : 1;
+
+ /* Misc. color flags. */
+ bool color_is_int8 : 1;
+ bool color_is_int10 : 1;
+ bool dcc_incompatible : 1;
+
+ /* Color registers. */
+ unsigned cb_color_info;
+ unsigned cb_color_view;
+ unsigned cb_color_attrib;
+ unsigned cb_color_attrib2; /* GFX9 and later */
+ unsigned cb_color_attrib3; /* GFX10 and later */
+ unsigned cb_dcc_control; /* GFX8 and later */
+ unsigned spi_shader_col_format : 8; /* no blending, no alpha-to-coverage. */
+ unsigned spi_shader_col_format_alpha : 8; /* alpha-to-coverage */
+ unsigned spi_shader_col_format_blend : 8; /* blending without alpha. */
+ unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */
+
+ /* DB registers. */
+ uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
+ uint64_t db_stencil_base;
+ uint64_t db_htile_data_base;
+ unsigned db_depth_info;
+ unsigned db_z_info;
+ unsigned db_z_info2; /* GFX9 only */
+ unsigned db_depth_view;
+ unsigned db_depth_size;
+ unsigned db_depth_slice;
+ unsigned db_stencil_info;
+ unsigned db_stencil_info2; /* GFX9 only */
+ unsigned db_htile_surface;
};
struct si_mmio_counter {
- unsigned busy;
- unsigned idle;
+ unsigned busy;
+ unsigned idle;
};
union si_mmio_counters {
- struct {
- /* For global GPU load including SDMA. */
- struct si_mmio_counter gpu;
-
- /* GRBM_STATUS */
- struct si_mmio_counter spi;
- struct si_mmio_counter gui;
- struct si_mmio_counter ta;
- struct si_mmio_counter gds;
- struct si_mmio_counter vgt;
- struct si_mmio_counter ia;
- struct si_mmio_counter sx;
- struct si_mmio_counter wd;
- struct si_mmio_counter bci;
- struct si_mmio_counter sc;
- struct si_mmio_counter pa;
- struct si_mmio_counter db;
- struct si_mmio_counter cp;
- struct si_mmio_counter cb;
-
- /* SRBM_STATUS2 */
- struct si_mmio_counter sdma;
-
- /* CP_STAT */
- struct si_mmio_counter pfp;
- struct si_mmio_counter meq;
- struct si_mmio_counter me;
- struct si_mmio_counter surf_sync;
- struct si_mmio_counter cp_dma;
- struct si_mmio_counter scratch_ram;
- } named;
- unsigned array[0];
+ struct {
+ /* For global GPU load including SDMA. */
+ struct si_mmio_counter gpu;
+
+ /* GRBM_STATUS */
+ struct si_mmio_counter spi;
+ struct si_mmio_counter gui;
+ struct si_mmio_counter ta;
+ struct si_mmio_counter gds;
+ struct si_mmio_counter vgt;
+ struct si_mmio_counter ia;
+ struct si_mmio_counter sx;
+ struct si_mmio_counter wd;
+ struct si_mmio_counter bci;
+ struct si_mmio_counter sc;
+ struct si_mmio_counter pa;
+ struct si_mmio_counter db;
+ struct si_mmio_counter cp;
+ struct si_mmio_counter cb;
+
+ /* SRBM_STATUS2 */
+ struct si_mmio_counter sdma;
+
+ /* CP_STAT */
+ struct si_mmio_counter pfp;
+ struct si_mmio_counter meq;
+ struct si_mmio_counter me;
+ struct si_mmio_counter surf_sync;
+ struct si_mmio_counter cp_dma;
+ struct si_mmio_counter scratch_ram;
+ } named;
+ unsigned array[0];
};
struct si_memory_object {
- struct pipe_memory_object b;
- struct pb_buffer *buf;
- uint32_t stride;
+ struct pipe_memory_object b;
+ struct pb_buffer *buf;
+ uint32_t stride;
};
/* Saved CS data for debugging features. */
struct radeon_saved_cs {
- uint32_t *ib;
- unsigned num_dw;
+ uint32_t *ib;
+ unsigned num_dw;
- struct radeon_bo_list_item *bo_list;
- unsigned bo_count;
+ struct radeon_bo_list_item *bo_list;
+ unsigned bo_count;
};
struct si_screen {
- struct pipe_screen b;
- struct radeon_winsys *ws;
- struct disk_cache *disk_shader_cache;
-
- struct radeon_info info;
- uint64_t debug_flags;
- char renderer_string[183];
-
- void (*make_texture_descriptor)(
- struct si_screen *screen,
- struct si_texture *tex,
- bool sampler,
- enum pipe_texture_target target,
- enum pipe_format pipe_format,
- const unsigned char state_swizzle[4],
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned width, unsigned height, unsigned depth,
- uint32_t *state,
- uint32_t *fmask_state);
-
- unsigned num_vbos_in_user_sgprs;
- unsigned pa_sc_raster_config;
- unsigned pa_sc_raster_config_1;
- unsigned se_tile_repeat;
- unsigned gs_table_depth;
- unsigned tess_offchip_block_dw_size;
- unsigned tess_offchip_ring_size;
- unsigned tess_factor_ring_size;
- unsigned vgt_hs_offchip_param;
- unsigned eqaa_force_coverage_samples;
- unsigned eqaa_force_z_samples;
- unsigned eqaa_force_color_samples;
- bool has_draw_indirect_multi;
- bool has_out_of_order_rast;
- bool assume_no_z_fights;
- bool commutative_blend_add;
- bool dpbb_allowed;
- bool dfsm_allowed;
- bool llvm_has_working_vgpr_indexing;
- bool use_ngg;
- bool use_ngg_culling;
- bool always_use_ngg_culling;
- bool use_ngg_streamout;
-
- struct {
-#define OPT_BOOL(name, dflt, description) bool name:1;
+ struct pipe_screen b;
+ struct radeon_winsys *ws;
+ struct disk_cache *disk_shader_cache;
+
+ struct radeon_info info;
+ uint64_t debug_flags;
+ char renderer_string[183];
+
+ void (*make_texture_descriptor)(struct si_screen *screen, struct si_texture *tex, bool sampler,
+ enum pipe_texture_target target, enum pipe_format pipe_format,
+ const unsigned char state_swizzle[4], unsigned first_level,
+ unsigned last_level, unsigned first_layer, unsigned last_layer,
+ unsigned width, unsigned height, unsigned depth, uint32_t *state,
+ uint32_t *fmask_state);
+
+ unsigned num_vbos_in_user_sgprs;
+ unsigned pa_sc_raster_config;
+ unsigned pa_sc_raster_config_1;
+ unsigned se_tile_repeat;
+ unsigned gs_table_depth;
+ unsigned tess_offchip_block_dw_size;
+ unsigned tess_offchip_ring_size;
+ unsigned tess_factor_ring_size;
+ unsigned vgt_hs_offchip_param;
+ unsigned eqaa_force_coverage_samples;
+ unsigned eqaa_force_z_samples;
+ unsigned eqaa_force_color_samples;
+ bool has_draw_indirect_multi;
+ bool has_out_of_order_rast;
+ bool assume_no_z_fights;
+ bool commutative_blend_add;
+ bool dpbb_allowed;
+ bool dfsm_allowed;
+ bool llvm_has_working_vgpr_indexing;
+ bool use_ngg;
+ bool use_ngg_culling;
+ bool always_use_ngg_culling;
+ bool use_ngg_streamout;
+
+ struct {
+#define OPT_BOOL(name, dflt, description) bool name : 1;
#include "si_debug_options.h"
- } options;
-
- /* Whether shaders are monolithic (1-part) or separate (3-part). */
- bool use_monolithic_shaders;
- bool record_llvm_ir;
- bool dcc_msaa_allowed;
-
- struct slab_parent_pool pool_transfers;
-
- /* Texture filter settings. */
- int force_aniso; /* -1 = disabled */
-
- /* Auxiliary context. Mainly used to initialize resources.
- * It must be locked prior to using and flushed before unlocking. */
- struct pipe_context *aux_context;
- simple_mtx_t aux_context_lock;
-
- /* This must be in the screen, because UE4 uses one context for
- * compilation and another one for rendering.
- */
- unsigned num_compilations;
- /* Along with ST_DEBUG=precompile, this should show if applications
- * are loading shaders on demand. This is a monotonic counter.
- */
- unsigned num_shaders_created;
- unsigned num_memory_shader_cache_hits;
- unsigned num_memory_shader_cache_misses;
- unsigned num_disk_shader_cache_hits;
- unsigned num_disk_shader_cache_misses;
-
- /* GPU load thread. */
- simple_mtx_t gpu_load_mutex;
- thrd_t gpu_load_thread;
- union si_mmio_counters mmio_counters;
- volatile unsigned gpu_load_stop_thread; /* bool */
-
- /* Performance counters. */
- struct si_perfcounters *perfcounters;
-
- /* If pipe_screen wants to recompute and re-emit the framebuffer,
- * sampler, and image states of all contexts, it should atomically
- * increment this.
- *
- * Each context will compare this with its own last known value of
- * the counter before drawing and re-emit the states accordingly.
- */
- unsigned dirty_tex_counter;
- unsigned dirty_buf_counter;
-
- /* Atomically increment this counter when an existing texture's
- * metadata is enabled or disabled in a way that requires changing
- * contexts' compressed texture binding masks.
- */
- unsigned compressed_colortex_counter;
-
- struct {
- /* Context flags to set so that all writes from earlier jobs
- * in the CP are seen by L2 clients.
- */
- unsigned cp_to_L2;
-
- /* Context flags to set so that all writes from earlier jobs
- * that end in L2 are seen by CP.
- */
- unsigned L2_to_cp;
- } barrier_flags;
-
- simple_mtx_t shader_parts_mutex;
- struct si_shader_part *vs_prologs;
- struct si_shader_part *tcs_epilogs;
- struct si_shader_part *gs_prologs;
- struct si_shader_part *ps_prologs;
- struct si_shader_part *ps_epilogs;
-
- /* Shader cache in memory.
- *
- * Design & limitations:
- * - The shader cache is per screen (= per process), never saved to
- * disk, and skips redundant shader compilations from NIR to bytecode.
- * - It can only be used with one-variant-per-shader support, in which
- * case only the main (typically middle) part of shaders is cached.
- * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
- * variants of VS and TES are cached, so LS and ES aren't.
- * - GS and CS aren't cached, but it's certainly possible to cache
- * those as well.
- */
- simple_mtx_t shader_cache_mutex;
- struct hash_table *shader_cache;
-
- /* Shader cache of live shaders. */
- struct util_live_shader_cache live_shader_cache;
-
- /* Shader compiler queue for multithreaded compilation. */
- struct util_queue shader_compiler_queue;
- /* Use at most 3 normal compiler threads on quadcore and better.
- * Hyperthreaded CPUs report the number of threads, but we want
- * the number of cores. We only need this many threads for shader-db. */
- struct ac_llvm_compiler compiler[24]; /* used by the queue only */
-
- struct util_queue shader_compiler_queue_low_priority;
- /* Use at most 2 low priority threads on quadcore and better.
- * We want to minimize the impact on multithreaded Mesa. */
- struct ac_llvm_compiler compiler_lowp[10];
-
- unsigned compute_wave_size;
- unsigned ps_wave_size;
- unsigned ge_wave_size;
+ } options;
+
+ /* Whether shaders are monolithic (1-part) or separate (3-part). */
+ bool use_monolithic_shaders;
+ bool record_llvm_ir;
+ bool dcc_msaa_allowed;
+
+ struct slab_parent_pool pool_transfers;
+
+ /* Texture filter settings. */
+ int force_aniso; /* -1 = disabled */
+
+ /* Auxiliary context. Mainly used to initialize resources.
+ * It must be locked prior to using and flushed before unlocking. */
+ struct pipe_context *aux_context;
+ simple_mtx_t aux_context_lock;
+
+ /* This must be in the screen, because UE4 uses one context for
+ * compilation and another one for rendering.
+ */
+ unsigned num_compilations;
+ /* Along with ST_DEBUG=precompile, this should show if applications
+ * are loading shaders on demand. This is a monotonic counter.
+ */
+ unsigned num_shaders_created;
+ unsigned num_memory_shader_cache_hits;
+ unsigned num_memory_shader_cache_misses;
+ unsigned num_disk_shader_cache_hits;
+ unsigned num_disk_shader_cache_misses;
+
+ /* GPU load thread. */
+ simple_mtx_t gpu_load_mutex;
+ thrd_t gpu_load_thread;
+ union si_mmio_counters mmio_counters;
+ volatile unsigned gpu_load_stop_thread; /* bool */
+
+ /* Performance counters. */
+ struct si_perfcounters *perfcounters;
+
+ /* If pipe_screen wants to recompute and re-emit the framebuffer,
+ * sampler, and image states of all contexts, it should atomically
+ * increment this.
+ *
+ * Each context will compare this with its own last known value of
+ * the counter before drawing and re-emit the states accordingly.
+ */
+ unsigned dirty_tex_counter;
+ unsigned dirty_buf_counter;
+
+ /* Atomically increment this counter when an existing texture's
+ * metadata is enabled or disabled in a way that requires changing
+ * contexts' compressed texture binding masks.
+ */
+ unsigned compressed_colortex_counter;
+
+ struct {
+ /* Context flags to set so that all writes from earlier jobs
+ * in the CP are seen by L2 clients.
+ */
+ unsigned cp_to_L2;
+
+ /* Context flags to set so that all writes from earlier jobs
+ * that end in L2 are seen by CP.
+ */
+ unsigned L2_to_cp;
+ } barrier_flags;
+
+ simple_mtx_t shader_parts_mutex;
+ struct si_shader_part *vs_prologs;
+ struct si_shader_part *tcs_epilogs;
+ struct si_shader_part *gs_prologs;
+ struct si_shader_part *ps_prologs;
+ struct si_shader_part *ps_epilogs;
+
+ /* Shader cache in memory.
+ *
+ * Design & limitations:
+ * - The shader cache is per screen (= per process), never saved to
+ * disk, and skips redundant shader compilations from NIR to bytecode.
+ * - It can only be used with one-variant-per-shader support, in which
+ * case only the main (typically middle) part of shaders is cached.
+ * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
+ * variants of VS and TES are cached, so LS and ES aren't.
+ * - GS and CS aren't cached, but it's certainly possible to cache
+ * those as well.
+ */
+ simple_mtx_t shader_cache_mutex;
+ struct hash_table *shader_cache;
+
+ /* Shader cache of live shaders. */
+ struct util_live_shader_cache live_shader_cache;
+
+ /* Shader compiler queue for multithreaded compilation. */
+ struct util_queue shader_compiler_queue;
+ /* Use at most 3 normal compiler threads on quadcore and better.
+ * Hyperthreaded CPUs report the number of threads, but we want
+ * the number of cores. We only need this many threads for shader-db. */
+ struct ac_llvm_compiler compiler[24]; /* used by the queue only */
+
+ struct util_queue shader_compiler_queue_low_priority;
+ /* Use at most 2 low priority threads on quadcore and better.
+ * We want to minimize the impact on multithreaded Mesa. */
+ struct ac_llvm_compiler compiler_lowp[10];
+
+ unsigned compute_wave_size;
+ unsigned ps_wave_size;
+ unsigned ge_wave_size;
};
struct si_blend_color {
- struct pipe_blend_color state;
- bool any_nonzeros;
+ struct pipe_blend_color state;
+ bool any_nonzeros;
};
struct si_sampler_view {
- struct pipe_sampler_view base;
- /* [0..7] = image descriptor
- * [4..7] = buffer descriptor */
- uint32_t state[8];
- uint32_t fmask_state[8];
- const struct legacy_surf_level *base_level_info;
- ubyte base_level;
- ubyte block_width;
- bool is_stencil_sampler;
- bool is_integer;
- bool dcc_incompatible;
+ struct pipe_sampler_view base;
+ /* [0..7] = image descriptor
+ * [4..7] = buffer descriptor */
+ uint32_t state[8];
+ uint32_t fmask_state[8];
+ const struct legacy_surf_level *base_level_info;
+ ubyte base_level;
+ ubyte block_width;
+ bool is_stencil_sampler;
+ bool is_integer;
+ bool dcc_incompatible;
};
#define SI_SAMPLER_STATE_MAGIC 0x34f1c35a
struct si_sampler_state {
#ifndef NDEBUG
- unsigned magic;
+ unsigned magic;
#endif
- uint32_t val[4];
- uint32_t integer_val[4];
- uint32_t upgraded_depth_val[4];
+ uint32_t val[4];
+ uint32_t integer_val[4];
+ uint32_t upgraded_depth_val[4];
};
struct si_cs_shader_state {
- struct si_compute *program;
- struct si_compute *emitted_program;
- unsigned offset;
- bool initialized;
- bool uses_scratch;
+ struct si_compute *program;
+ struct si_compute *emitted_program;
+ unsigned offset;
+ bool initialized;
+ bool uses_scratch;
};
struct si_samplers {
- struct pipe_sampler_view *views[SI_NUM_SAMPLERS];
- struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS];
+ struct pipe_sampler_view *views[SI_NUM_SAMPLERS];
+ struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS];
- /* The i-th bit is set if that element is enabled (non-NULL resource). */
- unsigned enabled_mask;
- uint32_t needs_depth_decompress_mask;
- uint32_t needs_color_decompress_mask;
+ /* The i-th bit is set if that element is enabled (non-NULL resource). */
+ unsigned enabled_mask;
+ uint32_t needs_depth_decompress_mask;
+ uint32_t needs_color_decompress_mask;
};
struct si_images {
- struct pipe_image_view views[SI_NUM_IMAGES];
- uint32_t needs_color_decompress_mask;
- unsigned enabled_mask;
+ struct pipe_image_view views[SI_NUM_IMAGES];
+ uint32_t needs_color_decompress_mask;
+ unsigned enabled_mask;
};
struct si_framebuffer {
- struct pipe_framebuffer_state state;
- unsigned colorbuf_enabled_4bit;
- unsigned spi_shader_col_format;
- unsigned spi_shader_col_format_alpha;
- unsigned spi_shader_col_format_blend;
- unsigned spi_shader_col_format_blend_alpha;
- ubyte nr_samples:5; /* at most 16xAA */
- ubyte log_samples:3; /* at most 4 = 16xAA */
- ubyte nr_color_samples; /* at most 8xAA */
- ubyte compressed_cb_mask;
- ubyte uncompressed_cb_mask;
- ubyte displayable_dcc_cb_mask;
- ubyte color_is_int8;
- ubyte color_is_int10;
- ubyte dirty_cbufs;
- ubyte dcc_overwrite_combiner_watermark;
- ubyte min_bytes_per_pixel;
- bool dirty_zsbuf;
- bool any_dst_linear;
- bool CB_has_shader_readable_metadata;
- bool DB_has_shader_readable_metadata;
- bool all_DCC_pipe_aligned;
+ struct pipe_framebuffer_state state;
+ unsigned colorbuf_enabled_4bit;
+ unsigned spi_shader_col_format;
+ unsigned spi_shader_col_format_alpha;
+ unsigned spi_shader_col_format_blend;
+ unsigned spi_shader_col_format_blend_alpha;
+ ubyte nr_samples : 5; /* at most 16xAA */
+ ubyte log_samples : 3; /* at most 4 = 16xAA */
+ ubyte nr_color_samples; /* at most 8xAA */
+ ubyte compressed_cb_mask;
+ ubyte uncompressed_cb_mask;
+ ubyte displayable_dcc_cb_mask;
+ ubyte color_is_int8;
+ ubyte color_is_int10;
+ ubyte dirty_cbufs;
+ ubyte dcc_overwrite_combiner_watermark;
+ ubyte min_bytes_per_pixel;
+ bool dirty_zsbuf;
+ bool any_dst_linear;
+ bool CB_has_shader_readable_metadata;
+ bool DB_has_shader_readable_metadata;
+ bool all_DCC_pipe_aligned;
};
-enum si_quant_mode {
- /* This is the list we want to support. */
- SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
- SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
- SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
+enum si_quant_mode
+{
+ /* This is the list we want to support. */
+ SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
+ SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
+ SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
};
struct si_signed_scissor {
- int minx;
- int miny;
- int maxx;
- int maxy;
- enum si_quant_mode quant_mode;
+ int minx;
+ int miny;
+ int maxx;
+ int maxy;
+ enum si_quant_mode quant_mode;
};
struct si_viewports {
- struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
- struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
- bool y_inverted;
+ struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
+ struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
+ bool y_inverted;
};
struct si_clip_state {
- struct pipe_clip_state state;
- bool any_nonzeros;
+ struct pipe_clip_state state;
+ bool any_nonzeros;
};
struct si_streamout_target {
- struct pipe_stream_output_target b;
+ struct pipe_stream_output_target b;
- /* The buffer where BUFFER_FILLED_SIZE is stored. */
- struct si_resource *buf_filled_size;
- unsigned buf_filled_size_offset;
- bool buf_filled_size_valid;
+ /* The buffer where BUFFER_FILLED_SIZE is stored. */
+ struct si_resource *buf_filled_size;
+ unsigned buf_filled_size_offset;
+ bool buf_filled_size_valid;
- unsigned stride_in_dw;
+ unsigned stride_in_dw;
};
struct si_streamout {
- bool begin_emitted;
+ bool begin_emitted;
- unsigned enabled_mask;
- unsigned num_targets;
- struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS];
+ unsigned enabled_mask;
+ unsigned num_targets;
+ struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS];
- unsigned append_bitmask;
- bool suspended;
+ unsigned append_bitmask;
+ bool suspended;
- /* External state which comes from the vertex shader,
- * it must be set explicitly when binding a shader. */
- uint16_t *stride_in_dw;
- unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
+ /* External state which comes from the vertex shader,
+ * it must be set explicitly when binding a shader. */
+ uint16_t *stride_in_dw;
+ unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
- /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
- unsigned hw_enabled_mask;
+ /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+ unsigned hw_enabled_mask;
- /* The state of VGT_STRMOUT_(CONFIG|EN). */
- bool streamout_enabled;
- bool prims_gen_query_enabled;
- int num_prims_gen_queries;
+ /* The state of VGT_STRMOUT_(CONFIG|EN). */
+ bool streamout_enabled;
+ bool prims_gen_query_enabled;
+ int num_prims_gen_queries;
};
/* A shader state consists of the shader selector, which is a constant state
* the current shader variant selected for this context.
*/
struct si_shader_ctx_state {
- struct si_shader_selector *cso;
- struct si_shader *current;
+ struct si_shader_selector *cso;
+ struct si_shader *current;
};
#define SI_NUM_VGT_PARAM_KEY_BITS 12
-#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS)
+#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS)
/* The IA_MULTI_VGT_PARAM key used to index the table of precomputed values.
* Some fields are set by state-change calls, most are set by draw_vbo.
*/
union si_vgt_param_key {
- struct {
+ struct {
#if UTIL_ARCH_LITTLE_ENDIAN
- unsigned prim:4;
- unsigned uses_instancing:1;
- unsigned multi_instances_smaller_than_primgroup:1;
- unsigned primitive_restart:1;
- unsigned count_from_stream_output:1;
- unsigned line_stipple_enabled:1;
- unsigned uses_tess:1;
- unsigned tess_uses_prim_id:1;
- unsigned uses_gs:1;
- unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
+ unsigned prim : 4;
+ unsigned uses_instancing : 1;
+ unsigned multi_instances_smaller_than_primgroup : 1;
+ unsigned primitive_restart : 1;
+ unsigned count_from_stream_output : 1;
+ unsigned line_stipple_enabled : 1;
+ unsigned uses_tess : 1;
+ unsigned tess_uses_prim_id : 1;
+ unsigned uses_gs : 1;
+ unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
#else /* UTIL_ARCH_BIG_ENDIAN */
- unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
- unsigned uses_gs:1;
- unsigned tess_uses_prim_id:1;
- unsigned uses_tess:1;
- unsigned line_stipple_enabled:1;
- unsigned count_from_stream_output:1;
- unsigned primitive_restart:1;
- unsigned multi_instances_smaller_than_primgroup:1;
- unsigned uses_instancing:1;
- unsigned prim:4;
+ unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
+ unsigned uses_gs : 1;
+ unsigned tess_uses_prim_id : 1;
+ unsigned uses_tess : 1;
+ unsigned line_stipple_enabled : 1;
+ unsigned count_from_stream_output : 1;
+ unsigned primitive_restart : 1;
+ unsigned multi_instances_smaller_than_primgroup : 1;
+ unsigned uses_instancing : 1;
+ unsigned prim : 4;
#endif
- } u;
- uint32_t index;
+ } u;
+ uint32_t index;
};
#define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
* Some fields are set by state-change calls, most are set by draw_vbo.
*/
union si_vgt_stages_key {
- struct {
+ struct {
#if UTIL_ARCH_LITTLE_ENDIAN
- unsigned tess:1;
- unsigned gs:1;
- unsigned ngg_gs_fast_launch:1;
- unsigned ngg_passthrough:1;
- unsigned ngg:1; /* gfx10+ */
- unsigned streamout:1; /* only used with NGG */
- unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
+ unsigned tess : 1;
+ unsigned gs : 1;
+ unsigned ngg_gs_fast_launch : 1;
+ unsigned ngg_passthrough : 1;
+ unsigned ngg : 1; /* gfx10+ */
+ unsigned streamout : 1; /* only used with NGG */
+ unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
#else /* UTIL_ARCH_BIG_ENDIAN */
- unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
- unsigned streamout:1;
- unsigned ngg:1;
- unsigned ngg_passthrough:1;
- unsigned ngg_gs_fast_launch:1;
- unsigned gs:1;
- unsigned tess:1;
+ unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
+ unsigned streamout : 1;
+ unsigned ngg : 1;
+ unsigned ngg_passthrough : 1;
+ unsigned ngg_gs_fast_launch : 1;
+ unsigned gs : 1;
+ unsigned tess : 1;
#endif
- } u;
- uint32_t index;
+ } u;
+ uint32_t index;
};
-struct si_texture_handle
-{
- unsigned desc_slot;
- bool desc_dirty;
- struct pipe_sampler_view *view;
- struct si_sampler_state sstate;
+struct si_texture_handle {
+ unsigned desc_slot;
+ bool desc_dirty;
+ struct pipe_sampler_view *view;
+ struct si_sampler_state sstate;
};
-struct si_image_handle
-{
- unsigned desc_slot;
- bool desc_dirty;
- struct pipe_image_view view;
+struct si_image_handle {
+ unsigned desc_slot;
+ bool desc_dirty;
+ struct pipe_image_view view;
};
struct si_saved_cs {
- struct pipe_reference reference;
- struct si_context *ctx;
- struct radeon_saved_cs gfx;
- struct radeon_saved_cs compute;
- struct si_resource *trace_buf;
- unsigned trace_id;
-
- unsigned gfx_last_dw;
- unsigned compute_last_dw;
- bool flushed;
- int64_t time_flush;
+ struct pipe_reference reference;
+ struct si_context *ctx;
+ struct radeon_saved_cs gfx;
+ struct radeon_saved_cs compute;
+ struct si_resource *trace_buf;
+ unsigned trace_id;
+
+ unsigned gfx_last_dw;
+ unsigned compute_last_dw;
+ bool flushed;
+ int64_t time_flush;
};
struct si_sdma_upload {
- struct si_resource *dst;
- struct si_resource *src;
- unsigned src_offset;
- unsigned dst_offset;
- unsigned size;
+ struct si_resource *dst;
+ struct si_resource *src;
+ unsigned src_offset;
+ unsigned dst_offset;
+ unsigned size;
};
struct si_small_prim_cull_info {
- float scale[2], translate[2];
+ float scale[2], translate[2];
};
struct si_context {
- struct pipe_context b; /* base class */
-
- enum radeon_family family;
- enum chip_class chip_class;
-
- struct radeon_winsys *ws;
- struct radeon_winsys_ctx *ctx;
- struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */
- struct radeon_cmdbuf *sdma_cs;
- struct pipe_fence_handle *last_gfx_fence;
- struct pipe_fence_handle *last_sdma_fence;
- struct si_resource *eop_bug_scratch;
- struct u_upload_mgr *cached_gtt_allocator;
- struct threaded_context *tc;
- struct u_suballocator *allocator_zeroed_memory;
- struct slab_child_pool pool_transfers;
- struct slab_child_pool pool_transfers_unsync; /* for threaded_context */
- struct pipe_device_reset_callback device_reset_callback;
- struct u_log_context *log;
- void *query_result_shader;
- void *sh_query_result_shader;
-
- void (*emit_cache_flush)(struct si_context *ctx);
-
- struct blitter_context *blitter;
- void *noop_blend;
- void *noop_dsa;
- void *discard_rasterizer_state;
- void *custom_dsa_flush;
- void *custom_blend_resolve;
- void *custom_blend_fmask_decompress;
- void *custom_blend_eliminate_fastclear;
- void *custom_blend_dcc_decompress;
- void *vs_blit_pos;
- void *vs_blit_pos_layered;
- void *vs_blit_color;
- void *vs_blit_color_layered;
- void *vs_blit_texcoord;
- void *cs_clear_buffer;
- void *cs_copy_buffer;
- void *cs_copy_image;
- void *cs_copy_image_1d_array;
- void *cs_clear_render_target;
- void *cs_clear_render_target_1d_array;
- void *cs_clear_12bytes_buffer;
- void *cs_dcc_retile;
- void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
- struct si_screen *screen;
- struct pipe_debug_callback debug;
- struct ac_llvm_compiler compiler; /* only non-threaded compilation */
- struct si_shader_ctx_state fixed_func_tcs_shader;
- /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
- struct si_resource *wait_mem_scratch;
- unsigned wait_mem_number;
- uint16_t prefetch_L2_mask;
-
- bool has_graphics;
- bool gfx_flush_in_progress:1;
- bool gfx_last_ib_is_busy:1;
- bool compute_is_busy:1;
-
- unsigned num_gfx_cs_flushes;
- unsigned initial_gfx_cs_size;
- unsigned last_dirty_tex_counter;
- unsigned last_dirty_buf_counter;
- unsigned last_compressed_colortex_counter;
- unsigned last_num_draw_calls;
- unsigned flags; /* flush flags */
- /* Current unaccounted memory usage. */
- uint64_t vram;
- uint64_t gtt;
-
- /* Compute-based primitive discard. */
- unsigned prim_discard_vertex_count_threshold;
- struct pb_buffer *gds;
- struct pb_buffer *gds_oa;
- struct radeon_cmdbuf *prim_discard_compute_cs;
- unsigned compute_gds_offset;
- struct si_shader *compute_ib_last_shader;
- uint32_t compute_rewind_va;
- unsigned compute_num_prims_in_batch;
- bool preserve_prim_restart_gds_at_flush;
- /* index_ring is divided into 2 halves for doublebuffering. */
- struct si_resource *index_ring;
- unsigned index_ring_base; /* offset of a per-IB portion */
- unsigned index_ring_offset; /* offset within a per-IB portion */
- unsigned index_ring_size_per_ib; /* max available size per IB */
- bool prim_discard_compute_ib_initialized;
- /* For tracking the last execution barrier - it can be either
- * a WRITE_DATA packet or a fence. */
- uint32_t *last_pkt3_write_data;
- struct si_resource *barrier_buf;
- unsigned barrier_buf_offset;
- struct pipe_fence_handle *last_ib_barrier_fence;
- struct si_resource *last_ib_barrier_buf;
- unsigned last_ib_barrier_buf_offset;
-
- /* Atoms (direct states). */
- union si_state_atoms atoms;
- unsigned dirty_atoms; /* mask */
- /* PM4 states (precomputed immutable states) */
- unsigned dirty_states;
- union si_state queued;
- union si_state emitted;
-
- /* Atom declarations. */
- struct si_framebuffer framebuffer;
- unsigned sample_locs_num_samples;
- uint16_t sample_mask;
- unsigned last_cb_target_mask;
- struct si_blend_color blend_color;
- struct si_clip_state clip_state;
- struct si_shader_data shader_pointers;
- struct si_stencil_ref stencil_ref;
- struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS];
- struct si_streamout streamout;
- struct si_viewports viewports;
- unsigned num_window_rectangles;
- bool window_rectangles_include;
- struct pipe_scissor_state window_rectangles[4];
-
- /* Precomputed states. */
- struct si_pm4_state *init_config;
- struct si_pm4_state *init_config_gs_rings;
- bool init_config_has_vgt_flush;
- struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
-
- /* shaders */
- struct si_shader_ctx_state ps_shader;
- struct si_shader_ctx_state gs_shader;
- struct si_shader_ctx_state vs_shader;
- struct si_shader_ctx_state tcs_shader;
- struct si_shader_ctx_state tes_shader;
- struct si_shader_ctx_state cs_prim_discard_state;
- struct si_cs_shader_state cs_shader_state;
-
- /* shader information */
- struct si_vertex_elements *vertex_elements;
- unsigned num_vertex_elements;
- unsigned sprite_coord_enable;
- unsigned cs_max_waves_per_sh;
- bool flatshade;
- bool do_update_shaders;
-
- /* shader descriptors */
- struct si_descriptors descriptors[SI_NUM_DESCS];
- unsigned descriptors_dirty;
- unsigned shader_pointers_dirty;
- unsigned shader_needs_decompress_mask;
- struct si_buffer_resources rw_buffers;
- struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
- struct si_samplers samplers[SI_NUM_SHADERS];
- struct si_images images[SI_NUM_SHADERS];
- bool bo_list_add_all_resident_resources;
- bool bo_list_add_all_gfx_resources;
- bool bo_list_add_all_compute_resources;
-
- /* other shader resources */
- struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
- struct pipe_resource *esgs_ring;
- struct pipe_resource *gsvs_ring;
- struct pipe_resource *tess_rings;
- union pipe_color_union *border_color_table; /* in CPU memory, any endian */
- struct si_resource *border_color_buffer;
- union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */
- unsigned border_color_count;
- unsigned num_vs_blit_sgprs;
- uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
- uint32_t cs_user_data[4];
-
- /* Vertex buffers. */
- bool vertex_buffers_dirty;
- bool vertex_buffer_pointer_dirty;
- bool vertex_buffer_user_sgprs_dirty;
- struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
- uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
- uint32_t *vb_descriptors_gpu_list;
- struct si_resource *vb_descriptors_buffer;
- unsigned vb_descriptors_offset;
- unsigned vb_descriptor_user_sgprs[5*4];
-
- /* MSAA config state. */
- int ps_iter_samples;
- bool ps_uses_fbfetch;
- bool smoothing_enabled;
-
- /* DB render state. */
- unsigned ps_db_shader_control;
- unsigned dbcb_copy_sample;
- bool dbcb_depth_copy_enabled:1;
- bool dbcb_stencil_copy_enabled:1;
- bool db_flush_depth_inplace:1;
- bool db_flush_stencil_inplace:1;
- bool db_depth_clear:1;
- bool db_depth_disable_expclear:1;
- bool db_stencil_clear:1;
- bool db_stencil_disable_expclear:1;
- bool occlusion_queries_disabled:1;
- bool generate_mipmap_for_depth:1;
-
- /* Emitted draw state. */
- bool gs_tri_strip_adj_fix:1;
- bool ls_vgpr_fix:1;
- bool prim_discard_cs_instancing:1;
- bool ngg:1;
- uint8_t ngg_culling;
- int last_index_size;
- int last_base_vertex;
- int last_start_instance;
- int last_instance_count;
- int last_drawid;
- int last_sh_base_reg;
- int last_primitive_restart_en;
- int last_restart_index;
- int last_prim;
- int last_multi_vgt_param;
- int last_gs_out_prim;
- int last_binning_enabled;
- unsigned current_vs_state;
- unsigned last_vs_state;
- enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
-
- struct si_small_prim_cull_info last_small_prim_cull_info;
- struct si_resource *small_prim_cull_info_buf;
- uint64_t small_prim_cull_info_address;
- bool small_prim_cull_info_dirty;
-
- /* Scratch buffer */
- struct si_resource *scratch_buffer;
- unsigned scratch_waves;
- unsigned spi_tmpring_size;
- unsigned max_seen_scratch_bytes_per_wave;
- unsigned max_seen_compute_scratch_bytes_per_wave;
-
- struct si_resource *compute_scratch_buffer;
-
- /* Emitted derived tessellation state. */
- /* Local shader (VS), or HS if LS-HS are merged. */
- struct si_shader *last_ls;
- struct si_shader_selector *last_tcs;
- int last_num_tcs_input_cp;
- int last_tes_sh_base;
- bool last_tess_uses_primid;
- unsigned last_num_patches;
- int last_ls_hs_config;
-
- /* Debug state. */
- bool is_debug;
- struct si_saved_cs *current_saved_cs;
- uint64_t dmesg_timestamp;
- unsigned apitrace_call_number;
-
- /* Other state */
- bool need_check_render_feedback;
- bool decompression_enabled;
- bool dpbb_force_off;
- bool vs_writes_viewport_index;
- bool vs_disables_clipping_viewport;
-
- /* Precomputed IA_MULTI_VGT_PARAM */
- union si_vgt_param_key ia_multi_vgt_param_key;
- unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
-
- /* Bindless descriptors. */
- struct si_descriptors bindless_descriptors;
- struct util_idalloc bindless_used_slots;
- unsigned num_bindless_descriptors;
- bool bindless_descriptors_dirty;
- bool graphics_bindless_pointer_dirty;
- bool compute_bindless_pointer_dirty;
-
- /* Allocated bindless handles */
- struct hash_table *tex_handles;
- struct hash_table *img_handles;
-
- /* Resident bindless handles */
- struct util_dynarray resident_tex_handles;
- struct util_dynarray resident_img_handles;
-
- /* Resident bindless handles which need decompression */
- struct util_dynarray resident_tex_needs_color_decompress;
- struct util_dynarray resident_img_needs_color_decompress;
- struct util_dynarray resident_tex_needs_depth_decompress;
-
- /* Bindless state */
- bool uses_bindless_samplers;
- bool uses_bindless_images;
-
- /* MSAA sample locations.
- * The first index is the sample index.
- * The second index is the coordinate: X, Y. */
- struct {
- float x1[1][2];
- float x2[2][2];
- float x4[4][2];
- float x8[8][2];
- float x16[16][2];
- } sample_positions;
- struct pipe_resource *sample_pos_buffer;
-
- /* Misc stats. */
- unsigned num_draw_calls;
- unsigned num_decompress_calls;
- unsigned num_mrt_draw_calls;
- unsigned num_prim_restart_calls;
- unsigned num_spill_draw_calls;
- unsigned num_compute_calls;
- unsigned num_spill_compute_calls;
- unsigned num_dma_calls;
- unsigned num_cp_dma_calls;
- unsigned num_vs_flushes;
- unsigned num_ps_flushes;
- unsigned num_cs_flushes;
- unsigned num_cb_cache_flushes;
- unsigned num_db_cache_flushes;
- unsigned num_L2_invalidates;
- unsigned num_L2_writebacks;
- unsigned num_resident_handles;
- uint64_t num_alloc_tex_transfer_bytes;
- unsigned last_tex_ps_draw_ratio; /* for query */
- unsigned compute_num_verts_accepted;
- unsigned compute_num_verts_rejected;
- unsigned compute_num_verts_ineligible; /* due to low vertex count */
- unsigned context_roll;
-
- /* Queries. */
- /* Maintain the list of active queries for pausing between IBs. */
- int num_occlusion_queries;
- int num_perfect_occlusion_queries;
- int num_pipeline_stat_queries;
- struct list_head active_queries;
- unsigned num_cs_dw_queries_suspend;
-
- /* Render condition. */
- struct pipe_query *render_cond;
- unsigned render_cond_mode;
- bool render_cond_invert;
- bool render_cond_force_off; /* for u_blitter */
-
- /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
- bool sdma_uploads_in_progress;
- struct si_sdma_upload *sdma_uploads;
- unsigned num_sdma_uploads;
- unsigned max_sdma_uploads;
-
- /* Shader-based queries. */
- struct list_head shader_query_buffers;
- unsigned num_active_shader_queries;
-
- /* Statistics gathering for the DCC enablement heuristic. It can't be
- * in si_texture because si_texture can be shared by multiple
- * contexts. This is for back buffers only. We shouldn't get too many
- * of those.
- *
- * X11 DRI3 rotates among a finite set of back buffers. They should
- * all fit in this array. If they don't, separate DCC might never be
- * enabled by DCC stat gathering.
- */
- struct {
- struct si_texture *tex;
- /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
- struct pipe_query *ps_stats[3];
- /* If all slots are used and another slot is needed,
- * the least recently used slot is evicted based on this. */
- int64_t last_use_timestamp;
- bool query_active;
- } dcc_stats[5];
-
- /* Copy one resource to another using async DMA. */
- void (*dma_copy)(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dst_x, unsigned dst_y, unsigned dst_z,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box);
-
- struct si_tracked_regs tracked_regs;
+ struct pipe_context b; /* base class */
+
+ enum radeon_family family;
+ enum chip_class chip_class;
+
+ struct radeon_winsys *ws;
+ struct radeon_winsys_ctx *ctx;
+ struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */
+ struct radeon_cmdbuf *sdma_cs;
+ struct pipe_fence_handle *last_gfx_fence;
+ struct pipe_fence_handle *last_sdma_fence;
+ struct si_resource *eop_bug_scratch;
+ struct u_upload_mgr *cached_gtt_allocator;
+ struct threaded_context *tc;
+ struct u_suballocator *allocator_zeroed_memory;
+ struct slab_child_pool pool_transfers;
+ struct slab_child_pool pool_transfers_unsync; /* for threaded_context */
+ struct pipe_device_reset_callback device_reset_callback;
+ struct u_log_context *log;
+ void *query_result_shader;
+ void *sh_query_result_shader;
+
+ void (*emit_cache_flush)(struct si_context *ctx);
+
+ struct blitter_context *blitter;
+ void *noop_blend;
+ void *noop_dsa;
+ void *discard_rasterizer_state;
+ void *custom_dsa_flush;
+ void *custom_blend_resolve;
+ void *custom_blend_fmask_decompress;
+ void *custom_blend_eliminate_fastclear;
+ void *custom_blend_dcc_decompress;
+ void *vs_blit_pos;
+ void *vs_blit_pos_layered;
+ void *vs_blit_color;
+ void *vs_blit_color_layered;
+ void *vs_blit_texcoord;
+ void *cs_clear_buffer;
+ void *cs_copy_buffer;
+ void *cs_copy_image;
+ void *cs_copy_image_1d_array;
+ void *cs_clear_render_target;
+ void *cs_clear_render_target_1d_array;
+ void *cs_clear_12bytes_buffer;
+ void *cs_dcc_retile;
+ void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
+ struct si_screen *screen;
+ struct pipe_debug_callback debug;
+ struct ac_llvm_compiler compiler; /* only non-threaded compilation */
+ struct si_shader_ctx_state fixed_func_tcs_shader;
+ /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
+ struct si_resource *wait_mem_scratch;
+ unsigned wait_mem_number;
+ uint16_t prefetch_L2_mask;
+
+ bool has_graphics;
+ bool gfx_flush_in_progress : 1;
+ bool gfx_last_ib_is_busy : 1;
+ bool compute_is_busy : 1;
+
+ unsigned num_gfx_cs_flushes;
+ unsigned initial_gfx_cs_size;
+ unsigned last_dirty_tex_counter;
+ unsigned last_dirty_buf_counter;
+ unsigned last_compressed_colortex_counter;
+ unsigned last_num_draw_calls;
+ unsigned flags; /* flush flags */
+ /* Current unaccounted memory usage. */
+ uint64_t vram;
+ uint64_t gtt;
+
+ /* Compute-based primitive discard. */
+ unsigned prim_discard_vertex_count_threshold;
+ struct pb_buffer *gds;
+ struct pb_buffer *gds_oa;
+ struct radeon_cmdbuf *prim_discard_compute_cs;
+ unsigned compute_gds_offset;
+ struct si_shader *compute_ib_last_shader;
+ uint32_t compute_rewind_va;
+ unsigned compute_num_prims_in_batch;
+ bool preserve_prim_restart_gds_at_flush;
+ /* index_ring is divided into 2 halves for doublebuffering. */
+ struct si_resource *index_ring;
+ unsigned index_ring_base; /* offset of a per-IB portion */
+ unsigned index_ring_offset; /* offset within a per-IB portion */
+ unsigned index_ring_size_per_ib; /* max available size per IB */
+ bool prim_discard_compute_ib_initialized;
+ /* For tracking the last execution barrier - it can be either
+ * a WRITE_DATA packet or a fence. */
+ uint32_t *last_pkt3_write_data;
+ struct si_resource *barrier_buf;
+ unsigned barrier_buf_offset;
+ struct pipe_fence_handle *last_ib_barrier_fence;
+ struct si_resource *last_ib_barrier_buf;
+ unsigned last_ib_barrier_buf_offset;
+
+ /* Atoms (direct states). */
+ union si_state_atoms atoms;
+ unsigned dirty_atoms; /* mask */
+ /* PM4 states (precomputed immutable states) */
+ unsigned dirty_states;
+ union si_state queued;
+ union si_state emitted;
+
+ /* Atom declarations. */
+ struct si_framebuffer framebuffer;
+ unsigned sample_locs_num_samples;
+ uint16_t sample_mask;
+ unsigned last_cb_target_mask;
+ struct si_blend_color blend_color;
+ struct si_clip_state clip_state;
+ struct si_shader_data shader_pointers;
+ struct si_stencil_ref stencil_ref;
+ struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS];
+ struct si_streamout streamout;
+ struct si_viewports viewports;
+ unsigned num_window_rectangles;
+ bool window_rectangles_include;
+ struct pipe_scissor_state window_rectangles[4];
+
+ /* Precomputed states. */
+ struct si_pm4_state *init_config;
+ struct si_pm4_state *init_config_gs_rings;
+ bool init_config_has_vgt_flush;
+ struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
+
+ /* shaders */
+ struct si_shader_ctx_state ps_shader;
+ struct si_shader_ctx_state gs_shader;
+ struct si_shader_ctx_state vs_shader;
+ struct si_shader_ctx_state tcs_shader;
+ struct si_shader_ctx_state tes_shader;
+ struct si_shader_ctx_state cs_prim_discard_state;
+ struct si_cs_shader_state cs_shader_state;
+
+ /* shader information */
+ struct si_vertex_elements *vertex_elements;
+ unsigned num_vertex_elements;
+ unsigned sprite_coord_enable;
+ unsigned cs_max_waves_per_sh;
+ bool flatshade;
+ bool do_update_shaders;
+
+ /* shader descriptors */
+ struct si_descriptors descriptors[SI_NUM_DESCS];
+ unsigned descriptors_dirty;
+ unsigned shader_pointers_dirty;
+ unsigned shader_needs_decompress_mask;
+ struct si_buffer_resources rw_buffers;
+ struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
+ struct si_samplers samplers[SI_NUM_SHADERS];
+ struct si_images images[SI_NUM_SHADERS];
+ bool bo_list_add_all_resident_resources;
+ bool bo_list_add_all_gfx_resources;
+ bool bo_list_add_all_compute_resources;
+
+ /* other shader resources */
+ struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
+ struct pipe_resource *esgs_ring;
+ struct pipe_resource *gsvs_ring;
+ struct pipe_resource *tess_rings;
+ union pipe_color_union *border_color_table; /* in CPU memory, any endian */
+ struct si_resource *border_color_buffer;
+ union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */
+ unsigned border_color_count;
+ unsigned num_vs_blit_sgprs;
+ uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
+ uint32_t cs_user_data[4];
+
+ /* Vertex buffers. */
+ bool vertex_buffers_dirty;
+ bool vertex_buffer_pointer_dirty;
+ bool vertex_buffer_user_sgprs_dirty;
+ struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+ uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
+ uint32_t *vb_descriptors_gpu_list;
+ struct si_resource *vb_descriptors_buffer;
+ unsigned vb_descriptors_offset;
+ unsigned vb_descriptor_user_sgprs[5 * 4];
+
+ /* MSAA config state. */
+ int ps_iter_samples;
+ bool ps_uses_fbfetch;
+ bool smoothing_enabled;
+
+ /* DB render state. */
+ unsigned ps_db_shader_control;
+ unsigned dbcb_copy_sample;
+ bool dbcb_depth_copy_enabled : 1;
+ bool dbcb_stencil_copy_enabled : 1;
+ bool db_flush_depth_inplace : 1;
+ bool db_flush_stencil_inplace : 1;
+ bool db_depth_clear : 1;
+ bool db_depth_disable_expclear : 1;
+ bool db_stencil_clear : 1;
+ bool db_stencil_disable_expclear : 1;
+ bool occlusion_queries_disabled : 1;
+ bool generate_mipmap_for_depth : 1;
+
+ /* Emitted draw state. */
+ bool gs_tri_strip_adj_fix : 1;
+ bool ls_vgpr_fix : 1;
+ bool prim_discard_cs_instancing : 1;
+ bool ngg : 1;
+ uint8_t ngg_culling;
+ int last_index_size;
+ int last_base_vertex;
+ int last_start_instance;
+ int last_instance_count;
+ int last_drawid;
+ int last_sh_base_reg;
+ int last_primitive_restart_en;
+ int last_restart_index;
+ int last_prim;
+ int last_multi_vgt_param;
+ int last_gs_out_prim;
+ int last_binning_enabled;
+ unsigned current_vs_state;
+ unsigned last_vs_state;
+ enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
+
+ struct si_small_prim_cull_info last_small_prim_cull_info;
+ struct si_resource *small_prim_cull_info_buf;
+ uint64_t small_prim_cull_info_address;
+ bool small_prim_cull_info_dirty;
+
+ /* Scratch buffer */
+ struct si_resource *scratch_buffer;
+ unsigned scratch_waves;
+ unsigned spi_tmpring_size;
+ unsigned max_seen_scratch_bytes_per_wave;
+ unsigned max_seen_compute_scratch_bytes_per_wave;
+
+ struct si_resource *compute_scratch_buffer;
+
+ /* Emitted derived tessellation state. */
+ /* Local shader (VS), or HS if LS-HS are merged. */
+ struct si_shader *last_ls;
+ struct si_shader_selector *last_tcs;
+ int last_num_tcs_input_cp;
+ int last_tes_sh_base;
+ bool last_tess_uses_primid;
+ unsigned last_num_patches;
+ int last_ls_hs_config;
+
+ /* Debug state. */
+ bool is_debug;
+ struct si_saved_cs *current_saved_cs;
+ uint64_t dmesg_timestamp;
+ unsigned apitrace_call_number;
+
+ /* Other state */
+ bool need_check_render_feedback;
+ bool decompression_enabled;
+ bool dpbb_force_off;
+ bool vs_writes_viewport_index;
+ bool vs_disables_clipping_viewport;
+
+ /* Precomputed IA_MULTI_VGT_PARAM */
+ union si_vgt_param_key ia_multi_vgt_param_key;
+ unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
+
+ /* Bindless descriptors. */
+ struct si_descriptors bindless_descriptors;
+ struct util_idalloc bindless_used_slots;
+ unsigned num_bindless_descriptors;
+ bool bindless_descriptors_dirty;
+ bool graphics_bindless_pointer_dirty;
+ bool compute_bindless_pointer_dirty;
+
+ /* Allocated bindless handles */
+ struct hash_table *tex_handles;
+ struct hash_table *img_handles;
+
+ /* Resident bindless handles */
+ struct util_dynarray resident_tex_handles;
+ struct util_dynarray resident_img_handles;
+
+ /* Resident bindless handles which need decompression */
+ struct util_dynarray resident_tex_needs_color_decompress;
+ struct util_dynarray resident_img_needs_color_decompress;
+ struct util_dynarray resident_tex_needs_depth_decompress;
+
+ /* Bindless state */
+ bool uses_bindless_samplers;
+ bool uses_bindless_images;
+
+ /* MSAA sample locations.
+ * The first index is the sample index.
+ * The second index is the coordinate: X, Y. */
+ struct {
+ float x1[1][2];
+ float x2[2][2];
+ float x4[4][2];
+ float x8[8][2];
+ float x16[16][2];
+ } sample_positions;
+ struct pipe_resource *sample_pos_buffer;
+
+ /* Misc stats. */
+ unsigned num_draw_calls;
+ unsigned num_decompress_calls;
+ unsigned num_mrt_draw_calls;
+ unsigned num_prim_restart_calls;
+ unsigned num_spill_draw_calls;
+ unsigned num_compute_calls;
+ unsigned num_spill_compute_calls;
+ unsigned num_dma_calls;
+ unsigned num_cp_dma_calls;
+ unsigned num_vs_flushes;
+ unsigned num_ps_flushes;
+ unsigned num_cs_flushes;
+ unsigned num_cb_cache_flushes;
+ unsigned num_db_cache_flushes;
+ unsigned num_L2_invalidates;
+ unsigned num_L2_writebacks;
+ unsigned num_resident_handles;
+ uint64_t num_alloc_tex_transfer_bytes;
+ unsigned last_tex_ps_draw_ratio; /* for query */
+ unsigned compute_num_verts_accepted;
+ unsigned compute_num_verts_rejected;
+ unsigned compute_num_verts_ineligible; /* due to low vertex count */
+ unsigned context_roll;
+
+ /* Queries. */
+ /* Maintain the list of active queries for pausing between IBs. */
+ int num_occlusion_queries;
+ int num_perfect_occlusion_queries;
+ int num_pipeline_stat_queries;
+ struct list_head active_queries;
+ unsigned num_cs_dw_queries_suspend;
+
+ /* Render condition. */
+ struct pipe_query *render_cond;
+ unsigned render_cond_mode;
+ bool render_cond_invert;
+ bool render_cond_force_off; /* for u_blitter */
+
+ /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+ bool sdma_uploads_in_progress;
+ struct si_sdma_upload *sdma_uploads;
+ unsigned num_sdma_uploads;
+ unsigned max_sdma_uploads;
+
+ /* Shader-based queries. */
+ struct list_head shader_query_buffers;
+ unsigned num_active_shader_queries;
+
+ /* Statistics gathering for the DCC enablement heuristic. It can't be
+ * in si_texture because si_texture can be shared by multiple
+ * contexts. This is for back buffers only. We shouldn't get too many
+ * of those.
+ *
+ * X11 DRI3 rotates among a finite set of back buffers. They should
+ * all fit in this array. If they don't, separate DCC might never be
+ * enabled by DCC stat gathering.
+ */
+ struct {
+ struct si_texture *tex;
+ /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+ struct pipe_query *ps_stats[3];
+ /* If all slots are used and another slot is needed,
+ * the least recently used slot is evicted based on this. */
+ int64_t last_use_timestamp;
+ bool query_active;
+ } dcc_stats[5];
+
+ /* Copy one resource to another using async DMA. */
+ void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+ unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src,
+ unsigned src_level, const struct pipe_box *src_box);
+
+ struct si_tracked_regs tracked_regs;
};
/* cik_sdma.c */
/* si_blit.c */
enum si_blitter_op /* bitmask */
{
- SI_SAVE_TEXTURES = 1,
- SI_SAVE_FRAMEBUFFER = 2,
- SI_SAVE_FRAGMENT_STATE = 4,
- SI_DISABLE_RENDER_COND = 8,
+ SI_SAVE_TEXTURES = 1,
+ SI_SAVE_FRAMEBUFFER = 2,
+ SI_SAVE_FRAGMENT_STATE = 4,
+ SI_DISABLE_RENDER_COND = 8,
};
void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op);
void si_blitter_end(struct si_context *sctx);
void si_init_blit_functions(struct si_context *sctx);
void si_decompress_textures(struct si_context *sctx, unsigned shader_mask);
-void si_decompress_subresource(struct pipe_context *ctx,
- struct pipe_resource *tex,
- unsigned planes, unsigned level,
- unsigned first_layer, unsigned last_layer);
-void si_resource_copy_region(struct pipe_context *ctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box);
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+ unsigned level, unsigned first_layer, unsigned last_layer);
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+ unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src, unsigned src_level,
+ const struct pipe_box *src_box);
void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
/* si_buffer.c */
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
- struct pb_buffer *buf,
- enum radeon_bo_usage usage);
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
- struct si_resource *resource,
- unsigned usage);
-void si_init_resource_fields(struct si_screen *sscreen,
- struct si_resource *res,
- uint64_t size, unsigned alignment);
-bool si_alloc_resource(struct si_screen *sscreen,
- struct si_resource *res);
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
- unsigned flags, unsigned usage,
- unsigned size, unsigned alignment);
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
- unsigned flags, unsigned usage,
- unsigned size, unsigned alignment);
-void si_replace_buffer_storage(struct pipe_context *ctx,
- struct pipe_resource *dst,
- struct pipe_resource *src);
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+ enum radeon_bo_usage usage);
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+ unsigned usage);
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+ unsigned alignment);
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res);
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+ unsigned usage, unsigned size, unsigned alignment);
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+ unsigned usage, unsigned size, unsigned alignment);
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+ struct pipe_resource *src);
void si_init_screen_buffer_functions(struct si_screen *sscreen);
void si_init_buffer_functions(struct si_context *sctx);
/* si_clear.c */
enum pipe_format si_simplify_cb_format(enum pipe_format format);
bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format);
-bool vi_dcc_clear_level(struct si_context *sctx,
- struct si_texture *tex,
- unsigned level, unsigned clear_value);
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+ unsigned clear_value);
void si_init_clear_functions(struct si_context *sctx);
/* si_compute_blit.c */
unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
- enum si_cache_policy cache_policy);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, uint32_t *clear_value,
- uint32_t clear_value_size, enum si_coherency coher,
- bool force_cpdma);
-void si_copy_buffer(struct si_context *sctx,
- struct pipe_resource *dst, struct pipe_resource *src,
- uint64_t dst_offset, uint64_t src_offset, unsigned size);
-void si_compute_copy_image(struct si_context *sctx,
- struct pipe_resource *dst,
- unsigned dst_level,
- struct pipe_resource *src,
- unsigned src_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- const struct pipe_box *src_box);
-void si_compute_clear_render_target(struct pipe_context *ctx,
- struct pipe_surface *dstsurf,
- const union pipe_color_union *color,
- unsigned dstx, unsigned dsty,
- unsigned width, unsigned height,
- bool render_condition_enabled);
+ enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+ enum si_coherency coher, bool force_cpdma);
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+ uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+ struct pipe_resource *src, unsigned src_level, unsigned dstx,
+ unsigned dsty, unsigned dstz, const struct pipe_box *src_box);
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+ const union pipe_color_union *color, unsigned dstx,
+ unsigned dsty, unsigned width, unsigned height,
+ bool render_condition_enabled);
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
void si_init_compute_blit_functions(struct si_context *sctx);
/* si_cp_dma.c */
-#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */
-#define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */
-#define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
-#define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */
-#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */
-#define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
- SI_CPDMA_SKIP_SYNC_AFTER | \
- SI_CPDMA_SKIP_SYNC_BEFORE | \
- SI_CPDMA_SKIP_GFX_SYNC | \
- SI_CPDMA_SKIP_BO_LIST_UPDATE)
+#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */
+#define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */
+#define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
+#define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */
+#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */
+#define SI_CPDMA_SKIP_ALL \
+ (SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_SYNC_AFTER | SI_CPDMA_SKIP_SYNC_BEFORE | \
+ SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_BO_LIST_UPDATE)
void si_cp_dma_wait_for_idle(struct si_context *sctx);
void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
- struct pipe_resource *dst, uint64_t offset,
- uint64_t size, unsigned value, unsigned user_flags,
- enum si_coherency coher, enum si_cache_policy cache_policy);
-void si_cp_dma_copy_buffer(struct si_context *sctx,
- struct pipe_resource *dst, struct pipe_resource *src,
- uint64_t dst_offset, uint64_t src_offset, unsigned size,
- unsigned user_flags, enum si_coherency coher,
- enum si_cache_policy cache_policy);
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
- uint64_t offset, unsigned size);
+ struct pipe_resource *dst, uint64_t offset, uint64_t size,
+ unsigned value, unsigned user_flags, enum si_coherency coher,
+ enum si_cache_policy cache_policy);
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+ unsigned size, unsigned user_flags, enum si_coherency coher,
+ enum si_cache_policy cache_policy);
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+ unsigned size);
void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
void si_test_gds(struct si_context *sctx);
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
- unsigned offset, unsigned size, unsigned dst_sel,
- unsigned engine, const void *data);
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
- unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
- unsigned src_sel, struct si_resource *src, unsigned src_offset);
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+ unsigned size, unsigned dst_sel, unsigned engine, const void *data);
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+ struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+ struct si_resource *src, unsigned src_offset);
/* si_debug.c */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
- struct radeon_saved_cs *saved, bool get_buffer_list);
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+ bool get_buffer_list);
void si_clear_saved_cs(struct radeon_saved_cs *saved);
void si_destroy_saved_cs(struct si_saved_cs *scs);
void si_auto_log_cs(void *data, struct u_log_context *log);
void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
void si_init_debug_functions(struct si_context *sctx);
-void si_check_vm_faults(struct si_context *sctx,
- struct radeon_saved_cs *saved, enum ring_type ring);
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
+ enum ring_type ring);
bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
/* si_dma_cs.c */
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
- uint64_t offset);
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, unsigned clear_value);
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset);
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, unsigned clear_value);
void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
- struct pipe_resource *src, uint64_t dst_offset,
- uint64_t src_offset, uint64_t size);
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
- struct si_resource *dst, struct si_resource *src);
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
- struct pipe_fence_handle **fence);
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
- uint64_t offset, uint64_t size, unsigned value);
+ struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+ uint64_t size);
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+ struct si_resource *src);
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+ uint64_t size, unsigned value);
/* si_fence.c */
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
- unsigned event, unsigned event_flags,
- unsigned dst_sel, unsigned int_sel, unsigned data_sel,
- struct si_resource *buf, uint64_t va,
- uint32_t new_fence, unsigned query_type);
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+ unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+ struct si_resource *buf, uint64_t va, uint32_t new_fence,
+ unsigned query_type);
unsigned si_cp_write_fence_dwords(struct si_screen *screen);
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
- uint64_t va, uint32_t ref, uint32_t mask, unsigned flags);
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+ uint32_t mask, unsigned flags);
void si_init_fence_functions(struct si_context *ctx);
void si_init_screen_fence_functions(struct si_screen *screen);
struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
- struct tc_unflushed_batch_token *tc_token);
+ struct tc_unflushed_batch_token *tc_token);
/* si_get.c */
void si_init_screen_get_functions(struct si_screen *sscreen);
/* si_gfx_cs.c */
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
- struct pipe_fence_handle **fence);
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
void si_allocate_gds(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx);
void si_need_gfx_cs_space(struct si_context *ctx);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
- uint64_t begin);
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin);
/* si_compute.c */
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_init_compute_functions(struct si_context *sctx);
/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome {
- SI_PRIM_DISCARD_ENABLED,
- SI_PRIM_DISCARD_DISABLED,
- SI_PRIM_DISCARD_DRAW_SPLIT,
+enum si_prim_discard_outcome
+{
+ SI_PRIM_DISCARD_ENABLED,
+ SI_PRIM_DISCARD_DISABLED,
+ SI_PRIM_DISCARD_DRAW_SPLIT,
};
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- bool primitive_restart);
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+ bool primitive_restart);
void si_compute_signal_gfx(struct si_context *sctx);
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned index_size,
- unsigned base_vertex,
- uint64_t input_indexbuf_va,
- unsigned input_indexbuf_max_elements);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
- bool is_aux_context,
- unsigned *prim_discard_vertex_count_threshold,
- unsigned *index_ring_size_per_ib);
+ const struct pipe_draw_info *info, unsigned index_size,
+ unsigned base_vertex, uint64_t input_indexbuf_va,
+ unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+ unsigned *prim_discard_vertex_count_threshold,
+ unsigned *index_ring_size_per_ib);
/* si_pipe.c */
void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
/* si_shaderlib_tgsi.c */
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
- unsigned num_layers);
+ unsigned num_layers);
void *si_create_fixed_func_tcs(struct si_context *sctx);
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
- unsigned num_dwords_per_thread,
- bool dst_stream_cache_policy, bool is_copy);
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+ bool dst_stream_cache_policy, bool is_copy);
void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
void *si_clear_render_target_shader(struct pipe_context *ctx);
void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx);
void *si_create_dcc_retile_cs(struct pipe_context *ctx);
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
- bool is_array);
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array);
void *si_create_query_result_cs(struct si_context *sctx);
void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
/* si_uvd.c */
struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
- const struct pipe_video_codec *templ);
+ const struct pipe_video_codec *templ);
struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
- const struct pipe_video_buffer *tmpl);
+ const struct pipe_video_buffer *tmpl);
/* si_viewport.c */
void si_update_ngg_small_prim_precision(struct si_context *ctx);
-void si_get_small_prim_cull_info(struct si_context *sctx,
- struct si_small_prim_cull_info *out);
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
void si_update_vs_viewport_state(struct si_context *ctx);
void si_init_viewport_functions(struct si_context *ctx);
/* si_texture.c */
-bool si_prepare_for_dma_blit(struct si_context *sctx,
- struct si_texture *dst,
- unsigned dst_level, unsigned dstx,
- unsigned dsty, unsigned dstz,
- struct si_texture *src,
- unsigned src_level,
- const struct pipe_box *src_box);
-void si_eliminate_fast_color_clear(struct si_context *sctx,
- struct si_texture *tex);
-void si_texture_discard_cmask(struct si_screen *sscreen,
- struct si_texture *tex);
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
- struct pipe_resource *texture);
-void si_print_texture_info(struct si_screen *sscreen,
- struct si_texture *tex, struct u_log_context *log);
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+ unsigned src_level, const struct pipe_box *src_box);
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex);
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex);
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture);
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+ struct u_log_context *log);
struct pipe_resource *si_texture_create(struct pipe_screen *screen,
- const struct pipe_resource *templ);
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
- enum pipe_format format1,
- enum pipe_format format2);
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
- unsigned level,
- enum pipe_format view_format);
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
- struct pipe_resource *tex,
- unsigned level,
- enum pipe_format view_format);
+ const struct pipe_resource *templ);
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+ enum pipe_format format2);
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+ enum pipe_format view_format);
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+ unsigned level, enum pipe_format view_format);
struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
- struct pipe_resource *texture,
- const struct pipe_surface *templ,
- unsigned width0, unsigned height0,
- unsigned width, unsigned height);
+ struct pipe_resource *texture,
+ const struct pipe_surface *templ, unsigned width0,
+ unsigned height0, unsigned width, unsigned height);
unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
-void vi_separate_dcc_try_enable(struct si_context *sctx,
- struct si_texture *tex);
-void vi_separate_dcc_start_query(struct si_context *sctx,
- struct si_texture *tex);
-void vi_separate_dcc_stop_query(struct si_context *sctx,
- struct si_texture *tex);
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
- struct si_texture *tex);
-bool si_texture_disable_dcc(struct si_context *sctx,
- struct si_texture *tex);
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex);
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex);
void si_init_screen_texture_functions(struct si_screen *sscreen);
void si_init_context_texture_functions(struct si_context *sctx);
-
/*
* common helpers
*/
static inline struct si_resource *si_resource(struct pipe_resource *r)
{
- return (struct si_resource*)r;
+ return (struct si_resource *)r;
}
-static inline void
-si_resource_reference(struct si_resource **ptr, struct si_resource *res)
+static inline void si_resource_reference(struct si_resource **ptr, struct si_resource *res)
{
- pipe_resource_reference((struct pipe_resource **)ptr,
- (struct pipe_resource *)res);
+ pipe_resource_reference((struct pipe_resource **)ptr, (struct pipe_resource *)res);
}
-static inline void
-si_texture_reference(struct si_texture **ptr, struct si_texture *res)
+static inline void si_texture_reference(struct si_texture **ptr, struct si_texture *res)
{
- pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
+ pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
}
static inline void
si_shader_selector_reference(struct si_context *sctx, /* sctx can optionally be NULL */
- struct si_shader_selector **dst,
- struct si_shader_selector *src)
+ struct si_shader_selector **dst, struct si_shader_selector *src)
{
- if (*dst == src)
- return;
+ if (*dst == src)
+ return;
- struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
- util_shader_reference(&sctx->b, &sscreen->live_shader_cache,
- (void**)dst, src);
+ struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
+ util_shader_reference(&sctx->b, &sscreen->live_shader_cache, (void **)dst, src);
}
-static inline bool
-vi_dcc_enabled(struct si_texture *tex, unsigned level)
+static inline bool vi_dcc_enabled(struct si_texture *tex, unsigned level)
{
- return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
+ return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
}
-static inline unsigned
-si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
+static inline unsigned si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
{
- if (stencil)
- return tex->surface.u.legacy.stencil_tiling_index[level];
- else
- return tex->surface.u.legacy.tiling_index[level];
+ if (stencil)
+ return tex->surface.u.legacy.stencil_tiling_index[level];
+ else
+ return tex->surface.u.legacy.tiling_index[level];
}
-static inline unsigned
-si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
+static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
{
- /* Don't count the needed CS space exactly and just use an upper bound.
- *
- * Also reserve space for stopping queries at the end of IB, because
- * the number of active queries is unlimited in theory.
- */
- return 2048 + sctx->num_cs_dw_queries_suspend;
+ /* Don't count the needed CS space exactly and just use an upper bound.
+ *
+ * Also reserve space for stopping queries at the end of IB, because
+ * the number of active queries is unlimited in theory.
+ */
+ return 2048 + sctx->num_cs_dw_queries_suspend;
}
-static inline void
-si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
+static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
{
- if (r) {
- /* Add memory usage for need_gfx_cs_space */
- sctx->vram += si_resource(r)->vram_usage;
- sctx->gtt += si_resource(r)->gart_usage;
- }
+ if (r) {
+ /* Add memory usage for need_gfx_cs_space */
+ sctx->vram += si_resource(r)->vram_usage;
+ sctx->gtt += si_resource(r)->gart_usage;
+ }
}
-static inline void
-si_invalidate_draw_sh_constants(struct si_context *sctx)
+static inline void si_invalidate_draw_sh_constants(struct si_context *sctx)
{
- sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
- sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
+ sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
+ sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
}
-static inline unsigned
-si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
+static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
{
- return 1 << (atom - sctx->atoms.array);
+ return 1 << (atom - sctx->atoms.array);
}
-static inline void
-si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
+static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
{
- unsigned bit = si_get_atom_bit(sctx, atom);
+ unsigned bit = si_get_atom_bit(sctx, atom);
- if (dirty)
- sctx->dirty_atoms |= bit;
- else
- sctx->dirty_atoms &= ~bit;
+ if (dirty)
+ sctx->dirty_atoms |= bit;
+ else
+ sctx->dirty_atoms &= ~bit;
}
-static inline bool
-si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline bool si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
{
- return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
+ return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
}
-static inline void
-si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline void si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
{
- si_set_atom_dirty(sctx, atom, true);
+ si_set_atom_dirty(sctx, atom, true);
}
static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)
{
- if (sctx->gs_shader.cso)
- return &sctx->gs_shader;
- if (sctx->tes_shader.cso)
- return &sctx->tes_shader;
+ if (sctx->gs_shader.cso)
+ return &sctx->gs_shader;
+ if (sctx->tes_shader.cso)
+ return &sctx->tes_shader;
- return &sctx->vs_shader;
+ return &sctx->vs_shader;
}
static inline struct si_shader_info *si_get_vs_info(struct si_context *sctx)
{
- struct si_shader_ctx_state *vs = si_get_vs(sctx);
+ struct si_shader_ctx_state *vs = si_get_vs(sctx);
- return vs->cso ? &vs->cso->info : NULL;
+ return vs->cso ? &vs->cso->info : NULL;
}
-static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
+static inline struct si_shader *si_get_vs_state(struct si_context *sctx)
{
- if (sctx->gs_shader.cso &&
- sctx->gs_shader.current &&
- !sctx->gs_shader.current->key.as_ngg)
- return sctx->gs_shader.cso->gs_copy_shader;
+ if (sctx->gs_shader.cso && sctx->gs_shader.current && !sctx->gs_shader.current->key.as_ngg)
+ return sctx->gs_shader.cso->gs_copy_shader;
- struct si_shader_ctx_state *vs = si_get_vs(sctx);
- return vs->current ? vs->current : NULL;
+ struct si_shader_ctx_state *vs = si_get_vs(sctx);
+ return vs->current ? vs->current : NULL;
}
-static inline bool si_can_dump_shader(struct si_screen *sscreen,
- unsigned processor)
+static inline bool si_can_dump_shader(struct si_screen *sscreen, unsigned processor)
{
- return sscreen->debug_flags & (1 << processor);
+ return sscreen->debug_flags & (1 << processor);
}
static inline bool si_get_strmout_en(struct si_context *sctx)
{
- return sctx->streamout.streamout_enabled ||
- sctx->streamout.prims_gen_query_enabled;
+ return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
}
-static inline unsigned
-si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
+static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
{
- unsigned alignment, tcc_cache_line_size;
-
- /* If the upload size is less than the cache line size (e.g. 16, 32),
- * the whole thing will fit into a cache line if we align it to its size.
- * The idea is that multiple small uploads can share a cache line.
- * If the upload size is greater, align it to the cache line size.
- */
- alignment = util_next_power_of_two(upload_size);
- tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
- return MIN2(alignment, tcc_cache_line_size);
+ unsigned alignment, tcc_cache_line_size;
+
+ /* If the upload size is less than the cache line size (e.g. 16, 32),
+ * the whole thing will fit into a cache line if we align it to its size.
+ * The idea is that multiple small uploads can share a cache line.
+ * If the upload size is greater, align it to the cache line size.
+ */
+ alignment = util_next_power_of_two(upload_size);
+ tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
+ return MIN2(alignment, tcc_cache_line_size);
}
-static inline void
-si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
+static inline void si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
{
- if (pipe_reference(&(*dst)->reference, &src->reference))
- si_destroy_saved_cs(*dst);
+ if (pipe_reference(&(*dst)->reference, &src->reference))
+ si_destroy_saved_cs(*dst);
- *dst = src;
+ *dst = src;
}
-static inline void
-si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
- bool shaders_read_metadata, bool dcc_pipe_aligned)
+static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+ bool shaders_read_metadata, bool dcc_pipe_aligned)
{
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
- SI_CONTEXT_INV_VCACHE;
-
- if (sctx->chip_class >= GFX10) {
- if (sctx->screen->info.tcc_harvested)
- sctx->flags |= SI_CONTEXT_INV_L2;
- else if (shaders_read_metadata)
- sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
- } else if (sctx->chip_class == GFX9) {
- /* Single-sample color is coherent with shaders on GFX9, but
- * L2 metadata must be flushed if shaders read metadata.
- * (DCC, CMASK).
- */
- if (num_samples >= 2 ||
- (shaders_read_metadata && !dcc_pipe_aligned))
- sctx->flags |= SI_CONTEXT_INV_L2;
- else if (shaders_read_metadata)
- sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
- } else {
- /* GFX6-GFX8 */
- sctx->flags |= SI_CONTEXT_INV_L2;
- }
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE;
+
+ if (sctx->chip_class >= GFX10) {
+ if (sctx->screen->info.tcc_harvested)
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ else if (shaders_read_metadata)
+ sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+ } else if (sctx->chip_class == GFX9) {
+ /* Single-sample color is coherent with shaders on GFX9, but
+ * L2 metadata must be flushed if shaders read metadata.
+ * (DCC, CMASK).
+ */
+ if (num_samples >= 2 || (shaders_read_metadata && !dcc_pipe_aligned))
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ else if (shaders_read_metadata)
+ sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+ } else {
+ /* GFX6-GFX8 */
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ }
}
-static inline void
-si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
- bool include_stencil, bool shaders_read_metadata)
+static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+ bool include_stencil, bool shaders_read_metadata)
{
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
- SI_CONTEXT_INV_VCACHE;
-
- if (sctx->chip_class >= GFX10) {
- if (sctx->screen->info.tcc_harvested)
- sctx->flags |= SI_CONTEXT_INV_L2;
- else if (shaders_read_metadata)
- sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
- } else if (sctx->chip_class == GFX9) {
- /* Single-sample depth (not stencil) is coherent with shaders
- * on GFX9, but L2 metadata must be flushed if shaders read
- * metadata.
- */
- if (num_samples >= 2 || include_stencil)
- sctx->flags |= SI_CONTEXT_INV_L2;
- else if (shaders_read_metadata)
- sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
- } else {
- /* GFX6-GFX8 */
- sctx->flags |= SI_CONTEXT_INV_L2;
- }
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE;
+
+ if (sctx->chip_class >= GFX10) {
+ if (sctx->screen->info.tcc_harvested)
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ else if (shaders_read_metadata)
+ sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+ } else if (sctx->chip_class == GFX9) {
+ /* Single-sample depth (not stencil) is coherent with shaders
+ * on GFX9, but L2 metadata must be flushed if shaders read
+ * metadata.
+ */
+ if (num_samples >= 2 || include_stencil)
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ else if (shaders_read_metadata)
+ sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+ } else {
+ /* GFX6-GFX8 */
+ sctx->flags |= SI_CONTEXT_INV_L2;
+ }
}
-static inline bool
-si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
+static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
{
- return (stencil_sampler && tex->can_sample_s) ||
- (!stencil_sampler && tex->can_sample_z);
+ return (stencil_sampler && tex->can_sample_s) || (!stencil_sampler && tex->can_sample_z);
}
-static inline bool
-si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
{
- if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
- return false;
+ if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
+ return false;
- return tex->surface.htile_offset && level == 0;
+ return tex->surface.htile_offset && level == 0;
}
-static inline bool
-vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
+ unsigned zs_mask)
{
- assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
- return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
+ assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
+ return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
}
static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
{
- if (sctx->ps_uses_fbfetch)
- return sctx->framebuffer.nr_color_samples;
+ if (sctx->ps_uses_fbfetch)
+ return sctx->framebuffer.nr_color_samples;
- return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
+ return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
}
static inline unsigned si_get_total_colormask(struct si_context *sctx)
{
- if (sctx->queued.named.rasterizer->rasterizer_discard)
- return 0;
+ if (sctx->queued.named.rasterizer->rasterizer_discard)
+ return 0;
- struct si_shader_selector *ps = sctx->ps_shader.cso;
- if (!ps)
- return 0;
+ struct si_shader_selector *ps = sctx->ps_shader.cso;
+ if (!ps)
+ return 0;
- unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit &
- sctx->queued.named.blend->cb_target_mask;
+ unsigned colormask =
+ sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask;
- if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
- colormask &= ps->colors_written_4bit;
- else if (!ps->colors_written_4bit)
- colormask = 0; /* color0 writes all cbufs, but it's not written */
+ if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+ colormask &= ps->colors_written_4bit;
+ else if (!ps->colors_written_4bit)
+ colormask = 0; /* color0 writes all cbufs, but it's not written */
- return colormask;
+ return colormask;
}
-#define UTIL_ALL_PRIM_LINE_MODES ((1 << PIPE_PRIM_LINES) | \
- (1 << PIPE_PRIM_LINE_LOOP) | \
- (1 << PIPE_PRIM_LINE_STRIP) | \
- (1 << PIPE_PRIM_LINES_ADJACENCY) | \
- (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
+#define UTIL_ALL_PRIM_LINE_MODES \
+ ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \
+ (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
static inline bool util_prim_is_lines(unsigned prim)
{
- return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
+ return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
}
static inline bool util_prim_is_points_or_lines(unsigned prim)
{
- return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES |
- (1 << PIPE_PRIM_POINTS))) != 0;
+ return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | (1 << PIPE_PRIM_POINTS))) != 0;
}
static inline bool util_rast_prim_is_triangles(unsigned prim)
{
- return ((1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP) |
- (1 << PIPE_PRIM_TRIANGLE_FAN) |
- (1 << PIPE_PRIM_QUADS) |
- (1 << PIPE_PRIM_QUAD_STRIP) |
- (1 << PIPE_PRIM_POLYGON) |
- (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+ return ((1 << prim) &
+ ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+ (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
+ (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
+ (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
}
/**
* \param vram VRAM memory size not added to the buffer list yet
* \param gtt GTT memory size not added to the buffer list yet
*/
-static inline bool
-radeon_cs_memory_below_limit(struct si_screen *screen,
- struct radeon_cmdbuf *cs,
- uint64_t vram, uint64_t gtt)
+static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
+ uint64_t vram, uint64_t gtt)
{
- vram += cs->used_vram;
- gtt += cs->used_gart;
+ vram += cs->used_vram;
+ gtt += cs->used_gart;
- /* Anything that goes above the VRAM size should go to GTT. */
- if (vram > screen->info.vram_size)
- gtt += vram - screen->info.vram_size;
+ /* Anything that goes above the VRAM size should go to GTT. */
+ if (vram > screen->info.vram_size)
+ gtt += vram - screen->info.vram_size;
- /* Now we just need to check if we have enough GTT. */
- return gtt < screen->info.gart_size * 0.7;
+ /* Now we just need to check if we have enough GTT. */
+ return gtt < screen->info.gart_size * 0.7;
}
/**
* The buffer list becomes empty after every context flush and must be
* rebuilt.
*/
-static inline void radeon_add_to_buffer_list(struct si_context *sctx,
- struct radeon_cmdbuf *cs,
- struct si_resource *bo,
- enum radeon_bo_usage usage,
- enum radeon_bo_priority priority)
+static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct radeon_cmdbuf *cs,
+ struct si_resource *bo, enum radeon_bo_usage usage,
+ enum radeon_bo_priority priority)
{
- assert(usage);
- sctx->ws->cs_add_buffer(
- cs, bo->buf,
- (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
- bo->domains, priority);
+ assert(usage);
+ sctx->ws->cs_add_buffer(cs, bo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
+ bo->domains, priority);
}
/**
* - if shader resource "enabled_mask" is not up-to-date or there is
* a different constraint disallowing a context flush
*/
-static inline void
-radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
- struct si_resource *bo,
- enum radeon_bo_usage usage,
- enum radeon_bo_priority priority,
- bool check_mem)
+static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
+ struct si_resource *bo,
+ enum radeon_bo_usage usage,
+ enum radeon_bo_priority priority,
+ bool check_mem)
{
- if (check_mem &&
- !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
- sctx->vram + bo->vram_usage,
- sctx->gtt + bo->gart_usage))
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ if (check_mem &&
+ !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + bo->vram_usage,
+ sctx->gtt + bo->gart_usage))
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
}
static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
{
- return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
+ return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
}
static inline unsigned si_get_wave_size(struct si_screen *sscreen,
- enum pipe_shader_type shader_type,
- bool ngg, bool es, bool prim_discard_cs)
+ enum pipe_shader_type shader_type, bool ngg, bool es,
+ bool prim_discard_cs)
{
- if (shader_type == PIPE_SHADER_COMPUTE)
- return sscreen->compute_wave_size;
- else if (shader_type == PIPE_SHADER_FRAGMENT)
- return sscreen->ps_wave_size;
- else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
- (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
- (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
- (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
- return 64;
- else
- return sscreen->ge_wave_size;
+ if (shader_type == PIPE_SHADER_COMPUTE)
+ return sscreen->compute_wave_size;
+ else if (shader_type == PIPE_SHADER_FRAGMENT)
+ return sscreen->ps_wave_size;
+ else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
+ (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
+ (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
+ (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
+ return 64;
+ else
+ return sscreen->ge_wave_size;
}
static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
{
- return si_get_wave_size(shader->selector->screen, shader->selector->type,
- shader->key.as_ngg, shader->key.as_es,
- shader->key.opt.vs_as_prim_discard_cs);
+ return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
+ shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
}
-#define PRINT_ERR(fmt, args...) \
- fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
+#define PRINT_ERR(fmt, args...) \
+ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
#endif
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "util/u_memory.h"
#include "si_pipe.h"
#include "sid.h"
+#include "util/u_memory.h"
void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
{
- state->last_opcode = opcode;
- state->last_pm4 = state->ndw++;
+ state->last_opcode = opcode;
+ state->last_pm4 = state->ndw++;
}
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
{
- state->pm4[state->ndw++] = dw;
+ state->pm4[state->ndw++] = dw;
}
void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
{
- unsigned count;
- count = state->ndw - state->last_pm4 - 2;
- state->pm4[state->last_pm4] =
- PKT3(state->last_opcode, count, predicate);
+ unsigned count;
+ count = state->ndw - state->last_pm4 - 2;
+ state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);
- assert(state->ndw <= SI_PM4_MAX_DW);
+ assert(state->ndw <= SI_PM4_MAX_DW);
}
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
{
- unsigned opcode;
+ unsigned opcode;
- if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
- opcode = PKT3_SET_CONFIG_REG;
- reg -= SI_CONFIG_REG_OFFSET;
+ if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
+ opcode = PKT3_SET_CONFIG_REG;
+ reg -= SI_CONFIG_REG_OFFSET;
- } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
- opcode = PKT3_SET_SH_REG;
- reg -= SI_SH_REG_OFFSET;
+ } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
+ opcode = PKT3_SET_SH_REG;
+ reg -= SI_SH_REG_OFFSET;
- } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
- opcode = PKT3_SET_CONTEXT_REG;
- reg -= SI_CONTEXT_REG_OFFSET;
+ } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
+ opcode = PKT3_SET_CONTEXT_REG;
+ reg -= SI_CONTEXT_REG_OFFSET;
- } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
- opcode = PKT3_SET_UCONFIG_REG;
- reg -= CIK_UCONFIG_REG_OFFSET;
+ } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
+ opcode = PKT3_SET_UCONFIG_REG;
+ reg -= CIK_UCONFIG_REG_OFFSET;
- } else {
- PRINT_ERR("Invalid register offset %08x!\n", reg);
- return;
- }
+ } else {
+ PRINT_ERR("Invalid register offset %08x!\n", reg);
+ return;
+ }
- reg >>= 2;
+ reg >>= 2;
- if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
- si_pm4_cmd_begin(state, opcode);
- si_pm4_cmd_add(state, reg);
- }
+ if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
+ si_pm4_cmd_begin(state, opcode);
+ si_pm4_cmd_add(state, reg);
+ }
- state->last_reg = reg;
- si_pm4_cmd_add(state, val);
- si_pm4_cmd_end(state, false);
+ state->last_reg = reg;
+ si_pm4_cmd_add(state, val);
+ si_pm4_cmd_end(state, false);
}
-void si_pm4_add_bo(struct si_pm4_state *state,
- struct si_resource *bo,
- enum radeon_bo_usage usage,
- enum radeon_bo_priority priority)
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+ enum radeon_bo_priority priority)
{
- unsigned idx = state->nbo++;
- assert(idx < SI_PM4_MAX_BO);
+ unsigned idx = state->nbo++;
+ assert(idx < SI_PM4_MAX_BO);
- si_resource_reference(&state->bo[idx], bo);
- state->bo_usage[idx] = usage;
- state->bo_priority[idx] = priority;
+ si_resource_reference(&state->bo[idx], bo);
+ state->bo_usage[idx] = usage;
+ state->bo_priority[idx] = priority;
}
void si_pm4_clear_state(struct si_pm4_state *state)
{
- for (int i = 0; i < state->nbo; ++i)
- si_resource_reference(&state->bo[i], NULL);
- si_resource_reference(&state->indirect_buffer, NULL);
- state->nbo = 0;
- state->ndw = 0;
+ for (int i = 0; i < state->nbo; ++i)
+ si_resource_reference(&state->bo[i], NULL);
+ si_resource_reference(&state->indirect_buffer, NULL);
+ state->nbo = 0;
+ state->ndw = 0;
}
-void si_pm4_free_state(struct si_context *sctx,
- struct si_pm4_state *state,
- unsigned idx)
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
{
- if (!state)
- return;
+ if (!state)
+ return;
- if (idx != ~0 && sctx->emitted.array[idx] == state) {
- sctx->emitted.array[idx] = NULL;
- }
+ if (idx != ~0 && sctx->emitted.array[idx] == state) {
+ sctx->emitted.array[idx] = NULL;
+ }
- si_pm4_clear_state(state);
- FREE(state);
+ si_pm4_clear_state(state);
+ FREE(state);
}
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- for (int i = 0; i < state->nbo; ++i) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
- state->bo_usage[i], state->bo_priority[i]);
- }
-
- if (!state->indirect_buffer) {
- radeon_emit_array(cs, state->pm4, state->ndw);
- } else {
- struct si_resource *ib = state->indirect_buffer;
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib,
- RADEON_USAGE_READ,
- RADEON_PRIO_IB2);
-
- radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
- radeon_emit(cs, ib->gpu_address);
- radeon_emit(cs, ib->gpu_address >> 32);
- radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
- }
-
- if (state->atom.emit)
- state->atom.emit(sctx);
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ for (int i = 0; i < state->nbo; ++i) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i],
+ state->bo_priority[i]);
+ }
+
+ if (!state->indirect_buffer) {
+ radeon_emit_array(cs, state->pm4, state->ndw);
+ } else {
+ struct si_resource *ib = state->indirect_buffer;
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2);
+
+ radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+ radeon_emit(cs, ib->gpu_address);
+ radeon_emit(cs, ib->gpu_address >> 32);
+ radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
+ }
+
+ if (state->atom.emit)
+ state->atom.emit(sctx);
}
void si_pm4_reset_emitted(struct si_context *sctx)
{
- memset(&sctx->emitted, 0, sizeof(sctx->emitted));
- sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
+ memset(&sctx->emitted, 0, sizeof(sctx->emitted));
+ sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
}
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
- struct si_pm4_state *state)
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state)
{
- struct pipe_screen *screen = sctx->b.screen;
- unsigned aligned_ndw = align(state->ndw, 8);
-
- /* only supported on GFX7 and later */
- if (sctx->chip_class < GFX7)
- return;
-
- assert(state->ndw);
- assert(aligned_ndw <= SI_PM4_MAX_DW);
-
- si_resource_reference(&state->indirect_buffer, NULL);
- /* TODO: this hangs with 1024 or higher alignment on GFX9. */
- state->indirect_buffer =
- si_aligned_buffer_create(screen, 0,
- PIPE_USAGE_DEFAULT, aligned_ndw * 4,
- 256);
- if (!state->indirect_buffer)
- return;
-
- /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
- if (sctx->screen->info.gfx_ib_pad_with_type2) {
- for (int i = state->ndw; i < aligned_ndw; i++)
- state->pm4[i] = 0x80000000; /* type2 nop packet */
- } else {
- for (int i = state->ndw; i < aligned_ndw; i++)
- state->pm4[i] = 0xffff1000; /* type3 nop packet */
- }
-
- pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b,
- 0, aligned_ndw *4, state->pm4);
+ struct pipe_screen *screen = sctx->b.screen;
+ unsigned aligned_ndw = align(state->ndw, 8);
+
+ /* only supported on GFX7 and later */
+ if (sctx->chip_class < GFX7)
+ return;
+
+ assert(state->ndw);
+ assert(aligned_ndw <= SI_PM4_MAX_DW);
+
+ si_resource_reference(&state->indirect_buffer, NULL);
+ /* TODO: this hangs with 1024 or higher alignment on GFX9. */
+ state->indirect_buffer =
+ si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256);
+ if (!state->indirect_buffer)
+ return;
+
+ /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
+ if (sctx->screen->info.gfx_ib_pad_with_type2) {
+ for (int i = state->ndw; i < aligned_ndw; i++)
+ state->pm4[i] = 0x80000000; /* type2 nop packet */
+ } else {
+ for (int i = state->ndw; i < aligned_ndw; i++)
+ state->pm4[i] = 0xffff1000; /* type3 nop packet */
+ }
+
+ pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4);
}
#include "radeon/radeon_winsys.h"
-#define SI_PM4_MAX_DW 176
-#define SI_PM4_MAX_BO 3
+#define SI_PM4_MAX_DW 176
+#define SI_PM4_MAX_BO 3
// forward defines
struct si_context;
* command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
*/
struct si_atom {
- void (*emit)(struct si_context *ctx);
+ void (*emit)(struct si_context *ctx);
};
-struct si_pm4_state
-{
- /* optional indirect buffer */
- struct si_resource *indirect_buffer;
+struct si_pm4_state {
+ /* optional indirect buffer */
+ struct si_resource *indirect_buffer;
- /* PKT3_SET_*_REG handling */
- unsigned last_opcode;
- unsigned last_reg;
- unsigned last_pm4;
+ /* PKT3_SET_*_REG handling */
+ unsigned last_opcode;
+ unsigned last_reg;
+ unsigned last_pm4;
- /* commands for the DE */
- unsigned ndw;
- uint32_t pm4[SI_PM4_MAX_DW];
+ /* commands for the DE */
+ unsigned ndw;
+ uint32_t pm4[SI_PM4_MAX_DW];
- /* BO's referenced by this state */
- unsigned nbo;
- struct si_resource *bo[SI_PM4_MAX_BO];
- enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
- enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
+ /* BO's referenced by this state */
+ unsigned nbo;
+ struct si_resource *bo[SI_PM4_MAX_BO];
+ enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
+ enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
- /* For shader states only */
- struct si_shader *shader;
- struct si_atom atom;
+ /* For shader states only */
+ struct si_shader *shader;
+ struct si_atom atom;
};
void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_add_bo(struct si_pm4_state *state,
- struct si_resource *bo,
- enum radeon_bo_usage usage,
- enum radeon_bo_priority priority);
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
- struct si_pm4_state *state);
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+ enum radeon_bo_priority priority);
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state);
void si_pm4_clear_state(struct si_pm4_state *state);
-void si_pm4_free_state(struct si_context *sctx,
- struct si_pm4_state *state,
- unsigned idx);
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
void si_pm4_reset_emitted(struct si_context *sctx);
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_pipe.h"
#include "si_query.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+
+#include "amd/common/sid.h"
+#include "si_pipe.h"
#include "util/os_time.h"
+#include "util/u_memory.h"
#include "util/u_suballoc.h"
-#include "amd/common/sid.h"
+#include "util/u_upload_mgr.h"
static const struct si_query_ops query_hw_ops;
struct si_hw_query_params {
- unsigned start_offset;
- unsigned end_offset;
- unsigned fence_offset;
- unsigned pair_stride;
- unsigned pair_count;
+ unsigned start_offset;
+ unsigned end_offset;
+ unsigned fence_offset;
+ unsigned pair_stride;
+ unsigned pair_count;
};
/* Queries without buffer handling or suspend/resume. */
struct si_query_sw {
- struct si_query b;
+ struct si_query b;
- uint64_t begin_result;
- uint64_t end_result;
+ uint64_t begin_result;
+ uint64_t end_result;
- uint64_t begin_time;
- uint64_t end_time;
+ uint64_t begin_time;
+ uint64_t end_time;
- /* Fence for GPU_FINISHED. */
- struct pipe_fence_handle *fence;
+ /* Fence for GPU_FINISHED. */
+ struct pipe_fence_handle *fence;
};
-static void si_query_sw_destroy(struct si_context *sctx,
- struct si_query *squery)
+static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_sw *query = (struct si_query_sw *)squery;
+ struct si_query_sw *query = (struct si_query_sw *)squery;
- sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
- FREE(query);
+ sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
+ FREE(query);
}
static enum radeon_value_id winsys_id_from_type(unsigned type)
{
- switch (type) {
- case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
- case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
- case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
- case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
- case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
- case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
- case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
- case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
- case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
- case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
- case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
- case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
- case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
- case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
- case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
- case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
- case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
- case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
- case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
- case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
- default: unreachable("query type does not correspond to winsys id");
- }
+ switch (type) {
+ case SI_QUERY_REQUESTED_VRAM:
+ return RADEON_REQUESTED_VRAM_MEMORY;
+ case SI_QUERY_REQUESTED_GTT:
+ return RADEON_REQUESTED_GTT_MEMORY;
+ case SI_QUERY_MAPPED_VRAM:
+ return RADEON_MAPPED_VRAM;
+ case SI_QUERY_MAPPED_GTT:
+ return RADEON_MAPPED_GTT;
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ return RADEON_BUFFER_WAIT_TIME_NS;
+ case SI_QUERY_NUM_MAPPED_BUFFERS:
+ return RADEON_NUM_MAPPED_BUFFERS;
+ case SI_QUERY_NUM_GFX_IBS:
+ return RADEON_NUM_GFX_IBS;
+ case SI_QUERY_NUM_SDMA_IBS:
+ return RADEON_NUM_SDMA_IBS;
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ return RADEON_GFX_BO_LIST_COUNTER;
+ case SI_QUERY_GFX_IB_SIZE:
+ return RADEON_GFX_IB_SIZE_COUNTER;
+ case SI_QUERY_NUM_BYTES_MOVED:
+ return RADEON_NUM_BYTES_MOVED;
+ case SI_QUERY_NUM_EVICTIONS:
+ return RADEON_NUM_EVICTIONS;
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
+ return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
+ case SI_QUERY_VRAM_USAGE:
+ return RADEON_VRAM_USAGE;
+ case SI_QUERY_VRAM_VIS_USAGE:
+ return RADEON_VRAM_VIS_USAGE;
+ case SI_QUERY_GTT_USAGE:
+ return RADEON_GTT_USAGE;
+ case SI_QUERY_GPU_TEMPERATURE:
+ return RADEON_GPU_TEMPERATURE;
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ return RADEON_CURRENT_SCLK;
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ return RADEON_CURRENT_MCLK;
+ case SI_QUERY_CS_THREAD_BUSY:
+ return RADEON_CS_THREAD_TIME;
+ default:
+ unreachable("query type does not correspond to winsys id");
+ }
}
static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
{
- struct pipe_fence_handle *fence = NULL;
+ struct pipe_fence_handle *fence = NULL;
- si_flush_dma_cs(sctx, 0, &fence);
- if (fence) {
- sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
- sctx->ws->fence_reference(&fence, NULL);
- }
+ si_flush_dma_cs(sctx, 0, &fence);
+ if (fence) {
+ sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
+ sctx->ws->fence_reference(&fence, NULL);
+ }
- return os_time_get_nano();
+ return os_time_get_nano();
}
-static bool si_query_sw_begin(struct si_context *sctx,
- struct si_query *squery)
+static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_sw *query = (struct si_query_sw *)squery;
- enum radeon_value_id ws_id;
-
- switch(query->b.type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- case PIPE_QUERY_GPU_FINISHED:
- break;
- case SI_QUERY_TIME_ELAPSED_SDMA_SI:
- query->begin_result = si_finish_dma_get_cpu_time(sctx);
- break;
- case SI_QUERY_DRAW_CALLS:
- query->begin_result = sctx->num_draw_calls;
- break;
- case SI_QUERY_DECOMPRESS_CALLS:
- query->begin_result = sctx->num_decompress_calls;
- break;
- case SI_QUERY_MRT_DRAW_CALLS:
- query->begin_result = sctx->num_mrt_draw_calls;
- break;
- case SI_QUERY_PRIM_RESTART_CALLS:
- query->begin_result = sctx->num_prim_restart_calls;
- break;
- case SI_QUERY_SPILL_DRAW_CALLS:
- query->begin_result = sctx->num_spill_draw_calls;
- break;
- case SI_QUERY_COMPUTE_CALLS:
- query->begin_result = sctx->num_compute_calls;
- break;
- case SI_QUERY_SPILL_COMPUTE_CALLS:
- query->begin_result = sctx->num_spill_compute_calls;
- break;
- case SI_QUERY_DMA_CALLS:
- query->begin_result = sctx->num_dma_calls;
- break;
- case SI_QUERY_CP_DMA_CALLS:
- query->begin_result = sctx->num_cp_dma_calls;
- break;
- case SI_QUERY_NUM_VS_FLUSHES:
- query->begin_result = sctx->num_vs_flushes;
- break;
- case SI_QUERY_NUM_PS_FLUSHES:
- query->begin_result = sctx->num_ps_flushes;
- break;
- case SI_QUERY_NUM_CS_FLUSHES:
- query->begin_result = sctx->num_cs_flushes;
- break;
- case SI_QUERY_NUM_CB_CACHE_FLUSHES:
- query->begin_result = sctx->num_cb_cache_flushes;
- break;
- case SI_QUERY_NUM_DB_CACHE_FLUSHES:
- query->begin_result = sctx->num_db_cache_flushes;
- break;
- case SI_QUERY_NUM_L2_INVALIDATES:
- query->begin_result = sctx->num_L2_invalidates;
- break;
- case SI_QUERY_NUM_L2_WRITEBACKS:
- query->begin_result = sctx->num_L2_writebacks;
- break;
- case SI_QUERY_NUM_RESIDENT_HANDLES:
- query->begin_result = sctx->num_resident_handles;
- break;
- case SI_QUERY_TC_OFFLOADED_SLOTS:
- query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
- break;
- case SI_QUERY_TC_DIRECT_SLOTS:
- query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
- break;
- case SI_QUERY_TC_NUM_SYNCS:
- query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
- break;
- case SI_QUERY_REQUESTED_VRAM:
- case SI_QUERY_REQUESTED_GTT:
- case SI_QUERY_MAPPED_VRAM:
- case SI_QUERY_MAPPED_GTT:
- case SI_QUERY_VRAM_USAGE:
- case SI_QUERY_VRAM_VIS_USAGE:
- case SI_QUERY_GTT_USAGE:
- case SI_QUERY_GPU_TEMPERATURE:
- case SI_QUERY_CURRENT_GPU_SCLK:
- case SI_QUERY_CURRENT_GPU_MCLK:
- case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
- case SI_QUERY_NUM_MAPPED_BUFFERS:
- query->begin_result = 0;
- break;
- case SI_QUERY_BUFFER_WAIT_TIME:
- case SI_QUERY_GFX_IB_SIZE:
- case SI_QUERY_NUM_GFX_IBS:
- case SI_QUERY_NUM_SDMA_IBS:
- case SI_QUERY_NUM_BYTES_MOVED:
- case SI_QUERY_NUM_EVICTIONS:
- case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
- enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
- query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
- break;
- }
- case SI_QUERY_GFX_BO_LIST_SIZE:
- ws_id = winsys_id_from_type(query->b.type);
- query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
- query->begin_time = sctx->ws->query_value(sctx->ws,
- RADEON_NUM_GFX_IBS);
- break;
- case SI_QUERY_CS_THREAD_BUSY:
- ws_id = winsys_id_from_type(query->b.type);
- query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
- query->begin_time = os_time_get_nano();
- break;
- case SI_QUERY_GALLIUM_THREAD_BUSY:
- query->begin_result =
- sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
- query->begin_time = os_time_get_nano();
- break;
- case SI_QUERY_GPU_LOAD:
- case SI_QUERY_GPU_SHADERS_BUSY:
- case SI_QUERY_GPU_TA_BUSY:
- case SI_QUERY_GPU_GDS_BUSY:
- case SI_QUERY_GPU_VGT_BUSY:
- case SI_QUERY_GPU_IA_BUSY:
- case SI_QUERY_GPU_SX_BUSY:
- case SI_QUERY_GPU_WD_BUSY:
- case SI_QUERY_GPU_BCI_BUSY:
- case SI_QUERY_GPU_SC_BUSY:
- case SI_QUERY_GPU_PA_BUSY:
- case SI_QUERY_GPU_DB_BUSY:
- case SI_QUERY_GPU_CP_BUSY:
- case SI_QUERY_GPU_CB_BUSY:
- case SI_QUERY_GPU_SDMA_BUSY:
- case SI_QUERY_GPU_PFP_BUSY:
- case SI_QUERY_GPU_MEQ_BUSY:
- case SI_QUERY_GPU_ME_BUSY:
- case SI_QUERY_GPU_SURF_SYNC_BUSY:
- case SI_QUERY_GPU_CP_DMA_BUSY:
- case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
- query->begin_result = si_begin_counter(sctx->screen,
- query->b.type);
- break;
- case SI_QUERY_NUM_COMPILATIONS:
- query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
- break;
- case SI_QUERY_NUM_SHADERS_CREATED:
- query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
- break;
- case SI_QUERY_LIVE_SHADER_CACHE_HITS:
- query->begin_result = sctx->screen->live_shader_cache.hits;
- break;
- case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
- query->begin_result = sctx->screen->live_shader_cache.misses;
- break;
- case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
- query->begin_result = sctx->screen->num_memory_shader_cache_hits;
- break;
- case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
- query->begin_result = sctx->screen->num_memory_shader_cache_misses;
- break;
- case SI_QUERY_DISK_SHADER_CACHE_HITS:
- query->begin_result = sctx->screen->num_disk_shader_cache_hits;
- break;
- case SI_QUERY_DISK_SHADER_CACHE_MISSES:
- query->begin_result = sctx->screen->num_disk_shader_cache_misses;
- break;
- case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
- query->begin_result = sctx->compute_num_verts_accepted;
- break;
- case SI_QUERY_PD_NUM_PRIMS_REJECTED:
- query->begin_result = sctx->compute_num_verts_rejected;
- break;
- case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
- query->begin_result = sctx->compute_num_verts_ineligible;
- break;
- case SI_QUERY_GPIN_ASIC_ID:
- case SI_QUERY_GPIN_NUM_SIMD:
- case SI_QUERY_GPIN_NUM_RB:
- case SI_QUERY_GPIN_NUM_SPI:
- case SI_QUERY_GPIN_NUM_SE:
- break;
- default:
- unreachable("si_query_sw_begin: bad query type");
- }
-
- return true;
+ struct si_query_sw *query = (struct si_query_sw *)squery;
+ enum radeon_value_id ws_id;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ case PIPE_QUERY_GPU_FINISHED:
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+ query->begin_result = si_finish_dma_get_cpu_time(sctx);
+ break;
+ case SI_QUERY_DRAW_CALLS:
+ query->begin_result = sctx->num_draw_calls;
+ break;
+ case SI_QUERY_DECOMPRESS_CALLS:
+ query->begin_result = sctx->num_decompress_calls;
+ break;
+ case SI_QUERY_MRT_DRAW_CALLS:
+ query->begin_result = sctx->num_mrt_draw_calls;
+ break;
+ case SI_QUERY_PRIM_RESTART_CALLS:
+ query->begin_result = sctx->num_prim_restart_calls;
+ break;
+ case SI_QUERY_SPILL_DRAW_CALLS:
+ query->begin_result = sctx->num_spill_draw_calls;
+ break;
+ case SI_QUERY_COMPUTE_CALLS:
+ query->begin_result = sctx->num_compute_calls;
+ break;
+ case SI_QUERY_SPILL_COMPUTE_CALLS:
+ query->begin_result = sctx->num_spill_compute_calls;
+ break;
+ case SI_QUERY_DMA_CALLS:
+ query->begin_result = sctx->num_dma_calls;
+ break;
+ case SI_QUERY_CP_DMA_CALLS:
+ query->begin_result = sctx->num_cp_dma_calls;
+ break;
+ case SI_QUERY_NUM_VS_FLUSHES:
+ query->begin_result = sctx->num_vs_flushes;
+ break;
+ case SI_QUERY_NUM_PS_FLUSHES:
+ query->begin_result = sctx->num_ps_flushes;
+ break;
+ case SI_QUERY_NUM_CS_FLUSHES:
+ query->begin_result = sctx->num_cs_flushes;
+ break;
+ case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->begin_result = sctx->num_cb_cache_flushes;
+ break;
+ case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->begin_result = sctx->num_db_cache_flushes;
+ break;
+ case SI_QUERY_NUM_L2_INVALIDATES:
+ query->begin_result = sctx->num_L2_invalidates;
+ break;
+ case SI_QUERY_NUM_L2_WRITEBACKS:
+ query->begin_result = sctx->num_L2_writebacks;
+ break;
+ case SI_QUERY_NUM_RESIDENT_HANDLES:
+ query->begin_result = sctx->num_resident_handles;
+ break;
+ case SI_QUERY_TC_OFFLOADED_SLOTS:
+ query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+ break;
+ case SI_QUERY_TC_DIRECT_SLOTS:
+ query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+ break;
+ case SI_QUERY_TC_NUM_SYNCS:
+ query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
+ break;
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_MAPPED_VRAM:
+ case SI_QUERY_MAPPED_GTT:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_VRAM_VIS_USAGE:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_GPU_TEMPERATURE:
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ case SI_QUERY_NUM_MAPPED_BUFFERS:
+ query->begin_result = 0;
+ break;
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GFX_IB_SIZE:
+ case SI_QUERY_NUM_GFX_IBS:
+ case SI_QUERY_NUM_SDMA_IBS:
+ case SI_QUERY_NUM_BYTES_MOVED:
+ case SI_QUERY_NUM_EVICTIONS:
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ break;
+ }
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+ break;
+ case SI_QUERY_CS_THREAD_BUSY:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->begin_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+ query->begin_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GPU_LOAD:
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ case SI_QUERY_GPU_TA_BUSY:
+ case SI_QUERY_GPU_GDS_BUSY:
+ case SI_QUERY_GPU_VGT_BUSY:
+ case SI_QUERY_GPU_IA_BUSY:
+ case SI_QUERY_GPU_SX_BUSY:
+ case SI_QUERY_GPU_WD_BUSY:
+ case SI_QUERY_GPU_BCI_BUSY:
+ case SI_QUERY_GPU_SC_BUSY:
+ case SI_QUERY_GPU_PA_BUSY:
+ case SI_QUERY_GPU_DB_BUSY:
+ case SI_QUERY_GPU_CP_BUSY:
+ case SI_QUERY_GPU_CB_BUSY:
+ case SI_QUERY_GPU_SDMA_BUSY:
+ case SI_QUERY_GPU_PFP_BUSY:
+ case SI_QUERY_GPU_MEQ_BUSY:
+ case SI_QUERY_GPU_ME_BUSY:
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ query->begin_result = si_begin_counter(sctx->screen, query->b.type);
+ break;
+ case SI_QUERY_NUM_COMPILATIONS:
+ query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
+ break;
+ case SI_QUERY_NUM_SHADERS_CREATED:
+ query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
+ break;
+ case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+ query->begin_result = sctx->screen->live_shader_cache.hits;
+ break;
+ case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+ query->begin_result = sctx->screen->live_shader_cache.misses;
+ break;
+ case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+ query->begin_result = sctx->screen->num_memory_shader_cache_hits;
+ break;
+ case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+ query->begin_result = sctx->screen->num_memory_shader_cache_misses;
+ break;
+ case SI_QUERY_DISK_SHADER_CACHE_HITS:
+ query->begin_result = sctx->screen->num_disk_shader_cache_hits;
+ break;
+ case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+ query->begin_result = sctx->screen->num_disk_shader_cache_misses;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+ query->begin_result = sctx->compute_num_verts_accepted;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+ query->begin_result = sctx->compute_num_verts_rejected;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+ query->begin_result = sctx->compute_num_verts_ineligible;
+ break;
+ case SI_QUERY_GPIN_ASIC_ID:
+ case SI_QUERY_GPIN_NUM_SIMD:
+ case SI_QUERY_GPIN_NUM_RB:
+ case SI_QUERY_GPIN_NUM_SPI:
+ case SI_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("si_query_sw_begin: bad query type");
+ }
+
+ return true;
}
-static bool si_query_sw_end(struct si_context *sctx,
- struct si_query *squery)
+static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_sw *query = (struct si_query_sw *)squery;
- enum radeon_value_id ws_id;
-
- switch(query->b.type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- break;
- case PIPE_QUERY_GPU_FINISHED:
- sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
- break;
- case SI_QUERY_TIME_ELAPSED_SDMA_SI:
- query->end_result = si_finish_dma_get_cpu_time(sctx);
- break;
- case SI_QUERY_DRAW_CALLS:
- query->end_result = sctx->num_draw_calls;
- break;
- case SI_QUERY_DECOMPRESS_CALLS:
- query->end_result = sctx->num_decompress_calls;
- break;
- case SI_QUERY_MRT_DRAW_CALLS:
- query->end_result = sctx->num_mrt_draw_calls;
- break;
- case SI_QUERY_PRIM_RESTART_CALLS:
- query->end_result = sctx->num_prim_restart_calls;
- break;
- case SI_QUERY_SPILL_DRAW_CALLS:
- query->end_result = sctx->num_spill_draw_calls;
- break;
- case SI_QUERY_COMPUTE_CALLS:
- query->end_result = sctx->num_compute_calls;
- break;
- case SI_QUERY_SPILL_COMPUTE_CALLS:
- query->end_result = sctx->num_spill_compute_calls;
- break;
- case SI_QUERY_DMA_CALLS:
- query->end_result = sctx->num_dma_calls;
- break;
- case SI_QUERY_CP_DMA_CALLS:
- query->end_result = sctx->num_cp_dma_calls;
- break;
- case SI_QUERY_NUM_VS_FLUSHES:
- query->end_result = sctx->num_vs_flushes;
- break;
- case SI_QUERY_NUM_PS_FLUSHES:
- query->end_result = sctx->num_ps_flushes;
- break;
- case SI_QUERY_NUM_CS_FLUSHES:
- query->end_result = sctx->num_cs_flushes;
- break;
- case SI_QUERY_NUM_CB_CACHE_FLUSHES:
- query->end_result = sctx->num_cb_cache_flushes;
- break;
- case SI_QUERY_NUM_DB_CACHE_FLUSHES:
- query->end_result = sctx->num_db_cache_flushes;
- break;
- case SI_QUERY_NUM_L2_INVALIDATES:
- query->end_result = sctx->num_L2_invalidates;
- break;
- case SI_QUERY_NUM_L2_WRITEBACKS:
- query->end_result = sctx->num_L2_writebacks;
- break;
- case SI_QUERY_NUM_RESIDENT_HANDLES:
- query->end_result = sctx->num_resident_handles;
- break;
- case SI_QUERY_TC_OFFLOADED_SLOTS:
- query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
- break;
- case SI_QUERY_TC_DIRECT_SLOTS:
- query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
- break;
- case SI_QUERY_TC_NUM_SYNCS:
- query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
- break;
- case SI_QUERY_REQUESTED_VRAM:
- case SI_QUERY_REQUESTED_GTT:
- case SI_QUERY_MAPPED_VRAM:
- case SI_QUERY_MAPPED_GTT:
- case SI_QUERY_VRAM_USAGE:
- case SI_QUERY_VRAM_VIS_USAGE:
- case SI_QUERY_GTT_USAGE:
- case SI_QUERY_GPU_TEMPERATURE:
- case SI_QUERY_CURRENT_GPU_SCLK:
- case SI_QUERY_CURRENT_GPU_MCLK:
- case SI_QUERY_BUFFER_WAIT_TIME:
- case SI_QUERY_GFX_IB_SIZE:
- case SI_QUERY_NUM_MAPPED_BUFFERS:
- case SI_QUERY_NUM_GFX_IBS:
- case SI_QUERY_NUM_SDMA_IBS:
- case SI_QUERY_NUM_BYTES_MOVED:
- case SI_QUERY_NUM_EVICTIONS:
- case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
- enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
- query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
- break;
- }
- case SI_QUERY_GFX_BO_LIST_SIZE:
- ws_id = winsys_id_from_type(query->b.type);
- query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
- query->end_time = sctx->ws->query_value(sctx->ws,
- RADEON_NUM_GFX_IBS);
- break;
- case SI_QUERY_CS_THREAD_BUSY:
- ws_id = winsys_id_from_type(query->b.type);
- query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
- query->end_time = os_time_get_nano();
- break;
- case SI_QUERY_GALLIUM_THREAD_BUSY:
- query->end_result =
- sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
- query->end_time = os_time_get_nano();
- break;
- case SI_QUERY_GPU_LOAD:
- case SI_QUERY_GPU_SHADERS_BUSY:
- case SI_QUERY_GPU_TA_BUSY:
- case SI_QUERY_GPU_GDS_BUSY:
- case SI_QUERY_GPU_VGT_BUSY:
- case SI_QUERY_GPU_IA_BUSY:
- case SI_QUERY_GPU_SX_BUSY:
- case SI_QUERY_GPU_WD_BUSY:
- case SI_QUERY_GPU_BCI_BUSY:
- case SI_QUERY_GPU_SC_BUSY:
- case SI_QUERY_GPU_PA_BUSY:
- case SI_QUERY_GPU_DB_BUSY:
- case SI_QUERY_GPU_CP_BUSY:
- case SI_QUERY_GPU_CB_BUSY:
- case SI_QUERY_GPU_SDMA_BUSY:
- case SI_QUERY_GPU_PFP_BUSY:
- case SI_QUERY_GPU_MEQ_BUSY:
- case SI_QUERY_GPU_ME_BUSY:
- case SI_QUERY_GPU_SURF_SYNC_BUSY:
- case SI_QUERY_GPU_CP_DMA_BUSY:
- case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
- query->end_result = si_end_counter(sctx->screen,
- query->b.type,
- query->begin_result);
- query->begin_result = 0;
- break;
- case SI_QUERY_NUM_COMPILATIONS:
- query->end_result = p_atomic_read(&sctx->screen->num_compilations);
- break;
- case SI_QUERY_NUM_SHADERS_CREATED:
- query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
- break;
- case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
- query->end_result = sctx->last_tex_ps_draw_ratio;
- break;
- case SI_QUERY_LIVE_SHADER_CACHE_HITS:
- query->end_result = sctx->screen->live_shader_cache.hits;
- break;
- case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
- query->end_result = sctx->screen->live_shader_cache.misses;
- break;
- case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
- query->end_result = sctx->screen->num_memory_shader_cache_hits;
- break;
- case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
- query->end_result = sctx->screen->num_memory_shader_cache_misses;
- break;
- case SI_QUERY_DISK_SHADER_CACHE_HITS:
- query->end_result = sctx->screen->num_disk_shader_cache_hits;
- break;
- case SI_QUERY_DISK_SHADER_CACHE_MISSES:
- query->end_result = sctx->screen->num_disk_shader_cache_misses;
- break;
- case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
- query->end_result = sctx->compute_num_verts_accepted;
- break;
- case SI_QUERY_PD_NUM_PRIMS_REJECTED:
- query->end_result = sctx->compute_num_verts_rejected;
- break;
- case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
- query->end_result = sctx->compute_num_verts_ineligible;
- break;
- case SI_QUERY_GPIN_ASIC_ID:
- case SI_QUERY_GPIN_NUM_SIMD:
- case SI_QUERY_GPIN_NUM_RB:
- case SI_QUERY_GPIN_NUM_SPI:
- case SI_QUERY_GPIN_NUM_SE:
- break;
- default:
- unreachable("si_query_sw_end: bad query type");
- }
-
- return true;
+ struct si_query_sw *query = (struct si_query_sw *)squery;
+ enum radeon_value_id ws_id;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ break;
+ case PIPE_QUERY_GPU_FINISHED:
+ sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+ query->end_result = si_finish_dma_get_cpu_time(sctx);
+ break;
+ case SI_QUERY_DRAW_CALLS:
+ query->end_result = sctx->num_draw_calls;
+ break;
+ case SI_QUERY_DECOMPRESS_CALLS:
+ query->end_result = sctx->num_decompress_calls;
+ break;
+ case SI_QUERY_MRT_DRAW_CALLS:
+ query->end_result = sctx->num_mrt_draw_calls;
+ break;
+ case SI_QUERY_PRIM_RESTART_CALLS:
+ query->end_result = sctx->num_prim_restart_calls;
+ break;
+ case SI_QUERY_SPILL_DRAW_CALLS:
+ query->end_result = sctx->num_spill_draw_calls;
+ break;
+ case SI_QUERY_COMPUTE_CALLS:
+ query->end_result = sctx->num_compute_calls;
+ break;
+ case SI_QUERY_SPILL_COMPUTE_CALLS:
+ query->end_result = sctx->num_spill_compute_calls;
+ break;
+ case SI_QUERY_DMA_CALLS:
+ query->end_result = sctx->num_dma_calls;
+ break;
+ case SI_QUERY_CP_DMA_CALLS:
+ query->end_result = sctx->num_cp_dma_calls;
+ break;
+ case SI_QUERY_NUM_VS_FLUSHES:
+ query->end_result = sctx->num_vs_flushes;
+ break;
+ case SI_QUERY_NUM_PS_FLUSHES:
+ query->end_result = sctx->num_ps_flushes;
+ break;
+ case SI_QUERY_NUM_CS_FLUSHES:
+ query->end_result = sctx->num_cs_flushes;
+ break;
+ case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+ query->end_result = sctx->num_cb_cache_flushes;
+ break;
+ case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+ query->end_result = sctx->num_db_cache_flushes;
+ break;
+ case SI_QUERY_NUM_L2_INVALIDATES:
+ query->end_result = sctx->num_L2_invalidates;
+ break;
+ case SI_QUERY_NUM_L2_WRITEBACKS:
+ query->end_result = sctx->num_L2_writebacks;
+ break;
+ case SI_QUERY_NUM_RESIDENT_HANDLES:
+ query->end_result = sctx->num_resident_handles;
+ break;
+ case SI_QUERY_TC_OFFLOADED_SLOTS:
+ query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+ break;
+ case SI_QUERY_TC_DIRECT_SLOTS:
+ query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+ break;
+ case SI_QUERY_TC_NUM_SYNCS:
+ query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
+ break;
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_MAPPED_VRAM:
+ case SI_QUERY_MAPPED_GTT:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_VRAM_VIS_USAGE:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_GPU_TEMPERATURE:
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GFX_IB_SIZE:
+ case SI_QUERY_NUM_MAPPED_BUFFERS:
+ case SI_QUERY_NUM_GFX_IBS:
+ case SI_QUERY_NUM_SDMA_IBS:
+ case SI_QUERY_NUM_BYTES_MOVED:
+ case SI_QUERY_NUM_EVICTIONS:
+ case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ break;
+ }
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+ break;
+ case SI_QUERY_CS_THREAD_BUSY:
+ ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+ query->end_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+ query->end_time = os_time_get_nano();
+ break;
+ case SI_QUERY_GPU_LOAD:
+ case SI_QUERY_GPU_SHADERS_BUSY:
+ case SI_QUERY_GPU_TA_BUSY:
+ case SI_QUERY_GPU_GDS_BUSY:
+ case SI_QUERY_GPU_VGT_BUSY:
+ case SI_QUERY_GPU_IA_BUSY:
+ case SI_QUERY_GPU_SX_BUSY:
+ case SI_QUERY_GPU_WD_BUSY:
+ case SI_QUERY_GPU_BCI_BUSY:
+ case SI_QUERY_GPU_SC_BUSY:
+ case SI_QUERY_GPU_PA_BUSY:
+ case SI_QUERY_GPU_DB_BUSY:
+ case SI_QUERY_GPU_CP_BUSY:
+ case SI_QUERY_GPU_CB_BUSY:
+ case SI_QUERY_GPU_SDMA_BUSY:
+ case SI_QUERY_GPU_PFP_BUSY:
+ case SI_QUERY_GPU_MEQ_BUSY:
+ case SI_QUERY_GPU_ME_BUSY:
+ case SI_QUERY_GPU_SURF_SYNC_BUSY:
+ case SI_QUERY_GPU_CP_DMA_BUSY:
+ case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+ query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
+ query->begin_result = 0;
+ break;
+ case SI_QUERY_NUM_COMPILATIONS:
+ query->end_result = p_atomic_read(&sctx->screen->num_compilations);
+ break;
+ case SI_QUERY_NUM_SHADERS_CREATED:
+ query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
+ break;
+ case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ query->end_result = sctx->last_tex_ps_draw_ratio;
+ break;
+ case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+ query->end_result = sctx->screen->live_shader_cache.hits;
+ break;
+ case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+ query->end_result = sctx->screen->live_shader_cache.misses;
+ break;
+ case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+ query->end_result = sctx->screen->num_memory_shader_cache_hits;
+ break;
+ case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+ query->end_result = sctx->screen->num_memory_shader_cache_misses;
+ break;
+ case SI_QUERY_DISK_SHADER_CACHE_HITS:
+ query->end_result = sctx->screen->num_disk_shader_cache_hits;
+ break;
+ case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+ query->end_result = sctx->screen->num_disk_shader_cache_misses;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+ query->end_result = sctx->compute_num_verts_accepted;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+ query->end_result = sctx->compute_num_verts_rejected;
+ break;
+ case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+ query->end_result = sctx->compute_num_verts_ineligible;
+ break;
+ case SI_QUERY_GPIN_ASIC_ID:
+ case SI_QUERY_GPIN_NUM_SIMD:
+ case SI_QUERY_GPIN_NUM_RB:
+ case SI_QUERY_GPIN_NUM_SPI:
+ case SI_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("si_query_sw_end: bad query type");
+ }
+
+ return true;
}
-static bool si_query_sw_get_result(struct si_context *sctx,
- struct si_query *squery,
- bool wait,
- union pipe_query_result *result)
+static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+ union pipe_query_result *result)
{
- struct si_query_sw *query = (struct si_query_sw *)squery;
-
- switch (query->b.type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- /* Convert from cycles per millisecond to cycles per second (Hz). */
- result->timestamp_disjoint.frequency =
- (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
- result->timestamp_disjoint.disjoint = false;
- return true;
- case PIPE_QUERY_GPU_FINISHED: {
- struct pipe_screen *screen = sctx->b.screen;
- struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
-
- result->b = screen->fence_finish(screen, ctx, query->fence,
- wait ? PIPE_TIMEOUT_INFINITE : 0);
- return result->b;
- }
-
- case SI_QUERY_GFX_BO_LIST_SIZE:
- result->u64 = (query->end_result - query->begin_result) /
- (query->end_time - query->begin_time);
- return true;
- case SI_QUERY_CS_THREAD_BUSY:
- case SI_QUERY_GALLIUM_THREAD_BUSY:
- result->u64 = (query->end_result - query->begin_result) * 100 /
- (query->end_time - query->begin_time);
- return true;
- case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
- case SI_QUERY_PD_NUM_PRIMS_REJECTED:
- case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
- result->u64 = ((unsigned)query->end_result -
- (unsigned)query->begin_result) / 3;
- return true;
- case SI_QUERY_GPIN_ASIC_ID:
- result->u32 = 0;
- return true;
- case SI_QUERY_GPIN_NUM_SIMD:
- result->u32 = sctx->screen->info.num_good_compute_units;
- return true;
- case SI_QUERY_GPIN_NUM_RB:
- result->u32 = sctx->screen->info.num_render_backends;
- return true;
- case SI_QUERY_GPIN_NUM_SPI:
- result->u32 = 1; /* all supported chips have one SPI per SE */
- return true;
- case SI_QUERY_GPIN_NUM_SE:
- result->u32 = sctx->screen->info.max_se;
- return true;
- }
-
- result->u64 = query->end_result - query->begin_result;
-
- switch (query->b.type) {
- case SI_QUERY_BUFFER_WAIT_TIME:
- case SI_QUERY_GPU_TEMPERATURE:
- result->u64 /= 1000;
- break;
- case SI_QUERY_CURRENT_GPU_SCLK:
- case SI_QUERY_CURRENT_GPU_MCLK:
- result->u64 *= 1000000;
- break;
- }
-
- return true;
+ struct si_query_sw *query = (struct si_query_sw *)squery;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* Convert from cycles per millisecond to cycles per second (Hz). */
+ result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
+ result->timestamp_disjoint.disjoint = false;
+ return true;
+ case PIPE_QUERY_GPU_FINISHED: {
+ struct pipe_screen *screen = sctx->b.screen;
+ struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
+
+ result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
+ return result->b;
+ }
+
+ case SI_QUERY_GFX_BO_LIST_SIZE:
+ result->u64 =
+ (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
+ return true;
+ case SI_QUERY_CS_THREAD_BUSY:
+ case SI_QUERY_GALLIUM_THREAD_BUSY:
+ result->u64 =
+ (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
+ return true;
+ case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+ case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+ case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+ result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
+ return true;
+ case SI_QUERY_GPIN_ASIC_ID:
+ result->u32 = 0;
+ return true;
+ case SI_QUERY_GPIN_NUM_SIMD:
+ result->u32 = sctx->screen->info.num_good_compute_units;
+ return true;
+ case SI_QUERY_GPIN_NUM_RB:
+ result->u32 = sctx->screen->info.num_render_backends;
+ return true;
+ case SI_QUERY_GPIN_NUM_SPI:
+ result->u32 = 1; /* all supported chips have one SPI per SE */
+ return true;
+ case SI_QUERY_GPIN_NUM_SE:
+ result->u32 = sctx->screen->info.max_se;
+ return true;
+ }
+
+ result->u64 = query->end_result - query->begin_result;
+
+ switch (query->b.type) {
+ case SI_QUERY_BUFFER_WAIT_TIME:
+ case SI_QUERY_GPU_TEMPERATURE:
+ result->u64 /= 1000;
+ break;
+ case SI_QUERY_CURRENT_GPU_SCLK:
+ case SI_QUERY_CURRENT_GPU_MCLK:
+ result->u64 *= 1000000;
+ break;
+ }
+
+ return true;
}
-
-static const struct si_query_ops sw_query_ops = {
- .destroy = si_query_sw_destroy,
- .begin = si_query_sw_begin,
- .end = si_query_sw_end,
- .get_result = si_query_sw_get_result,
- .get_result_resource = NULL
-};
+static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
+ .begin = si_query_sw_begin,
+ .end = si_query_sw_end,
+ .get_result = si_query_sw_get_result,
+ .get_result_resource = NULL};
static struct pipe_query *si_query_sw_create(unsigned query_type)
{
- struct si_query_sw *query;
+ struct si_query_sw *query;
- query = CALLOC_STRUCT(si_query_sw);
- if (!query)
- return NULL;
+ query = CALLOC_STRUCT(si_query_sw);
+ if (!query)
+ return NULL;
- query->b.type = query_type;
- query->b.ops = &sw_query_ops;
+ query->b.type = query_type;
+ query->b.ops = &sw_query_ops;
- return (struct pipe_query *)query;
+ return (struct pipe_query *)query;
}
void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
{
- struct si_query_buffer *prev = buffer->previous;
+ struct si_query_buffer *prev = buffer->previous;
- /* Release all query buffers. */
- while (prev) {
- struct si_query_buffer *qbuf = prev;
- prev = prev->previous;
- si_resource_reference(&qbuf->buf, NULL);
- FREE(qbuf);
- }
+ /* Release all query buffers. */
+ while (prev) {
+ struct si_query_buffer *qbuf = prev;
+ prev = prev->previous;
+ si_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
+ }
- si_resource_reference(&buffer->buf, NULL);
+ si_resource_reference(&buffer->buf, NULL);
}
void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
{
- /* Discard all query buffers except for the oldest. */
- while (buffer->previous) {
- struct si_query_buffer *qbuf = buffer->previous;
- buffer->previous = qbuf->previous;
-
- si_resource_reference(&buffer->buf, NULL);
- buffer->buf = qbuf->buf; /* move ownership */
- FREE(qbuf);
- }
- buffer->results_end = 0;
-
- if (!buffer->buf)
- return;
-
- /* Discard even the oldest buffer if it can't be mapped without a stall. */
- if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
- si_resource_reference(&buffer->buf, NULL);
- } else {
- buffer->unprepared = true;
- }
+ /* Discard all query buffers except for the oldest. */
+ while (buffer->previous) {
+ struct si_query_buffer *qbuf = buffer->previous;
+ buffer->previous = qbuf->previous;
+
+ si_resource_reference(&buffer->buf, NULL);
+ buffer->buf = qbuf->buf; /* move ownership */
+ FREE(qbuf);
+ }
+ buffer->results_end = 0;
+
+ if (!buffer->buf)
+ return;
+
+ /* Discard even the oldest buffer if it can't be mapped without a stall. */
+ if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ si_resource_reference(&buffer->buf, NULL);
+ } else {
+ buffer->unprepared = true;
+ }
}
bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
- bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
- unsigned size)
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+ unsigned size)
{
- bool unprepared = buffer->unprepared;
- buffer->unprepared = false;
-
- if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
- if (buffer->buf) {
- struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
- memcpy(qbuf, buffer, sizeof(*qbuf));
- buffer->previous = qbuf;
- }
- buffer->results_end = 0;
-
- /* Queries are normally read by the CPU after
- * being written by the gpu, hence staging is probably a good
- * usage pattern.
- */
- struct si_screen *screen = sctx->screen;
- unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
- buffer->buf = si_resource(
- pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
- if (unlikely(!buffer->buf))
- return false;
- unprepared = true;
- }
-
- if (unprepared && prepare_buffer) {
- if (unlikely(!prepare_buffer(sctx, buffer))) {
- si_resource_reference(&buffer->buf, NULL);
- return false;
- }
- }
-
- return true;
+ bool unprepared = buffer->unprepared;
+ buffer->unprepared = false;
+
+ if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
+ if (buffer->buf) {
+ struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+ memcpy(qbuf, buffer, sizeof(*qbuf));
+ buffer->previous = qbuf;
+ }
+ buffer->results_end = 0;
+
+ /* Queries are normally read by the CPU after
+ * being written by the gpu, hence staging is probably a good
+ * usage pattern.
+ */
+ struct si_screen *screen = sctx->screen;
+ unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+ buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+ if (unlikely(!buffer->buf))
+ return false;
+ unprepared = true;
+ }
+
+ if (unprepared && prepare_buffer) {
+ if (unlikely(!prepare_buffer(sctx, buffer))) {
+ si_resource_reference(&buffer->buf, NULL);
+ return false;
+ }
+ }
+
+ return true;
}
-
void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_hw *query = (struct si_query_hw *)squery;
+ struct si_query_hw *query = (struct si_query_hw *)squery;
- si_query_buffer_destroy(sctx->screen, &query->buffer);
- si_resource_reference(&query->workaround_buf, NULL);
- FREE(squery);
+ si_query_buffer_destroy(sctx->screen, &query->buffer);
+ si_resource_reference(&query->workaround_buf, NULL);
+ FREE(squery);
}
-static bool si_query_hw_prepare_buffer(struct si_context *sctx,
- struct si_query_buffer *qbuf)
+static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
{
- static const struct si_query_hw si_query_hw_s;
- struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
- struct si_screen *screen = sctx->screen;
-
- /* The caller ensures that the buffer is currently unused by the GPU. */
- uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
- PIPE_TRANSFER_WRITE |
- PIPE_TRANSFER_UNSYNCHRONIZED);
- if (!results)
- return false;
-
- memset(results, 0, qbuf->buf->b.b.width0);
-
- if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
- query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
- query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
- unsigned max_rbs = screen->info.num_render_backends;
- unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
- unsigned num_results;
- unsigned i, j;
-
- /* Set top bits for unused backends. */
- num_results = qbuf->buf->b.b.width0 / query->result_size;
- for (j = 0; j < num_results; j++) {
- for (i = 0; i < max_rbs; i++) {
- if (!(enabled_rb_mask & (1<<i))) {
- results[(i * 4)+1] = 0x80000000;
- results[(i * 4)+3] = 0x80000000;
- }
- }
- results += 4 * max_rbs;
- }
- }
-
- return true;
+ static const struct si_query_hw si_query_hw_s;
+ struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
+ struct si_screen *screen = sctx->screen;
+
+ /* The caller ensures that the buffer is currently unused by the GPU. */
+ uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
+ PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!results)
+ return false;
+
+ memset(results, 0, qbuf->buf->b.b.width0);
+
+ if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ unsigned max_rbs = screen->info.num_render_backends;
+ unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
+ unsigned num_results;
+ unsigned i, j;
+
+ /* Set top bits for unused backends. */
+ num_results = qbuf->buf->b.b.width0 / query->result_size;
+ for (j = 0; j < num_results; j++) {
+ for (i = 0; i < max_rbs; i++) {
+ if (!(enabled_rb_mask & (1 << i))) {
+ results[(i * 4) + 1] = 0x80000000;
+ results[(i * 4) + 3] = 0x80000000;
+ }
+ }
+ results += 4 * max_rbs;
+ }
+ }
+
+ return true;
}
-static void si_query_hw_get_result_resource(struct si_context *sctx,
- struct si_query *squery,
- bool wait,
- enum pipe_query_value_type result_type,
- int index,
- struct pipe_resource *resource,
- unsigned offset);
-
-static void si_query_hw_do_emit_start(struct si_context *sctx,
- struct si_query_hw *query,
- struct si_resource *buffer,
- uint64_t va);
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
- struct si_query_hw *query,
- struct si_resource *buffer,
- uint64_t va);
-static void si_query_hw_add_result(struct si_screen *sscreen,
- struct si_query_hw *, void *buffer,
- union pipe_query_result *result);
-static void si_query_hw_clear_result(struct si_query_hw *,
- union pipe_query_result *);
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+ bool wait, enum pipe_query_value_type result_type,
+ int index, struct pipe_resource *resource,
+ unsigned offset);
+
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+ struct si_resource *buffer, uint64_t va);
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+ struct si_resource *buffer, uint64_t va);
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
+ union pipe_query_result *result);
+static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
static struct si_query_hw_ops query_hw_default_hw_ops = {
- .prepare_buffer = si_query_hw_prepare_buffer,
- .emit_start = si_query_hw_do_emit_start,
- .emit_stop = si_query_hw_do_emit_stop,
- .clear_result = si_query_hw_clear_result,
- .add_result = si_query_hw_add_result,
+ .prepare_buffer = si_query_hw_prepare_buffer,
+ .emit_start = si_query_hw_do_emit_start,
+ .emit_stop = si_query_hw_do_emit_stop,
+ .clear_result = si_query_hw_clear_result,
+ .add_result = si_query_hw_add_result,
};
-static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
- unsigned query_type,
- unsigned index)
+static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
+ unsigned index)
{
- struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
- if (!query)
- return NULL;
-
- query->b.type = query_type;
- query->b.ops = &query_hw_ops;
- query->ops = &query_hw_default_hw_ops;
-
- switch (query_type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- query->result_size = 16 * sscreen->info.num_render_backends;
- query->result_size += 16; /* for the fence + alignment */
- query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
- break;
- case SI_QUERY_TIME_ELAPSED_SDMA:
- /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
- query->result_size = 64;
- break;
- case PIPE_QUERY_TIME_ELAPSED:
- query->result_size = 24;
- query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
- break;
- case PIPE_QUERY_TIMESTAMP:
- query->result_size = 16;
- query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
- query->flags = SI_QUERY_HW_FLAG_NO_START;
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- case PIPE_QUERY_SO_STATISTICS:
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
- query->result_size = 32;
- query->b.num_cs_dw_suspend = 6;
- query->stream = index;
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
- query->result_size = 32 * SI_MAX_STREAMS;
- query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- /* 11 values on GCN. */
- query->result_size = 11 * 16;
- query->result_size += 8; /* for the fence + alignment */
- query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
- break;
- default:
- assert(0);
- FREE(query);
- return NULL;
- }
-
- return (struct pipe_query *)query;
+ struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
+ if (!query)
+ return NULL;
+
+ query->b.type = query_type;
+ query->b.ops = &query_hw_ops;
+ query->ops = &query_hw_default_hw_ops;
+
+ switch (query_type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ query->result_size = 16 * sscreen->info.num_render_backends;
+ query->result_size += 16; /* for the fence + alignment */
+ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+ query->result_size = 64;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ query->result_size = 24;
+ query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ query->result_size = 16;
+ query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+ query->flags = SI_QUERY_HW_FLAG_NO_START;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32;
+ query->b.num_cs_dw_suspend = 6;
+ query->stream = index;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32 * SI_MAX_STREAMS;
+ query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ /* 11 values on GCN. */
+ query->result_size = 11 * 16;
+ query->result_size += 8; /* for the fence + alignment */
+ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+ break;
+ default:
+ assert(0);
+ FREE(query);
+ return NULL;
+ }
+
+ return (struct pipe_query *)query;
}
-static void si_update_occlusion_query_state(struct si_context *sctx,
- unsigned type, int diff)
+static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
{
- if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
- type == PIPE_QUERY_OCCLUSION_PREDICATE ||
- type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
- bool old_enable = sctx->num_occlusion_queries != 0;
- bool old_perfect_enable =
- sctx->num_perfect_occlusion_queries != 0;
- bool enable, perfect_enable;
-
- sctx->num_occlusion_queries += diff;
- assert(sctx->num_occlusion_queries >= 0);
-
- if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
- sctx->num_perfect_occlusion_queries += diff;
- assert(sctx->num_perfect_occlusion_queries >= 0);
- }
-
- enable = sctx->num_occlusion_queries != 0;
- perfect_enable = sctx->num_perfect_occlusion_queries != 0;
-
- if (enable != old_enable || perfect_enable != old_perfect_enable) {
- si_set_occlusion_query_state(sctx, old_perfect_enable);
- }
- }
+ if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ bool old_enable = sctx->num_occlusion_queries != 0;
+ bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+ bool enable, perfect_enable;
+
+ sctx->num_occlusion_queries += diff;
+ assert(sctx->num_occlusion_queries >= 0);
+
+ if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+ sctx->num_perfect_occlusion_queries += diff;
+ assert(sctx->num_perfect_occlusion_queries >= 0);
+ }
+
+ enable = sctx->num_occlusion_queries != 0;
+ perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+
+ if (enable != old_enable || perfect_enable != old_perfect_enable) {
+ si_set_occlusion_query_state(sctx, old_perfect_enable);
+ }
+ }
}
static unsigned event_type_for_stream(unsigned stream)
{
- switch (stream) {
- default:
- case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
- case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
- case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
- case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
- }
+ switch (stream) {
+ default:
+ case 0:
+ return V_028A90_SAMPLE_STREAMOUTSTATS;
+ case 1:
+ return V_028A90_SAMPLE_STREAMOUTSTATS1;
+ case 2:
+ return V_028A90_SAMPLE_STREAMOUTSTATS2;
+ case 3:
+ return V_028A90_SAMPLE_STREAMOUTSTATS3;
+ }
}
-static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
- unsigned stream)
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
{
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
}
-static void si_query_hw_do_emit_start(struct si_context *sctx,
- struct si_query_hw *query,
- struct si_resource *buffer,
- uint64_t va)
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+ struct si_resource *buffer, uint64_t va)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- switch (query->b.type) {
- case SI_QUERY_TIME_ELAPSED_SDMA:
- si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
- return;
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- case PIPE_QUERY_SO_STATISTICS:
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- emit_sample_streamout(cs, va, query->stream);
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
- emit_sample_streamout(cs, va + 32 * stream, stream);
- break;
- case PIPE_QUERY_TIME_ELAPSED:
- si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_TIMESTAMP, NULL, va,
- 0, query->b.type);
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- break;
- default:
- assert(0);
- }
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
- RADEON_PRIO_QUERY);
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ switch (query->b.type) {
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+ return;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ break;
+ default:
+ assert(0);
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
}
-static void si_query_hw_emit_start(struct si_context *sctx,
- struct si_query_hw *query)
+static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
{
- uint64_t va;
+ uint64_t va;
- if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
- query->result_size))
- return;
+ if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
+ return;
- si_update_occlusion_query_state(sctx, query->b.type, 1);
- si_update_prims_generated_query_state(sctx, query->b.type, 1);
+ si_update_occlusion_query_state(sctx, query->b.type, 1);
+ si_update_prims_generated_query_state(sctx, query->b.type, 1);
- if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
- sctx->num_pipeline_stat_queries++;
+ if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+ sctx->num_pipeline_stat_queries++;
- if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
- si_need_gfx_cs_space(sctx);
+ if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+ si_need_gfx_cs_space(sctx);
- va = query->buffer.buf->gpu_address + query->buffer.results_end;
- query->ops->emit_start(sctx, query, query->buffer.buf, va);
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ query->ops->emit_start(sctx, query, query->buffer.buf, va);
}
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
- struct si_query_hw *query,
- struct si_resource *buffer,
- uint64_t va)
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+ struct si_resource *buffer, uint64_t va)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- uint64_t fence_va = 0;
-
- switch (query->b.type) {
- case SI_QUERY_TIME_ELAPSED_SDMA:
- si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
- return;
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- va += 8;
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
-
- fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- case PIPE_QUERY_SO_STATISTICS:
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- va += 16;
- emit_sample_streamout(cs, va, query->stream);
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- va += 16;
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
- emit_sample_streamout(cs, va + 32 * stream, stream);
- break;
- case PIPE_QUERY_TIME_ELAPSED:
- va += 8;
- /* fall through */
- case PIPE_QUERY_TIMESTAMP:
- si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_TIMESTAMP, NULL, va,
- 0, query->b.type);
- fence_va = va + 8;
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS: {
- unsigned sample_size = (query->result_size - 8) / 2;
-
- va += sample_size;
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
-
- fence_va = va + sample_size;
- break;
- }
- default:
- assert(0);
- }
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
- RADEON_PRIO_QUERY);
-
- if (fence_va) {
- si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
- EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_VALUE_32BIT,
- query->buffer.buf, fence_va, 0x80000000,
- query->b.type);
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ uint64_t fence_va = 0;
+
+ switch (query->b.type) {
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+ return;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ va += 8;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+
+ fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_SO_STATISTICS:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ va += 16;
+ emit_sample_streamout(cs, va, query->stream);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ va += 16;
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+ emit_sample_streamout(cs, va + 32 * stream, stream);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ va += 8;
+ /* fall through */
+ case PIPE_QUERY_TIMESTAMP:
+ si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+ fence_va = va + 8;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ unsigned sample_size = (query->result_size - 8) / 2;
+
+ va += sample_size;
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+
+ fence_va = va + sample_size;
+ break;
+ }
+ default:
+ assert(0);
+ }
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
+
+ if (fence_va) {
+ si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+ EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
+ query->b.type);
+ }
}
-static void si_query_hw_emit_stop(struct si_context *sctx,
- struct si_query_hw *query)
+static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
{
- uint64_t va;
+ uint64_t va;
- /* The queries which need begin already called this in begin_query. */
- if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
- si_need_gfx_cs_space(sctx);
- if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
- query->result_size))
- return;
- }
+ /* The queries which need begin already called this in begin_query. */
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+ si_need_gfx_cs_space(sctx);
+ if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
+ query->result_size))
+ return;
+ }
- if (!query->buffer.buf)
- return; // previous buffer allocation failure
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure
- /* emit end query */
- va = query->buffer.buf->gpu_address + query->buffer.results_end;
+ /* emit end query */
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
- query->ops->emit_stop(sctx, query, query->buffer.buf, va);
+ query->ops->emit_stop(sctx, query, query->buffer.buf, va);
- query->buffer.results_end += query->result_size;
+ query->buffer.results_end += query->result_size;
- si_update_occlusion_query_state(sctx, query->b.type, -1);
- si_update_prims_generated_query_state(sctx, query->b.type, -1);
+ si_update_occlusion_query_state(sctx, query->b.type, -1);
+ si_update_prims_generated_query_state(sctx, query->b.type, -1);
- if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
- sctx->num_pipeline_stat_queries--;
+ if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+ sctx->num_pipeline_stat_queries--;
}
-static void emit_set_predicate(struct si_context *ctx,
- struct si_resource *buf, uint64_t va,
- uint32_t op)
+static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
+ uint32_t op)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
- if (ctx->chip_class >= GFX9) {
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
- radeon_emit(cs, op);
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- } else {
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
- radeon_emit(cs, va);
- radeon_emit(cs, op | ((va >> 32) & 0xFF));
- }
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ,
- RADEON_PRIO_QUERY);
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ if (ctx->chip_class >= GFX9) {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+ radeon_emit(cs, op);
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+ radeon_emit(cs, va);
+ radeon_emit(cs, op | ((va >> 32) & 0xFF));
+ }
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
}
static void si_emit_query_predication(struct si_context *ctx)
{
- struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
- struct si_query_buffer *qbuf;
- uint32_t op;
- bool flag_wait, invert;
-
- if (!query)
- return;
-
- if (ctx->screen->use_ngg_streamout &&
- (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
- query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
- assert(!"not implemented");
- }
-
- invert = ctx->render_cond_invert;
- flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
- ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
-
- if (query->workaround_buf) {
- op = PRED_OP(PREDICATION_OP_BOOL64);
- } else {
- switch (query->b.type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- op = PRED_OP(PREDICATION_OP_ZPASS);
- break;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
- invert = !invert;
- break;
- default:
- assert(0);
- return;
- }
- }
-
- /* if true then invert, see GL_ARB_conditional_render_inverted */
- if (invert)
- op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
- else
- op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
-
- /* Use the value written by compute shader as a workaround. Note that
- * the wait flag does not apply in this predication mode.
- *
- * The shader outputs the result value to L2. Workarounds only affect GFX8
- * and later, where the CP reads data from L2, so we don't need an
- * additional flush.
- */
- if (query->workaround_buf) {
- uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
- emit_set_predicate(ctx, query->workaround_buf, va, op);
- return;
- }
-
- op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-
- /* emit predicate packets for all data blocks */
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- unsigned results_base = 0;
- uint64_t va_base = qbuf->buf->gpu_address;
-
- while (results_base < qbuf->results_end) {
- uint64_t va = va_base + results_base;
-
- if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
- emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
-
- /* set CONTINUE bit for all packets except the first */
- op |= PREDICATION_CONTINUE;
- }
- } else {
- emit_set_predicate(ctx, qbuf->buf, va, op);
- op |= PREDICATION_CONTINUE;
- }
-
- results_base += query->result_size;
- }
- }
+ struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
+ struct si_query_buffer *qbuf;
+ uint32_t op;
+ bool flag_wait, invert;
+
+ if (!query)
+ return;
+
+ if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+ assert(!"not implemented");
+ }
+
+ invert = ctx->render_cond_invert;
+ flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+ ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+ if (query->workaround_buf) {
+ op = PRED_OP(PREDICATION_OP_BOOL64);
+ } else {
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ op = PRED_OP(PREDICATION_OP_ZPASS);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+ invert = !invert;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+ }
+
+ /* if true then invert, see GL_ARB_conditional_render_inverted */
+ if (invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+ else
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+ /* Use the value written by compute shader as a workaround. Note that
+ * the wait flag does not apply in this predication mode.
+ *
+ * The shader outputs the result value to L2. Workarounds only affect GFX8
+ * and later, where the CP reads data from L2, so we don't need an
+ * additional flush.
+ */
+ if (query->workaround_buf) {
+ uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+ emit_set_predicate(ctx, query->workaround_buf, va, op);
+ return;
+ }
+
+ op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+ /* emit predicate packets for all data blocks */
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ uint64_t va_base = qbuf->buf->gpu_address;
+
+ while (results_base < qbuf->results_end) {
+ uint64_t va = va_base + results_base;
+
+ if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
+ }
+ } else {
+ emit_set_predicate(ctx, qbuf->buf, va, op);
+ op |= PREDICATION_CONTINUE;
+ }
+
+ results_base += query->result_size;
+ }
+ }
}
-static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
+ unsigned index)
{
- struct si_screen *sscreen =
- (struct si_screen *)ctx->screen;
-
- if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
- query_type == PIPE_QUERY_GPU_FINISHED ||
- (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
- query_type != SI_QUERY_TIME_ELAPSED_SDMA))
- return si_query_sw_create(query_type);
-
- if (sscreen->use_ngg_streamout &&
- (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
- query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
- query_type == PIPE_QUERY_SO_STATISTICS ||
- query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
- query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
- return gfx10_sh_query_create(sscreen, query_type, index);
-
- return si_query_hw_create(sscreen, query_type, index);
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+ if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
+ (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+ return si_query_sw_create(query_type);
+
+ if (sscreen->use_ngg_streamout &&
+ (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+ query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
+ query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+ return gfx10_sh_query_create(sscreen, query_type, index);
+
+ return si_query_hw_create(sscreen, query_type, index);
}
static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query *squery = (struct si_query *)query;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *squery = (struct si_query *)query;
- squery->ops->destroy(sctx, squery);
+ squery->ops->destroy(sctx, squery);
}
-static bool si_begin_query(struct pipe_context *ctx,
- struct pipe_query *query)
+static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query *squery = (struct si_query *)query;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *squery = (struct si_query *)query;
- return squery->ops->begin(sctx, squery);
+ return squery->ops->begin(sctx, squery);
}
-bool si_query_hw_begin(struct si_context *sctx,
- struct si_query *squery)
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_hw *query = (struct si_query_hw *)squery;
+ struct si_query_hw *query = (struct si_query_hw *)squery;
- if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
- assert(0);
- return false;
- }
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+ assert(0);
+ return false;
+ }
- if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
- si_query_buffer_reset(sctx, &query->buffer);
+ if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
+ si_query_buffer_reset(sctx, &query->buffer);
- si_resource_reference(&query->workaround_buf, NULL);
+ si_resource_reference(&query->workaround_buf, NULL);
- si_query_hw_emit_start(sctx, query);
- if (!query->buffer.buf)
- return false;
+ si_query_hw_emit_start(sctx, query);
+ if (!query->buffer.buf)
+ return false;
- list_addtail(&query->b.active_list, &sctx->active_queries);
- sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
- return true;
+ list_addtail(&query->b.active_list, &sctx->active_queries);
+ sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+ return true;
}
static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query *squery = (struct si_query *)query;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *squery = (struct si_query *)query;
- return squery->ops->end(sctx, squery);
+ return squery->ops->end(sctx, squery);
}
-bool si_query_hw_end(struct si_context *sctx,
- struct si_query *squery)
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
{
- struct si_query_hw *query = (struct si_query_hw *)squery;
+ struct si_query_hw *query = (struct si_query_hw *)squery;
- if (query->flags & SI_QUERY_HW_FLAG_NO_START)
- si_query_buffer_reset(sctx, &query->buffer);
+ if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+ si_query_buffer_reset(sctx, &query->buffer);
- si_query_hw_emit_stop(sctx, query);
+ si_query_hw_emit_stop(sctx, query);
- if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
- list_delinit(&query->b.active_list);
- sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
- }
+ if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
+ list_delinit(&query->b.active_list);
+ sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
+ }
- if (!query->buffer.buf)
- return false;
+ if (!query->buffer.buf)
+ return false;
- return true;
+ return true;
}
-static void si_get_hw_query_params(struct si_context *sctx,
- struct si_query_hw *squery, int index,
- struct si_hw_query_params *params)
+static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
+ struct si_hw_query_params *params)
{
- unsigned max_rbs = sctx->screen->info.num_render_backends;
-
- params->pair_stride = 0;
- params->pair_count = 1;
-
- switch (squery->b.type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
- params->start_offset = 0;
- params->end_offset = 8;
- params->fence_offset = max_rbs * 16;
- params->pair_stride = 16;
- params->pair_count = max_rbs;
- break;
- case PIPE_QUERY_TIME_ELAPSED:
- params->start_offset = 0;
- params->end_offset = 8;
- params->fence_offset = 16;
- break;
- case PIPE_QUERY_TIMESTAMP:
- params->start_offset = 0;
- params->end_offset = 0;
- params->fence_offset = 8;
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- params->start_offset = 8;
- params->end_offset = 24;
- params->fence_offset = params->end_offset + 4;
- break;
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- params->start_offset = 0;
- params->end_offset = 16;
- params->fence_offset = params->end_offset + 4;
- break;
- case PIPE_QUERY_SO_STATISTICS:
- params->start_offset = 8 - index * 8;
- params->end_offset = 24 - index * 8;
- params->fence_offset = params->end_offset + 4;
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- params->pair_count = SI_MAX_STREAMS;
- params->pair_stride = 32;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- params->start_offset = 0;
- params->end_offset = 16;
-
- /* We can re-use the high dword of the last 64-bit value as a
- * fence: it is initialized as 0, and the high bit is set by
- * the write of the streamout stats event.
- */
- params->fence_offset = squery->result_size - 4;
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- {
- static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
- params->start_offset = offsets[index];
- params->end_offset = 88 + offsets[index];
- params->fence_offset = 2 * 88;
- break;
- }
- default:
- unreachable("si_get_hw_query_params unsupported");
- }
+ unsigned max_rbs = sctx->screen->info.num_render_backends;
+
+ params->pair_stride = 0;
+ params->pair_count = 1;
+
+ switch (squery->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = max_rbs * 16;
+ params->pair_stride = 16;
+ params->pair_count = max_rbs;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = 16;
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ params->start_offset = 0;
+ params->end_offset = 0;
+ params->fence_offset = 8;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ params->start_offset = 8;
+ params->end_offset = 24;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ params->start_offset = 0;
+ params->end_offset = 16;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ params->start_offset = 8 - index * 8;
+ params->end_offset = 24 - index * 8;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ params->pair_count = SI_MAX_STREAMS;
+ params->pair_stride = 32;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ params->start_offset = 0;
+ params->end_offset = 16;
+
+ /* We can re-use the high dword of the last 64-bit value as a
+ * fence: it is initialized as 0, and the high bit is set by
+ * the write of the streamout stats event.
+ */
+ params->fence_offset = squery->result_size - 4;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+ params->start_offset = offsets[index];
+ params->end_offset = 88 + offsets[index];
+ params->fence_offset = 2 * 88;
+ break;
+ }
+ default:
+ unreachable("si_get_hw_query_params unsupported");
+ }
}
static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
- bool test_status_bit)
+ bool test_status_bit)
{
- uint32_t *current_result = (uint32_t*)map;
- uint64_t start, end;
-
- start = (uint64_t)current_result[start_index] |
- (uint64_t)current_result[start_index+1] << 32;
- end = (uint64_t)current_result[end_index] |
- (uint64_t)current_result[end_index+1] << 32;
-
- if (!test_status_bit ||
- ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
- return end - start;
- }
- return 0;
+ uint32_t *current_result = (uint32_t *)map;
+ uint64_t start, end;
+
+ start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
+ end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
+
+ if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+ return end - start;
+ }
+ return 0;
}
-static void si_query_hw_add_result(struct si_screen *sscreen,
- struct si_query_hw *query,
- void *buffer,
- union pipe_query_result *result)
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
+ void *buffer, union pipe_query_result *result)
{
- unsigned max_rbs = sscreen->info.num_render_backends;
-
- switch (query->b.type) {
- case PIPE_QUERY_OCCLUSION_COUNTER: {
- for (unsigned i = 0; i < max_rbs; ++i) {
- unsigned results_base = i * 16;
- result->u64 +=
- si_query_read_result(buffer + results_base, 0, 2, true);
- }
- break;
- }
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
- for (unsigned i = 0; i < max_rbs; ++i) {
- unsigned results_base = i * 16;
- result->b = result->b ||
- si_query_read_result(buffer + results_base, 0, 2, true) != 0;
- }
- break;
- }
- case PIPE_QUERY_TIME_ELAPSED:
- result->u64 += si_query_read_result(buffer, 0, 2, false);
- break;
- case SI_QUERY_TIME_ELAPSED_SDMA:
- result->u64 += si_query_read_result(buffer, 0, 32/4, false);
- break;
- case PIPE_QUERY_TIMESTAMP:
- result->u64 = *(uint64_t*)buffer;
- break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- /* SAMPLE_STREAMOUTSTATS stores this structure:
- * {
- * u64 NumPrimitivesWritten;
- * u64 PrimitiveStorageNeeded;
- * }
- * We only need NumPrimitivesWritten here. */
- result->u64 += si_query_read_result(buffer, 2, 6, true);
- break;
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- /* Here we read PrimitiveStorageNeeded. */
- result->u64 += si_query_read_result(buffer, 0, 4, true);
- break;
- case PIPE_QUERY_SO_STATISTICS:
- result->so_statistics.num_primitives_written +=
- si_query_read_result(buffer, 2, 6, true);
- result->so_statistics.primitives_storage_needed +=
- si_query_read_result(buffer, 0, 4, true);
- break;
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- result->b = result->b ||
- si_query_read_result(buffer, 2, 6, true) !=
- si_query_read_result(buffer, 0, 4, true);
- break;
- case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
- for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
- result->b = result->b ||
- si_query_read_result(buffer, 2, 6, true) !=
- si_query_read_result(buffer, 0, 4, true);
- buffer = (char *)buffer + 32;
- }
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- result->pipeline_statistics.ps_invocations +=
- si_query_read_result(buffer, 0, 22, false);
- result->pipeline_statistics.c_primitives +=
- si_query_read_result(buffer, 2, 24, false);
- result->pipeline_statistics.c_invocations +=
- si_query_read_result(buffer, 4, 26, false);
- result->pipeline_statistics.vs_invocations +=
- si_query_read_result(buffer, 6, 28, false);
- result->pipeline_statistics.gs_invocations +=
- si_query_read_result(buffer, 8, 30, false);
- result->pipeline_statistics.gs_primitives +=
- si_query_read_result(buffer, 10, 32, false);
- result->pipeline_statistics.ia_primitives +=
- si_query_read_result(buffer, 12, 34, false);
- result->pipeline_statistics.ia_vertices +=
- si_query_read_result(buffer, 14, 36, false);
- result->pipeline_statistics.hs_invocations +=
- si_query_read_result(buffer, 16, 38, false);
- result->pipeline_statistics.ds_invocations +=
- si_query_read_result(buffer, 18, 40, false);
- result->pipeline_statistics.cs_invocations +=
- si_query_read_result(buffer, 20, 42, false);
+ unsigned max_rbs = sscreen->info.num_render_backends;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER: {
+ for (unsigned i = 0; i < max_rbs; ++i) {
+ unsigned results_base = i * 16;
+ result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
+ }
+ break;
+ }
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+ for (unsigned i = 0; i < max_rbs; ++i) {
+ unsigned results_base = i * 16;
+ result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
+ }
+ break;
+ }
+ case PIPE_QUERY_TIME_ELAPSED:
+ result->u64 += si_query_read_result(buffer, 0, 2, false);
+ break;
+ case SI_QUERY_TIME_ELAPSED_SDMA:
+ result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ result->u64 = *(uint64_t *)buffer;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ /* SAMPLE_STREAMOUTSTATS stores this structure:
+ * {
+ * u64 NumPrimitivesWritten;
+ * u64 PrimitiveStorageNeeded;
+ * }
+ * We only need NumPrimitivesWritten here. */
+ result->u64 += si_query_read_result(buffer, 2, 6, true);
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ /* Here we read PrimitiveStorageNeeded. */
+ result->u64 += si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
+ result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+ si_query_read_result(buffer, 0, 4, true);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+ result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+ si_query_read_result(buffer, 0, 4, true);
+ buffer = (char *)buffer + 32;
+ }
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
+ result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
+ result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
+ result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
+ result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
+ result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
+ result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
+ result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
+ result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
+ result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
+ result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
#if 0 /* for testing */
printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
"DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
result->pipeline_statistics.ps_invocations,
result->pipeline_statistics.cs_invocations);
#endif
- break;
- default:
- assert(0);
- }
+ break;
+ default:
+ assert(0);
+ }
}
void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
{
- si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
+ si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
}
void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
{
- si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
+ si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
}
static const struct si_query_ops query_hw_ops = {
- .destroy = si_query_hw_destroy,
- .begin = si_query_hw_begin,
- .end = si_query_hw_end,
- .get_result = si_query_hw_get_result,
- .get_result_resource = si_query_hw_get_result_resource,
-
- .suspend = si_query_hw_suspend,
- .resume = si_query_hw_resume,
+ .destroy = si_query_hw_destroy,
+ .begin = si_query_hw_begin,
+ .end = si_query_hw_end,
+ .get_result = si_query_hw_get_result,
+ .get_result_resource = si_query_hw_get_result_resource,
+
+ .suspend = si_query_hw_suspend,
+ .resume = si_query_hw_resume,
};
-static bool si_get_query_result(struct pipe_context *ctx,
- struct pipe_query *query, bool wait,
- union pipe_query_result *result)
+static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
+ union pipe_query_result *result)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query *squery = (struct si_query *)query;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *squery = (struct si_query *)query;
- return squery->ops->get_result(sctx, squery, wait, result);
+ return squery->ops->get_result(sctx, squery, wait, result);
}
-static void si_get_query_result_resource(struct pipe_context *ctx,
- struct pipe_query *query,
- bool wait,
- enum pipe_query_value_type result_type,
- int index,
- struct pipe_resource *resource,
- unsigned offset)
+static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
+ bool wait, enum pipe_query_value_type result_type,
+ int index, struct pipe_resource *resource, unsigned offset)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query *squery = (struct si_query *)query;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query *squery = (struct si_query *)query;
- squery->ops->get_result_resource(sctx, squery, wait, result_type, index,
- resource, offset);
+ squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
}
-static void si_query_hw_clear_result(struct si_query_hw *query,
- union pipe_query_result *result)
+static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
{
- util_query_clear_result(result, query->b.type);
+ util_query_clear_result(result, query->b.type);
}
-bool si_query_hw_get_result(struct si_context *sctx,
- struct si_query *squery,
- bool wait, union pipe_query_result *result)
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+ union pipe_query_result *result)
{
- struct si_screen *sscreen = sctx->screen;
- struct si_query_hw *query = (struct si_query_hw *)squery;
- struct si_query_buffer *qbuf;
-
- query->ops->clear_result(query, result);
-
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- unsigned usage = PIPE_TRANSFER_READ |
- (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
- unsigned results_base = 0;
- void *map;
-
- if (squery->b.flushed)
- map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
- else
- map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
-
- if (!map)
- return false;
-
- while (results_base != qbuf->results_end) {
- query->ops->add_result(sscreen, query, map + results_base,
- result);
- results_base += query->result_size;
- }
- }
-
- /* Convert the time to expected units. */
- if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
- squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
- squery->type == PIPE_QUERY_TIMESTAMP) {
- result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
- }
- return true;
+ struct si_screen *sscreen = sctx->screen;
+ struct si_query_hw *query = (struct si_query_hw *)squery;
+ struct si_query_buffer *qbuf;
+
+ query->ops->clear_result(query, result);
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+ unsigned results_base = 0;
+ void *map;
+
+ if (squery->b.flushed)
+ map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+ else
+ map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+ if (!map)
+ return false;
+
+ while (results_base != qbuf->results_end) {
+ query->ops->add_result(sscreen, query, map + results_base, result);
+ results_base += query->result_size;
+ }
+ }
+
+ /* Convert the time to expected units. */
+ if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+ squery->type == PIPE_QUERY_TIMESTAMP) {
+ result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
+ }
+ return true;
}
-static void si_query_hw_get_result_resource(struct si_context *sctx,
- struct si_query *squery,
- bool wait,
- enum pipe_query_value_type result_type,
- int index,
- struct pipe_resource *resource,
- unsigned offset)
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+ bool wait, enum pipe_query_value_type result_type,
+ int index, struct pipe_resource *resource,
+ unsigned offset)
{
- struct si_query_hw *query = (struct si_query_hw *)squery;
- struct si_query_buffer *qbuf;
- struct si_query_buffer *qbuf_prev;
- struct pipe_resource *tmp_buffer = NULL;
- unsigned tmp_buffer_offset = 0;
- struct si_qbo_state saved_state = {};
- struct pipe_grid_info grid = {};
- struct pipe_constant_buffer constant_buffer = {};
- struct pipe_shader_buffer ssbo[3];
- struct si_hw_query_params params;
- struct {
- uint32_t end_offset;
- uint32_t result_stride;
- uint32_t result_count;
- uint32_t config;
- uint32_t fence_offset;
- uint32_t pair_stride;
- uint32_t pair_count;
- } consts;
-
- if (!sctx->query_result_shader) {
- sctx->query_result_shader = si_create_query_result_cs(sctx);
- if (!sctx->query_result_shader)
- return;
- }
-
- if (query->buffer.previous) {
- u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
- &tmp_buffer_offset, &tmp_buffer);
- if (!tmp_buffer)
- return;
- }
-
- si_save_qbo_state(sctx, &saved_state);
-
- si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms);
- consts.end_offset = params.end_offset - params.start_offset;
- consts.fence_offset = params.fence_offset - params.start_offset;
- consts.result_stride = query->result_size;
- consts.pair_stride = params.pair_stride;
- consts.pair_count = params.pair_count;
-
- constant_buffer.buffer_size = sizeof(consts);
- constant_buffer.user_buffer = &consts;
-
- ssbo[1].buffer = tmp_buffer;
- ssbo[1].buffer_offset = tmp_buffer_offset;
- ssbo[1].buffer_size = 16;
-
- ssbo[2] = ssbo[1];
-
- sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
-
- grid.block[0] = 1;
- grid.block[1] = 1;
- grid.block[2] = 1;
- grid.grid[0] = 1;
- grid.grid[1] = 1;
- grid.grid[2] = 1;
-
- consts.config = 0;
- if (index < 0)
- consts.config |= 4;
- if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
- query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
- consts.config |= 8;
- else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
- query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
- consts.config |= 8 | 256;
- else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
- query->b.type == PIPE_QUERY_TIME_ELAPSED)
- consts.config |= 32;
-
- switch (result_type) {
- case PIPE_QUERY_TYPE_U64:
- case PIPE_QUERY_TYPE_I64:
- consts.config |= 64;
- break;
- case PIPE_QUERY_TYPE_I32:
- consts.config |= 128;
- break;
- case PIPE_QUERY_TYPE_U32:
- break;
- }
-
- sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
-
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
- if (query->b.type != PIPE_QUERY_TIMESTAMP) {
- qbuf_prev = qbuf->previous;
- consts.result_count = qbuf->results_end / query->result_size;
- consts.config &= ~3;
- if (qbuf != &query->buffer)
- consts.config |= 1;
- if (qbuf->previous)
- consts.config |= 2;
- } else {
- /* Only read the last timestamp. */
- qbuf_prev = NULL;
- consts.result_count = 0;
- consts.config |= 16;
- params.start_offset += qbuf->results_end - query->result_size;
- }
-
- sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-
- ssbo[0].buffer = &qbuf->buf->b.b;
- ssbo[0].buffer_offset = params.start_offset;
- ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
-
- if (!qbuf->previous) {
- ssbo[2].buffer = resource;
- ssbo[2].buffer_offset = offset;
- ssbo[2].buffer_size = 8;
-
- si_resource(resource)->TC_L2_dirty = true;
- }
-
- sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo,
- 1 << 2);
-
- if (wait && qbuf == &query->buffer) {
- uint64_t va;
-
- /* Wait for result availability. Wait only for readiness
- * of the last entry, since the fence writes should be
- * serialized in the CP.
- */
- va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
- va += params.fence_offset;
-
- si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000,
- 0x80000000, WAIT_REG_MEM_EQUAL);
- }
-
- sctx->b.launch_grid(&sctx->b, &grid);
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- si_restore_qbo_state(sctx, &saved_state);
- pipe_resource_reference(&tmp_buffer, NULL);
+ struct si_query_hw *query = (struct si_query_hw *)squery;
+ struct si_query_buffer *qbuf;
+ struct si_query_buffer *qbuf_prev;
+ struct pipe_resource *tmp_buffer = NULL;
+ unsigned tmp_buffer_offset = 0;
+ struct si_qbo_state saved_state = {};
+ struct pipe_grid_info grid = {};
+ struct pipe_constant_buffer constant_buffer = {};
+ struct pipe_shader_buffer ssbo[3];
+ struct si_hw_query_params params;
+ struct {
+ uint32_t end_offset;
+ uint32_t result_stride;
+ uint32_t result_count;
+ uint32_t config;
+ uint32_t fence_offset;
+ uint32_t pair_stride;
+ uint32_t pair_count;
+ } consts;
+
+ if (!sctx->query_result_shader) {
+ sctx->query_result_shader = si_create_query_result_cs(sctx);
+ if (!sctx->query_result_shader)
+ return;
+ }
+
+ if (query->buffer.previous) {
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+ if (!tmp_buffer)
+ return;
+ }
+
+ si_save_qbo_state(sctx, &saved_state);
+
+ si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms);
+ consts.end_offset = params.end_offset - params.start_offset;
+ consts.fence_offset = params.fence_offset - params.start_offset;
+ consts.result_stride = query->result_size;
+ consts.pair_stride = params.pair_stride;
+ consts.pair_count = params.pair_count;
+
+ constant_buffer.buffer_size = sizeof(consts);
+ constant_buffer.user_buffer = &consts;
+
+ ssbo[1].buffer = tmp_buffer;
+ ssbo[1].buffer_offset = tmp_buffer_offset;
+ ssbo[1].buffer_size = 16;
+
+ ssbo[2] = ssbo[1];
+
+ sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
+
+ grid.block[0] = 1;
+ grid.block[1] = 1;
+ grid.block[2] = 1;
+ grid.grid[0] = 1;
+ grid.grid[1] = 1;
+ grid.grid[2] = 1;
+
+ consts.config = 0;
+ if (index < 0)
+ consts.config |= 4;
+ if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
+ consts.config |= 8;
+ else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ consts.config |= 8 | 256;
+ else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
+ consts.config |= 32;
+
+ switch (result_type) {
+ case PIPE_QUERY_TYPE_U64:
+ case PIPE_QUERY_TYPE_I64:
+ consts.config |= 64;
+ break;
+ case PIPE_QUERY_TYPE_I32:
+ consts.config |= 128;
+ break;
+ case PIPE_QUERY_TYPE_U32:
+ break;
+ }
+
+ sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+ if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+ qbuf_prev = qbuf->previous;
+ consts.result_count = qbuf->results_end / query->result_size;
+ consts.config &= ~3;
+ if (qbuf != &query->buffer)
+ consts.config |= 1;
+ if (qbuf->previous)
+ consts.config |= 2;
+ } else {
+ /* Only read the last timestamp. */
+ qbuf_prev = NULL;
+ consts.result_count = 0;
+ consts.config |= 16;
+ params.start_offset += qbuf->results_end - query->result_size;
+ }
+
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+ ssbo[0].buffer = &qbuf->buf->b.b;
+ ssbo[0].buffer_offset = params.start_offset;
+ ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+ if (!qbuf->previous) {
+ ssbo[2].buffer = resource;
+ ssbo[2].buffer_offset = offset;
+ ssbo[2].buffer_size = 8;
+
+ si_resource(resource)->TC_L2_dirty = true;
+ }
+
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2);
+
+ if (wait && qbuf == &query->buffer) {
+ uint64_t va;
+
+ /* Wait for result availability. Wait only for readiness
+ * of the last entry, since the fence writes should be
+ * serialized in the CP.
+ */
+ va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+ va += params.fence_offset;
+
+ si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
+ }
+
+ sctx->b.launch_grid(&sctx->b, &grid);
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ si_restore_qbo_state(sctx, &saved_state);
+ pipe_resource_reference(&tmp_buffer, NULL);
}
-static void si_render_condition(struct pipe_context *ctx,
- struct pipe_query *query,
- bool condition,
- enum pipe_render_cond_flag mode)
+static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
+ enum pipe_render_cond_flag mode)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_query_hw *squery = (struct si_query_hw *)query;
- struct si_atom *atom = &sctx->atoms.s.render_cond;
-
- if (query) {
- bool needs_workaround = false;
-
- /* There was a firmware regression in GFX8 which causes successive
- * SET_PREDICATION packets to give the wrong answer for
- * non-inverted stream overflow predication.
- */
- if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
- (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
- !condition &&
- (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
- (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
- (squery->buffer.previous ||
- squery->buffer.results_end > squery->result_size)))) {
- needs_workaround = true;
- }
-
- if (needs_workaround && !squery->workaround_buf) {
- bool old_force_off = sctx->render_cond_force_off;
- sctx->render_cond_force_off = true;
-
- u_suballocator_alloc(
- sctx->allocator_zeroed_memory, 8, 8,
- &squery->workaround_offset,
- (struct pipe_resource **)&squery->workaround_buf);
-
- /* Reset to NULL to avoid a redundant SET_PREDICATION
- * from launching the compute grid.
- */
- sctx->render_cond = NULL;
-
- ctx->get_query_result_resource(
- ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
- &squery->workaround_buf->b.b, squery->workaround_offset);
-
- /* Settings this in the render cond atom is too late,
- * so set it here. */
- sctx->flags |= sctx->screen->barrier_flags.L2_to_cp |
- SI_CONTEXT_FLUSH_FOR_RENDER_COND;
-
- sctx->render_cond_force_off = old_force_off;
- }
- }
-
- sctx->render_cond = query;
- sctx->render_cond_invert = condition;
- sctx->render_cond_mode = mode;
-
- si_set_atom_dirty(sctx, atom, query != NULL);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_query_hw *squery = (struct si_query_hw *)query;
+ struct si_atom *atom = &sctx->atoms.s.render_cond;
+
+ if (query) {
+ bool needs_workaround = false;
+
+ /* There was a firmware regression in GFX8 which causes successive
+ * SET_PREDICATION packets to give the wrong answer for
+ * non-inverted stream overflow predication.
+ */
+ if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
+ (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
+ !condition &&
+ (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+ (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+ (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
+ needs_workaround = true;
+ }
+
+ if (needs_workaround && !squery->workaround_buf) {
+ bool old_force_off = sctx->render_cond_force_off;
+ sctx->render_cond_force_off = true;
+
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
+ (struct pipe_resource **)&squery->workaround_buf);
+
+ /* Reset to NULL to avoid a redundant SET_PREDICATION
+ * from launching the compute grid.
+ */
+ sctx->render_cond = NULL;
+
+ ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+ &squery->workaround_buf->b.b, squery->workaround_offset);
+
+ /* Settings this in the render cond atom is too late,
+ * so set it here. */
+ sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+ sctx->render_cond_force_off = old_force_off;
+ }
+ }
+
+ sctx->render_cond = query;
+ sctx->render_cond_invert = condition;
+ sctx->render_cond_mode = mode;
+
+ si_set_atom_dirty(sctx, atom, query != NULL);
}
void si_suspend_queries(struct si_context *sctx)
{
- struct si_query *query;
+ struct si_query *query;
- LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
- query->ops->suspend(sctx, query);
+ LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+ query->ops->suspend(sctx, query);
}
void si_resume_queries(struct si_context *sctx)
{
- struct si_query *query;
+ struct si_query *query;
- /* Check CS space here. Resuming must not be interrupted by flushes. */
- si_need_gfx_cs_space(sctx);
+ /* Check CS space here. Resuming must not be interrupted by flushes. */
+ si_need_gfx_cs_space(sctx);
- LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
- query->ops->resume(sctx, query);
+ LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+ query->ops->resume(sctx, query);
}
-#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
- { \
- .name = name_, \
- .query_type = SI_QUERY_##query_type_, \
- .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
- .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
- .group_id = group_id_ \
- }
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
+ { \
+ .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+ .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_ \
+ }
-#define X(name_, query_type_, type_, result_type_) \
- XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+#define X(name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
-#define XG(group_, name_, query_type_, type_, result_type_) \
- XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
+#define XG(group_, name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
static struct pipe_driver_query_info si_driver_query_list[] = {
- X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
- X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
- X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
- X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
- X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
- X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
- X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
- X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
- X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
- X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
- X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
- X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
- X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
- X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
- X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
- X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
- X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
- X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
- X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
- X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
- X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
- X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
- X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
- X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
- X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
- X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
- X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
- X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
- X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
- X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
- X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
- X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
- X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
- X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
- X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
- X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
- X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
- X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
- X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
- X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
- X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
- X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
- X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
- X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
- X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
- X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
- X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-
- /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
- * which use it as a fallback path to detect the GPU type.
- *
- * Note: The names of these queries are significant for GPUPerfStudio
- * (and possibly their order as well). */
- XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
- XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
- XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
- XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
- XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
-
- X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
- X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
- X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
-
- /* The following queries must be at the end of the list because their
- * availability is adjusted dynamically based on the DRM version. */
- X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
- X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
- X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
- X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
- X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
- X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
- X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
- X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
- X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
- X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
- X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
- X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
- X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
- X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
-
- /* SRBM_STATUS2 */
- X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
-
- /* CP_STAT */
- X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
- X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
- X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
- X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
- X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
- X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
-
- X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
- X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
- X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE,UINT64, AVERAGE),
+ X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
+ X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
+ X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+ X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
+ X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
+ X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
+ X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
+ X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
+ X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
+ X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
+ X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
+ X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
+ X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
+ X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
+ X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
+ X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
+ X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
+ X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
+ X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
+ X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
+ X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
+ X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
+ X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
+ X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
+ X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
+ X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
+ X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
+ X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
+ X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
+ X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
+ X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
+ X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
+ X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
+ X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
+ X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+ X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
+ X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
+ X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
+ X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
+ X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+ X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+ X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+ X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+ X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+ X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+ X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+
+ /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+ * which use it as a fallback path to detect the GPU type.
+ *
+ * Note: The names of these queries are significant for GPUPerfStudio
+ * (and possibly their order as well). */
+ XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
+ XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
+ XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
+ XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
+ XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
+
+ X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
+ X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
+ X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
+
+ /* The following queries must be at the end of the list because their
+ * availability is adjusted dynamically based on the DRM version. */
+ X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
+ X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
+ X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
+ X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
+ X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
+ X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
+ X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
+ X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
+ X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
+ X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
+ X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
+ X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
+ X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
+ X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
+
+ /* SRBM_STATUS2 */
+ X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
+
+ /* CP_STAT */
+ X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
+ X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
+ X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
+ X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
+ X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
+ X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
+
+ X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
+ X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
+ X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
};
#undef X
static unsigned si_get_num_queries(struct si_screen *sscreen)
{
- /* amdgpu */
- if (sscreen->info.is_amdgpu) {
- if (sscreen->info.chip_class >= GFX8)
- return ARRAY_SIZE(si_driver_query_list);
- else
- return ARRAY_SIZE(si_driver_query_list) - 7;
- }
-
- /* radeon */
- if (sscreen->info.has_read_registers_query) {
- if (sscreen->info.chip_class == GFX7)
- return ARRAY_SIZE(si_driver_query_list) - 6;
- else
- return ARRAY_SIZE(si_driver_query_list) - 7;
- }
-
- return ARRAY_SIZE(si_driver_query_list) - 21;
+ /* amdgpu */
+ if (sscreen->info.is_amdgpu) {
+ if (sscreen->info.chip_class >= GFX8)
+ return ARRAY_SIZE(si_driver_query_list);
+ else
+ return ARRAY_SIZE(si_driver_query_list) - 7;
+ }
+
+ /* radeon */
+ if (sscreen->info.has_read_registers_query) {
+ if (sscreen->info.chip_class == GFX7)
+ return ARRAY_SIZE(si_driver_query_list) - 6;
+ else
+ return ARRAY_SIZE(si_driver_query_list) - 7;
+ }
+
+ return ARRAY_SIZE(si_driver_query_list) - 21;
}
-static int si_get_driver_query_info(struct pipe_screen *screen,
- unsigned index,
- struct pipe_driver_query_info *info)
+static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
+ struct pipe_driver_query_info *info)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- unsigned num_queries = si_get_num_queries(sscreen);
-
- if (!info) {
- unsigned num_perfcounters =
- si_get_perfcounter_info(sscreen, 0, NULL);
-
- return num_queries + num_perfcounters;
- }
-
- if (index >= num_queries)
- return si_get_perfcounter_info(sscreen, index - num_queries, info);
-
- *info = si_driver_query_list[index];
-
- switch (info->query_type) {
- case SI_QUERY_REQUESTED_VRAM:
- case SI_QUERY_VRAM_USAGE:
- case SI_QUERY_MAPPED_VRAM:
- info->max_value.u64 = sscreen->info.vram_size;
- break;
- case SI_QUERY_REQUESTED_GTT:
- case SI_QUERY_GTT_USAGE:
- case SI_QUERY_MAPPED_GTT:
- info->max_value.u64 = sscreen->info.gart_size;
- break;
- case SI_QUERY_GPU_TEMPERATURE:
- info->max_value.u64 = 125;
- break;
- case SI_QUERY_VRAM_VIS_USAGE:
- info->max_value.u64 = sscreen->info.vram_vis_size;
- break;
- }
-
- if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
- info->group_id += sscreen->perfcounters->num_groups;
-
- return 1;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ unsigned num_queries = si_get_num_queries(sscreen);
+
+ if (!info) {
+ unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
+
+ return num_queries + num_perfcounters;
+ }
+
+ if (index >= num_queries)
+ return si_get_perfcounter_info(sscreen, index - num_queries, info);
+
+ *info = si_driver_query_list[index];
+
+ switch (info->query_type) {
+ case SI_QUERY_REQUESTED_VRAM:
+ case SI_QUERY_VRAM_USAGE:
+ case SI_QUERY_MAPPED_VRAM:
+ info->max_value.u64 = sscreen->info.vram_size;
+ break;
+ case SI_QUERY_REQUESTED_GTT:
+ case SI_QUERY_GTT_USAGE:
+ case SI_QUERY_MAPPED_GTT:
+ info->max_value.u64 = sscreen->info.gart_size;
+ break;
+ case SI_QUERY_GPU_TEMPERATURE:
+ info->max_value.u64 = 125;
+ break;
+ case SI_QUERY_VRAM_VIS_USAGE:
+ info->max_value.u64 = sscreen->info.vram_vis_size;
+ break;
+ }
+
+ if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
+ info->group_id += sscreen->perfcounters->num_groups;
+
+ return 1;
}
/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
* performance counter groups, so be careful when changing this and related
* functions.
*/
-static int si_get_driver_query_group_info(struct pipe_screen *screen,
- unsigned index,
- struct pipe_driver_query_group_info *info)
+static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
+ struct pipe_driver_query_group_info *info)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- unsigned num_pc_groups = 0;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ unsigned num_pc_groups = 0;
- if (sscreen->perfcounters)
- num_pc_groups = sscreen->perfcounters->num_groups;
+ if (sscreen->perfcounters)
+ num_pc_groups = sscreen->perfcounters->num_groups;
- if (!info)
- return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
+ if (!info)
+ return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
- if (index < num_pc_groups)
- return si_get_perfcounter_group_info(sscreen, index, info);
+ if (index < num_pc_groups)
+ return si_get_perfcounter_group_info(sscreen, index, info);
- index -= num_pc_groups;
- if (index >= SI_NUM_SW_QUERY_GROUPS)
- return 0;
+ index -= num_pc_groups;
+ if (index >= SI_NUM_SW_QUERY_GROUPS)
+ return 0;
- info->name = "GPIN";
- info->max_active_queries = 5;
- info->num_queries = 5;
- return 1;
+ info->name = "GPIN";
+ info->max_active_queries = 5;
+ info->num_queries = 5;
+ return 1;
}
void si_init_query_functions(struct si_context *sctx)
{
- sctx->b.create_query = si_create_query;
- sctx->b.create_batch_query = si_create_batch_query;
- sctx->b.destroy_query = si_destroy_query;
- sctx->b.begin_query = si_begin_query;
- sctx->b.end_query = si_end_query;
- sctx->b.get_query_result = si_get_query_result;
- sctx->b.get_query_result_resource = si_get_query_result_resource;
-
- if (sctx->has_graphics) {
- sctx->atoms.s.render_cond.emit = si_emit_query_predication;
- sctx->b.render_condition = si_render_condition;
- }
-
- list_inithead(&sctx->active_queries);
+ sctx->b.create_query = si_create_query;
+ sctx->b.create_batch_query = si_create_batch_query;
+ sctx->b.destroy_query = si_destroy_query;
+ sctx->b.begin_query = si_begin_query;
+ sctx->b.end_query = si_end_query;
+ sctx->b.get_query_result = si_get_query_result;
+ sctx->b.get_query_result_resource = si_get_query_result_resource;
+
+ if (sctx->has_graphics) {
+ sctx->atoms.s.render_cond.emit = si_emit_query_predication;
+ sctx->b.render_condition = si_render_condition;
+ }
+
+ list_inithead(&sctx->active_queries);
}
void si_init_screen_query_functions(struct si_screen *sscreen)
{
- sscreen->b.get_driver_query_info = si_get_driver_query_info;
- sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
+ sscreen->b.get_driver_query_info = si_get_driver_query_info;
+ sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
}
#define SI_MAX_STREAMS 4
-enum {
- SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
- SI_QUERY_DECOMPRESS_CALLS,
- SI_QUERY_MRT_DRAW_CALLS,
- SI_QUERY_PRIM_RESTART_CALLS,
- SI_QUERY_SPILL_DRAW_CALLS,
- SI_QUERY_COMPUTE_CALLS,
- SI_QUERY_SPILL_COMPUTE_CALLS,
- SI_QUERY_DMA_CALLS,
- SI_QUERY_CP_DMA_CALLS,
- SI_QUERY_NUM_VS_FLUSHES,
- SI_QUERY_NUM_PS_FLUSHES,
- SI_QUERY_NUM_CS_FLUSHES,
- SI_QUERY_NUM_CB_CACHE_FLUSHES,
- SI_QUERY_NUM_DB_CACHE_FLUSHES,
- SI_QUERY_NUM_L2_INVALIDATES,
- SI_QUERY_NUM_L2_WRITEBACKS,
- SI_QUERY_NUM_RESIDENT_HANDLES,
- SI_QUERY_TC_OFFLOADED_SLOTS,
- SI_QUERY_TC_DIRECT_SLOTS,
- SI_QUERY_TC_NUM_SYNCS,
- SI_QUERY_CS_THREAD_BUSY,
- SI_QUERY_GALLIUM_THREAD_BUSY,
- SI_QUERY_REQUESTED_VRAM,
- SI_QUERY_REQUESTED_GTT,
- SI_QUERY_MAPPED_VRAM,
- SI_QUERY_MAPPED_GTT,
- SI_QUERY_BUFFER_WAIT_TIME,
- SI_QUERY_NUM_MAPPED_BUFFERS,
- SI_QUERY_NUM_GFX_IBS,
- SI_QUERY_NUM_SDMA_IBS,
- SI_QUERY_GFX_BO_LIST_SIZE,
- SI_QUERY_GFX_IB_SIZE,
- SI_QUERY_NUM_BYTES_MOVED,
- SI_QUERY_NUM_EVICTIONS,
- SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
- SI_QUERY_VRAM_USAGE,
- SI_QUERY_VRAM_VIS_USAGE,
- SI_QUERY_GTT_USAGE,
- SI_QUERY_GPU_TEMPERATURE,
- SI_QUERY_CURRENT_GPU_SCLK,
- SI_QUERY_CURRENT_GPU_MCLK,
- SI_QUERY_GPU_LOAD,
- SI_QUERY_GPU_SHADERS_BUSY,
- SI_QUERY_GPU_TA_BUSY,
- SI_QUERY_GPU_GDS_BUSY,
- SI_QUERY_GPU_VGT_BUSY,
- SI_QUERY_GPU_IA_BUSY,
- SI_QUERY_GPU_SX_BUSY,
- SI_QUERY_GPU_WD_BUSY,
- SI_QUERY_GPU_BCI_BUSY,
- SI_QUERY_GPU_SC_BUSY,
- SI_QUERY_GPU_PA_BUSY,
- SI_QUERY_GPU_DB_BUSY,
- SI_QUERY_GPU_CP_BUSY,
- SI_QUERY_GPU_CB_BUSY,
- SI_QUERY_GPU_SDMA_BUSY,
- SI_QUERY_GPU_PFP_BUSY,
- SI_QUERY_GPU_MEQ_BUSY,
- SI_QUERY_GPU_ME_BUSY,
- SI_QUERY_GPU_SURF_SYNC_BUSY,
- SI_QUERY_GPU_CP_DMA_BUSY,
- SI_QUERY_GPU_SCRATCH_RAM_BUSY,
- SI_QUERY_NUM_COMPILATIONS,
- SI_QUERY_NUM_SHADERS_CREATED,
- SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
- SI_QUERY_GPIN_ASIC_ID,
- SI_QUERY_GPIN_NUM_SIMD,
- SI_QUERY_GPIN_NUM_RB,
- SI_QUERY_GPIN_NUM_SPI,
- SI_QUERY_GPIN_NUM_SE,
- SI_QUERY_TIME_ELAPSED_SDMA,
- SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
- SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
- SI_QUERY_PD_NUM_PRIMS_REJECTED,
- SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
- SI_QUERY_LIVE_SHADER_CACHE_HITS,
- SI_QUERY_LIVE_SHADER_CACHE_MISSES,
- SI_QUERY_MEMORY_SHADER_CACHE_HITS,
- SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
- SI_QUERY_DISK_SHADER_CACHE_HITS,
- SI_QUERY_DISK_SHADER_CACHE_MISSES,
-
- SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+enum
+{
+ SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+ SI_QUERY_DECOMPRESS_CALLS,
+ SI_QUERY_MRT_DRAW_CALLS,
+ SI_QUERY_PRIM_RESTART_CALLS,
+ SI_QUERY_SPILL_DRAW_CALLS,
+ SI_QUERY_COMPUTE_CALLS,
+ SI_QUERY_SPILL_COMPUTE_CALLS,
+ SI_QUERY_DMA_CALLS,
+ SI_QUERY_CP_DMA_CALLS,
+ SI_QUERY_NUM_VS_FLUSHES,
+ SI_QUERY_NUM_PS_FLUSHES,
+ SI_QUERY_NUM_CS_FLUSHES,
+ SI_QUERY_NUM_CB_CACHE_FLUSHES,
+ SI_QUERY_NUM_DB_CACHE_FLUSHES,
+ SI_QUERY_NUM_L2_INVALIDATES,
+ SI_QUERY_NUM_L2_WRITEBACKS,
+ SI_QUERY_NUM_RESIDENT_HANDLES,
+ SI_QUERY_TC_OFFLOADED_SLOTS,
+ SI_QUERY_TC_DIRECT_SLOTS,
+ SI_QUERY_TC_NUM_SYNCS,
+ SI_QUERY_CS_THREAD_BUSY,
+ SI_QUERY_GALLIUM_THREAD_BUSY,
+ SI_QUERY_REQUESTED_VRAM,
+ SI_QUERY_REQUESTED_GTT,
+ SI_QUERY_MAPPED_VRAM,
+ SI_QUERY_MAPPED_GTT,
+ SI_QUERY_BUFFER_WAIT_TIME,
+ SI_QUERY_NUM_MAPPED_BUFFERS,
+ SI_QUERY_NUM_GFX_IBS,
+ SI_QUERY_NUM_SDMA_IBS,
+ SI_QUERY_GFX_BO_LIST_SIZE,
+ SI_QUERY_GFX_IB_SIZE,
+ SI_QUERY_NUM_BYTES_MOVED,
+ SI_QUERY_NUM_EVICTIONS,
+ SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+ SI_QUERY_VRAM_USAGE,
+ SI_QUERY_VRAM_VIS_USAGE,
+ SI_QUERY_GTT_USAGE,
+ SI_QUERY_GPU_TEMPERATURE,
+ SI_QUERY_CURRENT_GPU_SCLK,
+ SI_QUERY_CURRENT_GPU_MCLK,
+ SI_QUERY_GPU_LOAD,
+ SI_QUERY_GPU_SHADERS_BUSY,
+ SI_QUERY_GPU_TA_BUSY,
+ SI_QUERY_GPU_GDS_BUSY,
+ SI_QUERY_GPU_VGT_BUSY,
+ SI_QUERY_GPU_IA_BUSY,
+ SI_QUERY_GPU_SX_BUSY,
+ SI_QUERY_GPU_WD_BUSY,
+ SI_QUERY_GPU_BCI_BUSY,
+ SI_QUERY_GPU_SC_BUSY,
+ SI_QUERY_GPU_PA_BUSY,
+ SI_QUERY_GPU_DB_BUSY,
+ SI_QUERY_GPU_CP_BUSY,
+ SI_QUERY_GPU_CB_BUSY,
+ SI_QUERY_GPU_SDMA_BUSY,
+ SI_QUERY_GPU_PFP_BUSY,
+ SI_QUERY_GPU_MEQ_BUSY,
+ SI_QUERY_GPU_ME_BUSY,
+ SI_QUERY_GPU_SURF_SYNC_BUSY,
+ SI_QUERY_GPU_CP_DMA_BUSY,
+ SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+ SI_QUERY_NUM_COMPILATIONS,
+ SI_QUERY_NUM_SHADERS_CREATED,
+ SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+ SI_QUERY_GPIN_ASIC_ID,
+ SI_QUERY_GPIN_NUM_SIMD,
+ SI_QUERY_GPIN_NUM_RB,
+ SI_QUERY_GPIN_NUM_SPI,
+ SI_QUERY_GPIN_NUM_SE,
+ SI_QUERY_TIME_ELAPSED_SDMA,
+ SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+ SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
+ SI_QUERY_PD_NUM_PRIMS_REJECTED,
+ SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
+ SI_QUERY_LIVE_SHADER_CACHE_HITS,
+ SI_QUERY_LIVE_SHADER_CACHE_MISSES,
+ SI_QUERY_MEMORY_SHADER_CACHE_HITS,
+ SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
+ SI_QUERY_DISK_SHADER_CACHE_HITS,
+ SI_QUERY_DISK_SHADER_CACHE_MISSES,
+
+ SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
};
-enum {
- SI_QUERY_GROUP_GPIN = 0,
- SI_NUM_SW_QUERY_GROUPS
+enum
+{
+ SI_QUERY_GROUP_GPIN = 0,
+ SI_NUM_SW_QUERY_GROUPS
};
struct si_query_ops {
- void (*destroy)(struct si_context *, struct si_query *);
- bool (*begin)(struct si_context *, struct si_query *);
- bool (*end)(struct si_context *, struct si_query *);
- bool (*get_result)(struct si_context *,
- struct si_query *, bool wait,
- union pipe_query_result *result);
- void (*get_result_resource)(struct si_context *,
- struct si_query *, bool wait,
- enum pipe_query_value_type result_type,
- int index,
- struct pipe_resource *resource,
- unsigned offset);
-
- void (*suspend)(struct si_context *, struct si_query *);
- void (*resume)(struct si_context *, struct si_query *);
+ void (*destroy)(struct si_context *, struct si_query *);
+ bool (*begin)(struct si_context *, struct si_query *);
+ bool (*end)(struct si_context *, struct si_query *);
+ bool (*get_result)(struct si_context *, struct si_query *, bool wait,
+ union pipe_query_result *result);
+ void (*get_result_resource)(struct si_context *, struct si_query *, bool wait,
+ enum pipe_query_value_type result_type, int index,
+ struct pipe_resource *resource, unsigned offset);
+
+ void (*suspend)(struct si_context *, struct si_query *);
+ void (*resume)(struct si_context *, struct si_query *);
};
struct si_query {
- struct threaded_query b;
- const struct si_query_ops *ops;
+ struct threaded_query b;
+ const struct si_query_ops *ops;
- /* The PIPE_QUERY_xxx type of query */
- unsigned type;
+ /* The PIPE_QUERY_xxx type of query */
+ unsigned type;
- /* The number of dwords for suspend. */
- unsigned num_cs_dw_suspend;
+ /* The number of dwords for suspend. */
+ unsigned num_cs_dw_suspend;
- /* Linked list of queries that must be suspended at end of CS. */
- struct list_head active_list;
+ /* Linked list of queries that must be suspended at end of CS. */
+ struct list_head active_list;
};
-enum {
- SI_QUERY_HW_FLAG_NO_START = (1 << 0),
- /* gap */
- /* whether begin_query doesn't clear the result */
- SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+enum
+{
+ SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+ /* gap */
+ /* whether begin_query doesn't clear the result */
+ SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
};
struct si_query_hw_ops {
- bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
- void (*emit_start)(struct si_context *,
- struct si_query_hw *,
- struct si_resource *buffer, uint64_t va);
- void (*emit_stop)(struct si_context *,
- struct si_query_hw *,
- struct si_resource *buffer, uint64_t va);
- void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
- void (*add_result)(struct si_screen *screen,
- struct si_query_hw *, void *buffer,
- union pipe_query_result *result);
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
+ void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+ uint64_t va);
+ void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+ uint64_t va);
+ void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+ void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer,
+ union pipe_query_result *result);
};
struct si_query_buffer {
- /* The buffer where query results are stored. */
- struct si_resource *buf;
- /* If a query buffer is full, a new buffer is created and the old one
- * is put in here. When we calculate the result, we sum up the samples
- * from all buffers. */
- struct si_query_buffer *previous;
- /* Offset of the next free result after current query data */
- unsigned results_end;
- bool unprepared;
+ /* The buffer where query results are stored. */
+ struct si_resource *buf;
+ /* If a query buffer is full, a new buffer is created and the old one
+ * is put in here. When we calculate the result, we sum up the samples
+ * from all buffers. */
+ struct si_query_buffer *previous;
+ /* Offset of the next free result after current query data */
+ unsigned results_end;
+ bool unprepared;
};
void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
- bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
- unsigned size);
-
+ bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+ unsigned size);
struct si_query_hw {
- struct si_query b;
- struct si_query_hw_ops *ops;
- unsigned flags;
-
- /* The query buffer and how many results are in it. */
- struct si_query_buffer buffer;
- /* Size of the result in memory for both begin_query and end_query,
- * this can be one or two numbers, or it could even be a size of a structure. */
- unsigned result_size;
- /* For transform feedback: which stream the query is for */
- unsigned stream;
-
- /* Workaround via compute shader */
- struct si_resource *workaround_buf;
- unsigned workaround_offset;
+ struct si_query b;
+ struct si_query_hw_ops *ops;
+ unsigned flags;
+
+ /* The query buffer and how many results are in it. */
+ struct si_query_buffer buffer;
+ /* Size of the result in memory for both begin_query and end_query,
+ * this can be one or two numbers, or it could even be a size of a structure. */
+ unsigned result_size;
+ /* For transform feedback: which stream the query is for */
+ unsigned stream;
+
+ /* Workaround via compute shader */
+ struct si_resource *workaround_buf;
+ unsigned workaround_offset;
};
-void si_query_hw_destroy(struct si_context *sctx,
- struct si_query *squery);
-bool si_query_hw_begin(struct si_context *sctx,
- struct si_query *squery);
-bool si_query_hw_end(struct si_context *sctx,
- struct si_query *squery);
-bool si_query_hw_get_result(struct si_context *sctx,
- struct si_query *squery,
- bool wait,
- union pipe_query_result *result);
+void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+ union pipe_query_result *result);
void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
-
/* Shader-based queries */
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
- enum pipe_query_type query_type,
- unsigned index);
-
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+ unsigned index);
/* Performance counters */
struct si_perfcounters {
- unsigned num_groups;
- unsigned num_blocks;
- struct si_pc_block *blocks;
+ unsigned num_groups;
+ unsigned num_blocks;
+ struct si_pc_block *blocks;
- unsigned num_stop_cs_dwords;
- unsigned num_instance_cs_dwords;
+ unsigned num_stop_cs_dwords;
+ unsigned num_instance_cs_dwords;
- bool separate_se;
- bool separate_instance;
+ bool separate_se;
+ bool separate_instance;
};
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
- unsigned num_queries,
- unsigned *query_types);
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+ unsigned *query_types);
-int si_get_perfcounter_info(struct si_screen *,
- unsigned index,
- struct pipe_driver_query_info *info);
-int si_get_perfcounter_group_info(struct si_screen *,
- unsigned index,
- struct pipe_driver_query_group_info *info);
+int si_get_perfcounter_info(struct si_screen *, unsigned index,
+ struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *, unsigned index,
+ struct pipe_driver_query_group_info *info);
struct si_qbo_state {
- void *saved_compute;
- struct pipe_constant_buffer saved_const0;
- struct pipe_shader_buffer saved_ssbo[3];
- unsigned saved_ssbo_writable_mask;
+ void *saved_compute;
+ struct pipe_constant_buffer saved_const0;
+ struct pipe_shader_buffer saved_ssbo[3];
+ unsigned saved_ssbo_writable_mask;
};
#endif /* SI_QUERY_H */
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "util/u_memory.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_from_mesa.h"
-
#include "ac_exp_param.h"
#include "ac_rtld.h"
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "sid.h"
-
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_serialize.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "sid.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "tgsi/tgsi_strings.h"
+#include "util/u_memory.h"
-static const char scratch_rsrc_dword0_symbol[] =
- "SCRATCH_RSRC_DWORD0";
+static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
-static const char scratch_rsrc_dword1_symbol[] =
- "SCRATCH_RSRC_DWORD1";
+static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
/** Whether the shader runs as a combination of multiple API shaders */
bool si_is_multi_part_shader(struct si_shader *shader)
{
- if (shader->selector->screen->info.chip_class <= GFX8)
- return false;
+ if (shader->selector->screen->info.chip_class <= GFX8)
+ return false;
- return shader->key.as_ls ||
- shader->key.as_es ||
- shader->selector->type == PIPE_SHADER_TESS_CTRL ||
- shader->selector->type == PIPE_SHADER_GEOMETRY;
+ return shader->key.as_ls || shader->key.as_es ||
+ shader->selector->type == PIPE_SHADER_TESS_CTRL ||
+ shader->selector->type == PIPE_SHADER_GEOMETRY;
}
/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
bool si_is_merged_shader(struct si_shader *shader)
{
- return shader->key.as_ngg || si_is_multi_part_shader(shader);
+ return shader->key.as_ngg || si_is_multi_part_shader(shader);
}
/**
*/
unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
{
- switch (semantic_name) {
- case TGSI_SEMANTIC_TESSOUTER:
- return 0;
- case TGSI_SEMANTIC_TESSINNER:
- return 1;
- case TGSI_SEMANTIC_PATCH:
- assert(index < 30);
- return 2 + index;
-
- default:
- assert(!"invalid semantic name");
- return 0;
- }
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_TESSOUTER:
+ return 0;
+ case TGSI_SEMANTIC_TESSINNER:
+ return 1;
+ case TGSI_SEMANTIC_PATCH:
+ assert(index < 30);
+ return 2 + index;
+
+ default:
+ assert(!"invalid semantic name");
+ return 0;
+ }
}
/**
* less than 64, so that a 64-bit bitmask of used inputs or outputs can be
* calculated.
*/
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
- unsigned is_varying)
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying)
{
- switch (semantic_name) {
- case TGSI_SEMANTIC_POSITION:
- return 0;
- case TGSI_SEMANTIC_GENERIC:
- /* Since some shader stages use the the highest used IO index
- * to determine the size to allocate for inputs/outputs
- * (in LDS, tess and GS rings). GENERIC should be placed right
- * after POSITION to make that size as small as possible.
- */
- if (index < SI_MAX_IO_GENERIC)
- return 1 + index;
-
- assert(!"invalid generic index");
- return 0;
- case TGSI_SEMANTIC_FOG:
- return SI_MAX_IO_GENERIC + 1;
- case TGSI_SEMANTIC_COLOR:
- assert(index < 2);
- return SI_MAX_IO_GENERIC + 2 + index;
- case TGSI_SEMANTIC_BCOLOR:
- assert(index < 2);
- /* If it's a varying, COLOR and BCOLOR alias. */
- if (is_varying)
- return SI_MAX_IO_GENERIC + 2 + index;
- else
- return SI_MAX_IO_GENERIC + 4 + index;
- case TGSI_SEMANTIC_TEXCOORD:
- assert(index < 8);
- return SI_MAX_IO_GENERIC + 6 + index;
-
- /* These are rarely used between LS and HS or ES and GS. */
- case TGSI_SEMANTIC_CLIPDIST:
- assert(index < 2);
- return SI_MAX_IO_GENERIC + 6 + 8 + index;
- case TGSI_SEMANTIC_CLIPVERTEX:
- return SI_MAX_IO_GENERIC + 6 + 8 + 2;
- case TGSI_SEMANTIC_PSIZE:
- return SI_MAX_IO_GENERIC + 6 + 8 + 3;
-
- /* These can't be written by LS, HS, and ES. */
- case TGSI_SEMANTIC_LAYER:
- return SI_MAX_IO_GENERIC + 6 + 8 + 4;
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- return SI_MAX_IO_GENERIC + 6 + 8 + 5;
- case TGSI_SEMANTIC_PRIMID:
- STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
- return SI_MAX_IO_GENERIC + 6 + 8 + 6;
- default:
- fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
- assert(!"invalid semantic name");
- return 0;
- }
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_POSITION:
+ return 0;
+ case TGSI_SEMANTIC_GENERIC:
+ /* Since some shader stages use the the highest used IO index
+ * to determine the size to allocate for inputs/outputs
+ * (in LDS, tess and GS rings). GENERIC should be placed right
+ * after POSITION to make that size as small as possible.
+ */
+ if (index < SI_MAX_IO_GENERIC)
+ return 1 + index;
+
+ assert(!"invalid generic index");
+ return 0;
+ case TGSI_SEMANTIC_FOG:
+ return SI_MAX_IO_GENERIC + 1;
+ case TGSI_SEMANTIC_COLOR:
+ assert(index < 2);
+ return SI_MAX_IO_GENERIC + 2 + index;
+ case TGSI_SEMANTIC_BCOLOR:
+ assert(index < 2);
+ /* If it's a varying, COLOR and BCOLOR alias. */
+ if (is_varying)
+ return SI_MAX_IO_GENERIC + 2 + index;
+ else
+ return SI_MAX_IO_GENERIC + 4 + index;
+ case TGSI_SEMANTIC_TEXCOORD:
+ assert(index < 8);
+ return SI_MAX_IO_GENERIC + 6 + index;
+
+ /* These are rarely used between LS and HS or ES and GS. */
+ case TGSI_SEMANTIC_CLIPDIST:
+ assert(index < 2);
+ return SI_MAX_IO_GENERIC + 6 + 8 + index;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ return SI_MAX_IO_GENERIC + 6 + 8 + 2;
+ case TGSI_SEMANTIC_PSIZE:
+ return SI_MAX_IO_GENERIC + 6 + 8 + 3;
+
+ /* These can't be written by LS, HS, and ES. */
+ case TGSI_SEMANTIC_LAYER:
+ return SI_MAX_IO_GENERIC + 6 + 8 + 4;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ return SI_MAX_IO_GENERIC + 6 + 8 + 5;
+ case TGSI_SEMANTIC_PRIMID:
+ STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
+ return SI_MAX_IO_GENERIC + 6 + 8 + 6;
+ default:
+ fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
+ assert(!"invalid semantic name");
+ return 0;
+ }
}
static void si_dump_streamout(struct pipe_stream_output_info *so)
{
- unsigned i;
-
- if (so->num_outputs)
- fprintf(stderr, "STREAMOUT\n");
-
- for (i = 0; i < so->num_outputs; i++) {
- unsigned mask = ((1 << so->output[i].num_components) - 1) <<
- so->output[i].start_component;
- fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
- i, so->output[i].output_buffer,
- so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
- so->output[i].register_index,
- mask & 1 ? "x" : "",
- mask & 2 ? "y" : "",
- mask & 4 ? "z" : "",
- mask & 8 ? "w" : "");
- }
+ unsigned i;
+
+ if (so->num_outputs)
+ fprintf(stderr, "STREAMOUT\n");
+
+ for (i = 0; i < so->num_outputs; i++) {
+ unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+ fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
+ so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
+ so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
+ mask & 4 ? "z" : "", mask & 8 ? "w" : "");
+ }
}
static void declare_streamout_params(struct si_shader_context *ctx,
- struct pipe_stream_output_info *so)
+ struct pipe_stream_output_info *so)
{
- if (ctx->screen->use_ngg_streamout) {
- if (ctx->type == PIPE_SHADER_TESS_EVAL)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- return;
- }
-
- /* Streamout SGPRs. */
- if (so->num_outputs) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
- } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- }
-
- /* A streamout buffer offset is loaded if the stride is non-zero. */
- for (int i = 0; i < 4; i++) {
- if (!so->stride[i])
- continue;
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
- }
+ if (ctx->screen->use_ngg_streamout) {
+ if (ctx->type == PIPE_SHADER_TESS_EVAL)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ return;
+ }
+
+ /* Streamout SGPRs. */
+ if (so->num_outputs) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
+ } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ }
+
+ /* A streamout buffer offset is loaded if the stride is non-zero. */
+ for (int i = 0; i < 4; i++) {
+ if (!so->stride[i])
+ continue;
+
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
+ }
}
unsigned si_get_max_workgroup_size(const struct si_shader *shader)
{
- switch (shader->selector->type) {
- case PIPE_SHADER_VERTEX:
- case PIPE_SHADER_TESS_EVAL:
- return shader->key.as_ngg ? 128 : 0;
-
- case PIPE_SHADER_TESS_CTRL:
- /* Return this so that LLVM doesn't remove s_barrier
- * instructions on chips where we use s_barrier. */
- return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
-
- case PIPE_SHADER_GEOMETRY:
- return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
-
- case PIPE_SHADER_COMPUTE:
- break; /* see below */
-
- default:
- return 0;
- }
-
- const unsigned *properties = shader->selector->info.properties;
- unsigned max_work_group_size =
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
-
- if (!max_work_group_size) {
- /* This is a variable group size compute shader,
- * compile it for the maximum possible group size.
- */
- max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
- }
- return max_work_group_size;
+ switch (shader->selector->type) {
+ case PIPE_SHADER_VERTEX:
+ case PIPE_SHADER_TESS_EVAL:
+ return shader->key.as_ngg ? 128 : 0;
+
+ case PIPE_SHADER_TESS_CTRL:
+ /* Return this so that LLVM doesn't remove s_barrier
+ * instructions on chips where we use s_barrier. */
+ return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
+
+ case PIPE_SHADER_GEOMETRY:
+ return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
+
+ case PIPE_SHADER_COMPUTE:
+ break; /* see below */
+
+ default:
+ return 0;
+ }
+
+ const unsigned *properties = shader->selector->info.properties;
+ unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+ properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+ properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
+
+ if (!max_work_group_size) {
+ /* This is a variable group size compute shader,
+ * compile it for the maximum possible group size.
+ */
+ max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+ }
+ return max_work_group_size;
}
-static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
- bool assign_params)
+static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
{
- enum ac_arg_type const_shader_buf_type;
+ enum ac_arg_type const_shader_buf_type;
- if (ctx->shader->selector->info.const_buffers_declared == 1 &&
- ctx->shader->selector->info.shader_buffers_declared == 0)
- const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
- else
- const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
+ if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+ ctx->shader->selector->info.shader_buffers_declared == 0)
+ const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
+ else
+ const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
- assign_params ? &ctx->const_and_shader_buffers :
- &ctx->other_const_and_shader_buffers);
+ ac_add_arg(
+ &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
+ assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
}
-static void declare_samplers_and_images(struct si_shader_context *ctx,
- bool assign_params)
+static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
{
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
- assign_params ? &ctx->samplers_and_images :
- &ctx->other_samplers_and_images);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+ assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
}
-static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
- bool assign_params)
+static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
{
- declare_const_and_shader_buffers(ctx, assign_params);
- declare_samplers_and_images(ctx, assign_params);
+ declare_const_and_shader_buffers(ctx, assign_params);
+ declare_samplers_and_images(ctx, assign_params);
}
static void declare_global_desc_pointers(struct si_shader_context *ctx)
{
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- &ctx->rw_buffers);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
- &ctx->bindless_samplers_and_images);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+ &ctx->bindless_samplers_and_images);
}
static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
{
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- if (!ctx->shader->is_gs_copy_shader) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
- }
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+ if (!ctx->shader->is_gs_copy_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
+ }
}
static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
{
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
- unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
- if (num_vbos_in_user_sgprs) {
- unsigned user_sgprs = ctx->args.num_sgprs_used;
+ unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+ if (num_vbos_in_user_sgprs) {
+ unsigned user_sgprs = ctx->args.num_sgprs_used;
- if (si_is_merged_shader(ctx->shader))
- user_sgprs -= 8;
- assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+ if (si_is_merged_shader(ctx->shader))
+ user_sgprs -= 8;
+ assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
- /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
- for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+ /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
+ for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
- assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
- for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
- }
+ assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
+ for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
+ }
}
-static void declare_vs_input_vgprs(struct si_shader_context *ctx,
- unsigned *num_prolog_vgprs,
- bool ngg_cull_shader)
+static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
+ bool ngg_cull_shader)
{
- struct si_shader *shader = ctx->shader;
-
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
- if (shader->key.as_ls) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
- if (ctx->screen->info.chip_class >= GFX10) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
- } else {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
- }
- } else if (ctx->screen->info.chip_class >= GFX10) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
- } else {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
- }
-
- if (!shader->is_gs_copy_shader) {
- if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->ngg_old_thread_id);
- }
-
- /* Vertex load indices. */
- if (shader->selector->info.num_inputs) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->vertex_index0);
- for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
- }
- *num_prolog_vgprs += shader->selector->info.num_inputs;
- }
+ struct si_shader *shader = ctx->shader;
+
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
+ if (shader->key.as_ls) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
+ if (ctx->screen->info.chip_class >= GFX10) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+ } else {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+ }
+ } else if (ctx->screen->info.chip_class >= GFX10) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+ &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+ } else {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+ }
+
+ if (!shader->is_gs_copy_shader) {
+ if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+ }
+
+ /* Vertex load indices. */
+ if (shader->selector->info.num_inputs) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
+ for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+ }
+ *num_prolog_vgprs += shader->selector->info.num_inputs;
+ }
}
-static void declare_vs_blit_inputs(struct si_shader_context *ctx,
- unsigned vs_blit_property)
+static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
{
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->vs_blit_inputs); /* i16 x1, y1 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
-
- if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
- } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
- }
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
+
+ if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
+ } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
+ }
}
static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
{
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
-
- if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->ngg_old_thread_id);
- }
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+ if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+ }
}
-enum {
- /* Convenient merged shader definitions. */
- SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
- SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
+enum
+{
+ /* Convenient merged shader definitions. */
+ SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
+ SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
};
-void si_add_arg_checked(struct ac_shader_args *args,
- enum ac_arg_regfile file,
- unsigned registers, enum ac_arg_type type,
- struct ac_arg *arg,
- unsigned idx)
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+ enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
{
- assert(args->arg_count == idx);
- ac_add_arg(args, file, registers, type, arg);
+ assert(args->arg_count == idx);
+ ac_add_arg(args, file, registers, type, arg);
}
void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
{
- struct si_shader *shader = ctx->shader;
- LLVMTypeRef returns[AC_MAX_ARGS];
- unsigned i, num_return_sgprs;
- unsigned num_returns = 0;
- unsigned num_prolog_vgprs = 0;
- unsigned type = ctx->type;
- unsigned vs_blit_property =
- shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- /* Set MERGED shaders. */
- if (ctx->screen->info.chip_class >= GFX9) {
- if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
- type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
- else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
- type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
- }
-
- switch (type) {
- case PIPE_SHADER_VERTEX:
- declare_global_desc_pointers(ctx);
-
- if (vs_blit_property) {
- declare_vs_blit_inputs(ctx, vs_blit_property);
-
- /* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
- break;
- }
-
- declare_per_stage_desc_pointers(ctx, true);
- declare_vs_specific_input_sgprs(ctx);
- if (!shader->is_gs_copy_shader)
- declare_vb_descriptor_input_sgprs(ctx);
-
- if (shader->key.as_es) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->es2gs_offset);
- } else if (shader->key.as_ls) {
- /* no extra parameters */
- } else {
- /* The locations of the other parameters are assigned dynamically. */
- declare_streamout_params(ctx, &shader->selector->so);
- }
-
- /* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
- /* Return values */
- if (shader->key.opt.vs_as_prim_discard_cs) {
- for (i = 0; i < 4; i++)
- returns[num_returns++] = ctx->ac.f32; /* VGPRs */
- }
- break;
-
- case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
-
- /* VGPRs */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
- /* param_tcs_offchip_offset and param_tcs_factor_offset are
- * placed after the user SGPRs.
- */
- for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
- returns[num_returns++] = ctx->ac.i32; /* SGPRs */
- for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->ac.f32; /* VGPRs */
- break;
-
- case SI_SHADER_MERGED_VERTEX_TESSCTRL:
- /* Merged stages have 8 system SGPRs at the beginning. */
- /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_TESS_CTRL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_VERTEX);
- declare_vs_specific_input_sgprs(ctx);
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
- declare_vb_descriptor_input_sgprs(ctx);
-
- /* VGPRs (first TCS, then VS) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
- /* LS return values are inputs to the TCS main shader part. */
- for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
- returns[num_returns++] = ctx->ac.i32; /* SGPRs */
- for (i = 0; i < 2; i++)
- returns[num_returns++] = ctx->ac.f32; /* VGPRs */
- } else {
- /* TCS return values are inputs to the TCS epilog.
- *
- * param_tcs_offchip_offset, param_tcs_factor_offset,
- * param_tcs_offchip_layout, and param_rw_buffers
- * should be passed to the epilog.
- */
- for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
- returns[num_returns++] = ctx->ac.i32; /* SGPRs */
- for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->ac.f32; /* VGPRs */
- }
- break;
-
- case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
- /* Merged stages have 8 system SGPRs at the beginning. */
- /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
- declare_per_stage_desc_pointers(ctx,
- ctx->type == PIPE_SHADER_GEOMETRY);
-
- if (ctx->shader->key.as_ngg)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
- else
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
-
- declare_global_desc_pointers(ctx);
- if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
- declare_per_stage_desc_pointers(ctx,
- (ctx->type == PIPE_SHADER_VERTEX ||
- ctx->type == PIPE_SHADER_TESS_EVAL));
- }
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- if (vs_blit_property)
- declare_vs_blit_inputs(ctx, vs_blit_property);
- else
- declare_vs_specific_input_sgprs(ctx);
- } else {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
- /* Declare as many input SGPRs as the VS has. */
- }
-
- if (ctx->type == PIPE_SHADER_VERTEX)
- declare_vb_descriptor_input_sgprs(ctx);
-
- /* VGPRs (first GS, then VS/TES) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
- } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- declare_tes_input_vgprs(ctx, ngg_cull_shader);
- }
-
- if ((ctx->shader->key.as_es || ngg_cull_shader) &&
- (ctx->type == PIPE_SHADER_VERTEX ||
- ctx->type == PIPE_SHADER_TESS_EVAL)) {
- unsigned num_user_sgprs, num_vgprs;
-
- if (ctx->type == PIPE_SHADER_VERTEX) {
- /* For the NGG cull shader, add 1 SGPR to hold
- * the vertex buffer pointer.
- */
- num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
-
- if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
- assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
- num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
- shader->selector->num_vbos_in_user_sgprs * 4;
- }
- } else {
- num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
- }
-
- /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
- *
- * The normal merged ESGS shader only has to return the 5 VGPRs
- * for the GS stage.
- */
- num_vgprs = ngg_cull_shader ? 10 : 5;
-
- /* ES return values are inputs to GS. */
- for (i = 0; i < 8 + num_user_sgprs; i++)
- returns[num_returns++] = ctx->ac.i32; /* SGPRs */
- for (i = 0; i < num_vgprs; i++)
- returns[num_returns++] = ctx->ac.f32; /* VGPRs */
- }
- break;
-
- case PIPE_SHADER_TESS_EVAL:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-
- if (shader->key.as_es) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
- } else {
- declare_streamout_params(ctx, &shader->selector->so);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
- }
-
- /* VGPRs */
- declare_tes_input_vgprs(ctx, ngg_cull_shader);
- break;
-
- case PIPE_SHADER_GEOMETRY:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
-
- /* VGPRs */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
- break;
-
- case PIPE_SHADER_FRAGMENT:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
- SI_PARAM_ALPHA_REF);
- si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
-
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
- SI_PARAM_PERSP_SAMPLE);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
- NULL, SI_PARAM_PERSP_PULL_MODEL);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
- &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
- NULL, SI_PARAM_LINE_STIPPLE_TEX);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
- shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->args.front_face, SI_PARAM_FRONT_FACE);
- shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->args.ancillary, SI_PARAM_ANCILLARY);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
- &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
- si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
-
- /* Color inputs from the prolog. */
- if (shader->selector->info.colors_read) {
- unsigned num_color_elements =
- util_bitcount(shader->selector->info.colors_read);
-
- for (i = 0; i < num_color_elements; i++)
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
- num_prolog_vgprs += num_color_elements;
- }
-
- /* Outputs for the epilog. */
- num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
- num_returns =
- num_return_sgprs +
- util_bitcount(shader->selector->info.colors_written) * 4 +
- shader->selector->info.writes_z +
- shader->selector->info.writes_stencil +
- shader->selector->info.writes_samplemask +
- 1 /* SampleMaskIn */;
-
- num_returns = MAX2(num_returns,
- num_return_sgprs +
- PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
- for (i = 0; i < num_return_sgprs; i++)
- returns[i] = ctx->ac.i32;
- for (; i < num_returns; i++)
- returns[i] = ctx->ac.f32;
- break;
-
- case PIPE_SHADER_COMPUTE:
- declare_global_desc_pointers(ctx);
- declare_per_stage_desc_pointers(ctx, true);
- if (shader->selector->info.uses_grid_size)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
- &ctx->args.num_work_groups);
- if (shader->selector->info.uses_block_size &&
- shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
-
- unsigned cs_user_data_dwords =
- shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
- if (cs_user_data_dwords) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
- &ctx->cs_user_data);
- }
-
- /* Hardware SGPRs. */
- for (i = 0; i < 3; i++) {
- if (shader->selector->info.uses_block_id[i]) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->args.workgroup_ids[i]);
- }
- }
- if (shader->selector->info.uses_subgroup_info)
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
-
- /* Hardware VGPRs. */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
- &ctx->args.local_invocation_ids);
- break;
- default:
- assert(0 && "unimplemented shader");
- return;
- }
-
- si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
- returns, num_returns, si_get_max_workgroup_size(shader));
-
- /* Reserve register locations for VGPR inputs the PS prolog may need. */
- if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "InitialPSInputAddr",
- S_0286D0_PERSP_SAMPLE_ENA(1) |
- S_0286D0_PERSP_CENTER_ENA(1) |
- S_0286D0_PERSP_CENTROID_ENA(1) |
- S_0286D0_LINEAR_SAMPLE_ENA(1) |
- S_0286D0_LINEAR_CENTER_ENA(1) |
- S_0286D0_LINEAR_CENTROID_ENA(1) |
- S_0286D0_FRONT_FACE_ENA(1) |
- S_0286D0_ANCILLARY_ENA(1) |
- S_0286D0_POS_FIXED_PT_ENA(1));
- }
-
- shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
- shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
-
- assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
- shader->info.num_input_vgprs -= num_prolog_vgprs;
-
- if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
- if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
- /* The LSHS size is not known until draw time, so we append it
- * at the end of whatever LDS use there may be in the rest of
- * the shader (currently none, unless LLVM decides to do its
- * own LDS-based lowering).
- */
- ctx->ac.lds = LLVMAddGlobalInAddressSpace(
- ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
- "__lds_end", AC_ADDR_SPACE_LDS);
- LLVMSetAlignment(ctx->ac.lds, 256);
- } else {
- ac_declare_lds_as_pointer(&ctx->ac);
- }
- }
-
- /* Unlike radv, we override these arguments in the prolog, so to the
- * API shader they appear as normal arguments.
- */
- if (ctx->type == PIPE_SHADER_VERTEX) {
- ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
- ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
- } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
- ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
- ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
- }
+ struct si_shader *shader = ctx->shader;
+ LLVMTypeRef returns[AC_MAX_ARGS];
+ unsigned i, num_return_sgprs;
+ unsigned num_returns = 0;
+ unsigned num_prolog_vgprs = 0;
+ unsigned type = ctx->type;
+ unsigned vs_blit_property = shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ /* Set MERGED shaders. */
+ if (ctx->screen->info.chip_class >= GFX9) {
+ if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
+ type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
+ else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
+ type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
+ }
+
+ switch (type) {
+ case PIPE_SHADER_VERTEX:
+ declare_global_desc_pointers(ctx);
+
+ if (vs_blit_property) {
+ declare_vs_blit_inputs(ctx, vs_blit_property);
+
+ /* VGPRs */
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+ break;
+ }
+
+ declare_per_stage_desc_pointers(ctx, true);
+ declare_vs_specific_input_sgprs(ctx);
+ if (!shader->is_gs_copy_shader)
+ declare_vb_descriptor_input_sgprs(ctx);
+
+ if (shader->key.as_es) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+ } else if (shader->key.as_ls) {
+ /* no extra parameters */
+ } else {
+ /* The locations of the other parameters are assigned dynamically. */
+ declare_streamout_params(ctx, &shader->selector->so);
+ }
+
+ /* VGPRs */
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+ /* Return values */
+ if (shader->key.opt.vs_as_prim_discard_cs) {
+ for (i = 0; i < 4; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+ }
+ break;
+
+ case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, true);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+
+ /* VGPRs */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+ /* param_tcs_offchip_offset and param_tcs_factor_offset are
+ * placed after the user SGPRs.
+ */
+ for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+ for (i = 0; i < 11; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+ break;
+
+ case SI_SHADER_MERGED_VERTEX_TESSCTRL:
+ /* Merged stages have 8 system SGPRs at the beginning. */
+ /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
+ declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_TESS_CTRL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_VERTEX);
+ declare_vs_specific_input_sgprs(ctx);
+
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+ declare_vb_descriptor_input_sgprs(ctx);
+
+ /* VGPRs (first TCS, then VS) */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+ /* LS return values are inputs to the TCS main shader part. */
+ for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+ for (i = 0; i < 2; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+ } else {
+ /* TCS return values are inputs to the TCS epilog.
+ *
+ * param_tcs_offchip_offset, param_tcs_factor_offset,
+ * param_tcs_offchip_layout, and param_rw_buffers
+ * should be passed to the epilog.
+ */
+ for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+ for (i = 0; i < 11; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+ }
+ break;
+
+ case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
+ /* Merged stages have 8 system SGPRs at the beginning. */
+ /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
+ declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_GEOMETRY);
+
+ if (ctx->shader->key.as_ngg)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
+ else
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+ &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
+ NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
+
+ declare_global_desc_pointers(ctx);
+ if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
+ declare_per_stage_desc_pointers(
+ ctx, (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL));
+ }
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ if (vs_blit_property)
+ declare_vs_blit_inputs(ctx, vs_blit_property);
+ else
+ declare_vs_specific_input_sgprs(ctx);
+ } else {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+ /* Declare as many input SGPRs as the VS has. */
+ }
+
+ if (ctx->type == PIPE_SHADER_VERTEX)
+ declare_vb_descriptor_input_sgprs(ctx);
+
+ /* VGPRs (first GS, then VS/TES) */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+ } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
+ }
+
+ if ((ctx->shader->key.as_es || ngg_cull_shader) &&
+ (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)) {
+ unsigned num_user_sgprs, num_vgprs;
+
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ /* For the NGG cull shader, add 1 SGPR to hold
+ * the vertex buffer pointer.
+ */
+ num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
+
+ if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+ assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+ num_user_sgprs =
+ SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+ }
+ } else {
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+ }
+
+ /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+ *
+ * The normal merged ESGS shader only has to return the 5 VGPRs
+ * for the GS stage.
+ */
+ num_vgprs = ngg_cull_shader ? 10 : 5;
+
+ /* ES return values are inputs to GS. */
+ for (i = 0; i < 8 + num_user_sgprs; i++)
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+ for (i = 0; i < num_vgprs; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+ }
+ break;
+
+ case PIPE_SHADER_TESS_EVAL:
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, true);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+
+ if (shader->key.as_es) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+ } else {
+ declare_streamout_params(ctx, &shader->selector->so);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ }
+
+ /* VGPRs */
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
+ break;
+
+ case PIPE_SHADER_GEOMETRY:
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, true);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
+
+ /* VGPRs */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+ break;
+
+ case PIPE_SHADER_FRAGMENT:
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, true);
+ si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
+ si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
+ SI_PARAM_PRIM_MASK);
+
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
+ SI_PARAM_PERSP_SAMPLE);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
+ SI_PARAM_PERSP_CENTER);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
+ SI_PARAM_PERSP_CENTROID);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
+ SI_PARAM_LINEAR_SAMPLE);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
+ SI_PARAM_LINEAR_CENTER);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
+ SI_PARAM_LINEAR_CENTROID);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
+ SI_PARAM_POS_X_FLOAT);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
+ SI_PARAM_POS_Y_FLOAT);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
+ SI_PARAM_POS_Z_FLOAT);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
+ SI_PARAM_POS_W_FLOAT);
+ shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
+ SI_PARAM_FRONT_FACE);
+ shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
+ SI_PARAM_ANCILLARY);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
+ SI_PARAM_SAMPLE_COVERAGE);
+ si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
+ SI_PARAM_POS_FIXED_PT);
+
+ /* Color inputs from the prolog. */
+ if (shader->selector->info.colors_read) {
+ unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
+
+ for (i = 0; i < num_color_elements; i++)
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+ num_prolog_vgprs += num_color_elements;
+ }
+
+ /* Outputs for the epilog. */
+ num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+ num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
+ shader->selector->info.writes_z + shader->selector->info.writes_stencil +
+ shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
+
+ num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+ for (i = 0; i < num_return_sgprs; i++)
+ returns[i] = ctx->ac.i32;
+ for (; i < num_returns; i++)
+ returns[i] = ctx->ac.f32;
+ break;
+
+ case PIPE_SHADER_COMPUTE:
+ declare_global_desc_pointers(ctx);
+ declare_per_stage_desc_pointers(ctx, true);
+ if (shader->selector->info.uses_grid_size)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
+ if (shader->selector->info.uses_block_size &&
+ shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
+
+ unsigned cs_user_data_dwords =
+ shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+ if (cs_user_data_dwords) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
+ }
+
+ /* Hardware SGPRs. */
+ for (i = 0; i < 3; i++) {
+ if (shader->selector->info.uses_block_id[i]) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
+ }
+ }
+ if (shader->selector->info.uses_subgroup_info)
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
+
+ /* Hardware VGPRs. */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
+ break;
+ default:
+ assert(0 && "unimplemented shader");
+ return;
+ }
+
+ si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
+ si_get_max_workgroup_size(shader));
+
+ /* Reserve register locations for VGPR inputs the PS prolog may need. */
+ if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
+ ac_llvm_add_target_dep_function_attr(
+ ctx->main_fn, "InitialPSInputAddr",
+ S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
+ S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
+ S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
+ S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
+ }
+
+ shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
+ shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
+
+ assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
+ shader->info.num_input_vgprs -= num_prolog_vgprs;
+
+ if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
+ if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+ /* The LSHS size is not known until draw time, so we append it
+ * at the end of whatever LDS use there may be in the rest of
+ * the shader (currently none, unless LLVM decides to do its
+ * own LDS-based lowering).
+ */
+ ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+ "__lds_end", AC_ADDR_SPACE_LDS);
+ LLVMSetAlignment(ctx->ac.lds, 256);
+ } else {
+ ac_declare_lds_as_pointer(&ctx->ac);
+ }
+ }
+
+ /* Unlike radv, we override these arguments in the prolog, so to the
+ * API shader they appear as normal arguments.
+ */
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
+ ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
+ } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
+ ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
+ ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
+ }
}
/* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS 5
+#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS 5
-static bool si_shader_binary_open(struct si_screen *screen,
- struct si_shader *shader,
- struct ac_rtld_binary *rtld)
+static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+ struct ac_rtld_binary *rtld)
{
- const struct si_shader_selector *sel = shader->selector;
- const char *part_elfs[5];
- size_t part_sizes[5];
- unsigned num_parts = 0;
-
-#define add_part(shader_or_part) \
- if (shader_or_part) { \
- part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
- part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
- num_parts++; \
- }
-
- add_part(shader->prolog);
- add_part(shader->previous_stage);
- add_part(shader->prolog2);
- add_part(shader);
- add_part(shader->epilog);
+ const struct si_shader_selector *sel = shader->selector;
+ const char *part_elfs[5];
+ size_t part_sizes[5];
+ unsigned num_parts = 0;
+
+#define add_part(shader_or_part) \
+ if (shader_or_part) { \
+ part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
+ part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
+ num_parts++; \
+ }
+
+ add_part(shader->prolog);
+ add_part(shader->previous_stage);
+ add_part(shader->prolog2);
+ add_part(shader);
+ add_part(shader->epilog);
#undef add_part
- struct ac_rtld_symbol lds_symbols[2];
- unsigned num_lds_symbols = 0;
-
- if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
- (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
- /* We add this symbol even on LLVM <= 8 to ensure that
- * shader->config.lds_size is set correctly below.
- */
- struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
- sym->name = "esgs_ring";
- sym->size = shader->gs_info.esgs_ring_size;
- sym->align = 64 * 1024;
- }
-
- if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
- struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
- sym->name = "ngg_emit";
- sym->size = shader->ngg.ngg_emit_size * 4;
- sym->align = 4;
- }
-
- bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
- .info = &screen->info,
- .options = {
- .halt_at_entry = screen->options.halt_shaders,
- },
- .shader_type = tgsi_processor_to_shader_stage(sel->type),
- .wave_size = si_get_shader_wave_size(shader),
- .num_parts = num_parts,
- .elf_ptrs = part_elfs,
- .elf_sizes = part_sizes,
- .num_shared_lds_symbols = num_lds_symbols,
- .shared_lds_symbols = lds_symbols });
-
- if (rtld->lds_size > 0) {
- unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
- shader->config.lds_size =
- align(rtld->lds_size, alloc_granularity) / alloc_granularity;
- }
-
- return ok;
+ struct ac_rtld_symbol lds_symbols[2];
+ unsigned num_lds_symbols = 0;
+
+ if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
+ (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
+ /* We add this symbol even on LLVM <= 8 to ensure that
+ * shader->config.lds_size is set correctly below.
+ */
+ struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+ sym->name = "esgs_ring";
+ sym->size = shader->gs_info.esgs_ring_size;
+ sym->align = 64 * 1024;
+ }
+
+ if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
+ struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+ sym->name = "ngg_emit";
+ sym->size = shader->ngg.ngg_emit_size * 4;
+ sym->align = 4;
+ }
+
+ bool ok = ac_rtld_open(
+ rtld, (struct ac_rtld_open_info){.info = &screen->info,
+ .options =
+ {
+ .halt_at_entry = screen->options.halt_shaders,
+ },
+ .shader_type = tgsi_processor_to_shader_stage(sel->type),
+ .wave_size = si_get_shader_wave_size(shader),
+ .num_parts = num_parts,
+ .elf_ptrs = part_elfs,
+ .elf_sizes = part_sizes,
+ .num_shared_lds_symbols = num_lds_symbols,
+ .shared_lds_symbols = lds_symbols});
+
+ if (rtld->lds_size > 0) {
+ unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
+ shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
+ }
+
+ return ok;
}
static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
{
- struct ac_rtld_binary rtld;
- si_shader_binary_open(screen, shader, &rtld);
- return rtld.exec_size;
+ struct ac_rtld_binary rtld;
+ si_shader_binary_open(screen, shader, &rtld);
+ return rtld.exec_size;
}
static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
{
- uint64_t *scratch_va = data;
-
- if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
- *value = (uint32_t)*scratch_va;
- return true;
- }
- if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
- /* Enable scratch coalescing. */
- *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
- S_008F04_SWIZZLE_ENABLE(1);
- return true;
- }
-
- return false;
+ uint64_t *scratch_va = data;
+
+ if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
+ *value = (uint32_t)*scratch_va;
+ return true;
+ }
+ if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
+ /* Enable scratch coalescing. */
+ *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+ return true;
+ }
+
+ return false;
}
bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
- uint64_t scratch_va)
+ uint64_t scratch_va)
{
- struct ac_rtld_binary binary;
- if (!si_shader_binary_open(sscreen, shader, &binary))
- return false;
-
- si_resource_reference(&shader->bo, NULL);
- shader->bo = si_aligned_buffer_create(&sscreen->b,
- sscreen->info.cpdma_prefetch_writes_memory ?
- 0 : SI_RESOURCE_FLAG_READ_ONLY,
- PIPE_USAGE_IMMUTABLE,
- align(binary.rx_size, SI_CPDMA_ALIGNMENT),
- 256);
- if (!shader->bo)
- return false;
-
- /* Upload. */
- struct ac_rtld_upload_info u = {};
- u.binary = &binary;
- u.get_external_symbol = si_get_external_symbol;
- u.cb_data = &scratch_va;
- u.rx_va = shader->bo->gpu_address;
- u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
- PIPE_TRANSFER_READ_WRITE |
- PIPE_TRANSFER_UNSYNCHRONIZED |
- RADEON_TRANSFER_TEMPORARY);
- if (!u.rx_ptr)
- return false;
-
- bool ok = ac_rtld_upload(&u);
-
- sscreen->ws->buffer_unmap(shader->bo->buf);
- ac_rtld_close(&binary);
-
- return ok;
+ struct ac_rtld_binary binary;
+ if (!si_shader_binary_open(sscreen, shader, &binary))
+ return false;
+
+ si_resource_reference(&shader->bo, NULL);
+ shader->bo = si_aligned_buffer_create(
+ &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
+ PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
+ if (!shader->bo)
+ return false;
+
+ /* Upload. */
+ struct ac_rtld_upload_info u = {};
+ u.binary = &binary;
+ u.get_external_symbol = si_get_external_symbol;
+ u.cb_data = &scratch_va;
+ u.rx_va = shader->bo->gpu_address;
+ u.rx_ptr = sscreen->ws->buffer_map(
+ shader->bo->buf, NULL,
+ PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
+ if (!u.rx_ptr)
+ return false;
+
+ bool ok = ac_rtld_upload(&u);
+
+ sscreen->ws->buffer_unmap(shader->bo->buf);
+ ac_rtld_close(&binary);
+
+ return ok;
}
static void si_shader_dump_disassembly(struct si_screen *screen,
- const struct si_shader_binary *binary,
- enum pipe_shader_type shader_type,
- unsigned wave_size,
- struct pipe_debug_callback *debug,
- const char *name, FILE *file)
+ const struct si_shader_binary *binary,
+ enum pipe_shader_type shader_type, unsigned wave_size,
+ struct pipe_debug_callback *debug, const char *name,
+ FILE *file)
{
- struct ac_rtld_binary rtld_binary;
-
- if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
- .info = &screen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return;
-
- const char *disasm;
- size_t nbytes;
-
- if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
- goto out;
-
- if (nbytes > INT_MAX)
- goto out;
-
- if (debug && debug->debug_message) {
- /* Very long debug messages are cut off, so send the
- * disassembly one line at a time. This causes more
- * overhead, but on the plus side it simplifies
- * parsing of resulting logs.
- */
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Disassembly Begin");
-
- uint64_t line = 0;
- while (line < nbytes) {
- int count = nbytes - line;
- const char *nl = memchr(disasm + line, '\n', nbytes - line);
- if (nl)
- count = nl - (disasm + line);
-
- if (count) {
- pipe_debug_message(debug, SHADER_INFO,
- "%.*s", count, disasm + line);
- }
-
- line += count + 1;
- }
-
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Disassembly End");
- }
-
- if (file) {
- fprintf(file, "Shader %s disassembly:\n", name);
- fprintf(file, "%*s", (int)nbytes, disasm);
- }
+ struct ac_rtld_binary rtld_binary;
+
+ if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
+ .info = &screen->info,
+ .shader_type = tgsi_processor_to_shader_stage(shader_type),
+ .wave_size = wave_size,
+ .num_parts = 1,
+ .elf_ptrs = &binary->elf_buffer,
+ .elf_sizes = &binary->elf_size}))
+ return;
+
+ const char *disasm;
+ size_t nbytes;
+
+ if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+ goto out;
+
+ if (nbytes > INT_MAX)
+ goto out;
+
+ if (debug && debug->debug_message) {
+ /* Very long debug messages are cut off, so send the
+ * disassembly one line at a time. This causes more
+ * overhead, but on the plus side it simplifies
+ * parsing of resulting logs.
+ */
+ pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
+
+ uint64_t line = 0;
+ while (line < nbytes) {
+ int count = nbytes - line;
+ const char *nl = memchr(disasm + line, '\n', nbytes - line);
+ if (nl)
+ count = nl - (disasm + line);
+
+ if (count) {
+ pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
+ }
+
+ line += count + 1;
+ }
+
+ pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
+ }
+
+ if (file) {
+ fprintf(file, "Shader %s disassembly:\n", name);
+ fprintf(file, "%*s", (int)nbytes, disasm);
+ }
out:
- ac_rtld_close(&rtld_binary);
+ ac_rtld_close(&rtld_binary);
}
static void si_calculate_max_simd_waves(struct si_shader *shader)
{
- struct si_screen *sscreen = shader->selector->screen;
- struct ac_shader_config *conf = &shader->config;
- unsigned num_inputs = shader->selector->info.num_inputs;
- unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
- unsigned lds_per_wave = 0;
- unsigned max_simd_waves;
-
- max_simd_waves = sscreen->info.max_wave64_per_simd;
-
- /* Compute LDS usage for PS. */
- switch (shader->selector->type) {
- case PIPE_SHADER_FRAGMENT:
- /* The minimum usage per wave is (num_inputs * 48). The maximum
- * usage is (num_inputs * 48 * 16).
- * We can get anything in between and it varies between waves.
- *
- * The 48 bytes per input for a single primitive is equal to
- * 4 bytes/component * 4 components/input * 3 points.
- *
- * Other stages don't know the size at compile time or don't
- * allocate LDS per wave, but instead they do it per thread group.
- */
- lds_per_wave = conf->lds_size * lds_increment +
- align(num_inputs * 48, lds_increment);
- break;
- case PIPE_SHADER_COMPUTE:
- if (shader->selector) {
- unsigned max_workgroup_size =
- si_get_max_workgroup_size(shader);
- lds_per_wave = (conf->lds_size * lds_increment) /
- DIV_ROUND_UP(max_workgroup_size,
- sscreen->compute_wave_size);
- }
- break;
- default:;
- }
-
- /* Compute the per-SIMD wave counts. */
- if (conf->num_sgprs) {
- max_simd_waves =
- MIN2(max_simd_waves,
- sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
- }
-
- if (conf->num_vgprs) {
- /* Always print wave limits as Wave64, so that we can compare
- * Wave32 and Wave64 with shader-db fairly. */
- unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
- max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
- }
-
- unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
- if (lds_per_wave)
- max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
-
- shader->info.max_simd_waves = max_simd_waves;
+ struct si_screen *sscreen = shader->selector->screen;
+ struct ac_shader_config *conf = &shader->config;
+ unsigned num_inputs = shader->selector->info.num_inputs;
+ unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
+ unsigned lds_per_wave = 0;
+ unsigned max_simd_waves;
+
+ max_simd_waves = sscreen->info.max_wave64_per_simd;
+
+ /* Compute LDS usage for PS. */
+ switch (shader->selector->type) {
+ case PIPE_SHADER_FRAGMENT:
+ /* The minimum usage per wave is (num_inputs * 48). The maximum
+ * usage is (num_inputs * 48 * 16).
+ * We can get anything in between and it varies between waves.
+ *
+ * The 48 bytes per input for a single primitive is equal to
+ * 4 bytes/component * 4 components/input * 3 points.
+ *
+ * Other stages don't know the size at compile time or don't
+ * allocate LDS per wave, but instead they do it per thread group.
+ */
+ lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
+ break;
+ case PIPE_SHADER_COMPUTE:
+ if (shader->selector) {
+ unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
+ lds_per_wave = (conf->lds_size * lds_increment) /
+ DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
+ }
+ break;
+ default:;
+ }
+
+ /* Compute the per-SIMD wave counts. */
+ if (conf->num_sgprs) {
+ max_simd_waves =
+ MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
+ }
+
+ if (conf->num_vgprs) {
+ /* Always print wave limits as Wave64, so that we can compare
+ * Wave32 and Wave64 with shader-db fairly. */
+ unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
+ max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
+ }
+
+ unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
+ if (lds_per_wave)
+ max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
+
+ shader->info.max_simd_waves = max_simd_waves;
}
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+ struct pipe_debug_callback *debug)
{
- const struct ac_shader_config *conf = &shader->config;
-
- if (screen->options.debug_disassembly)
- si_shader_dump_disassembly(screen, &shader->binary,
- shader->selector->type,
- si_get_shader_wave_size(shader),
- debug, "main", NULL);
-
- pipe_debug_message(debug, SHADER_INFO,
- "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
- "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
- "Spilled VGPRs: %d PrivMem VGPRs: %d",
- conf->num_sgprs, conf->num_vgprs,
- si_get_shader_binary_size(screen, shader),
- conf->lds_size, conf->scratch_bytes_per_wave,
- shader->info.max_simd_waves, conf->spilled_sgprs,
- conf->spilled_vgprs, shader->info.private_mem_vgprs);
+ const struct ac_shader_config *conf = &shader->config;
+
+ if (screen->options.debug_disassembly)
+ si_shader_dump_disassembly(screen, &shader->binary, shader->selector->type,
+ si_get_shader_wave_size(shader), debug, "main", NULL);
+
+ pipe_debug_message(debug, SHADER_INFO,
+ "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+ "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
+ "Spilled VGPRs: %d PrivMem VGPRs: %d",
+ conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
+ conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
+ conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
}
-static void si_shader_dump_stats(struct si_screen *sscreen,
- struct si_shader *shader,
- FILE *file,
- bool check_debug_option)
+static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
+ bool check_debug_option)
{
- const struct ac_shader_config *conf = &shader->config;
-
- if (!check_debug_option ||
- si_can_dump_shader(sscreen, shader->selector->type)) {
- if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
- fprintf(file, "*** SHADER CONFIG ***\n"
- "SPI_PS_INPUT_ADDR = 0x%04x\n"
- "SPI_PS_INPUT_ENA = 0x%04x\n",
- conf->spi_ps_input_addr, conf->spi_ps_input_ena);
- }
-
- fprintf(file, "*** SHADER STATS ***\n"
- "SGPRS: %d\n"
- "VGPRS: %d\n"
- "Spilled SGPRs: %d\n"
- "Spilled VGPRs: %d\n"
- "Private memory VGPRs: %d\n"
- "Code Size: %d bytes\n"
- "LDS: %d blocks\n"
- "Scratch: %d bytes per wave\n"
- "Max Waves: %d\n"
- "********************\n\n\n",
- conf->num_sgprs, conf->num_vgprs,
- conf->spilled_sgprs, conf->spilled_vgprs,
- shader->info.private_mem_vgprs,
- si_get_shader_binary_size(sscreen, shader),
- conf->lds_size, conf->scratch_bytes_per_wave,
- shader->info.max_simd_waves);
- }
+ const struct ac_shader_config *conf = &shader->config;
+
+ if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->type)) {
+ if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
+ fprintf(file,
+ "*** SHADER CONFIG ***\n"
+ "SPI_PS_INPUT_ADDR = 0x%04x\n"
+ "SPI_PS_INPUT_ENA = 0x%04x\n",
+ conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+ }
+
+ fprintf(file,
+ "*** SHADER STATS ***\n"
+ "SGPRS: %d\n"
+ "VGPRS: %d\n"
+ "Spilled SGPRs: %d\n"
+ "Spilled VGPRs: %d\n"
+ "Private memory VGPRs: %d\n"
+ "Code Size: %d bytes\n"
+ "LDS: %d blocks\n"
+ "Scratch: %d bytes per wave\n"
+ "Max Waves: %d\n"
+ "********************\n\n\n",
+ conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
+ shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
+ conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
+ }
}
const char *si_get_shader_name(const struct si_shader *shader)
{
- switch (shader->selector->type) {
- case PIPE_SHADER_VERTEX:
- if (shader->key.as_es)
- return "Vertex Shader as ES";
- else if (shader->key.as_ls)
- return "Vertex Shader as LS";
- else if (shader->key.opt.vs_as_prim_discard_cs)
- return "Vertex Shader as Primitive Discard CS";
- else if (shader->key.as_ngg)
- return "Vertex Shader as ESGS";
- else
- return "Vertex Shader as VS";
- case PIPE_SHADER_TESS_CTRL:
- return "Tessellation Control Shader";
- case PIPE_SHADER_TESS_EVAL:
- if (shader->key.as_es)
- return "Tessellation Evaluation Shader as ES";
- else if (shader->key.as_ngg)
- return "Tessellation Evaluation Shader as ESGS";
- else
- return "Tessellation Evaluation Shader as VS";
- case PIPE_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- return "GS Copy Shader as VS";
- else
- return "Geometry Shader";
- case PIPE_SHADER_FRAGMENT:
- return "Pixel Shader";
- case PIPE_SHADER_COMPUTE:
- return "Compute Shader";
- default:
- return "Unknown Shader";
- }
+ switch (shader->selector->type) {
+ case PIPE_SHADER_VERTEX:
+ if (shader->key.as_es)
+ return "Vertex Shader as ES";
+ else if (shader->key.as_ls)
+ return "Vertex Shader as LS";
+ else if (shader->key.opt.vs_as_prim_discard_cs)
+ return "Vertex Shader as Primitive Discard CS";
+ else if (shader->key.as_ngg)
+ return "Vertex Shader as ESGS";
+ else
+ return "Vertex Shader as VS";
+ case PIPE_SHADER_TESS_CTRL:
+ return "Tessellation Control Shader";
+ case PIPE_SHADER_TESS_EVAL:
+ if (shader->key.as_es)
+ return "Tessellation Evaluation Shader as ES";
+ else if (shader->key.as_ngg)
+ return "Tessellation Evaluation Shader as ESGS";
+ else
+ return "Tessellation Evaluation Shader as VS";
+ case PIPE_SHADER_GEOMETRY:
+ if (shader->is_gs_copy_shader)
+ return "GS Copy Shader as VS";
+ else
+ return "Geometry Shader";
+ case PIPE_SHADER_FRAGMENT:
+ return "Pixel Shader";
+ case PIPE_SHADER_COMPUTE:
+ return "Compute Shader";
+ default:
+ return "Unknown Shader";
+ }
}
void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
- struct pipe_debug_callback *debug,
- FILE *file, bool check_debug_option)
+ struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
{
- enum pipe_shader_type shader_type = shader->selector->type;
-
- if (!check_debug_option ||
- si_can_dump_shader(sscreen, shader_type))
- si_dump_shader_key(shader, file);
-
- if (!check_debug_option && shader->binary.llvm_ir_string) {
- if (shader->previous_stage &&
- shader->previous_stage->binary.llvm_ir_string) {
- fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
- si_get_shader_name(shader));
- fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
- }
-
- fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
- si_get_shader_name(shader));
- fprintf(file, "%s\n", shader->binary.llvm_ir_string);
- }
-
- if (!check_debug_option ||
- (si_can_dump_shader(sscreen, shader_type) &&
- !(sscreen->debug_flags & DBG(NO_ASM)))) {
- unsigned wave_size = si_get_shader_wave_size(shader);
-
- fprintf(file, "\n%s:\n", si_get_shader_name(shader));
-
- if (shader->prolog)
- si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
- shader_type, wave_size, debug, "prolog", file);
- if (shader->previous_stage)
- si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
- shader_type, wave_size, debug, "previous stage", file);
- if (shader->prolog2)
- si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
- shader_type, wave_size, debug, "prolog2", file);
-
- si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
- wave_size, debug, "main", file);
-
- if (shader->epilog)
- si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
- shader_type, wave_size, debug, "epilog", file);
- fprintf(file, "\n");
- }
-
- si_shader_dump_stats(sscreen, shader, file, check_debug_option);
+ enum pipe_shader_type shader_type = shader->selector->type;
+
+ if (!check_debug_option || si_can_dump_shader(sscreen, shader_type))
+ si_dump_shader_key(shader, file);
+
+ if (!check_debug_option && shader->binary.llvm_ir_string) {
+ if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
+ fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
+ fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
+ }
+
+ fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
+ fprintf(file, "%s\n", shader->binary.llvm_ir_string);
+ }
+
+ if (!check_debug_option ||
+ (si_can_dump_shader(sscreen, shader_type) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
+ unsigned wave_size = si_get_shader_wave_size(shader);
+
+ fprintf(file, "\n%s:\n", si_get_shader_name(shader));
+
+ if (shader->prolog)
+ si_shader_dump_disassembly(sscreen, &shader->prolog->binary, shader_type, wave_size, debug,
+ "prolog", file);
+ if (shader->previous_stage)
+ si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, shader_type,
+ wave_size, debug, "previous stage", file);
+ if (shader->prolog2)
+ si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, shader_type, wave_size,
+ debug, "prolog2", file);
+
+ si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, wave_size, debug, "main",
+ file);
+
+ if (shader->epilog)
+ si_shader_dump_disassembly(sscreen, &shader->epilog->binary, shader_type, wave_size, debug,
+ "epilog", file);
+ fprintf(file, "\n");
+ }
+
+ si_shader_dump_stats(sscreen, shader, file, check_debug_option);
}
static void si_dump_shader_key_vs(const struct si_shader_key *key,
- const struct si_vs_prolog_bits *prolog,
- const char *prefix, FILE *f)
+ const struct si_vs_prolog_bits *prolog, const char *prefix,
+ FILE *f)
{
- fprintf(f, " %s.instance_divisor_is_one = %u\n",
- prefix, prolog->instance_divisor_is_one);
- fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
- prefix, prolog->instance_divisor_is_fetched);
- fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
- prefix, prolog->unpack_instance_id_from_vertex_id);
- fprintf(f, " %s.ls_vgpr_fix = %u\n",
- prefix, prolog->ls_vgpr_fix);
-
- fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
- fprintf(f, " mono.vs.fix_fetch = {");
- for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
- union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
- if (i)
- fprintf(f, ", ");
- if (!fix.bits)
- fprintf(f, "0");
- else
- fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
- fix.u.num_channels_m1, fix.u.format);
- }
- fprintf(f, "}\n");
+ fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
+ fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
+ prolog->instance_divisor_is_fetched);
+ fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
+ prolog->unpack_instance_id_from_vertex_id);
+ fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
+
+ fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
+ fprintf(f, " mono.vs.fix_fetch = {");
+ for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+ union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+ if (i)
+ fprintf(f, ", ");
+ if (!fix.bits)
+ fprintf(f, "0");
+ else
+ fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
+ fix.u.format);
+ }
+ fprintf(f, "}\n");
}
static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
{
- const struct si_shader_key *key = &shader->key;
- enum pipe_shader_type shader_type = shader->selector->type;
-
- fprintf(f, "SHADER KEY\n");
-
- switch (shader_type) {
- case PIPE_SHADER_VERTEX:
- si_dump_shader_key_vs(key, &key->part.vs.prolog,
- "part.vs.prolog", f);
- fprintf(f, " as_es = %u\n", key->as_es);
- fprintf(f, " as_ls = %u\n", key->as_ls);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- fprintf(f, " mono.u.vs_export_prim_id = %u\n",
- key->mono.u.vs_export_prim_id);
- fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
- key->opt.vs_as_prim_discard_cs);
- fprintf(f, " opt.cs_prim_type = %s\n",
- tgsi_primitive_names[key->opt.cs_prim_type]);
- fprintf(f, " opt.cs_indexed = %u\n",
- key->opt.cs_indexed);
- fprintf(f, " opt.cs_instancing = %u\n",
- key->opt.cs_instancing);
- fprintf(f, " opt.cs_primitive_restart = %u\n",
- key->opt.cs_primitive_restart);
- fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
- key->opt.cs_provoking_vertex_first);
- fprintf(f, " opt.cs_need_correct_orientation = %u\n",
- key->opt.cs_need_correct_orientation);
- fprintf(f, " opt.cs_cull_front = %u\n",
- key->opt.cs_cull_front);
- fprintf(f, " opt.cs_cull_back = %u\n",
- key->opt.cs_cull_back);
- fprintf(f, " opt.cs_cull_z = %u\n",
- key->opt.cs_cull_z);
- fprintf(f, " opt.cs_halfz_clip_space = %u\n",
- key->opt.cs_halfz_clip_space);
- break;
-
- case PIPE_SHADER_TESS_CTRL:
- if (shader->selector->screen->info.chip_class >= GFX9) {
- si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
- "part.tcs.ls_prolog", f);
- }
- fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
- fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
- break;
-
- case PIPE_SHADER_TESS_EVAL:
- fprintf(f, " as_es = %u\n", key->as_es);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- fprintf(f, " mono.u.vs_export_prim_id = %u\n",
- key->mono.u.vs_export_prim_id);
- break;
-
- case PIPE_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- break;
-
- if (shader->selector->screen->info.chip_class >= GFX9 &&
- key->part.gs.es->type == PIPE_SHADER_VERTEX) {
- si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
- "part.gs.vs_prolog", f);
- }
- fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
- fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
- fprintf(f, " as_ngg = %u\n", key->as_ngg);
- break;
-
- case PIPE_SHADER_COMPUTE:
- break;
-
- case PIPE_SHADER_FRAGMENT:
- fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
- fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
- fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
- fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
- fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
- fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
- fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
- fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
- fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
- fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
- fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
- fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
- fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
- fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
- fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
- fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
- fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
- fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
- fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
- fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
- fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
- fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
- break;
-
- default:
- assert(0);
- }
-
- if ((shader_type == PIPE_SHADER_GEOMETRY ||
- shader_type == PIPE_SHADER_TESS_EVAL ||
- shader_type == PIPE_SHADER_VERTEX) &&
- !key->as_es && !key->as_ls) {
- fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
- fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
- if (shader_type != PIPE_SHADER_GEOMETRY)
- fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
- }
+ const struct si_shader_key *key = &shader->key;
+ enum pipe_shader_type shader_type = shader->selector->type;
+
+ fprintf(f, "SHADER KEY\n");
+
+ switch (shader_type) {
+ case PIPE_SHADER_VERTEX:
+ si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
+ fprintf(f, " as_es = %u\n", key->as_es);
+ fprintf(f, " as_ls = %u\n", key->as_ls);
+ fprintf(f, " as_ngg = %u\n", key->as_ngg);
+ fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+ fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
+ fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
+ fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
+ fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);
+ fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
+ fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
+ fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
+ fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
+ fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
+ fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
+ fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
+ break;
+
+ case PIPE_SHADER_TESS_CTRL:
+ if (shader->selector->screen->info.chip_class >= GFX9) {
+ si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
+ }
+ fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
+ fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
+ key->mono.u.ff_tcs_inputs_to_copy);
+ break;
+
+ case PIPE_SHADER_TESS_EVAL:
+ fprintf(f, " as_es = %u\n", key->as_es);
+ fprintf(f, " as_ngg = %u\n", key->as_ngg);
+ fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+ break;
+
+ case PIPE_SHADER_GEOMETRY:
+ if (shader->is_gs_copy_shader)
+ break;
+
+ if (shader->selector->screen->info.chip_class >= GFX9 &&
+ key->part.gs.es->type == PIPE_SHADER_VERTEX) {
+ si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
+ }
+ fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n",
+ key->part.gs.prolog.tri_strip_adj_fix);
+ fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
+ fprintf(f, " as_ngg = %u\n", key->as_ngg);
+ break;
+
+ case PIPE_SHADER_COMPUTE:
+ break;
+
+ case PIPE_SHADER_FRAGMENT:
+ fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
+ fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
+ fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
+ fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n",
+ key->part.ps.prolog.force_persp_sample_interp);
+ fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n",
+ key->part.ps.prolog.force_linear_sample_interp);
+ fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n",
+ key->part.ps.prolog.force_persp_center_interp);
+ fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n",
+ key->part.ps.prolog.force_linear_center_interp);
+ fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n",
+ key->part.ps.prolog.bc_optimize_for_persp);
+ fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n",
+ key->part.ps.prolog.bc_optimize_for_linear);
+ fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n",
+ key->part.ps.prolog.samplemask_log_ps_iter);
+ fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n",
+ key->part.ps.epilog.spi_shader_col_format);
+ fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
+ fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
+ fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
+ fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
+ fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
+ fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n",
+ key->part.ps.epilog.poly_line_smoothing);
+ fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
+ fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n",
+ key->mono.u.ps.interpolate_at_sample_force_center);
+ fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
+ fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
+ fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ if ((shader_type == PIPE_SHADER_GEOMETRY || shader_type == PIPE_SHADER_TESS_EVAL ||
+ shader_type == PIPE_SHADER_VERTEX) &&
+ !key->as_es && !key->as_ls) {
+ fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
+ fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
+ if (shader_type != PIPE_SHADER_GEOMETRY)
+ fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
+ }
}
static void si_optimize_vs_outputs(struct si_shader_context *ctx)
{
- struct si_shader *shader = ctx->shader;
- struct si_shader_info *info = &shader->selector->info;
-
- if ((ctx->type != PIPE_SHADER_VERTEX &&
- ctx->type != PIPE_SHADER_TESS_EVAL) ||
- shader->key.as_ls ||
- shader->key.as_es)
- return;
-
- ac_optimize_vs_outputs(&ctx->ac,
- ctx->main_fn,
- shader->info.vs_output_param_offset,
- info->num_outputs,
- &shader->info.nr_param_exports);
+ struct si_shader *shader = ctx->shader;
+ struct si_shader_info *info = &shader->selector->info;
+
+ if ((ctx->type != PIPE_SHADER_VERTEX && ctx->type != PIPE_SHADER_TESS_EVAL) ||
+ shader->key.as_ls || shader->key.as_es)
+ return;
+
+ ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
+ info->num_outputs, &shader->info.nr_param_exports);
}
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
- const struct si_vs_prolog_bits *prolog_key,
- const struct si_shader_key *key,
- bool ngg_cull_shader)
+ const struct si_vs_prolog_bits *prolog_key,
+ const struct si_shader_key *key, bool ngg_cull_shader)
{
- /* VGPR initialization fixup for Vega10 and Raven is always done in the
- * VS prolog. */
- return sel->vs_needs_prolog ||
- prolog_key->ls_vgpr_fix ||
- prolog_key->unpack_instance_id_from_vertex_id ||
- (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+ /* VGPR initialization fixup for Vega10 and Raven is always done in the
+ * VS prolog. */
+ return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
+ prolog_key->unpack_instance_id_from_vertex_id ||
+ (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
-static bool si_build_main_function(struct si_shader_context *ctx,
- struct si_shader *shader,
- struct nir_shader *nir, bool free_nir,
- bool ngg_cull_shader)
+static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
+ struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
{
- struct si_shader_selector *sel = shader->selector;
- const struct si_shader_info *info = &sel->info;
-
- ctx->shader = shader;
- ctx->type = sel->type;
-
- ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
- ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
-
- ctx->num_samplers = util_last_bit(info->samplers_declared);
- ctx->num_images = util_last_bit(info->images_declared);
-
- si_llvm_init_resource_callbacks(ctx);
-
- switch (ctx->type) {
- case PIPE_SHADER_VERTEX:
- si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
- break;
- case PIPE_SHADER_TESS_CTRL:
- si_llvm_init_tcs_callbacks(ctx);
- break;
- case PIPE_SHADER_TESS_EVAL:
- si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
- break;
- case PIPE_SHADER_GEOMETRY:
- si_llvm_init_gs_callbacks(ctx);
- break;
- case PIPE_SHADER_FRAGMENT:
- si_llvm_init_ps_callbacks(ctx);
- break;
- case PIPE_SHADER_COMPUTE:
- ctx->abi.load_local_group_size = si_llvm_get_block_size;
- break;
- default:
- assert(!"Unsupported shader type");
- return false;
- }
-
- si_create_function(ctx, ngg_cull_shader);
-
- if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
- si_preload_esgs_ring(ctx);
-
- if (ctx->type == PIPE_SHADER_GEOMETRY)
- si_preload_gs_rings(ctx);
- else if (ctx->type == PIPE_SHADER_TESS_EVAL)
- si_llvm_preload_tes_rings(ctx);
-
- if (ctx->type == PIPE_SHADER_TESS_CTRL &&
- sel->info.tessfactors_are_def_in_all_invocs) {
- for (unsigned i = 0; i < 6; i++) {
- ctx->invoc0_tess_factors[i] =
- ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
- }
- }
-
- if (ctx->type == PIPE_SHADER_GEOMETRY) {
- for (unsigned i = 0; i < 4; i++) {
- ctx->gs_next_vertex[i] =
- ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
- }
- if (shader->key.as_ngg) {
- for (unsigned i = 0; i < 4; ++i) {
- ctx->gs_curprim_verts[i] =
- ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
- ctx->gs_generated_prims[i] =
- ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
- }
-
- unsigned scratch_size = 8;
- if (sel->so.num_outputs)
- scratch_size = 44;
-
- assert(!ctx->gs_ngg_scratch);
- LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
- ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
- LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
- LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-
- ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
- LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
- LLVMSetAlignment(ctx->gs_ngg_emit, 4);
- }
- }
-
- if (ctx->type != PIPE_SHADER_GEOMETRY &&
- (shader->key.as_ngg && !shader->key.as_es)) {
- /* Unconditionally declare scratch space base for streamout and
- * vertex compaction. Whether space is actually allocated is
- * determined during linking / PM4 creation.
- *
- * Add an extra dword per vertex to ensure an odd stride, which
- * avoids bank conflicts for SoA accesses.
- */
- if (!gfx10_is_ngg_passthrough(shader))
- si_llvm_declare_esgs_ring(ctx);
-
- /* This is really only needed when streamout and / or vertex
- * compaction is enabled.
- */
- if (!ctx->gs_ngg_scratch &&
- (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
- LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
- ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
- LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
- LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
- }
- }
-
- /* For GFX9 merged shaders:
- * - Set EXEC for the first shader. If the prolog is present, set
- * EXEC there instead.
- * - Add a barrier before the second shader.
- * - In the second shader, reset EXEC to ~0 and wrap the main part in
- * an if-statement. This is required for correctness in geometry
- * shaders, to ensure that empty GS waves do not send GS_EMIT and
- * GS_CUT messages.
- *
- * For monolithic merged shaders, the first shader is wrapped in an
- * if-block together with its prolog in si_build_wrapper_function.
- *
- * NGG vertex and tess eval shaders running as the last
- * vertex/geometry stage handle execution explicitly using
- * if-statements.
- */
- if (ctx->screen->info.chip_class >= GFX9) {
- if (!shader->is_monolithic &&
- (shader->key.as_es || shader->key.as_ls) &&
- (ctx->type == PIPE_SHADER_TESS_EVAL ||
- (ctx->type == PIPE_SHADER_VERTEX &&
- !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
- &shader->key, ngg_cull_shader)))) {
- si_init_exec_from_input(ctx,
- ctx->merged_wave_info, 0);
- } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
- ctx->type == PIPE_SHADER_GEOMETRY ||
- (shader->key.as_ngg && !shader->key.as_es)) {
- LLVMValueRef thread_enabled;
- bool nested_barrier;
-
- if (!shader->is_monolithic ||
- (ctx->type == PIPE_SHADER_TESS_EVAL &&
- shader->key.as_ngg && !shader->key.as_es &&
- !shader->key.opt.ngg_culling))
- ac_init_exec_full_mask(&ctx->ac);
-
- if ((ctx->type == PIPE_SHADER_VERTEX ||
- ctx->type == PIPE_SHADER_TESS_EVAL) &&
- shader->key.as_ngg && !shader->key.as_es &&
- !shader->key.opt.ngg_culling) {
- gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
-
- /* Build the primitive export at the beginning
- * of the shader if possible.
- */
- if (gfx10_ngg_export_prim_early(shader))
- gfx10_ngg_build_export_prim(ctx, NULL, NULL);
- }
-
- if (ctx->type == PIPE_SHADER_TESS_CTRL ||
- ctx->type == PIPE_SHADER_GEOMETRY) {
- if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
- gfx10_ngg_gs_emit_prologue(ctx);
- nested_barrier = false;
- } else {
- nested_barrier = true;
- }
-
- thread_enabled = si_is_gs_thread(ctx);
- } else {
- thread_enabled = si_is_es_thread(ctx);
- nested_barrier = false;
- }
-
- ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
- ctx->merged_wrap_if_label = 11500;
- ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-
- if (nested_barrier) {
- /* Execute a barrier before the second shader in
- * a merged shader.
- *
- * Execute the barrier inside the conditional block,
- * so that empty waves can jump directly to s_endpgm,
- * which will also signal the barrier.
- *
- * This is possible in gfx9, because an empty wave
- * for the second shader does not participate in
- * the epilogue. With NGG, empty waves may still
- * be required to export data (e.g. GS output vertices),
- * so we cannot let them exit early.
- *
- * If the shader is TCS and the TCS epilog is present
- * and contains a barrier, it will wait there and then
- * reach s_endpgm.
- */
- si_llvm_emit_barrier(ctx);
- }
- }
- }
-
- bool success = si_nir_build_llvm(ctx, nir);
- if (free_nir)
- ralloc_free(nir);
- if (!success) {
- fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
- return false;
- }
-
- si_llvm_build_ret(ctx, ctx->return_value);
- return true;
+ struct si_shader_selector *sel = shader->selector;
+ const struct si_shader_info *info = &sel->info;
+
+ ctx->shader = shader;
+ ctx->type = sel->type;
+
+ ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+ ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+
+ ctx->num_samplers = util_last_bit(info->samplers_declared);
+ ctx->num_images = util_last_bit(info->images_declared);
+
+ si_llvm_init_resource_callbacks(ctx);
+
+ switch (ctx->type) {
+ case PIPE_SHADER_VERTEX:
+ si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ si_llvm_init_tcs_callbacks(ctx);
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ si_llvm_init_gs_callbacks(ctx);
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ si_llvm_init_ps_callbacks(ctx);
+ break;
+ case PIPE_SHADER_COMPUTE:
+ ctx->abi.load_local_group_size = si_llvm_get_block_size;
+ break;
+ default:
+ assert(!"Unsupported shader type");
+ return false;
+ }
+
+ si_create_function(ctx, ngg_cull_shader);
+
+ if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
+ si_preload_esgs_ring(ctx);
+
+ if (ctx->type == PIPE_SHADER_GEOMETRY)
+ si_preload_gs_rings(ctx);
+ else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+ si_llvm_preload_tes_rings(ctx);
+
+ if (ctx->type == PIPE_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
+ for (unsigned i = 0; i < 6; i++) {
+ ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+ }
+ }
+
+ if (ctx->type == PIPE_SHADER_GEOMETRY) {
+ for (unsigned i = 0; i < 4; i++) {
+ ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+ }
+ if (shader->key.as_ngg) {
+ for (unsigned i = 0; i < 4; ++i) {
+ ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+ ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+ }
+
+ unsigned scratch_size = 8;
+ if (sel->so.num_outputs)
+ scratch_size = 44;
+
+ assert(!ctx->gs_ngg_scratch);
+ LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
+ ctx->gs_ngg_scratch =
+ LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+ LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
+ LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+
+ ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
+ ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+ LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
+ LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+ }
+ }
+
+ if (ctx->type != PIPE_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
+ /* Unconditionally declare scratch space base for streamout and
+ * vertex compaction. Whether space is actually allocated is
+ * determined during linking / PM4 creation.
+ *
+ * Add an extra dword per vertex to ensure an odd stride, which
+ * avoids bank conflicts for SoA accesses.
+ */
+ if (!gfx10_is_ngg_passthrough(shader))
+ si_llvm_declare_esgs_ring(ctx);
+
+ /* This is really only needed when streamout and / or vertex
+ * compaction is enabled.
+ */
+ if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+ LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
+ ctx->gs_ngg_scratch =
+ LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+ LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
+ LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+ }
+ }
+
+ /* For GFX9 merged shaders:
+ * - Set EXEC for the first shader. If the prolog is present, set
+ * EXEC there instead.
+ * - Add a barrier before the second shader.
+ * - In the second shader, reset EXEC to ~0 and wrap the main part in
+ * an if-statement. This is required for correctness in geometry
+ * shaders, to ensure that empty GS waves do not send GS_EMIT and
+ * GS_CUT messages.
+ *
+ * For monolithic merged shaders, the first shader is wrapped in an
+ * if-block together with its prolog in si_build_wrapper_function.
+ *
+ * NGG vertex and tess eval shaders running as the last
+ * vertex/geometry stage handle execution explicitly using
+ * if-statements.
+ */
+ if (ctx->screen->info.chip_class >= GFX9) {
+ if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
+ (ctx->type == PIPE_SHADER_TESS_EVAL ||
+ (ctx->type == PIPE_SHADER_VERTEX &&
+ !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
+ si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
+ } else if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY ||
+ (shader->key.as_ngg && !shader->key.as_es)) {
+ LLVMValueRef thread_enabled;
+ bool nested_barrier;
+
+ if (!shader->is_monolithic || (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.as_ngg &&
+ !shader->key.as_es && !shader->key.opt.ngg_culling))
+ ac_init_exec_full_mask(&ctx->ac);
+
+ if ((ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL) &&
+ shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+ gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+ /* Build the primitive export at the beginning
+ * of the shader if possible.
+ */
+ if (gfx10_ngg_export_prim_early(shader))
+ gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+ }
+
+ if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) {
+ if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+ gfx10_ngg_gs_emit_prologue(ctx);
+ nested_barrier = false;
+ } else {
+ nested_barrier = true;
+ }
+
+ thread_enabled = si_is_gs_thread(ctx);
+ } else {
+ thread_enabled = si_is_es_thread(ctx);
+ nested_barrier = false;
+ }
+
+ ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
+ ctx->merged_wrap_if_label = 11500;
+ ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
+
+ if (nested_barrier) {
+ /* Execute a barrier before the second shader in
+ * a merged shader.
+ *
+ * Execute the barrier inside the conditional block,
+ * so that empty waves can jump directly to s_endpgm,
+ * which will also signal the barrier.
+ *
+ * This is possible in gfx9, because an empty wave
+ * for the second shader does not participate in
+ * the epilogue. With NGG, empty waves may still
+ * be required to export data (e.g. GS output vertices),
+ * so we cannot let them exit early.
+ *
+ * If the shader is TCS and the TCS epilog is present
+ * and contains a barrier, it will wait there and then
+ * reach s_endpgm.
+ */
+ si_llvm_emit_barrier(ctx);
+ }
+ }
+ }
+
+ bool success = si_nir_build_llvm(ctx, nir);
+ if (free_nir)
+ ralloc_free(nir);
+ if (!success) {
+ fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
+ return false;
+ }
+
+ si_llvm_build_ret(ctx, ctx->return_value);
+ return true;
}
/**
* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
* \param key Output shader part key.
*/
-static void si_get_vs_prolog_key(const struct si_shader_info *info,
- unsigned num_input_sgprs,
- bool ngg_cull_shader,
- const struct si_vs_prolog_bits *prolog_key,
- struct si_shader *shader_out,
- union si_shader_part_key *key)
+static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
+ bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
+ struct si_shader *shader_out, union si_shader_part_key *key)
{
- memset(key, 0, sizeof(*key));
- key->vs_prolog.states = *prolog_key;
- key->vs_prolog.num_input_sgprs = num_input_sgprs;
- key->vs_prolog.num_inputs = info->num_inputs;
- key->vs_prolog.as_ls = shader_out->key.as_ls;
- key->vs_prolog.as_es = shader_out->key.as_es;
- key->vs_prolog.as_ngg = shader_out->key.as_ngg;
- key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
-
- if (ngg_cull_shader) {
- key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
- SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
- key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
- SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
- } else {
- key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
- }
-
- if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
- key->vs_prolog.as_ls = 1;
- key->vs_prolog.num_merged_next_stage_vgprs = 2;
- } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
- key->vs_prolog.as_es = 1;
- key->vs_prolog.num_merged_next_stage_vgprs = 5;
- } else if (shader_out->key.as_ngg) {
- key->vs_prolog.num_merged_next_stage_vgprs = 5;
- }
-
- /* Only one of these combinations can be set. as_ngg can be set with as_es. */
- assert(key->vs_prolog.as_ls +
- key->vs_prolog.as_ngg +
- (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) +
- key->vs_prolog.as_prim_discard_cs <= 1);
-
- /* Enable loading the InstanceID VGPR. */
- uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
-
- if ((key->vs_prolog.states.instance_divisor_is_one |
- key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
- shader_out->info.uses_instanceid = true;
+ memset(key, 0, sizeof(*key));
+ key->vs_prolog.states = *prolog_key;
+ key->vs_prolog.num_input_sgprs = num_input_sgprs;
+ key->vs_prolog.num_inputs = info->num_inputs;
+ key->vs_prolog.as_ls = shader_out->key.as_ls;
+ key->vs_prolog.as_es = shader_out->key.as_es;
+ key->vs_prolog.as_ngg = shader_out->key.as_ngg;
+ key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
+
+ if (ngg_cull_shader) {
+ key->vs_prolog.gs_fast_launch_tri_list =
+ !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+ key->vs_prolog.gs_fast_launch_tri_strip =
+ !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+ } else {
+ key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+ }
+
+ if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
+ key->vs_prolog.as_ls = 1;
+ key->vs_prolog.num_merged_next_stage_vgprs = 2;
+ } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
+ key->vs_prolog.as_es = 1;
+ key->vs_prolog.num_merged_next_stage_vgprs = 5;
+ } else if (shader_out->key.as_ngg) {
+ key->vs_prolog.num_merged_next_stage_vgprs = 5;
+ }
+
+ /* Only one of these combinations can be set. as_ngg can be set with as_es. */
+ assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
+ (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
+ 1);
+
+ /* Enable loading the InstanceID VGPR. */
+ uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
+
+ if ((key->vs_prolog.states.instance_divisor_is_one |
+ key->vs_prolog.states.instance_divisor_is_fetched) &
+ input_mask)
+ shader_out->info.uses_instanceid = true;
}
static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
- struct si_shader_selector *sel)
+ struct si_shader_selector *sel)
{
- if (!compiler->low_opt_passes)
- return false;
+ if (!compiler->low_opt_passes)
+ return false;
- /* Assume a slow CPU. */
- assert(!sel->screen->info.has_dedicated_vram &&
- sel->screen->info.chip_class <= GFX8);
+ /* Assume a slow CPU. */
+ assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
- /* For a crazy dEQP test containing 2597 memory opcodes, mostly
- * buffer stores. */
- return sel->type == PIPE_SHADER_COMPUTE &&
- sel->info.num_memory_instructions > 1000;
+ /* For a crazy dEQP test containing 2597 memory opcodes, mostly
+ * buffer stores. */
+ return sel->type == PIPE_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000;
}
-static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
- bool *free_nir)
+static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
{
- *free_nir = false;
-
- if (sel->nir) {
- return sel->nir;
- } else if (sel->nir_binary) {
- struct pipe_screen *screen = &sel->screen->b;
- const void *options =
- screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
- sel->type);
-
- struct blob_reader blob_reader;
- blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
- *free_nir = true;
- return nir_deserialize(NULL, options, &blob_reader);
- }
- return NULL;
+ *free_nir = false;
+
+ if (sel->nir) {
+ return sel->nir;
+ } else if (sel->nir_binary) {
+ struct pipe_screen *screen = &sel->screen->b;
+ const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->type);
+
+ struct blob_reader blob_reader;
+ blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
+ *free_nir = true;
+ return nir_deserialize(NULL, options, &blob_reader);
+ }
+ return NULL;
}
-static bool si_llvm_compile_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug,
- struct nir_shader *nir,
- bool free_nir)
+static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug,
+ struct nir_shader *nir, bool free_nir)
{
- struct si_shader_selector *sel = shader->selector;
- struct si_shader_context ctx;
-
- si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
-
- LLVMValueRef ngg_cull_main_fn = NULL;
- if (shader->key.opt.ngg_culling) {
- if (!si_build_main_function(&ctx, shader, nir, false, true)) {
- si_llvm_dispose(&ctx);
- return false;
- }
- ngg_cull_main_fn = ctx.main_fn;
- ctx.main_fn = NULL;
- }
-
- if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
- si_llvm_dispose(&ctx);
- return false;
- }
-
- if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
- LLVMValueRef parts[4];
- unsigned num_parts = 0;
- bool has_prolog = false;
- LLVMValueRef main_fn = ctx.main_fn;
-
- if (ngg_cull_main_fn) {
- if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
- &shader->key, true)) {
- union si_shader_part_key prolog_key;
- si_get_vs_prolog_key(&sel->info,
- shader->info.num_input_sgprs,
- true,
- &shader->key.part.vs.prolog,
- shader, &prolog_key);
- prolog_key.vs_prolog.is_monolithic = true;
- si_llvm_build_vs_prolog(&ctx, &prolog_key);
- parts[num_parts++] = ctx.main_fn;
- has_prolog = true;
- }
- parts[num_parts++] = ngg_cull_main_fn;
- }
-
- if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
- &shader->key, false)) {
- union si_shader_part_key prolog_key;
- si_get_vs_prolog_key(&sel->info,
- shader->info.num_input_sgprs,
- false,
- &shader->key.part.vs.prolog,
- shader, &prolog_key);
- prolog_key.vs_prolog.is_monolithic = true;
- si_llvm_build_vs_prolog(&ctx, &prolog_key);
- parts[num_parts++] = ctx.main_fn;
- has_prolog = true;
- }
- parts[num_parts++] = main_fn;
-
- si_build_wrapper_function(&ctx, parts, num_parts,
- has_prolog ? 1 : 0, 0);
-
- if (ctx.shader->key.opt.vs_as_prim_discard_cs)
- si_build_prim_discard_compute_shader(&ctx);
- } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
- ngg_cull_main_fn) {
- LLVMValueRef parts[2];
-
- parts[0] = ngg_cull_main_fn;
- parts[1] = ctx.main_fn;
-
- si_build_wrapper_function(&ctx, parts, 2, 0, 0);
- } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
- if (sscreen->info.chip_class >= GFX9) {
- struct si_shader_selector *ls = shader->key.part.tcs.ls;
- LLVMValueRef parts[4];
- bool vs_needs_prolog =
- si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
- &shader->key, false);
-
- /* TCS main part */
- parts[2] = ctx.main_fn;
-
- /* TCS epilog */
- union si_shader_part_key tcs_epilog_key;
- memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
- tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
- si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
- parts[3] = ctx.main_fn;
-
- /* VS as LS main part */
- nir = get_nir_shader(ls, &free_nir);
- struct si_shader shader_ls = {};
- shader_ls.selector = ls;
- shader_ls.key.as_ls = 1;
- shader_ls.key.mono = shader->key.mono;
- shader_ls.key.opt = shader->key.opt;
- shader_ls.is_monolithic = true;
-
- if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
- si_llvm_dispose(&ctx);
- return false;
- }
- shader->info.uses_instanceid |= ls->info.uses_instanceid;
- parts[1] = ctx.main_fn;
-
- /* LS prolog */
- if (vs_needs_prolog) {
- union si_shader_part_key vs_prolog_key;
- si_get_vs_prolog_key(&ls->info,
- shader_ls.info.num_input_sgprs,
- false,
- &shader->key.part.tcs.ls_prolog,
- shader, &vs_prolog_key);
- vs_prolog_key.vs_prolog.is_monolithic = true;
- si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
- parts[0] = ctx.main_fn;
- }
-
- /* Reset the shader context. */
- ctx.shader = shader;
- ctx.type = PIPE_SHADER_TESS_CTRL;
-
- si_build_wrapper_function(&ctx,
- parts + !vs_needs_prolog,
- 4 - !vs_needs_prolog, vs_needs_prolog,
- vs_needs_prolog ? 2 : 1);
- } else {
- LLVMValueRef parts[2];
- union si_shader_part_key epilog_key;
-
- parts[0] = ctx.main_fn;
-
- memset(&epilog_key, 0, sizeof(epilog_key));
- epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
- si_llvm_build_tcs_epilog(&ctx, &epilog_key);
- parts[1] = ctx.main_fn;
-
- si_build_wrapper_function(&ctx, parts, 2, 0, 0);
- }
- } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
- if (ctx.screen->info.chip_class >= GFX9) {
- struct si_shader_selector *es = shader->key.part.gs.es;
- LLVMValueRef es_prolog = NULL;
- LLVMValueRef es_main = NULL;
- LLVMValueRef gs_prolog = NULL;
- LLVMValueRef gs_main = ctx.main_fn;
-
- /* GS prolog */
- union si_shader_part_key gs_prolog_key;
- memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
- gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
- gs_prolog_key.gs_prolog.is_monolithic = true;
- gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
- si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
- gs_prolog = ctx.main_fn;
-
- /* ES main part */
- nir = get_nir_shader(es, &free_nir);
- struct si_shader shader_es = {};
- shader_es.selector = es;
- shader_es.key.as_es = 1;
- shader_es.key.as_ngg = shader->key.as_ngg;
- shader_es.key.mono = shader->key.mono;
- shader_es.key.opt = shader->key.opt;
- shader_es.is_monolithic = true;
-
- if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
- si_llvm_dispose(&ctx);
- return false;
- }
- shader->info.uses_instanceid |= es->info.uses_instanceid;
- es_main = ctx.main_fn;
-
- /* ES prolog */
- if (es->type == PIPE_SHADER_VERTEX &&
- si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
- &shader->key, false)) {
- union si_shader_part_key vs_prolog_key;
- si_get_vs_prolog_key(&es->info,
- shader_es.info.num_input_sgprs,
- false,
- &shader->key.part.gs.vs_prolog,
- shader, &vs_prolog_key);
- vs_prolog_key.vs_prolog.is_monolithic = true;
- si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
- es_prolog = ctx.main_fn;
- }
-
- /* Reset the shader context. */
- ctx.shader = shader;
- ctx.type = PIPE_SHADER_GEOMETRY;
-
- /* Prepare the array of shader parts. */
- LLVMValueRef parts[4];
- unsigned num_parts = 0, main_part, next_first_part;
-
- if (es_prolog)
- parts[num_parts++] = es_prolog;
-
- parts[main_part = num_parts++] = es_main;
- parts[next_first_part = num_parts++] = gs_prolog;
- parts[num_parts++] = gs_main;
-
- si_build_wrapper_function(&ctx, parts, num_parts,
- main_part, next_first_part);
- } else {
- LLVMValueRef parts[2];
- union si_shader_part_key prolog_key;
-
- parts[1] = ctx.main_fn;
-
- memset(&prolog_key, 0, sizeof(prolog_key));
- prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
- si_llvm_build_gs_prolog(&ctx, &prolog_key);
- parts[0] = ctx.main_fn;
-
- si_build_wrapper_function(&ctx, parts, 2, 1, 0);
- }
- } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
- si_llvm_build_monolithic_ps(&ctx, shader);
- }
-
- si_llvm_optimize_module(&ctx);
-
- /* Post-optimization transformations and analysis. */
- si_optimize_vs_outputs(&ctx);
-
- if ((debug && debug->debug_message) ||
- si_can_dump_shader(sscreen, ctx.type)) {
- ctx.shader->info.private_mem_vgprs =
- ac_count_scratch_private_memory(ctx.main_fn);
- }
-
- /* Make sure the input is a pointer and not integer followed by inttoptr. */
- assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
- LLVMPointerTypeKind);
-
- /* Compile to bytecode. */
- if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
- &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
- si_should_optimize_less(compiler, shader->selector))) {
- si_llvm_dispose(&ctx);
- fprintf(stderr, "LLVM failed to compile shader\n");
- return false;
- }
-
- si_llvm_dispose(&ctx);
- return true;
+ struct si_shader_selector *sel = shader->selector;
+ struct si_shader_context ctx;
+
+ si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
+
+ LLVMValueRef ngg_cull_main_fn = NULL;
+ if (shader->key.opt.ngg_culling) {
+ if (!si_build_main_function(&ctx, shader, nir, false, true)) {
+ si_llvm_dispose(&ctx);
+ return false;
+ }
+ ngg_cull_main_fn = ctx.main_fn;
+ ctx.main_fn = NULL;
+ }
+
+ if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
+ si_llvm_dispose(&ctx);
+ return false;
+ }
+
+ if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
+ LLVMValueRef parts[4];
+ unsigned num_parts = 0;
+ bool has_prolog = false;
+ LLVMValueRef main_fn = ctx.main_fn;
+
+ if (ngg_cull_main_fn) {
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
+ union si_shader_part_key prolog_key;
+ si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
+ &shader->key.part.vs.prolog, shader, &prolog_key);
+ prolog_key.vs_prolog.is_monolithic = true;
+ si_llvm_build_vs_prolog(&ctx, &prolog_key);
+ parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
+ }
+ parts[num_parts++] = ngg_cull_main_fn;
+ }
+
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
+ union si_shader_part_key prolog_key;
+ si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
+ &shader->key.part.vs.prolog, shader, &prolog_key);
+ prolog_key.vs_prolog.is_monolithic = true;
+ si_llvm_build_vs_prolog(&ctx, &prolog_key);
+ parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
+ }
+ parts[num_parts++] = main_fn;
+
+ si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
+
+ if (ctx.shader->key.opt.vs_as_prim_discard_cs)
+ si_build_prim_discard_compute_shader(&ctx);
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && ngg_cull_main_fn) {
+ LLVMValueRef parts[2];
+
+ parts[0] = ngg_cull_main_fn;
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
+ if (sscreen->info.chip_class >= GFX9) {
+ struct si_shader_selector *ls = shader->key.part.tcs.ls;
+ LLVMValueRef parts[4];
+ bool vs_needs_prolog =
+ si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
+
+ /* TCS main part */
+ parts[2] = ctx.main_fn;
+
+ /* TCS epilog */
+ union si_shader_part_key tcs_epilog_key;
+ memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+ tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
+ parts[3] = ctx.main_fn;
+
+ /* VS as LS main part */
+ nir = get_nir_shader(ls, &free_nir);
+ struct si_shader shader_ls = {};
+ shader_ls.selector = ls;
+ shader_ls.key.as_ls = 1;
+ shader_ls.key.mono = shader->key.mono;
+ shader_ls.key.opt = shader->key.opt;
+ shader_ls.is_monolithic = true;
+
+ if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
+ si_llvm_dispose(&ctx);
+ return false;
+ }
+ shader->info.uses_instanceid |= ls->info.uses_instanceid;
+ parts[1] = ctx.main_fn;
+
+ /* LS prolog */
+ if (vs_needs_prolog) {
+ union si_shader_part_key vs_prolog_key;
+ si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
+ &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
+ vs_prolog_key.vs_prolog.is_monolithic = true;
+ si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+ parts[0] = ctx.main_fn;
+ }
+
+ /* Reset the shader context. */
+ ctx.shader = shader;
+ ctx.type = PIPE_SHADER_TESS_CTRL;
+
+ si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
+ vs_needs_prolog, vs_needs_prolog ? 2 : 1);
+ } else {
+ LLVMValueRef parts[2];
+ union si_shader_part_key epilog_key;
+
+ parts[0] = ctx.main_fn;
+
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+ si_llvm_build_tcs_epilog(&ctx, &epilog_key);
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ }
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+ if (ctx.screen->info.chip_class >= GFX9) {
+ struct si_shader_selector *es = shader->key.part.gs.es;
+ LLVMValueRef es_prolog = NULL;
+ LLVMValueRef es_main = NULL;
+ LLVMValueRef gs_prolog = NULL;
+ LLVMValueRef gs_main = ctx.main_fn;
+
+ /* GS prolog */
+ union si_shader_part_key gs_prolog_key;
+ memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
+ gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+ gs_prolog_key.gs_prolog.is_monolithic = true;
+ gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+ si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
+ gs_prolog = ctx.main_fn;
+
+ /* ES main part */
+ nir = get_nir_shader(es, &free_nir);
+ struct si_shader shader_es = {};
+ shader_es.selector = es;
+ shader_es.key.as_es = 1;
+ shader_es.key.as_ngg = shader->key.as_ngg;
+ shader_es.key.mono = shader->key.mono;
+ shader_es.key.opt = shader->key.opt;
+ shader_es.is_monolithic = true;
+
+ if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
+ si_llvm_dispose(&ctx);
+ return false;
+ }
+ shader->info.uses_instanceid |= es->info.uses_instanceid;
+ es_main = ctx.main_fn;
+
+ /* ES prolog */
+ if (es->type == PIPE_SHADER_VERTEX &&
+ si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
+ union si_shader_part_key vs_prolog_key;
+ si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
+ &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
+ vs_prolog_key.vs_prolog.is_monolithic = true;
+ si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+ es_prolog = ctx.main_fn;
+ }
+
+ /* Reset the shader context. */
+ ctx.shader = shader;
+ ctx.type = PIPE_SHADER_GEOMETRY;
+
+ /* Prepare the array of shader parts. */
+ LLVMValueRef parts[4];
+ unsigned num_parts = 0, main_part, next_first_part;
+
+ if (es_prolog)
+ parts[num_parts++] = es_prolog;
+
+ parts[main_part = num_parts++] = es_main;
+ parts[next_first_part = num_parts++] = gs_prolog;
+ parts[num_parts++] = gs_main;
+
+ si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
+ } else {
+ LLVMValueRef parts[2];
+ union si_shader_part_key prolog_key;
+
+ parts[1] = ctx.main_fn;
+
+ memset(&prolog_key, 0, sizeof(prolog_key));
+ prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+ si_llvm_build_gs_prolog(&ctx, &prolog_key);
+ parts[0] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+ }
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+ si_llvm_build_monolithic_ps(&ctx, shader);
+ }
+
+ si_llvm_optimize_module(&ctx);
+
+ /* Post-optimization transformations and analysis. */
+ si_optimize_vs_outputs(&ctx);
+
+ if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.type)) {
+ ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
+ }
+
+ /* Make sure the input is a pointer and not integer followed by inttoptr. */
+ assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+
+ /* Compile to bytecode. */
+ if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
+ ctx.type, si_get_shader_name(shader),
+ si_should_optimize_less(compiler, shader->selector))) {
+ si_llvm_dispose(&ctx);
+ fprintf(stderr, "LLVM failed to compile shader\n");
+ return false;
+ }
+
+ si_llvm_dispose(&ctx);
+ return true;
}
-bool si_compile_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- struct si_shader_selector *sel = shader->selector;
- bool free_nir;
- struct nir_shader *nir = get_nir_shader(sel, &free_nir);
-
- /* Dump NIR before doing NIR->LLVM conversion in case the
- * conversion fails. */
- if (si_can_dump_shader(sscreen, sel->type) &&
- !(sscreen->debug_flags & DBG(NO_NIR))) {
- nir_print_shader(nir, stderr);
- si_dump_streamout(&sel->so);
- }
-
- memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
- sizeof(shader->info.vs_output_param_offset));
-
- shader->info.uses_instanceid = sel->info.uses_instanceid;
-
- /* TODO: ACO could compile non-monolithic shaders here (starting
- * with PS and NGG VS), but monolithic shaders should be compiled
- * by LLVM due to more complicated compilation.
- */
- if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
- return false;
-
- /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
- * LLVM 3.9svn has this bug.
- */
- if (sel->type == PIPE_SHADER_COMPUTE) {
- unsigned wave_size = sscreen->compute_wave_size;
- unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
- (wave_size == 32 ? 2 : 1);
- unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
- unsigned max_sgprs_per_wave = 128;
- unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
- unsigned threads_per_tg = si_get_max_workgroup_size(shader);
- unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
- unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
-
- max_vgprs = max_vgprs / waves_per_simd;
- max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
-
- if (shader->config.num_sgprs > max_sgprs ||
- shader->config.num_vgprs > max_vgprs) {
- fprintf(stderr, "LLVM failed to compile a shader correctly: "
- "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
- shader->config.num_sgprs, shader->config.num_vgprs,
- max_sgprs, max_vgprs);
-
- /* Just terminate the process, because dependent
- * shaders can hang due to bad input data, but use
- * the env var to allow shader-db to work.
- */
- if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
- abort();
- }
- }
-
- /* Add the scratch offset to input SGPRs. */
- if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
- shader->info.num_input_sgprs += 1; /* scratch byte offset */
-
- /* Calculate the number of fragment input VGPRs. */
- if (sel->type == PIPE_SHADER_FRAGMENT) {
- shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
- &shader->info.face_vgpr_index,
- &shader->info.ancillary_vgpr_index);
- }
-
- si_calculate_max_simd_waves(shader);
- si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
- return true;
+ struct si_shader_selector *sel = shader->selector;
+ bool free_nir;
+ struct nir_shader *nir = get_nir_shader(sel, &free_nir);
+
+ /* Dump NIR before doing NIR->LLVM conversion in case the
+ * conversion fails. */
+ if (si_can_dump_shader(sscreen, sel->type) && !(sscreen->debug_flags & DBG(NO_NIR))) {
+ nir_print_shader(nir, stderr);
+ si_dump_streamout(&sel->so);
+ }
+
+ memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+ sizeof(shader->info.vs_output_param_offset));
+
+ shader->info.uses_instanceid = sel->info.uses_instanceid;
+
+ /* TODO: ACO could compile non-monolithic shaders here (starting
+ * with PS and NGG VS), but monolithic shaders should be compiled
+ * by LLVM due to more complicated compilation.
+ */
+ if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
+ return false;
+
+ /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
+ * LLVM 3.9svn has this bug.
+ */
+ if (sel->type == PIPE_SHADER_COMPUTE) {
+ unsigned wave_size = sscreen->compute_wave_size;
+ unsigned max_vgprs =
+ sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
+ unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
+ unsigned max_sgprs_per_wave = 128;
+ unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
+ unsigned threads_per_tg = si_get_max_workgroup_size(shader);
+ unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
+ unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
+
+ max_vgprs = max_vgprs / waves_per_simd;
+ max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
+
+ if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
+ fprintf(stderr,
+ "LLVM failed to compile a shader correctly: "
+ "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
+ shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
+
+ /* Just terminate the process, because dependent
+ * shaders can hang due to bad input data, but use
+ * the env var to allow shader-db to work.
+ */
+ if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
+ abort();
+ }
+ }
+
+ /* Add the scratch offset to input SGPRs. */
+ if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
+ shader->info.num_input_sgprs += 1; /* scratch byte offset */
+
+ /* Calculate the number of fragment input VGPRs. */
+ if (sel->type == PIPE_SHADER_FRAGMENT) {
+ shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
+ &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
+ }
+
+ si_calculate_max_simd_waves(shader);
+ si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+ return true;
}
/**
* \return non-NULL on success
*/
static struct si_shader_part *
-si_get_shader_part(struct si_screen *sscreen,
- struct si_shader_part **list,
- enum pipe_shader_type type,
- bool prolog,
- union si_shader_part_key *key,
- struct ac_llvm_compiler *compiler,
- struct pipe_debug_callback *debug,
- void (*build)(struct si_shader_context *,
- union si_shader_part_key *),
- const char *name)
+si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
+ enum pipe_shader_type type, bool prolog, union si_shader_part_key *key,
+ struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
+ void (*build)(struct si_shader_context *, union si_shader_part_key *),
+ const char *name)
{
- struct si_shader_part *result;
-
- simple_mtx_lock(&sscreen->shader_parts_mutex);
-
- /* Find existing. */
- for (result = *list; result; result = result->next) {
- if (memcmp(&result->key, key, sizeof(*key)) == 0) {
- simple_mtx_unlock(&sscreen->shader_parts_mutex);
- return result;
- }
- }
-
- /* Compile a new one. */
- result = CALLOC_STRUCT(si_shader_part);
- result->key = *key;
-
- struct si_shader_selector sel = {};
- sel.screen = sscreen;
-
- struct si_shader shader = {};
- shader.selector = &sel;
-
- switch (type) {
- case PIPE_SHADER_VERTEX:
- shader.key.as_ls = key->vs_prolog.as_ls;
- shader.key.as_es = key->vs_prolog.as_es;
- shader.key.as_ngg = key->vs_prolog.as_ngg;
- shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
- break;
- case PIPE_SHADER_TESS_CTRL:
- assert(!prolog);
- shader.key.part.tcs.epilog = key->tcs_epilog.states;
- break;
- case PIPE_SHADER_GEOMETRY:
- assert(prolog);
- shader.key.as_ngg = key->gs_prolog.as_ngg;
- break;
- case PIPE_SHADER_FRAGMENT:
- if (prolog)
- shader.key.part.ps.prolog = key->ps_prolog.states;
- else
- shader.key.part.ps.epilog = key->ps_epilog.states;
- break;
- default:
- unreachable("bad shader part");
- }
-
- struct si_shader_context ctx;
- si_llvm_context_init(&ctx, sscreen, compiler,
- si_get_wave_size(sscreen, type, shader.key.as_ngg,
- shader.key.as_es,
- shader.key.opt.vs_as_prim_discard_cs));
- ctx.shader = &shader;
- ctx.type = type;
-
- build(&ctx, key);
-
- /* Compile. */
- si_llvm_optimize_module(&ctx);
-
- if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
- &ctx.ac, debug, ctx.type, name, false)) {
- FREE(result);
- result = NULL;
- goto out;
- }
-
- result->next = *list;
- *list = result;
+ struct si_shader_part *result;
+
+ simple_mtx_lock(&sscreen->shader_parts_mutex);
+
+ /* Find existing. */
+ for (result = *list; result; result = result->next) {
+ if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+ simple_mtx_unlock(&sscreen->shader_parts_mutex);
+ return result;
+ }
+ }
+
+ /* Compile a new one. */
+ result = CALLOC_STRUCT(si_shader_part);
+ result->key = *key;
+
+ struct si_shader_selector sel = {};
+ sel.screen = sscreen;
+
+ struct si_shader shader = {};
+ shader.selector = &sel;
+
+ switch (type) {
+ case PIPE_SHADER_VERTEX:
+ shader.key.as_ls = key->vs_prolog.as_ls;
+ shader.key.as_es = key->vs_prolog.as_es;
+ shader.key.as_ngg = key->vs_prolog.as_ngg;
+ shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ assert(!prolog);
+ shader.key.part.tcs.epilog = key->tcs_epilog.states;
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ assert(prolog);
+ shader.key.as_ngg = key->gs_prolog.as_ngg;
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ if (prolog)
+ shader.key.part.ps.prolog = key->ps_prolog.states;
+ else
+ shader.key.part.ps.epilog = key->ps_epilog.states;
+ break;
+ default:
+ unreachable("bad shader part");
+ }
+
+ struct si_shader_context ctx;
+ si_llvm_context_init(&ctx, sscreen, compiler,
+ si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+ shader.key.opt.vs_as_prim_discard_cs));
+ ctx.shader = &shader;
+ ctx.type = type;
+
+ build(&ctx, key);
+
+ /* Compile. */
+ si_llvm_optimize_module(&ctx);
+
+ if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
+ ctx.type, name, false)) {
+ FREE(result);
+ result = NULL;
+ goto out;
+ }
+
+ result->next = *list;
+ *list = result;
out:
- si_llvm_dispose(&ctx);
- simple_mtx_unlock(&sscreen->shader_parts_mutex);
- return result;
+ si_llvm_dispose(&ctx);
+ simple_mtx_unlock(&sscreen->shader_parts_mutex);
+ return result;
}
-static bool si_get_vs_prolog(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug,
- struct si_shader *main_part,
- const struct si_vs_prolog_bits *key)
+static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug,
+ struct si_shader *main_part, const struct si_vs_prolog_bits *key)
{
- struct si_shader_selector *vs = main_part->selector;
-
- if (!si_vs_needs_prolog(vs, key, &shader->key, false))
- return true;
-
- /* Get the prolog. */
- union si_shader_part_key prolog_key;
- si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
- key, shader, &prolog_key);
-
- shader->prolog =
- si_get_shader_part(sscreen, &sscreen->vs_prologs,
- PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
- debug, si_llvm_build_vs_prolog,
- "Vertex Shader Prolog");
- return shader->prolog != NULL;
+ struct si_shader_selector *vs = main_part->selector;
+
+ if (!si_vs_needs_prolog(vs, key, &shader->key, false))
+ return true;
+
+ /* Get the prolog. */
+ union si_shader_part_key prolog_key;
+ si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
+ &prolog_key);
+
+ shader->prolog =
+ si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key,
+ compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
+ return shader->prolog != NULL;
}
/**
* Select and compile (or reuse) vertex shader parts (prolog & epilog).
*/
-static bool si_shader_select_vs_parts(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
- &shader->key.part.vs.prolog);
+ return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
}
/**
* Select and compile (or reuse) TCS parts (epilog).
*/
-static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- if (sscreen->info.chip_class >= GFX9) {
- struct si_shader *ls_main_part =
- shader->key.part.tcs.ls->main_shader_part_ls;
-
- if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
- &shader->key.part.tcs.ls_prolog))
- return false;
-
- shader->previous_stage = ls_main_part;
- }
-
- /* Get the epilog. */
- union si_shader_part_key epilog_key;
- memset(&epilog_key, 0, sizeof(epilog_key));
- epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-
- shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
- PIPE_SHADER_TESS_CTRL, false,
- &epilog_key, compiler, debug,
- si_llvm_build_tcs_epilog,
- "Tessellation Control Shader Epilog");
- return shader->epilog != NULL;
+ if (sscreen->info.chip_class >= GFX9) {
+ struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
+
+ if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
+ &shader->key.part.tcs.ls_prolog))
+ return false;
+
+ shader->previous_stage = ls_main_part;
+ }
+
+ /* Get the epilog. */
+ union si_shader_part_key epilog_key;
+ memset(&epilog_key, 0, sizeof(epilog_key));
+ epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+
+ shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, PIPE_SHADER_TESS_CTRL, false,
+ &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
+ "Tessellation Control Shader Epilog");
+ return shader->epilog != NULL;
}
/**
* Select and compile (or reuse) GS parts (prolog).
*/
-static bool si_shader_select_gs_parts(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- if (sscreen->info.chip_class >= GFX9) {
- struct si_shader *es_main_part;
- enum pipe_shader_type es_type = shader->key.part.gs.es->type;
-
- if (shader->key.as_ngg)
- es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
- else
- es_main_part = shader->key.part.gs.es->main_shader_part_es;
-
- if (es_type == PIPE_SHADER_VERTEX &&
- !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
- &shader->key.part.gs.vs_prolog))
- return false;
-
- shader->previous_stage = es_main_part;
- }
-
- if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
- return true;
-
- union si_shader_part_key prolog_key;
- memset(&prolog_key, 0, sizeof(prolog_key));
- prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
- prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-
- shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
- PIPE_SHADER_GEOMETRY, true,
- &prolog_key, compiler, debug,
- si_llvm_build_gs_prolog,
- "Geometry Shader Prolog");
- return shader->prolog2 != NULL;
+ if (sscreen->info.chip_class >= GFX9) {
+ struct si_shader *es_main_part;
+ enum pipe_shader_type es_type = shader->key.part.gs.es->type;
+
+ if (shader->key.as_ngg)
+ es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
+ else
+ es_main_part = shader->key.part.gs.es->main_shader_part_es;
+
+ if (es_type == PIPE_SHADER_VERTEX &&
+ !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
+ &shader->key.part.gs.vs_prolog))
+ return false;
+
+ shader->previous_stage = es_main_part;
+ }
+
+ if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
+ return true;
+
+ union si_shader_part_key prolog_key;
+ memset(&prolog_key, 0, sizeof(prolog_key));
+ prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+ prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+
+ shader->prolog2 =
+ si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key,
+ compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
+ return shader->prolog2 != NULL;
}
/**
* Compute the PS prolog key, which contains all the information needed to
* build the PS prolog function, and set related bits in shader->config.
*/
-void si_get_ps_prolog_key(struct si_shader *shader,
- union si_shader_part_key *key,
- bool separate_prolog)
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+ bool separate_prolog)
{
- struct si_shader_info *info = &shader->selector->info;
-
- memset(key, 0, sizeof(*key));
- key->ps_prolog.states = shader->key.part.ps.prolog;
- key->ps_prolog.colors_read = info->colors_read;
- key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
- key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
- key->ps_prolog.wqm = info->uses_derivatives &&
- (key->ps_prolog.colors_read ||
- key->ps_prolog.states.force_persp_sample_interp ||
- key->ps_prolog.states.force_linear_sample_interp ||
- key->ps_prolog.states.force_persp_center_interp ||
- key->ps_prolog.states.force_linear_center_interp ||
- key->ps_prolog.states.bc_optimize_for_persp ||
- key->ps_prolog.states.bc_optimize_for_linear);
- key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
-
- if (info->colors_read) {
- unsigned *color = shader->selector->color_attr_index;
-
- if (shader->key.part.ps.prolog.color_two_side) {
- /* BCOLORs are stored after the last input. */
- key->ps_prolog.num_interp_inputs = info->num_inputs;
- key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
- if (separate_prolog)
- shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
- }
-
- for (unsigned i = 0; i < 2; i++) {
- unsigned interp = info->input_interpolate[color[i]];
- unsigned location = info->input_interpolate_loc[color[i]];
-
- if (!(info->colors_read & (0xf << i*4)))
- continue;
-
- key->ps_prolog.color_attr_index[i] = color[i];
-
- if (shader->key.part.ps.prolog.flatshade_colors &&
- interp == TGSI_INTERPOLATE_COLOR)
- interp = TGSI_INTERPOLATE_CONSTANT;
-
- switch (interp) {
- case TGSI_INTERPOLATE_CONSTANT:
- key->ps_prolog.color_interp_vgpr_index[i] = -1;
- break;
- case TGSI_INTERPOLATE_PERSPECTIVE:
- case TGSI_INTERPOLATE_COLOR:
- /* Force the interpolation location for colors here. */
- if (shader->key.part.ps.prolog.force_persp_sample_interp)
- location = TGSI_INTERPOLATE_LOC_SAMPLE;
- if (shader->key.part.ps.prolog.force_persp_center_interp)
- location = TGSI_INTERPOLATE_LOC_CENTER;
-
- switch (location) {
- case TGSI_INTERPOLATE_LOC_SAMPLE:
- key->ps_prolog.color_interp_vgpr_index[i] = 0;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_SAMPLE_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTER:
- key->ps_prolog.color_interp_vgpr_index[i] = 2;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_CENTER_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTROID:
- key->ps_prolog.color_interp_vgpr_index[i] = 4;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_PERSP_CENTROID_ENA(1);
- }
- break;
- default:
- assert(0);
- }
- break;
- case TGSI_INTERPOLATE_LINEAR:
- /* Force the interpolation location for colors here. */
- if (shader->key.part.ps.prolog.force_linear_sample_interp)
- location = TGSI_INTERPOLATE_LOC_SAMPLE;
- if (shader->key.part.ps.prolog.force_linear_center_interp)
- location = TGSI_INTERPOLATE_LOC_CENTER;
-
- /* The VGPR assignment for non-monolithic shaders
- * works because InitialPSInputAddr is set on the
- * main shader and PERSP_PULL_MODEL is never used.
- */
- switch (location) {
- case TGSI_INTERPOLATE_LOC_SAMPLE:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 6 : 9;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_SAMPLE_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTER:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 8 : 11;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_CENTER_ENA(1);
- }
- break;
- case TGSI_INTERPOLATE_LOC_CENTROID:
- key->ps_prolog.color_interp_vgpr_index[i] =
- separate_prolog ? 10 : 13;
- if (separate_prolog) {
- shader->config.spi_ps_input_ena |=
- S_0286CC_LINEAR_CENTROID_ENA(1);
- }
- break;
- default:
- assert(0);
- }
- break;
- default:
- assert(0);
- }
- }
- }
+ struct si_shader_info *info = &shader->selector->info;
+
+ memset(key, 0, sizeof(*key));
+ key->ps_prolog.states = shader->key.part.ps.prolog;
+ key->ps_prolog.colors_read = info->colors_read;
+ key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+ key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+ key->ps_prolog.wqm =
+ info->uses_derivatives &&
+ (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+ key->ps_prolog.states.force_linear_sample_interp ||
+ key->ps_prolog.states.force_persp_center_interp ||
+ key->ps_prolog.states.force_linear_center_interp ||
+ key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
+ key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
+
+ if (info->colors_read) {
+ unsigned *color = shader->selector->color_attr_index;
+
+ if (shader->key.part.ps.prolog.color_two_side) {
+ /* BCOLORs are stored after the last input. */
+ key->ps_prolog.num_interp_inputs = info->num_inputs;
+ key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+ if (separate_prolog)
+ shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+ }
+
+ for (unsigned i = 0; i < 2; i++) {
+ unsigned interp = info->input_interpolate[color[i]];
+ unsigned location = info->input_interpolate_loc[color[i]];
+
+ if (!(info->colors_read & (0xf << i * 4)))
+ continue;
+
+ key->ps_prolog.color_attr_index[i] = color[i];
+
+ if (shader->key.part.ps.prolog.flatshade_colors && interp == TGSI_INTERPOLATE_COLOR)
+ interp = TGSI_INTERPOLATE_CONSTANT;
+
+ switch (interp) {
+ case TGSI_INTERPOLATE_CONSTANT:
+ key->ps_prolog.color_interp_vgpr_index[i] = -1;
+ break;
+ case TGSI_INTERPOLATE_PERSPECTIVE:
+ case TGSI_INTERPOLATE_COLOR:
+ /* Force the interpolation location for colors here. */
+ if (shader->key.part.ps.prolog.force_persp_sample_interp)
+ location = TGSI_INTERPOLATE_LOC_SAMPLE;
+ if (shader->key.part.ps.prolog.force_persp_center_interp)
+ location = TGSI_INTERPOLATE_LOC_CENTER;
+
+ switch (location) {
+ case TGSI_INTERPOLATE_LOC_SAMPLE:
+ key->ps_prolog.color_interp_vgpr_index[i] = 0;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+ }
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTER:
+ key->ps_prolog.color_interp_vgpr_index[i] = 2;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+ }
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTROID:
+ key->ps_prolog.color_interp_vgpr_index[i] = 4;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ break;
+ case TGSI_INTERPOLATE_LINEAR:
+ /* Force the interpolation location for colors here. */
+ if (shader->key.part.ps.prolog.force_linear_sample_interp)
+ location = TGSI_INTERPOLATE_LOC_SAMPLE;
+ if (shader->key.part.ps.prolog.force_linear_center_interp)
+ location = TGSI_INTERPOLATE_LOC_CENTER;
+
+ /* The VGPR assignment for non-monolithic shaders
+ * works because InitialPSInputAddr is set on the
+ * main shader and PERSP_PULL_MODEL is never used.
+ */
+ switch (location) {
+ case TGSI_INTERPOLATE_LOC_SAMPLE:
+ key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+ }
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTER:
+ key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+ }
+ break;
+ case TGSI_INTERPOLATE_LOC_CENTROID:
+ key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
+ if (separate_prolog) {
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+ }
}
/**
*/
bool si_need_ps_prolog(const union si_shader_part_key *key)
{
- return key->ps_prolog.colors_read ||
- key->ps_prolog.states.force_persp_sample_interp ||
- key->ps_prolog.states.force_linear_sample_interp ||
- key->ps_prolog.states.force_persp_center_interp ||
- key->ps_prolog.states.force_linear_center_interp ||
- key->ps_prolog.states.bc_optimize_for_persp ||
- key->ps_prolog.states.bc_optimize_for_linear ||
- key->ps_prolog.states.poly_stipple ||
- key->ps_prolog.states.samplemask_log_ps_iter;
+ return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+ key->ps_prolog.states.force_linear_sample_interp ||
+ key->ps_prolog.states.force_persp_center_interp ||
+ key->ps_prolog.states.force_linear_center_interp ||
+ key->ps_prolog.states.bc_optimize_for_persp ||
+ key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
+ key->ps_prolog.states.samplemask_log_ps_iter;
}
/**
* Compute the PS epilog key, which contains all the information needed to
* build the PS epilog function.
*/
-void si_get_ps_epilog_key(struct si_shader *shader,
- union si_shader_part_key *key)
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
{
- struct si_shader_info *info = &shader->selector->info;
- memset(key, 0, sizeof(*key));
- key->ps_epilog.colors_written = info->colors_written;
- key->ps_epilog.writes_z = info->writes_z;
- key->ps_epilog.writes_stencil = info->writes_stencil;
- key->ps_epilog.writes_samplemask = info->writes_samplemask;
- key->ps_epilog.states = shader->key.part.ps.epilog;
+ struct si_shader_info *info = &shader->selector->info;
+ memset(key, 0, sizeof(*key));
+ key->ps_epilog.colors_written = info->colors_written;
+ key->ps_epilog.writes_z = info->writes_z;
+ key->ps_epilog.writes_stencil = info->writes_stencil;
+ key->ps_epilog.writes_samplemask = info->writes_samplemask;
+ key->ps_epilog.states = shader->key.part.ps.epilog;
}
/**
* Select and compile (or reuse) pixel shader parts (prolog & epilog).
*/
-static bool si_shader_select_ps_parts(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- union si_shader_part_key prolog_key;
- union si_shader_part_key epilog_key;
-
- /* Get the prolog. */
- si_get_ps_prolog_key(shader, &prolog_key, true);
-
- /* The prolog is a no-op if these aren't set. */
- if (si_need_ps_prolog(&prolog_key)) {
- shader->prolog =
- si_get_shader_part(sscreen, &sscreen->ps_prologs,
- PIPE_SHADER_FRAGMENT, true,
- &prolog_key, compiler, debug,
- si_llvm_build_ps_prolog,
- "Fragment Shader Prolog");
- if (!shader->prolog)
- return false;
- }
-
- /* Get the epilog. */
- si_get_ps_epilog_key(shader, &epilog_key);
-
- shader->epilog =
- si_get_shader_part(sscreen, &sscreen->ps_epilogs,
- PIPE_SHADER_FRAGMENT, false,
- &epilog_key, compiler, debug,
- si_llvm_build_ps_epilog,
- "Fragment Shader Epilog");
- if (!shader->epilog)
- return false;
-
- /* Enable POS_FIXED_PT if polygon stippling is enabled. */
- if (shader->key.part.ps.prolog.poly_stipple) {
- shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
- assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
- }
-
- /* Set up the enable bits for per-sample shading if needed. */
- if (shader->key.part.ps.prolog.force_persp_sample_interp &&
- (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
- G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
- shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
- shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
- shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
- }
- if (shader->key.part.ps.prolog.force_linear_sample_interp &&
- (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
- G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
- shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
- shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
- shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
- }
- if (shader->key.part.ps.prolog.force_persp_center_interp &&
- (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
- G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
- shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
- shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
- shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
- }
- if (shader->key.part.ps.prolog.force_linear_center_interp &&
- (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
- G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
- shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
- shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
- shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
- }
-
- /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
- if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
- !(shader->config.spi_ps_input_ena & 0xf)) {
- shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
- assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
- }
-
- /* At least one pair of interpolation weights must be enabled. */
- if (!(shader->config.spi_ps_input_ena & 0x7f)) {
- shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
- assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
- }
-
- /* Samplemask fixup requires the sample ID. */
- if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
- shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
- assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
- }
-
- /* The sample mask input is always enabled, because the API shader always
- * passes it through to the epilog. Disable it here if it's unused.
- */
- if (!shader->key.part.ps.epilog.poly_line_smoothing &&
- !shader->selector->info.reads_samplemask)
- shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
-
- return true;
+ union si_shader_part_key prolog_key;
+ union si_shader_part_key epilog_key;
+
+ /* Get the prolog. */
+ si_get_ps_prolog_key(shader, &prolog_key, true);
+
+ /* The prolog is a no-op if these aren't set. */
+ if (si_need_ps_prolog(&prolog_key)) {
+ shader->prolog =
+ si_get_shader_part(sscreen, &sscreen->ps_prologs, PIPE_SHADER_FRAGMENT, true, &prolog_key,
+ compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
+ if (!shader->prolog)
+ return false;
+ }
+
+ /* Get the epilog. */
+ si_get_ps_epilog_key(shader, &epilog_key);
+
+ shader->epilog =
+ si_get_shader_part(sscreen, &sscreen->ps_epilogs, PIPE_SHADER_FRAGMENT, false, &epilog_key,
+ compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
+ if (!shader->epilog)
+ return false;
+
+ /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+ if (shader->key.part.ps.prolog.poly_stipple) {
+ shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+ assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* Set up the enable bits for per-sample shading if needed. */
+ if (shader->key.part.ps.prolog.force_persp_sample_interp &&
+ (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+ }
+ if (shader->key.part.ps.prolog.force_linear_sample_interp &&
+ (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+ }
+ if (shader->key.part.ps.prolog.force_persp_center_interp &&
+ (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+ }
+ if (shader->key.part.ps.prolog.force_linear_center_interp &&
+ (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+ G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
+ shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+ }
+
+ /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+ if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+ !(shader->config.spi_ps_input_ena & 0xf)) {
+ shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+ assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* At least one pair of interpolation weights must be enabled. */
+ if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+ shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+ assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* Samplemask fixup requires the sample ID. */
+ if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
+ shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
+ assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
+ }
+
+ /* The sample mask input is always enabled, because the API shader always
+ * passes it through to the epilog. Disable it here if it's unused.
+ */
+ if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
+ shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+ return true;
}
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
- unsigned *lds_size)
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
{
- /* If tessellation is all offchip and on-chip GS isn't used, this
- * workaround is not needed.
- */
- return;
-
- /* SPI barrier management bug:
- * Make sure we have at least 4k of LDS in use to avoid the bug.
- * It applies to workgroup sizes of more than one wavefront.
- */
- if (sscreen->info.family == CHIP_BONAIRE ||
- sscreen->info.family == CHIP_KABINI)
- *lds_size = MAX2(*lds_size, 8);
+ /* If tessellation is all offchip and on-chip GS isn't used, this
+ * workaround is not needed.
+ */
+ return;
+
+ /* SPI barrier management bug:
+ * Make sure we have at least 4k of LDS in use to avoid the bug.
+ * It applies to workgroup sizes of more than one wavefront.
+ */
+ if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
+ *lds_size = MAX2(*lds_size, 8);
}
void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
{
- unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
+ unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
- shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
+ shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
- if (shader->selector->type == PIPE_SHADER_COMPUTE &&
- si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
- si_multiwave_lds_size_workaround(sscreen,
- &shader->config.lds_size);
- }
+ if (shader->selector->type == PIPE_SHADER_COMPUTE &&
+ si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
+ si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
+ }
}
-bool si_create_shader_variant(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug)
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug)
{
- struct si_shader_selector *sel = shader->selector;
- struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
-
- /* LS, ES, VS are compiled on demand if the main part hasn't been
- * compiled for that stage.
- *
- * GS are compiled on demand if the main part hasn't been compiled
- * for the chosen NGG-ness.
- *
- * Vertex shaders are compiled on demand when a vertex fetch
- * workaround must be applied.
- */
- if (shader->is_monolithic) {
- /* Monolithic shader (compiled as a whole, has many variants,
- * may take a long time to compile).
- */
- if (!si_compile_shader(sscreen, compiler, shader, debug))
- return false;
- } else {
- /* The shader consists of several parts:
- *
- * - the middle part is the user shader, it has 1 variant only
- * and it was compiled during the creation of the shader
- * selector
- * - the prolog part is inserted at the beginning
- * - the epilog part is inserted at the end
- *
- * The prolog and epilog have many (but simple) variants.
- *
- * Starting with gfx9, geometry and tessellation control
- * shaders also contain the prolog and user shader parts of
- * the previous shader stage.
- */
-
- if (!mainp)
- return false;
-
- /* Copy the compiled shader data over. */
- shader->is_binary_shared = true;
- shader->binary = mainp->binary;
- shader->config = mainp->config;
- shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
- shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
- shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
- shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
- memcpy(shader->info.vs_output_param_offset,
- mainp->info.vs_output_param_offset,
- sizeof(mainp->info.vs_output_param_offset));
- shader->info.uses_instanceid = mainp->info.uses_instanceid;
- shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
- shader->info.nr_param_exports = mainp->info.nr_param_exports;
-
- /* Select prologs and/or epilogs. */
- switch (sel->type) {
- case PIPE_SHADER_VERTEX:
- if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
- return false;
- break;
- case PIPE_SHADER_TESS_CTRL:
- if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
- return false;
- break;
- case PIPE_SHADER_TESS_EVAL:
- break;
- case PIPE_SHADER_GEOMETRY:
- if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
- return false;
- break;
- case PIPE_SHADER_FRAGMENT:
- if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
- return false;
-
- /* Make sure we have at least as many VGPRs as there
- * are allocated inputs.
- */
- shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
- shader->info.num_input_vgprs);
- break;
- default:;
- }
-
- /* Update SGPR and VGPR counts. */
- if (shader->prolog) {
- shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
- shader->prolog->config.num_sgprs);
- shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
- shader->prolog->config.num_vgprs);
- }
- if (shader->previous_stage) {
- shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
- shader->previous_stage->config.num_sgprs);
- shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
- shader->previous_stage->config.num_vgprs);
- shader->config.spilled_sgprs =
- MAX2(shader->config.spilled_sgprs,
- shader->previous_stage->config.spilled_sgprs);
- shader->config.spilled_vgprs =
- MAX2(shader->config.spilled_vgprs,
- shader->previous_stage->config.spilled_vgprs);
- shader->info.private_mem_vgprs =
- MAX2(shader->info.private_mem_vgprs,
- shader->previous_stage->info.private_mem_vgprs);
- shader->config.scratch_bytes_per_wave =
- MAX2(shader->config.scratch_bytes_per_wave,
- shader->previous_stage->config.scratch_bytes_per_wave);
- shader->info.uses_instanceid |=
- shader->previous_stage->info.uses_instanceid;
- }
- if (shader->prolog2) {
- shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
- shader->prolog2->config.num_sgprs);
- shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
- shader->prolog2->config.num_vgprs);
- }
- if (shader->epilog) {
- shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
- shader->epilog->config.num_sgprs);
- shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
- shader->epilog->config.num_vgprs);
- }
- si_calculate_max_simd_waves(shader);
- }
-
- if (shader->key.as_ngg) {
- assert(!shader->key.as_es && !shader->key.as_ls);
- gfx10_ngg_calculate_subgroup_info(shader);
- } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
- gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
- }
-
- si_fix_resource_usage(sscreen, shader);
- si_shader_dump(sscreen, shader, debug, stderr, true);
-
- /* Upload. */
- if (!si_shader_binary_upload(sscreen, shader, 0)) {
- fprintf(stderr, "LLVM failed to upload shader\n");
- return false;
- }
-
- return true;
+ struct si_shader_selector *sel = shader->selector;
+ struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
+
+ /* LS, ES, VS are compiled on demand if the main part hasn't been
+ * compiled for that stage.
+ *
+ * GS are compiled on demand if the main part hasn't been compiled
+ * for the chosen NGG-ness.
+ *
+ * Vertex shaders are compiled on demand when a vertex fetch
+ * workaround must be applied.
+ */
+ if (shader->is_monolithic) {
+ /* Monolithic shader (compiled as a whole, has many variants,
+ * may take a long time to compile).
+ */
+ if (!si_compile_shader(sscreen, compiler, shader, debug))
+ return false;
+ } else {
+ /* The shader consists of several parts:
+ *
+ * - the middle part is the user shader, it has 1 variant only
+ * and it was compiled during the creation of the shader
+ * selector
+ * - the prolog part is inserted at the beginning
+ * - the epilog part is inserted at the end
+ *
+ * The prolog and epilog have many (but simple) variants.
+ *
+ * Starting with gfx9, geometry and tessellation control
+ * shaders also contain the prolog and user shader parts of
+ * the previous shader stage.
+ */
+
+ if (!mainp)
+ return false;
+
+ /* Copy the compiled shader data over. */
+ shader->is_binary_shared = true;
+ shader->binary = mainp->binary;
+ shader->config = mainp->config;
+ shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+ shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+ shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+ shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
+ memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
+ sizeof(mainp->info.vs_output_param_offset));
+ shader->info.uses_instanceid = mainp->info.uses_instanceid;
+ shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+ shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+ /* Select prologs and/or epilogs. */
+ switch (sel->type) {
+ case PIPE_SHADER_VERTEX:
+ if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
+ return false;
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
+ return false;
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
+ return false;
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
+ return false;
+
+ /* Make sure we have at least as many VGPRs as there
+ * are allocated inputs.
+ */
+ shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
+ break;
+ default:;
+ }
+
+ /* Update SGPR and VGPR counts. */
+ if (shader->prolog) {
+ shader->config.num_sgprs =
+ MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
+ shader->config.num_vgprs =
+ MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
+ }
+ if (shader->previous_stage) {
+ shader->config.num_sgprs =
+ MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
+ shader->config.num_vgprs =
+ MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
+ shader->config.spilled_sgprs =
+ MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
+ shader->config.spilled_vgprs =
+ MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
+ shader->info.private_mem_vgprs =
+ MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
+ shader->config.scratch_bytes_per_wave =
+ MAX2(shader->config.scratch_bytes_per_wave,
+ shader->previous_stage->config.scratch_bytes_per_wave);
+ shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
+ }
+ if (shader->prolog2) {
+ shader->config.num_sgprs =
+ MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
+ shader->config.num_vgprs =
+ MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
+ }
+ if (shader->epilog) {
+ shader->config.num_sgprs =
+ MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
+ shader->config.num_vgprs =
+ MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
+ }
+ si_calculate_max_simd_waves(shader);
+ }
+
+ if (shader->key.as_ngg) {
+ assert(!shader->key.as_es && !shader->key.as_ls);
+ gfx10_ngg_calculate_subgroup_info(shader);
+ } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
+ gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
+ }
+
+ si_fix_resource_usage(sscreen, shader);
+ si_shader_dump(sscreen, shader, debug, stderr, true);
+
+ /* Upload. */
+ if (!si_shader_binary_upload(sscreen, shader, 0)) {
+ fprintf(stderr, "LLVM failed to upload shader\n");
+ return false;
+ }
+
+ return true;
}
void si_shader_binary_clean(struct si_shader_binary *binary)
{
- free((void *)binary->elf_buffer);
- binary->elf_buffer = NULL;
+ free((void *)binary->elf_buffer);
+ binary->elf_buffer = NULL;
- free(binary->llvm_ir_string);
- binary->llvm_ir_string = NULL;
+ free(binary->llvm_ir_string);
+ binary->llvm_ir_string = NULL;
}
void si_shader_destroy(struct si_shader *shader)
{
- if (shader->scratch_bo)
- si_resource_reference(&shader->scratch_bo, NULL);
+ if (shader->scratch_bo)
+ si_resource_reference(&shader->scratch_bo, NULL);
- si_resource_reference(&shader->bo, NULL);
+ si_resource_reference(&shader->bo, NULL);
- if (!shader->is_binary_shared)
- si_shader_binary_clean(&shader->binary);
+ if (!shader->is_binary_shared)
+ si_shader_binary_clean(&shader->binary);
- free(shader->shader_log);
+ free(shader->shader_log);
}
#ifndef SI_SHADER_H
#define SI_SHADER_H
-#include "util/u_inlines.h"
-#include "util/u_live_shader_cache.h"
-#include "util/u_queue.h"
-#include "util/simple_mtx.h"
-
#include "ac_binary.h"
#include "ac_llvm_build.h"
#include "ac_llvm_util.h"
+#include "util/simple_mtx.h"
+#include "util/u_inlines.h"
+#include "util/u_live_shader_cache.h"
+#include "util/u_queue.h"
#include <stdio.h>
struct si_shader;
struct si_context;
-#define SI_MAX_ATTRIBS 16
-#define SI_MAX_VS_OUTPUTS 40
+#define SI_MAX_ATTRIBS 16
+#define SI_MAX_VS_OUTPUTS 40
/* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an
* index smaller than this.
*/
-#define SI_MAX_IO_GENERIC 32
+#define SI_MAX_IO_GENERIC 32
#define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
/* SGPR user data indices */
-enum {
- SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
- SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
- SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
- SI_SGPR_SAMPLERS_AND_IMAGES,
- SI_NUM_RESOURCE_SGPRS,
-
- /* API VS, TES without GS, GS copy shader */
- SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
- SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
- /* all VS variants */
- SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
- SI_SGPR_START_INSTANCE,
- SI_SGPR_DRAWID,
- SI_VS_NUM_USER_SGPR,
-
- SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
-
- /* TES */
- SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
- SI_SGPR_TES_OFFCHIP_ADDR,
- SI_TES_NUM_USER_SGPR,
-
- /* GFX6-8: TCS only */
- GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
- GFX6_SGPR_TCS_OUT_OFFSETS,
- GFX6_SGPR_TCS_OUT_LAYOUT,
- GFX6_SGPR_TCS_IN_LAYOUT,
- GFX6_TCS_NUM_USER_SGPR,
-
- /* GFX9: Merged shaders. */
- /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
- /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
- GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-
- /* GFX9: Merged LS-HS (VS-TCS) only. */
- GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
- GFX9_SGPR_TCS_OUT_OFFSETS,
- GFX9_SGPR_TCS_OUT_LAYOUT,
- GFX9_TCS_NUM_USER_SGPR,
-
- /* GS limits */
- GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
- GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
- GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
- SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
- /* PS only */
- SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
- SI_PS_NUM_USER_SGPR,
-
- /* The value has to be 12, because the hw requires that descriptors
- * are aligned to 4 SGPRs.
- */
- SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
+enum
+{
+ SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
+ SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
+ SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
+ SI_SGPR_SAMPLERS_AND_IMAGES,
+ SI_NUM_RESOURCE_SGPRS,
+
+ /* API VS, TES without GS, GS copy shader */
+ SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
+ SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+ /* all VS variants */
+ SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+ SI_SGPR_START_INSTANCE,
+ SI_SGPR_DRAWID,
+ SI_VS_NUM_USER_SGPR,
+
+ SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
+
+ /* TES */
+ SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+ SI_SGPR_TES_OFFCHIP_ADDR,
+ SI_TES_NUM_USER_SGPR,
+
+ /* GFX6-8: TCS only */
+ GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+ GFX6_SGPR_TCS_OUT_OFFSETS,
+ GFX6_SGPR_TCS_OUT_LAYOUT,
+ GFX6_SGPR_TCS_IN_LAYOUT,
+ GFX6_TCS_NUM_USER_SGPR,
+
+ /* GFX9: Merged shaders. */
+ /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
+ /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
+ GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+
+ /* GFX9: Merged LS-HS (VS-TCS) only. */
+ GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
+ GFX9_SGPR_TCS_OUT_OFFSETS,
+ GFX9_SGPR_TCS_OUT_LAYOUT,
+ GFX9_TCS_NUM_USER_SGPR,
+
+ /* GS limits */
+ GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
+ GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+ GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
+ SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+ /* PS only */
+ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
+ SI_PS_NUM_USER_SGPR,
+
+ /* The value has to be 12, because the hw requires that descriptors
+ * are aligned to 4 SGPRs.
+ */
+ SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
};
/* LLVM function parameter indices */
-enum {
- SI_NUM_RESOURCE_PARAMS = 4,
-
- /* PS only parameters */
- SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
- SI_PARAM_PRIM_MASK,
- SI_PARAM_PERSP_SAMPLE,
- SI_PARAM_PERSP_CENTER,
- SI_PARAM_PERSP_CENTROID,
- SI_PARAM_PERSP_PULL_MODEL,
- SI_PARAM_LINEAR_SAMPLE,
- SI_PARAM_LINEAR_CENTER,
- SI_PARAM_LINEAR_CENTROID,
- SI_PARAM_LINE_STIPPLE_TEX,
- SI_PARAM_POS_X_FLOAT,
- SI_PARAM_POS_Y_FLOAT,
- SI_PARAM_POS_Z_FLOAT,
- SI_PARAM_POS_W_FLOAT,
- SI_PARAM_FRONT_FACE,
- SI_PARAM_ANCILLARY,
- SI_PARAM_SAMPLE_COVERAGE,
- SI_PARAM_POS_FIXED_PT,
-
- SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
+enum
+{
+ SI_NUM_RESOURCE_PARAMS = 4,
+
+ /* PS only parameters */
+ SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
+ SI_PARAM_PRIM_MASK,
+ SI_PARAM_PERSP_SAMPLE,
+ SI_PARAM_PERSP_CENTER,
+ SI_PARAM_PERSP_CENTROID,
+ SI_PARAM_PERSP_PULL_MODEL,
+ SI_PARAM_LINEAR_SAMPLE,
+ SI_PARAM_LINEAR_CENTER,
+ SI_PARAM_LINEAR_CENTROID,
+ SI_PARAM_LINE_STIPPLE_TEX,
+ SI_PARAM_POS_X_FLOAT,
+ SI_PARAM_POS_Y_FLOAT,
+ SI_PARAM_POS_Z_FLOAT,
+ SI_PARAM_POS_W_FLOAT,
+ SI_PARAM_FRONT_FACE,
+ SI_PARAM_ANCILLARY,
+ SI_PARAM_SAMPLE_COVERAGE,
+ SI_PARAM_POS_FIXED_PT,
+
+ SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
};
/* Fields of driver-defined VS state SGPR. */
-#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x) & 0x1) << 0)
-#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE
-#define S_VS_STATE_INDEXED(x) (((unsigned)(x) & 0x1) << 1)
-#define C_VS_STATE_INDEXED 0xFFFFFFFD
-#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x) & 0x3) << 2)
-#define C_VS_STATE_OUTPRIM 0xFFFFFFF3
-#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x) & 0x3) << 4)
-#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF
-#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x) & 0x1) << 6)
-#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF
-#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x) & 0xF) << 7)
-#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F
-#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x) & 0x1FFF) << 11)
-#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF
-#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x) & 0xFF) << 24)
-#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF
-
-enum {
- /* Use a property enum that CS wouldn't use. */
- TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
-
- /* These represent the number of SGPRs the shader uses. */
- SI_VS_BLIT_SGPRS_POS = 3,
- SI_VS_BLIT_SGPRS_POS_COLOR = 7,
- SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
+#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x)&0x1) << 0)
+#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE
+#define S_VS_STATE_INDEXED(x) (((unsigned)(x)&0x1) << 1)
+#define C_VS_STATE_INDEXED 0xFFFFFFFD
+#define S_VS_STATE_OUTPRIM(x) (((unsigned)(x)&0x3) << 2)
+#define C_VS_STATE_OUTPRIM 0xFFFFFFF3
+#define S_VS_STATE_PROVOKING_VTX_INDEX(x) (((unsigned)(x)&0x3) << 4)
+#define C_VS_STATE_PROVOKING_VTX_INDEX 0xFFFFFFCF
+#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6)
+#define C_VS_STATE_STREAMOUT_QUERY_ENABLED 0xFFFFFFBF
+#define S_VS_STATE_SMALL_PRIM_PRECISION(x) (((unsigned)(x)&0xF) << 7)
+#define C_VS_STATE_SMALL_PRIM_PRECISION 0xFFFFF87F
+#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x)&0x1FFF) << 11)
+#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF
+#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24)
+#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF
+
+enum
+{
+ /* Use a property enum that CS wouldn't use. */
+ TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
+
+ /* These represent the number of SGPRs the shader uses. */
+ SI_VS_BLIT_SGPRS_POS = 3,
+ SI_VS_BLIT_SGPRS_POS_COLOR = 7,
+ SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
};
-#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
-#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
-#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_VIEW_SMALLPRIMS (1 << 0) /* view.xy + small prims */
+#define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
+#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
/**
* For VS shader keys, describe any fixups required for vertex fetch.
* buffer_load_format_xyzw).
*/
union si_vs_fix_fetch {
- struct {
- uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
- uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
- uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
- uint8_t reverse : 1; /* reverse XYZ channels */
- } u;
- uint8_t bits;
+ struct {
+ uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
+ uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+ uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
+ uint8_t reverse : 1; /* reverse XYZ channels */
+ } u;
+ uint8_t bits;
};
struct si_shader;
/* State of the context creating the shader object. */
struct si_compiler_ctx_state {
- /* Should only be used by si_init_shader_selector_async and
- * si_build_shader_variant if thread_index == -1 (non-threaded). */
- struct ac_llvm_compiler *compiler;
+ /* Should only be used by si_init_shader_selector_async and
+ * si_build_shader_variant if thread_index == -1 (non-threaded). */
+ struct ac_llvm_compiler *compiler;
- /* Used if thread_index == -1 or if debug.async is true. */
- struct pipe_debug_callback debug;
+ /* Used if thread_index == -1 or if debug.async is true. */
+ struct pipe_debug_callback debug;
- /* Used for creating the log string for gallium/ddebug. */
- bool is_debug_context;
+ /* Used for creating the log string for gallium/ddebug. */
+ bool is_debug_context;
};
struct si_shader_info {
- ubyte num_inputs;
- ubyte num_outputs;
- ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
- ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
- ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
- ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
- ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
- ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
- ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
- ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
-
- ubyte processor;
-
- int constbuf0_num_slots;
- unsigned const_buffers_declared; /**< bitmask of declared const buffers */
- unsigned samplers_declared; /**< bitmask of declared samplers */
- ubyte num_stream_output_components[4];
-
- uint num_memory_instructions; /**< sampler, buffer, and image instructions */
-
- /**
- * If a tessellation control shader reads outputs, this describes which ones.
- */
- bool reads_pervertex_outputs;
- bool reads_perpatch_outputs;
- bool reads_tessfactor_outputs;
-
- ubyte colors_read; /**< which color components are read by the FS */
- ubyte colors_written;
- bool reads_samplemask; /**< does fragment shader read sample mask? */
- bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
- bool writes_z; /**< does fragment shader write Z value? */
- bool writes_stencil; /**< does fragment shader write stencil value? */
- bool writes_samplemask; /**< does fragment shader write sample mask? */
- bool writes_edgeflag; /**< vertex shader outputs edgeflag */
- bool uses_kill; /**< KILL or KILL_IF instruction used? */
- bool uses_persp_center;
- bool uses_persp_centroid;
- bool uses_persp_sample;
- bool uses_linear_center;
- bool uses_linear_centroid;
- bool uses_linear_sample;
- bool uses_persp_opcode_interp_sample;
- bool uses_linear_opcode_interp_sample;
- bool uses_instanceid;
- bool uses_vertexid;
- bool uses_vertexid_nobase;
- bool uses_basevertex;
- bool uses_drawid;
- bool uses_primid;
- bool uses_frontface;
- bool uses_invocationid;
- bool uses_thread_id[3];
- bool uses_block_id[3];
- bool uses_block_size;
- bool uses_grid_size;
- bool uses_subgroup_info;
- bool writes_position;
- bool writes_psize;
- bool writes_clipvertex;
- bool writes_primid;
- bool writes_viewport_index;
- bool writes_layer;
- bool writes_memory; /**< contains stores or atomics to buffers or images */
- bool uses_derivatives;
- bool uses_bindless_samplers;
- bool uses_bindless_images;
- bool uses_fbfetch;
- unsigned clipdist_writemask;
- unsigned culldist_writemask;
- unsigned num_written_culldistance;
- unsigned num_written_clipdistance;
-
- unsigned images_declared; /**< bitmask of declared images */
- unsigned msaa_images_declared; /**< bitmask of declared MSAA images */
- unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
-
- unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
-
- /** Whether all codepaths write tess factors in all invocations. */
- bool tessfactors_are_def_in_all_invocs;
+ ubyte num_inputs;
+ ubyte num_outputs;
+ ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
+ ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+ ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+ ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
+ ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
+ ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
+ ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+ ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
+ ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
+
+ ubyte processor;
+
+ int constbuf0_num_slots;
+ unsigned const_buffers_declared; /**< bitmask of declared const buffers */
+ unsigned samplers_declared; /**< bitmask of declared samplers */
+ ubyte num_stream_output_components[4];
+
+ uint num_memory_instructions; /**< sampler, buffer, and image instructions */
+
+ /**
+ * If a tessellation control shader reads outputs, this describes which ones.
+ */
+ bool reads_pervertex_outputs;
+ bool reads_perpatch_outputs;
+ bool reads_tessfactor_outputs;
+
+ ubyte colors_read; /**< which color components are read by the FS */
+ ubyte colors_written;
+ bool reads_samplemask; /**< does fragment shader read sample mask? */
+ bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
+ bool writes_z; /**< does fragment shader write Z value? */
+ bool writes_stencil; /**< does fragment shader write stencil value? */
+ bool writes_samplemask; /**< does fragment shader write sample mask? */
+ bool writes_edgeflag; /**< vertex shader outputs edgeflag */
+ bool uses_kill; /**< KILL or KILL_IF instruction used? */
+ bool uses_persp_center;
+ bool uses_persp_centroid;
+ bool uses_persp_sample;
+ bool uses_linear_center;
+ bool uses_linear_centroid;
+ bool uses_linear_sample;
+ bool uses_persp_opcode_interp_sample;
+ bool uses_linear_opcode_interp_sample;
+ bool uses_instanceid;
+ bool uses_vertexid;
+ bool uses_vertexid_nobase;
+ bool uses_basevertex;
+ bool uses_drawid;
+ bool uses_primid;
+ bool uses_frontface;
+ bool uses_invocationid;
+ bool uses_thread_id[3];
+ bool uses_block_id[3];
+ bool uses_block_size;
+ bool uses_grid_size;
+ bool uses_subgroup_info;
+ bool writes_position;
+ bool writes_psize;
+ bool writes_clipvertex;
+ bool writes_primid;
+ bool writes_viewport_index;
+ bool writes_layer;
+ bool writes_memory; /**< contains stores or atomics to buffers or images */
+ bool uses_derivatives;
+ bool uses_bindless_samplers;
+ bool uses_bindless_images;
+ bool uses_fbfetch;
+ unsigned clipdist_writemask;
+ unsigned culldist_writemask;
+ unsigned num_written_culldistance;
+ unsigned num_written_clipdistance;
+
+ unsigned images_declared; /**< bitmask of declared images */
+ unsigned msaa_images_declared; /**< bitmask of declared MSAA images */
+ unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
+
+ unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
+
+ /** Whether all codepaths write tess factors in all invocations. */
+ bool tessfactors_are_def_in_all_invocs;
};
/* A shader selector is a gallium CSO and contains shader variants and
* binaries for one NIR program. This can be shared by multiple contexts.
*/
struct si_shader_selector {
- struct util_live_shader base;
- struct si_screen *screen;
- struct util_queue_fence ready;
- struct si_compiler_ctx_state compiler_ctx_state;
-
- simple_mtx_t mutex;
- struct si_shader *first_variant; /* immutable after the first variant */
- struct si_shader *last_variant; /* mutable */
-
- /* The compiled NIR shader without a prolog and/or epilog (not
- * uploaded to a buffer object).
- */
- struct si_shader *main_shader_part;
- struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
- struct si_shader *main_shader_part_es; /* as_es is set in the key */
- struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
- struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
-
- struct si_shader *gs_copy_shader;
-
- struct nir_shader *nir;
- void *nir_binary;
- unsigned nir_size;
-
- struct pipe_stream_output_info so;
- struct si_shader_info info;
-
- /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
- enum pipe_shader_type type;
- bool vs_needs_prolog;
- bool prim_discard_cs_allowed;
- bool ngg_culling_allowed;
- unsigned num_vs_inputs;
- unsigned num_vbos_in_user_sgprs;
- unsigned pa_cl_vs_out_cntl;
- ubyte clipdist_mask;
- ubyte culldist_mask;
- unsigned rast_prim;
-
- /* ES parameters. */
- unsigned esgs_itemsize; /* vertex stride */
- unsigned lshs_vertex_stride;
-
- /* GS parameters. */
- unsigned gs_input_verts_per_prim;
- unsigned gs_output_prim;
- unsigned gs_max_out_vertices;
- unsigned gs_num_invocations;
- unsigned max_gs_stream; /* count - 1 */
- unsigned gsvs_vertex_size;
- unsigned max_gsvs_emit_size;
- unsigned enabled_streamout_buffer_mask;
- bool tess_turns_off_ngg;
-
- /* PS parameters. */
- unsigned color_attr_index[2];
- unsigned db_shader_control;
- /* Set 0xf or 0x0 (4 bits) per each written output.
- * ANDed with spi_shader_col_format.
- */
- unsigned colors_written_4bit;
-
- uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
- uint64_t outputs_written; /* "get_unique_index" bits */
- uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
-
- uint64_t inputs_read; /* "get_unique_index" bits */
-
- /* bitmasks of used descriptor slots */
- uint32_t active_const_and_shader_buffers;
- uint64_t active_samplers_and_images;
+ struct util_live_shader base;
+ struct si_screen *screen;
+ struct util_queue_fence ready;
+ struct si_compiler_ctx_state compiler_ctx_state;
+
+ simple_mtx_t mutex;
+ struct si_shader *first_variant; /* immutable after the first variant */
+ struct si_shader *last_variant; /* mutable */
+
+ /* The compiled NIR shader without a prolog and/or epilog (not
+ * uploaded to a buffer object).
+ */
+ struct si_shader *main_shader_part;
+ struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
+ struct si_shader *main_shader_part_es; /* as_es is set in the key */
+ struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
+ struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
+
+ struct si_shader *gs_copy_shader;
+
+ struct nir_shader *nir;
+ void *nir_binary;
+ unsigned nir_size;
+
+ struct pipe_stream_output_info so;
+ struct si_shader_info info;
+
+ /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
+ enum pipe_shader_type type;
+ bool vs_needs_prolog;
+ bool prim_discard_cs_allowed;
+ bool ngg_culling_allowed;
+ unsigned num_vs_inputs;
+ unsigned num_vbos_in_user_sgprs;
+ unsigned pa_cl_vs_out_cntl;
+ ubyte clipdist_mask;
+ ubyte culldist_mask;
+ unsigned rast_prim;
+
+ /* ES parameters. */
+ unsigned esgs_itemsize; /* vertex stride */
+ unsigned lshs_vertex_stride;
+
+ /* GS parameters. */
+ unsigned gs_input_verts_per_prim;
+ unsigned gs_output_prim;
+ unsigned gs_max_out_vertices;
+ unsigned gs_num_invocations;
+ unsigned max_gs_stream; /* count - 1 */
+ unsigned gsvs_vertex_size;
+ unsigned max_gsvs_emit_size;
+ unsigned enabled_streamout_buffer_mask;
+ bool tess_turns_off_ngg;
+
+ /* PS parameters. */
+ unsigned color_attr_index[2];
+ unsigned db_shader_control;
+ /* Set 0xf or 0x0 (4 bits) per each written output.
+ * ANDed with spi_shader_col_format.
+ */
+ unsigned colors_written_4bit;
+
+ uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
+ uint64_t outputs_written; /* "get_unique_index" bits */
+ uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
+
+ uint64_t inputs_read; /* "get_unique_index" bits */
+
+ /* bitmasks of used descriptor slots */
+ uint32_t active_const_and_shader_buffers;
+ uint64_t active_samplers_and_images;
};
/* Valid shader configurations:
/* Common VS bits between the shader key and the prolog key. */
struct si_vs_prolog_bits {
- /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
- * divisor is 0.
- * - If "is_one" has a bit set, the instance divisor is 1.
- * - If "is_fetched" has a bit set, the instance divisor will be loaded
- * from the constant buffer.
- */
- uint16_t instance_divisor_is_one; /* bitmask of inputs */
- uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
- unsigned ls_vgpr_fix:1;
- unsigned unpack_instance_id_from_vertex_id:1;
+ /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+ * divisor is 0.
+ * - If "is_one" has a bit set, the instance divisor is 1.
+ * - If "is_fetched" has a bit set, the instance divisor will be loaded
+ * from the constant buffer.
+ */
+ uint16_t instance_divisor_is_one; /* bitmask of inputs */
+ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+ unsigned ls_vgpr_fix : 1;
+ unsigned unpack_instance_id_from_vertex_id : 1;
};
/* Common TCS bits between the shader key and the epilog key. */
struct si_tcs_epilog_bits {
- unsigned prim_mode:3;
- unsigned invoc0_tess_factors_are_def:1;
- unsigned tes_reads_tess_factors:1;
+ unsigned prim_mode : 3;
+ unsigned invoc0_tess_factors_are_def : 1;
+ unsigned tes_reads_tess_factors : 1;
};
struct si_gs_prolog_bits {
- unsigned tri_strip_adj_fix:1;
- unsigned gfx9_prev_is_vs:1;
+ unsigned tri_strip_adj_fix : 1;
+ unsigned gfx9_prev_is_vs : 1;
};
/* Common PS bits between the shader key and the prolog key. */
struct si_ps_prolog_bits {
- unsigned color_two_side:1;
- unsigned flatshade_colors:1;
- unsigned poly_stipple:1;
- unsigned force_persp_sample_interp:1;
- unsigned force_linear_sample_interp:1;
- unsigned force_persp_center_interp:1;
- unsigned force_linear_center_interp:1;
- unsigned bc_optimize_for_persp:1;
- unsigned bc_optimize_for_linear:1;
- unsigned samplemask_log_ps_iter:3;
+ unsigned color_two_side : 1;
+ unsigned flatshade_colors : 1;
+ unsigned poly_stipple : 1;
+ unsigned force_persp_sample_interp : 1;
+ unsigned force_linear_sample_interp : 1;
+ unsigned force_persp_center_interp : 1;
+ unsigned force_linear_center_interp : 1;
+ unsigned bc_optimize_for_persp : 1;
+ unsigned bc_optimize_for_linear : 1;
+ unsigned samplemask_log_ps_iter : 3;
};
/* Common PS bits between the shader key and the epilog key. */
struct si_ps_epilog_bits {
- unsigned spi_shader_col_format;
- unsigned color_is_int8:8;
- unsigned color_is_int10:8;
- unsigned last_cbuf:3;
- unsigned alpha_func:3;
- unsigned alpha_to_one:1;
- unsigned poly_line_smoothing:1;
- unsigned clamp_color:1;
+ unsigned spi_shader_col_format;
+ unsigned color_is_int8 : 8;
+ unsigned color_is_int10 : 8;
+ unsigned last_cbuf : 3;
+ unsigned alpha_func : 3;
+ unsigned alpha_to_one : 1;
+ unsigned poly_line_smoothing : 1;
+ unsigned clamp_color : 1;
};
union si_shader_part_key {
- struct {
- struct si_vs_prolog_bits states;
- unsigned num_input_sgprs:6;
- /* For merged stages such as LS-HS, HS input VGPRs are first. */
- unsigned num_merged_next_stage_vgprs:3;
- unsigned num_inputs:5;
- unsigned as_ls:1;
- unsigned as_es:1;
- unsigned as_ngg:1;
- unsigned as_prim_discard_cs:1;
- unsigned has_ngg_cull_inputs:1; /* from the NGG cull shader */
- unsigned gs_fast_launch_tri_list:1; /* for NGG culling */
- unsigned gs_fast_launch_tri_strip:1; /* for NGG culling */
- /* Prologs for monolithic shaders shouldn't set EXEC. */
- unsigned is_monolithic:1;
- } vs_prolog;
- struct {
- struct si_tcs_epilog_bits states;
- } tcs_epilog;
- struct {
- struct si_gs_prolog_bits states;
- /* Prologs of monolithic shaders shouldn't set EXEC. */
- unsigned is_monolithic:1;
- unsigned as_ngg:1;
- } gs_prolog;
- struct {
- struct si_ps_prolog_bits states;
- unsigned num_input_sgprs:6;
- unsigned num_input_vgprs:5;
- /* Color interpolation and two-side color selection. */
- unsigned colors_read:8; /* color input components read */
- unsigned num_interp_inputs:5; /* BCOLOR is at this location */
- unsigned face_vgpr_index:5;
- unsigned ancillary_vgpr_index:5;
- unsigned wqm:1;
- char color_attr_index[2];
- signed char color_interp_vgpr_index[2]; /* -1 == constant */
- } ps_prolog;
- struct {
- struct si_ps_epilog_bits states;
- unsigned colors_written:8;
- unsigned writes_z:1;
- unsigned writes_stencil:1;
- unsigned writes_samplemask:1;
- } ps_epilog;
+ struct {
+ struct si_vs_prolog_bits states;
+ unsigned num_input_sgprs : 6;
+ /* For merged stages such as LS-HS, HS input VGPRs are first. */
+ unsigned num_merged_next_stage_vgprs : 3;
+ unsigned num_inputs : 5;
+ unsigned as_ls : 1;
+ unsigned as_es : 1;
+ unsigned as_ngg : 1;
+ unsigned as_prim_discard_cs : 1;
+ unsigned has_ngg_cull_inputs : 1; /* from the NGG cull shader */
+ unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
+ unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
+ /* Prologs for monolithic shaders shouldn't set EXEC. */
+ unsigned is_monolithic : 1;
+ } vs_prolog;
+ struct {
+ struct si_tcs_epilog_bits states;
+ } tcs_epilog;
+ struct {
+ struct si_gs_prolog_bits states;
+ /* Prologs of monolithic shaders shouldn't set EXEC. */
+ unsigned is_monolithic : 1;
+ unsigned as_ngg : 1;
+ } gs_prolog;
+ struct {
+ struct si_ps_prolog_bits states;
+ unsigned num_input_sgprs : 6;
+ unsigned num_input_vgprs : 5;
+ /* Color interpolation and two-side color selection. */
+ unsigned colors_read : 8; /* color input components read */
+ unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
+ unsigned face_vgpr_index : 5;
+ unsigned ancillary_vgpr_index : 5;
+ unsigned wqm : 1;
+ char color_attr_index[2];
+ signed char color_interp_vgpr_index[2]; /* -1 == constant */
+ } ps_prolog;
+ struct {
+ struct si_ps_epilog_bits states;
+ unsigned colors_written : 8;
+ unsigned writes_z : 1;
+ unsigned writes_stencil : 1;
+ unsigned writes_samplemask : 1;
+ } ps_epilog;
};
struct si_shader_key {
- /* Prolog and epilog flags. */
- union {
- struct {
- struct si_vs_prolog_bits prolog;
- } vs;
- struct {
- struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
- struct si_shader_selector *ls; /* for merged LS-HS */
- struct si_tcs_epilog_bits epilog;
- } tcs; /* tessellation control shader */
- struct {
- struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
- struct si_shader_selector *es; /* for merged ES-GS */
- struct si_gs_prolog_bits prolog;
- } gs;
- struct {
- struct si_ps_prolog_bits prolog;
- struct si_ps_epilog_bits epilog;
- } ps;
- } part;
-
- /* These three are initially set according to the NEXT_SHADER property,
- * or guessed if the property doesn't seem correct.
- */
- unsigned as_es:1; /* export shader, which precedes GS */
- unsigned as_ls:1; /* local shader, which precedes TCS */
- unsigned as_ngg:1; /* VS, TES, or GS compiled as NGG primitive shader */
-
- /* Flags for monolithic compilation only. */
- struct {
- /* Whether fetch should be opencoded according to vs_fix_fetch.
- * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
- * with minimal fixups is used. */
- uint16_t vs_fetch_opencode;
- union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
-
- union {
- uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
- /* When PS needs PrimID and GS is disabled. */
- unsigned vs_export_prim_id:1;
- struct {
- unsigned interpolate_at_sample_force_center:1;
- unsigned fbfetch_msaa:1;
- unsigned fbfetch_is_1D:1;
- unsigned fbfetch_layered:1;
- } ps;
- } u;
- } mono;
-
- /* Optimization flags for asynchronous compilation only. */
- struct {
- /* For HW VS (it can be VS, TES, GS) */
- uint64_t kill_outputs; /* "get_unique_index" bits */
- unsigned clip_disable:1;
-
- /* For NGG VS and TES. */
- unsigned ngg_culling:5; /* SI_NGG_CULL_* */
-
- /* For shaders where monolithic variants have better code.
- *
- * This is a flag that has no effect on code generation,
- * but forces monolithic shaders to be used as soon as
- * possible, because it's in the "opt" group.
- */
- unsigned prefer_mono:1;
-
- /* Primitive discard compute shader. */
- unsigned vs_as_prim_discard_cs:1;
- unsigned cs_prim_type:4;
- unsigned cs_indexed:1;
- unsigned cs_instancing:1;
- unsigned cs_primitive_restart:1;
- unsigned cs_provoking_vertex_first:1;
- unsigned cs_need_correct_orientation:1;
- unsigned cs_cull_front:1;
- unsigned cs_cull_back:1;
- unsigned cs_cull_z:1;
- unsigned cs_halfz_clip_space:1;
- } opt;
+ /* Prolog and epilog flags. */
+ union {
+ struct {
+ struct si_vs_prolog_bits prolog;
+ } vs;
+ struct {
+ struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
+ struct si_shader_selector *ls; /* for merged LS-HS */
+ struct si_tcs_epilog_bits epilog;
+ } tcs; /* tessellation control shader */
+ struct {
+ struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
+ struct si_shader_selector *es; /* for merged ES-GS */
+ struct si_gs_prolog_bits prolog;
+ } gs;
+ struct {
+ struct si_ps_prolog_bits prolog;
+ struct si_ps_epilog_bits epilog;
+ } ps;
+ } part;
+
+ /* These three are initially set according to the NEXT_SHADER property,
+ * or guessed if the property doesn't seem correct.
+ */
+ unsigned as_es : 1; /* export shader, which precedes GS */
+ unsigned as_ls : 1; /* local shader, which precedes TCS */
+ unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+
+ /* Flags for monolithic compilation only. */
+ struct {
+ /* Whether fetch should be opencoded according to vs_fix_fetch.
+ * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+ * with minimal fixups is used. */
+ uint16_t vs_fetch_opencode;
+ union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
+
+ union {
+ uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+ /* When PS needs PrimID and GS is disabled. */
+ unsigned vs_export_prim_id : 1;
+ struct {
+ unsigned interpolate_at_sample_force_center : 1;
+ unsigned fbfetch_msaa : 1;
+ unsigned fbfetch_is_1D : 1;
+ unsigned fbfetch_layered : 1;
+ } ps;
+ } u;
+ } mono;
+
+ /* Optimization flags for asynchronous compilation only. */
+ struct {
+ /* For HW VS (it can be VS, TES, GS) */
+ uint64_t kill_outputs; /* "get_unique_index" bits */
+ unsigned clip_disable : 1;
+
+ /* For NGG VS and TES. */
+ unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
+
+ /* For shaders where monolithic variants have better code.
+ *
+ * This is a flag that has no effect on code generation,
+ * but forces monolithic shaders to be used as soon as
+ * possible, because it's in the "opt" group.
+ */
+ unsigned prefer_mono : 1;
+
+ /* Primitive discard compute shader. */
+ unsigned vs_as_prim_discard_cs : 1;
+ unsigned cs_prim_type : 4;
+ unsigned cs_indexed : 1;
+ unsigned cs_instancing : 1;
+ unsigned cs_primitive_restart : 1;
+ unsigned cs_provoking_vertex_first : 1;
+ unsigned cs_need_correct_orientation : 1;
+ unsigned cs_cull_front : 1;
+ unsigned cs_cull_back : 1;
+ unsigned cs_cull_z : 1;
+ unsigned cs_halfz_clip_space : 1;
+ } opt;
};
/* Restore the pack alignment to default. */
/* GCN-specific shader info. */
struct si_shader_binary_info {
- ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
- ubyte num_input_sgprs;
- ubyte num_input_vgprs;
- signed char face_vgpr_index;
- signed char ancillary_vgpr_index;
- bool uses_instanceid;
- ubyte nr_pos_exports;
- ubyte nr_param_exports;
- unsigned private_mem_vgprs;
- unsigned max_simd_waves;
+ ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+ ubyte num_input_sgprs;
+ ubyte num_input_vgprs;
+ signed char face_vgpr_index;
+ signed char ancillary_vgpr_index;
+ bool uses_instanceid;
+ ubyte nr_pos_exports;
+ ubyte nr_param_exports;
+ unsigned private_mem_vgprs;
+ unsigned max_simd_waves;
};
struct si_shader_binary {
- const char *elf_buffer;
- size_t elf_size;
+ const char *elf_buffer;
+ size_t elf_size;
- char *llvm_ir_string;
+ char *llvm_ir_string;
};
struct gfx9_gs_info {
- unsigned es_verts_per_subgroup;
- unsigned gs_prims_per_subgroup;
- unsigned gs_inst_prims_in_subgroup;
- unsigned max_prims_per_subgroup;
- unsigned esgs_ring_size; /* in bytes */
+ unsigned es_verts_per_subgroup;
+ unsigned gs_prims_per_subgroup;
+ unsigned gs_inst_prims_in_subgroup;
+ unsigned max_prims_per_subgroup;
+ unsigned esgs_ring_size; /* in bytes */
};
struct si_shader {
- struct si_compiler_ctx_state compiler_ctx_state;
-
- struct si_shader_selector *selector;
- struct si_shader_selector *previous_stage_sel; /* for refcounting */
- struct si_shader *next_variant;
-
- struct si_shader_part *prolog;
- struct si_shader *previous_stage; /* for GFX9 */
- struct si_shader_part *prolog2;
- struct si_shader_part *epilog;
-
- struct si_pm4_state *pm4;
- struct si_resource *bo;
- struct si_resource *scratch_bo;
- struct si_shader_key key;
- struct util_queue_fence ready;
- bool compilation_failed;
- bool is_monolithic;
- bool is_optimized;
- bool is_binary_shared;
- bool is_gs_copy_shader;
-
- /* The following data is all that's needed for binary shaders. */
- struct si_shader_binary binary;
- struct ac_shader_config config;
- struct si_shader_binary_info info;
-
- struct {
- uint16_t ngg_emit_size; /* in dwords */
- uint16_t hw_max_esverts;
- uint16_t max_gsprims;
- uint16_t max_out_verts;
- uint16_t prim_amp_factor;
- bool max_vert_out_per_gs_instance;
- } ngg;
-
- /* Shader key + LLVM IR + disassembly + statistics.
- * Generated for debug contexts only.
- */
- char *shader_log;
- size_t shader_log_size;
-
- struct gfx9_gs_info gs_info;
-
- /* For save precompute context registers values. */
- union {
- struct {
- unsigned vgt_gsvs_ring_offset_1;
- unsigned vgt_gsvs_ring_offset_2;
- unsigned vgt_gsvs_ring_offset_3;
- unsigned vgt_gsvs_ring_itemsize;
- unsigned vgt_gs_max_vert_out;
- unsigned vgt_gs_vert_itemsize;
- unsigned vgt_gs_vert_itemsize_1;
- unsigned vgt_gs_vert_itemsize_2;
- unsigned vgt_gs_vert_itemsize_3;
- unsigned vgt_gs_instance_cnt;
- unsigned vgt_gs_onchip_cntl;
- unsigned vgt_gs_max_prims_per_subgroup;
- unsigned vgt_esgs_ring_itemsize;
- } gs;
-
- struct {
- unsigned ge_max_output_per_subgroup;
- unsigned ge_ngg_subgrp_cntl;
- unsigned vgt_primitiveid_en;
- unsigned vgt_gs_onchip_cntl;
- unsigned vgt_gs_instance_cnt;
- unsigned vgt_esgs_ring_itemsize;
- unsigned spi_vs_out_config;
- unsigned spi_shader_idx_format;
- unsigned spi_shader_pos_format;
- unsigned pa_cl_vte_cntl;
- unsigned pa_cl_ngg_cntl;
- unsigned vgt_gs_max_vert_out; /* for API GS */
- unsigned ge_pc_alloc; /* uconfig register */
- } ngg;
-
- struct {
- unsigned vgt_gs_mode;
- unsigned vgt_primitiveid_en;
- unsigned vgt_reuse_off;
- unsigned spi_vs_out_config;
- unsigned spi_shader_pos_format;
- unsigned pa_cl_vte_cntl;
- unsigned ge_pc_alloc; /* uconfig register */
- } vs;
-
- struct {
- unsigned spi_ps_input_ena;
- unsigned spi_ps_input_addr;
- unsigned spi_baryc_cntl;
- unsigned spi_ps_in_control;
- unsigned spi_shader_z_format;
- unsigned spi_shader_col_format;
- unsigned cb_shader_mask;
- } ps;
- } ctx_reg;
-
- /*For save precompute registers value */
- unsigned vgt_tf_param; /* VGT_TF_PARAM */
- unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
- unsigned pa_cl_vs_out_cntl;
- unsigned ge_cntl;
+ struct si_compiler_ctx_state compiler_ctx_state;
+
+ struct si_shader_selector *selector;
+ struct si_shader_selector *previous_stage_sel; /* for refcounting */
+ struct si_shader *next_variant;
+
+ struct si_shader_part *prolog;
+ struct si_shader *previous_stage; /* for GFX9 */
+ struct si_shader_part *prolog2;
+ struct si_shader_part *epilog;
+
+ struct si_pm4_state *pm4;
+ struct si_resource *bo;
+ struct si_resource *scratch_bo;
+ struct si_shader_key key;
+ struct util_queue_fence ready;
+ bool compilation_failed;
+ bool is_monolithic;
+ bool is_optimized;
+ bool is_binary_shared;
+ bool is_gs_copy_shader;
+
+ /* The following data is all that's needed for binary shaders. */
+ struct si_shader_binary binary;
+ struct ac_shader_config config;
+ struct si_shader_binary_info info;
+
+ struct {
+ uint16_t ngg_emit_size; /* in dwords */
+ uint16_t hw_max_esverts;
+ uint16_t max_gsprims;
+ uint16_t max_out_verts;
+ uint16_t prim_amp_factor;
+ bool max_vert_out_per_gs_instance;
+ } ngg;
+
+ /* Shader key + LLVM IR + disassembly + statistics.
+ * Generated for debug contexts only.
+ */
+ char *shader_log;
+ size_t shader_log_size;
+
+ struct gfx9_gs_info gs_info;
+
+ /* For save precompute context registers values. */
+ union {
+ struct {
+ unsigned vgt_gsvs_ring_offset_1;
+ unsigned vgt_gsvs_ring_offset_2;
+ unsigned vgt_gsvs_ring_offset_3;
+ unsigned vgt_gsvs_ring_itemsize;
+ unsigned vgt_gs_max_vert_out;
+ unsigned vgt_gs_vert_itemsize;
+ unsigned vgt_gs_vert_itemsize_1;
+ unsigned vgt_gs_vert_itemsize_2;
+ unsigned vgt_gs_vert_itemsize_3;
+ unsigned vgt_gs_instance_cnt;
+ unsigned vgt_gs_onchip_cntl;
+ unsigned vgt_gs_max_prims_per_subgroup;
+ unsigned vgt_esgs_ring_itemsize;
+ } gs;
+
+ struct {
+ unsigned ge_max_output_per_subgroup;
+ unsigned ge_ngg_subgrp_cntl;
+ unsigned vgt_primitiveid_en;
+ unsigned vgt_gs_onchip_cntl;
+ unsigned vgt_gs_instance_cnt;
+ unsigned vgt_esgs_ring_itemsize;
+ unsigned spi_vs_out_config;
+ unsigned spi_shader_idx_format;
+ unsigned spi_shader_pos_format;
+ unsigned pa_cl_vte_cntl;
+ unsigned pa_cl_ngg_cntl;
+ unsigned vgt_gs_max_vert_out; /* for API GS */
+ unsigned ge_pc_alloc; /* uconfig register */
+ } ngg;
+
+ struct {
+ unsigned vgt_gs_mode;
+ unsigned vgt_primitiveid_en;
+ unsigned vgt_reuse_off;
+ unsigned spi_vs_out_config;
+ unsigned spi_shader_pos_format;
+ unsigned pa_cl_vte_cntl;
+ unsigned ge_pc_alloc; /* uconfig register */
+ } vs;
+
+ struct {
+ unsigned spi_ps_input_ena;
+ unsigned spi_ps_input_addr;
+ unsigned spi_baryc_cntl;
+ unsigned spi_ps_in_control;
+ unsigned spi_shader_z_format;
+ unsigned spi_shader_col_format;
+ unsigned cb_shader_mask;
+ } ps;
+ } ctx_reg;
+
+ /*For save precompute registers value */
+ unsigned vgt_tf_param; /* VGT_TF_PARAM */
+ unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
+ unsigned pa_cl_vs_out_cntl;
+ unsigned ge_cntl;
};
struct si_shader_part {
- struct si_shader_part *next;
- union si_shader_part_key key;
- struct si_shader_binary binary;
- struct ac_shader_config config;
+ struct si_shader_part *next;
+ union si_shader_part_key key;
+ struct si_shader_binary binary;
+ struct ac_shader_config config;
};
/* si_shader.c */
-bool si_compile_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug);
-bool si_create_shader_variant(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader *shader,
- struct pipe_debug_callback *debug);
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug);
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+ struct si_shader *shader, struct pipe_debug_callback *debug);
void si_shader_destroy(struct si_shader *shader);
unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index);
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
- unsigned is_varying);
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying);
bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
- uint64_t scratch_va);
+ uint64_t scratch_va);
void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
- struct pipe_debug_callback *debug,
- FILE *f, bool check_debug_option);
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
- struct si_shader *shader,
- struct pipe_debug_callback *debug);
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
- unsigned *lds_size);
+ struct pipe_debug_callback *debug, FILE *f, bool check_debug_option);
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+ struct pipe_debug_callback *debug);
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
const char *si_get_shader_name(const struct si_shader *shader);
void si_shader_binary_clean(struct si_shader_binary *binary);
/* si_shader_llvm_gs.c */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader_selector *gs_selector,
- struct pipe_debug_callback *debug);
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+ struct ac_llvm_compiler *compiler,
+ struct si_shader_selector *gs_selector,
+ struct pipe_debug_callback *debug);
/* si_shader_nir.c */
-void si_nir_scan_shader(const struct nir_shader *nir,
- struct si_shader_info *info);
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
void si_nir_adjust_driver_locations(struct nir_shader *nir);
void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
/* si_state_shaders.c */
-void gfx9_get_gs_info(struct si_shader_selector *es,
- struct si_shader_selector *gs,
- struct gfx9_gs_info *out);
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+ struct gfx9_gs_info *out);
/* Inline helpers. */
/* Return the pointer to the main shader part's pointer. */
-static inline struct si_shader **
-si_get_main_shader_part(struct si_shader_selector *sel,
- struct si_shader_key *key)
+static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
+ struct si_shader_key *key)
{
- if (key->as_ls)
- return &sel->main_shader_part_ls;
- if (key->as_es && key->as_ngg)
- return &sel->main_shader_part_ngg_es;
- if (key->as_es)
- return &sel->main_shader_part_es;
- if (key->as_ngg)
- return &sel->main_shader_part_ngg;
- return &sel->main_shader_part;
+ if (key->as_ls)
+ return &sel->main_shader_part_ls;
+ if (key->as_es && key->as_ngg)
+ return &sel->main_shader_part_ngg_es;
+ if (key->as_es)
+ return &sel->main_shader_part_es;
+ if (key->as_ngg)
+ return &sel->main_shader_part_ngg;
+ return &sel->main_shader_part;
}
-static inline bool
-gfx10_is_ngg_passthrough(struct si_shader *shader)
+static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
{
- struct si_shader_selector *sel = shader->selector;
-
- return sel->type != PIPE_SHADER_GEOMETRY &&
- !sel->so.num_outputs &&
- !sel->info.writes_edgeflag &&
- !shader->key.opt.ngg_culling &&
- (sel->type != PIPE_SHADER_VERTEX ||
- !shader->key.mono.u.vs_export_prim_id);
+ struct si_shader_selector *sel = shader->selector;
+
+ return sel->type != PIPE_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+ !shader->key.opt.ngg_culling &&
+ (sel->type != PIPE_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
}
-static inline bool
-si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
{
- return selector ? selector->info.uses_bindless_samplers : false;
+ return selector ? selector->info.uses_bindless_samplers : false;
}
-static inline bool
-si_shader_uses_bindless_images(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
{
- return selector ? selector->info.uses_bindless_images : false;
+ return selector ? selector->info.uses_bindless_images : false;
}
#endif
#ifndef SI_SHADER_PRIVATE_H
#define SI_SHADER_PRIVATE_H
-#include "si_shader.h"
#include "ac_shader_abi.h"
+#include "si_shader.h"
struct pipe_debug_callback;
#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
struct si_shader_output_values {
- LLVMValueRef values[4];
- unsigned semantic_name;
- unsigned semantic_index;
- ubyte vertex_stream[4];
+ LLVMValueRef values[4];
+ unsigned semantic_name;
+ unsigned semantic_index;
+ ubyte vertex_stream[4];
};
struct si_shader_context {
- struct ac_llvm_context ac;
- struct si_shader *shader;
- struct si_screen *screen;
-
- unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
-
- /* For clamping the non-constant index in resource indexing: */
- unsigned num_const_buffers;
- unsigned num_shader_buffers;
- unsigned num_images;
- unsigned num_samplers;
-
- struct ac_shader_args args;
- struct ac_shader_abi abi;
-
- LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
- LLVMBasicBlockRef merged_wrap_if_entry_block;
- int merged_wrap_if_label;
-
- LLVMValueRef main_fn;
- LLVMTypeRef return_type;
-
- struct ac_arg const_and_shader_buffers;
- struct ac_arg samplers_and_images;
-
- /* For merged shaders, the per-stage descriptors for the stage other
- * than the one we're processing, used to pass them through from the
- * first stage to the second.
- */
- struct ac_arg other_const_and_shader_buffers;
- struct ac_arg other_samplers_and_images;
-
- struct ac_arg rw_buffers;
- struct ac_arg bindless_samplers_and_images;
- /* Common inputs for merged shaders. */
- struct ac_arg merged_wave_info;
- struct ac_arg merged_scratch_offset;
- struct ac_arg small_prim_cull_info;
- /* API VS */
- struct ac_arg vertex_buffers;
- struct ac_arg vb_descriptors[5];
- struct ac_arg rel_auto_id;
- struct ac_arg vs_prim_id;
- struct ac_arg vertex_index0;
- /* VS states and layout of LS outputs / TCS inputs at the end
- * [0] = clamp vertex color
- * [1] = indexed
- * [2:3] = NGG: output primitive type
- * [4:5] = NGG: provoking vertex index
- * [6] = NGG: streamout queries enabled
- * [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
- * but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
- * Only the first 4 bits of the exponent are stored.
- * Set it like this: (fui(num_samples / quant_mode) >> 23)
- * Expand to FP32 like this: ((0x70 | value) << 23);
- * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
- * = 1/2^(15 - value) in FP32
- * [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
- * max = 32*32*4 + 32*4
- * [24:31] = stride between vertices in DW = num_inputs * 4
- * max = 32*4
- */
- struct ac_arg vs_state_bits;
- struct ac_arg vs_blit_inputs;
- struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
- /* HW VS */
- struct ac_arg streamout_config;
- struct ac_arg streamout_write_index;
- struct ac_arg streamout_offset[4];
-
- /* API TCS & TES */
- /* Layout of TCS outputs in the offchip buffer
- * # 6 bits
- * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
- * # 6 bits
- * [6:11] = the number of output vertices per patch, max = 32
- * # 20 bits
- * [12:31] = the offset of per patch attributes in the buffer in bytes.
- * max = NUM_PATCHES*32*32*16
- */
- struct ac_arg tcs_offchip_layout;
-
- /* API TCS */
- /* Offsets where TCS outputs and TCS patch outputs live in LDS:
- * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
- * [16:31] = TCS output patch0 offset for per-patch / 16
- * max = (NUM_PATCHES + 1) * 32*32
- */
- struct ac_arg tcs_out_lds_offsets;
- /* Layout of TCS outputs / TES inputs:
- * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
- * max = 32*32*4 + 32*4
- * [13:18] = gl_PatchVerticesIn, max = 32
- * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
- */
- struct ac_arg tcs_out_lds_layout;
- struct ac_arg tcs_offchip_offset;
- struct ac_arg tcs_factor_offset;
-
- /* API TES */
- struct ac_arg tes_offchip_addr;
- struct ac_arg tes_u;
- struct ac_arg tes_v;
- struct ac_arg tes_rel_patch_id;
- /* HW ES */
- struct ac_arg es2gs_offset;
- /* HW GS */
- /* On gfx10:
- * - bits 0..11: ordered_wave_id
- * - bits 12..20: number of vertices in group
- * - bits 22..30: number of primitives in group
- */
- struct ac_arg gs_tg_info;
- /* API GS */
- struct ac_arg gs2vs_offset;
- struct ac_arg gs_wave_id; /* GFX6 */
- struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
- struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
- struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
- struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
- /* PS */
- struct ac_arg pos_fixed_pt;
- /* CS */
- struct ac_arg block_size;
- struct ac_arg cs_user_data;
-
- struct ac_llvm_compiler *compiler;
-
- /* Preloaded descriptors. */
- LLVMValueRef esgs_ring;
- LLVMValueRef gsvs_ring[4];
- LLVMValueRef tess_offchip_ring;
-
- LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
- LLVMValueRef gs_next_vertex[4];
- LLVMValueRef gs_curprim_verts[4];
- LLVMValueRef gs_generated_prims[4];
- LLVMValueRef gs_ngg_emit;
- LLVMValueRef gs_ngg_scratch;
- LLVMValueRef postponed_kill;
- LLVMValueRef return_value;
+ struct ac_llvm_context ac;
+ struct si_shader *shader;
+ struct si_screen *screen;
+
+ unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
+
+ /* For clamping the non-constant index in resource indexing: */
+ unsigned num_const_buffers;
+ unsigned num_shader_buffers;
+ unsigned num_images;
+ unsigned num_samplers;
+
+ struct ac_shader_args args;
+ struct ac_shader_abi abi;
+
+ LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
+
+ LLVMBasicBlockRef merged_wrap_if_entry_block;
+ int merged_wrap_if_label;
+
+ LLVMValueRef main_fn;
+ LLVMTypeRef return_type;
+
+ struct ac_arg const_and_shader_buffers;
+ struct ac_arg samplers_and_images;
+
+ /* For merged shaders, the per-stage descriptors for the stage other
+ * than the one we're processing, used to pass them through from the
+ * first stage to the second.
+ */
+ struct ac_arg other_const_and_shader_buffers;
+ struct ac_arg other_samplers_and_images;
+
+ struct ac_arg rw_buffers;
+ struct ac_arg bindless_samplers_and_images;
+ /* Common inputs for merged shaders. */
+ struct ac_arg merged_wave_info;
+ struct ac_arg merged_scratch_offset;
+ struct ac_arg small_prim_cull_info;
+ /* API VS */
+ struct ac_arg vertex_buffers;
+ struct ac_arg vb_descriptors[5];
+ struct ac_arg rel_auto_id;
+ struct ac_arg vs_prim_id;
+ struct ac_arg vertex_index0;
+ /* VS states and layout of LS outputs / TCS inputs at the end
+ * [0] = clamp vertex color
+ * [1] = indexed
+ * [2:3] = NGG: output primitive type
+ * [4:5] = NGG: provoking vertex index
+ * [6] = NGG: streamout queries enabled
+ * [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
+ * but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
+ * Only the first 4 bits of the exponent are stored.
+ * Set it like this: (fui(num_samples / quant_mode) >> 23)
+ * Expand to FP32 like this: ((0x70 | value) << 23);
+ * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
+ * = 1/2^(15 - value) in FP32
+ * [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
+ * max = 32*32*4 + 32*4
+ * [24:31] = stride between vertices in DW = num_inputs * 4
+ * max = 32*4
+ */
+ struct ac_arg vs_state_bits;
+ struct ac_arg vs_blit_inputs;
+ struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
+ /* HW VS */
+ struct ac_arg streamout_config;
+ struct ac_arg streamout_write_index;
+ struct ac_arg streamout_offset[4];
+
+ /* API TCS & TES */
+ /* Layout of TCS outputs in the offchip buffer
+ * # 6 bits
+ * [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
+ * # 6 bits
+ * [6:11] = the number of output vertices per patch, max = 32
+ * # 20 bits
+ * [12:31] = the offset of per patch attributes in the buffer in bytes.
+ * max = NUM_PATCHES*32*32*16
+ */
+ struct ac_arg tcs_offchip_layout;
+
+ /* API TCS */
+ /* Offsets where TCS outputs and TCS patch outputs live in LDS:
+ * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+ * [16:31] = TCS output patch0 offset for per-patch / 16
+ * max = (NUM_PATCHES + 1) * 32*32
+ */
+ struct ac_arg tcs_out_lds_offsets;
+ /* Layout of TCS outputs / TES inputs:
+ * [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
+ * max = 32*32*4 + 32*4
+ * [13:18] = gl_PatchVerticesIn, max = 32
+ * [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
+ */
+ struct ac_arg tcs_out_lds_layout;
+ struct ac_arg tcs_offchip_offset;
+ struct ac_arg tcs_factor_offset;
+
+ /* API TES */
+ struct ac_arg tes_offchip_addr;
+ struct ac_arg tes_u;
+ struct ac_arg tes_v;
+ struct ac_arg tes_rel_patch_id;
+ /* HW ES */
+ struct ac_arg es2gs_offset;
+ /* HW GS */
+ /* On gfx10:
+ * - bits 0..11: ordered_wave_id
+ * - bits 12..20: number of vertices in group
+ * - bits 22..30: number of primitives in group
+ */
+ struct ac_arg gs_tg_info;
+ /* API GS */
+ struct ac_arg gs2vs_offset;
+ struct ac_arg gs_wave_id; /* GFX6 */
+ struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
+ struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
+ struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
+ struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
+ /* PS */
+ struct ac_arg pos_fixed_pt;
+ /* CS */
+ struct ac_arg block_size;
+ struct ac_arg cs_user_data;
+
+ struct ac_llvm_compiler *compiler;
+
+ /* Preloaded descriptors. */
+ LLVMValueRef esgs_ring;
+ LLVMValueRef gsvs_ring[4];
+ LLVMValueRef tess_offchip_ring;
+
+ LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
+ LLVMValueRef gs_next_vertex[4];
+ LLVMValueRef gs_curprim_verts[4];
+ LLVMValueRef gs_generated_prims[4];
+ LLVMValueRef gs_ngg_emit;
+ LLVMValueRef gs_ngg_scratch;
+ LLVMValueRef postponed_kill;
+ LLVMValueRef return_value;
};
-static inline struct si_shader_context *
-si_shader_context_from_abi(struct ac_shader_abi *abi)
+static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = NULL;
- return container_of(abi, ctx, abi);
+ struct si_shader_context *ctx = NULL;
+ return container_of(abi, ctx, abi);
}
bool si_is_multi_part_shader(struct si_shader *shader);
bool si_is_merged_shader(struct si_shader *shader);
-void si_add_arg_checked(struct ac_shader_args *args,
- enum ac_arg_regfile file,
- unsigned registers, enum ac_arg_type type,
- struct ac_arg *arg,
- unsigned idx);
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+ enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
unsigned si_get_max_workgroup_size(const struct si_shader *shader);
bool si_need_ps_prolog(const union si_shader_part_key *key);
-void si_get_ps_prolog_key(struct si_shader *shader,
- union si_shader_part_key *key,
- bool separate_prolog);
-void si_get_ps_epilog_key(struct si_shader *shader,
- union si_shader_part_key *key);
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+ bool separate_prolog);
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key);
void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);
bool gfx10_ngg_export_prim_early(struct si_shader *shader);
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
- LLVMValueRef user_edgeflags[3],
- LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs);
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
- unsigned stream,
- LLVMValueRef *addrs);
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+ LLVMValueRef prim_passthrough);
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
/* si_shader_llvm.c */
-bool si_compile_llvm(struct si_screen *sscreen,
- struct si_shader_binary *binary,
- struct ac_shader_config *conf,
- struct ac_llvm_compiler *compiler,
- struct ac_llvm_context *ac,
- struct pipe_debug_callback *debug,
- enum pipe_shader_type shader_type,
- const char *name,
- bool less_optimized);
-void si_llvm_context_init(struct si_shader_context *ctx,
- struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- unsigned wave_size);
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
- LLVMTypeRef *return_types, unsigned num_return_elems,
- unsigned max_workgroup_size);
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+ struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+ struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+ enum pipe_shader_type shader_type, const char *name, bool less_optimized);
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+ struct ac_llvm_compiler *compiler, unsigned wave_size);
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+ unsigned num_return_elems, unsigned max_workgroup_size);
void si_llvm_optimize_module(struct si_shader_context *ctx);
void si_llvm_dispose(struct si_shader_context *ctx);
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
- LLVMValueRef resource, LLVMValueRef offset);
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+ LLVMValueRef offset);
void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret);
LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index);
+ struct ac_arg param, unsigned return_index);
LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index);
+ struct ac_arg param, unsigned return_index);
LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index);
+ struct ac_arg param, unsigned return_index);
LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
- LLVMTypeRef type, LLVMValueRef val1,
- LLVMValueRef val2);
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+ LLVMValueRef val1, LLVMValueRef val2);
void si_llvm_emit_barrier(struct si_shader_context *ctx);
void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
- unsigned bitoffset);
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
- struct ac_arg param, unsigned rshift,
- unsigned bitwidth);
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
- unsigned swizzle);
+ unsigned bitoffset);
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+ unsigned bitwidth);
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
- unsigned num_parts, unsigned main_part,
- unsigned next_shader_first_part);
+ unsigned num_parts, unsigned main_part,
+ unsigned next_shader_first_part);
/* si_shader_llvm_gs.c */
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
void si_preload_esgs_ring(struct si_shader_context *ctx);
void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key);
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
/* si_shader_llvm_tess.c */
void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs);
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
- union si_shader_part_key *key);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
/* si_shader_llvm_ps.c */
LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key);
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
- union si_shader_part_key *key);
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
- struct si_shader *shader);
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader);
void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
/* si_shader_llvm_resources.c */
/* si_shader_llvm_vs.c */
void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
- LLVMValueRef const *so_buffers,
- LLVMValueRef const *so_write_offsets,
- struct pipe_stream_output *stream_out,
- struct si_shader_output_values *shader_out);
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput, unsigned stream);
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+ LLVMValueRef const *so_write_offsets,
+ struct pipe_stream_output *stream_out,
+ struct si_shader_output_values *shader_out);
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+ unsigned noutput, unsigned stream);
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs);
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key);
+ struct si_shader_output_values *outputs, unsigned noutput);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
#endif
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "ac_rtld.h"
#include "ac_nir_to_llvm.h"
+#include "ac_rtld.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
-
#include "tgsi/tgsi_from_mesa.h"
#include "util/u_memory.h"
struct si_llvm_diagnostics {
- struct pipe_debug_callback *debug;
- unsigned retval;
+ struct pipe_debug_callback *debug;
+ unsigned retval;
};
static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
{
- struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
- LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
- const char *severity_str = NULL;
-
- switch (severity) {
- case LLVMDSError:
- severity_str = "error";
- break;
- case LLVMDSWarning:
- severity_str = "warning";
- break;
- case LLVMDSRemark:
- case LLVMDSNote:
- default:
- return;
- }
-
- char *description = LLVMGetDiagInfoDescription(di);
-
- pipe_debug_message(diag->debug, SHADER_INFO,
- "LLVM diagnostic (%s): %s", severity_str, description);
-
- if (severity == LLVMDSError) {
- diag->retval = 1;
- fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
- }
-
- LLVMDisposeMessage(description);
+ struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
+ LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+ const char *severity_str = NULL;
+
+ switch (severity) {
+ case LLVMDSError:
+ severity_str = "error";
+ break;
+ case LLVMDSWarning:
+ severity_str = "warning";
+ break;
+ case LLVMDSRemark:
+ case LLVMDSNote:
+ default:
+ return;
+ }
+
+ char *description = LLVMGetDiagInfoDescription(di);
+
+ pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
+ description);
+
+ if (severity == LLVMDSError) {
+ diag->retval = 1;
+ fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
+ }
+
+ LLVMDisposeMessage(description);
}
-bool si_compile_llvm(struct si_screen *sscreen,
- struct si_shader_binary *binary,
- struct ac_shader_config *conf,
- struct ac_llvm_compiler *compiler,
- struct ac_llvm_context *ac,
- struct pipe_debug_callback *debug,
- enum pipe_shader_type shader_type,
- const char *name,
- bool less_optimized)
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+ struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+ struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+ enum pipe_shader_type shader_type, const char *name, bool less_optimized)
{
- unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
- if (si_can_dump_shader(sscreen, shader_type)) {
- fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
- if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
- fprintf(stderr, "%s LLVM IR:\n\n", name);
- ac_dump_module(ac->module);
- fprintf(stderr, "\n");
- }
- }
-
- if (sscreen->record_llvm_ir) {
- char *ir = LLVMPrintModuleToString(ac->module);
- binary->llvm_ir_string = strdup(ir);
- LLVMDisposeMessage(ir);
- }
-
- if (!si_replace_shader(count, binary)) {
- struct ac_compiler_passes *passes = compiler->passes;
-
- if (ac->wave_size == 32)
- passes = compiler->passes_wave32;
- else if (less_optimized && compiler->low_opt_passes)
- passes = compiler->low_opt_passes;
-
- struct si_llvm_diagnostics diag = {debug};
- LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
-
- if (!ac_compile_module_to_elf(passes, ac->module,
- (char **)&binary->elf_buffer,
- &binary->elf_size))
- diag.retval = 1;
-
- if (diag.retval != 0) {
- pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
- return false;
- }
- }
-
- struct ac_rtld_binary rtld;
- if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
- .info = &sscreen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = ac->wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return false;
-
- bool ok = ac_rtld_read_config(&rtld, conf);
- ac_rtld_close(&rtld);
- return ok;
+ unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
+
+ if (si_can_dump_shader(sscreen, shader_type)) {
+ fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
+
+ if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
+ fprintf(stderr, "%s LLVM IR:\n\n", name);
+ ac_dump_module(ac->module);
+ fprintf(stderr, "\n");
+ }
+ }
+
+ if (sscreen->record_llvm_ir) {
+ char *ir = LLVMPrintModuleToString(ac->module);
+ binary->llvm_ir_string = strdup(ir);
+ LLVMDisposeMessage(ir);
+ }
+
+ if (!si_replace_shader(count, binary)) {
+ struct ac_compiler_passes *passes = compiler->passes;
+
+ if (ac->wave_size == 32)
+ passes = compiler->passes_wave32;
+ else if (less_optimized && compiler->low_opt_passes)
+ passes = compiler->low_opt_passes;
+
+ struct si_llvm_diagnostics diag = {debug};
+ LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
+
+ if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
+ &binary->elf_size))
+ diag.retval = 1;
+
+ if (diag.retval != 0) {
+ pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
+ return false;
+ }
+ }
+
+ struct ac_rtld_binary rtld;
+ if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
+ .info = &sscreen->info,
+ .shader_type = tgsi_processor_to_shader_stage(shader_type),
+ .wave_size = ac->wave_size,
+ .num_parts = 1,
+ .elf_ptrs = &binary->elf_buffer,
+ .elf_sizes = &binary->elf_size}))
+ return false;
+
+ bool ok = ac_rtld_read_config(&rtld, conf);
+ ac_rtld_close(&rtld);
+ return ok;
}
-void si_llvm_context_init(struct si_shader_context *ctx,
- struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- unsigned wave_size)
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+ struct ac_llvm_compiler *compiler, unsigned wave_size)
{
- memset(ctx, 0, sizeof(*ctx));
- ctx->screen = sscreen;
- ctx->compiler = compiler;
-
- ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
- sscreen->info.family,
- AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
- wave_size, 64);
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->screen = sscreen;
+ ctx->compiler = compiler;
+
+ ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
+ AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, wave_size, 64);
}
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
- LLVMTypeRef *return_types, unsigned num_return_elems,
- unsigned max_workgroup_size)
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+ unsigned num_return_elems, unsigned max_workgroup_size)
{
- LLVMTypeRef ret_type;
- enum ac_llvm_calling_convention call_conv;
- enum pipe_shader_type real_shader_type;
-
- if (num_return_elems)
- ret_type = LLVMStructTypeInContext(ctx->ac.context,
- return_types,
- num_return_elems, true);
- else
- ret_type = ctx->ac.voidt;
-
- real_shader_type = ctx->type;
-
- /* LS is merged into HS (TCS), and ES is merged into GS. */
- if (ctx->screen->info.chip_class >= GFX9) {
- if (ctx->shader->key.as_ls)
- real_shader_type = PIPE_SHADER_TESS_CTRL;
- else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
- real_shader_type = PIPE_SHADER_GEOMETRY;
- }
-
- switch (real_shader_type) {
- case PIPE_SHADER_VERTEX:
- case PIPE_SHADER_TESS_EVAL:
- call_conv = AC_LLVM_AMDGPU_VS;
- break;
- case PIPE_SHADER_TESS_CTRL:
- call_conv = AC_LLVM_AMDGPU_HS;
- break;
- case PIPE_SHADER_GEOMETRY:
- call_conv = AC_LLVM_AMDGPU_GS;
- break;
- case PIPE_SHADER_FRAGMENT:
- call_conv = AC_LLVM_AMDGPU_PS;
- break;
- case PIPE_SHADER_COMPUTE:
- call_conv = AC_LLVM_AMDGPU_CS;
- break;
- default:
- unreachable("Unhandle shader type");
- }
-
- /* Setup the function */
- ctx->return_type = ret_type;
- ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name,
- ret_type, ctx->ac.module);
- ctx->return_value = LLVMGetUndef(ctx->return_type);
-
- if (ctx->screen->info.address32_hi) {
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "amdgpu-32bit-address-high-bits",
- ctx->screen->info.address32_hi);
- }
-
- LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
- "no-signed-zeros-fp-math",
- "true");
-
- ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+ LLVMTypeRef ret_type;
+ enum ac_llvm_calling_convention call_conv;
+ enum pipe_shader_type real_shader_type;
+
+ if (num_return_elems)
+ ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
+ else
+ ret_type = ctx->ac.voidt;
+
+ real_shader_type = ctx->type;
+
+ /* LS is merged into HS (TCS), and ES is merged into GS. */
+ if (ctx->screen->info.chip_class >= GFX9) {
+ if (ctx->shader->key.as_ls)
+ real_shader_type = PIPE_SHADER_TESS_CTRL;
+ else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
+ real_shader_type = PIPE_SHADER_GEOMETRY;
+ }
+
+ switch (real_shader_type) {
+ case PIPE_SHADER_VERTEX:
+ case PIPE_SHADER_TESS_EVAL:
+ call_conv = AC_LLVM_AMDGPU_VS;
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ call_conv = AC_LLVM_AMDGPU_HS;
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ call_conv = AC_LLVM_AMDGPU_GS;
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ call_conv = AC_LLVM_AMDGPU_PS;
+ break;
+ case PIPE_SHADER_COMPUTE:
+ call_conv = AC_LLVM_AMDGPU_CS;
+ break;
+ default:
+ unreachable("Unhandle shader type");
+ }
+
+ /* Setup the function */
+ ctx->return_type = ret_type;
+ ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
+ ctx->return_value = LLVMGetUndef(ctx->return_type);
+
+ if (ctx->screen->info.address32_hi) {
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
+ ctx->screen->info.address32_hi);
+ }
+
+ LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true");
+
+ ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
}
void si_llvm_optimize_module(struct si_shader_context *ctx)
{
- /* Dump LLVM IR before any optimization passes */
- if (ctx->screen->debug_flags & DBG(PREOPT_IR) &&
- si_can_dump_shader(ctx->screen, ctx->type))
- LLVMDumpModule(ctx->ac.module);
-
- /* Run the pass */
- LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
- LLVMDisposeBuilder(ctx->ac.builder);
+ /* Dump LLVM IR before any optimization passes */
+ if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type))
+ LLVMDumpModule(ctx->ac.module);
+
+ /* Run the pass */
+ LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
+ LLVMDisposeBuilder(ctx->ac.builder);
}
void si_llvm_dispose(struct si_shader_context *ctx)
{
- LLVMDisposeModule(ctx->ac.module);
- LLVMContextDispose(ctx->ac.context);
- ac_llvm_context_dispose(&ctx->ac);
+ LLVMDisposeModule(ctx->ac.module);
+ LLVMContextDispose(ctx->ac.context);
+ ac_llvm_context_dispose(&ctx->ac);
}
/**
* Load a dword from a constant buffer.
*/
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
- LLVMValueRef resource, LLVMValueRef offset)
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+ LLVMValueRef offset)
{
- return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
- 0, 0, true, true);
+ return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true);
}
void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
{
- if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
- LLVMBuildRetVoid(ctx->ac.builder);
- else
- LLVMBuildRet(ctx->ac.builder, ret);
+ if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+ LLVMBuildRetVoid(ctx->ac.builder);
+ else
+ LLVMBuildRet(ctx->ac.builder, ret);
}
LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index)
+ struct ac_arg param, unsigned return_index)
{
- return LLVMBuildInsertValue(ctx->ac.builder, ret,
- ac_get_arg(&ctx->ac, param),
- return_index, "");
+ return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
}
LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index)
+ struct ac_arg param, unsigned return_index)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef p = ac_get_arg(&ctx->ac, param);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef p = ac_get_arg(&ctx->ac, param);
- return LLVMBuildInsertValue(builder, ret,
- ac_to_float(&ctx->ac, p),
- return_index, "");
+ return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
}
LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
- struct ac_arg param, unsigned return_index)
+ struct ac_arg param, unsigned return_index)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
- ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
- return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
+ ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
+ return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
}
LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
{
- LLVMValueRef ptr[2], list;
- bool merged_shader = si_is_merged_shader(ctx->shader);
+ LLVMValueRef ptr[2], list;
+ bool merged_shader = si_is_merged_shader(ctx->shader);
- ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
- list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
- ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
- return list;
+ ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
+ list =
+ LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+ return list;
}
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
- LLVMTypeRef type, LLVMValueRef val1,
- LLVMValueRef val2)
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+ LLVMValueRef val1, LLVMValueRef val2)
{
- LLVMValueRef values[2] = {
- ac_to_integer(&ctx->ac, val1),
- ac_to_integer(&ctx->ac, val2),
- };
- LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
- return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+ LLVMValueRef values[2] = {
+ ac_to_integer(&ctx->ac, val1),
+ ac_to_integer(&ctx->ac, val2),
+ };
+ LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+ return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
}
void si_llvm_emit_barrier(struct si_shader_context *ctx)
{
- /* GFX6 only (thanks to a hw bug workaround):
- * The real barrier instruction isn’t needed, because an entire patch
- * always fits into a single wave.
- */
- if (ctx->screen->info.chip_class == GFX6 &&
- ctx->type == PIPE_SHADER_TESS_CTRL) {
- ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
- return;
- }
-
- ac_build_s_barrier(&ctx->ac);
+ /* GFX6 only (thanks to a hw bug workaround):
+ * The real barrier instruction isn’t needed, because an entire patch
+ * always fits into a single wave.
+ */
+ if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) {
+ ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
+ return;
+ }
+
+ ac_build_s_barrier(&ctx->ac);
}
/* Ensure that the esgs ring is declared.
*/
void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
{
- if (ctx->esgs_ring)
- return;
+ if (ctx->esgs_ring)
+ return;
- assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
+ assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
- ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
- ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
- "esgs_ring",
- AC_ADDR_SPACE_LDS);
- LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
- LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
+ ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+ "esgs_ring", AC_ADDR_SPACE_LDS);
+ LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
+ LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
}
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
- unsigned bitoffset)
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
{
- LLVMValueRef args[] = {
- ac_get_arg(&ctx->ac, param),
- LLVMConstInt(ctx->ac.i32, bitoffset, 0),
- };
- ac_build_intrinsic(&ctx->ac,
- "llvm.amdgcn.init.exec.from.input",
- ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+ LLVMValueRef args[] = {
+ ac_get_arg(&ctx->ac, param),
+ LLVMConstInt(ctx->ac.i32, bitoffset, 0),
+ };
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
+ AC_FUNC_ATTR_CONVERGENT);
}
/**
* Get the value of a shader input parameter and extract a bitfield.
*/
-static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
- LLVMValueRef value, unsigned rshift,
- unsigned bitwidth)
+static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
+ unsigned rshift, unsigned bitwidth)
{
- if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
- value = ac_to_integer(&ctx->ac, value);
+ if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+ value = ac_to_integer(&ctx->ac, value);
- if (rshift)
- value = LLVMBuildLShr(ctx->ac.builder, value,
- LLVMConstInt(ctx->ac.i32, rshift, 0), "");
+ if (rshift)
+ value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
- if (rshift + bitwidth < 32) {
- unsigned mask = (1 << bitwidth) - 1;
- value = LLVMBuildAnd(ctx->ac.builder, value,
- LLVMConstInt(ctx->ac.i32, mask, 0), "");
- }
+ if (rshift + bitwidth < 32) {
+ unsigned mask = (1 << bitwidth) - 1;
+ value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
+ }
- return value;
+ return value;
}
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
- struct ac_arg param, unsigned rshift,
- unsigned bitwidth)
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+ unsigned bitwidth)
{
- LLVMValueRef value = ac_get_arg(&ctx->ac, param);
+ LLVMValueRef value = ac_get_arg(&ctx->ac, param);
- return unpack_llvm_param(ctx, value, rshift, bitwidth);
+ return unpack_llvm_param(ctx, value, rshift, bitwidth);
}
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
- unsigned swizzle)
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
{
- if (swizzle > 0)
- return ctx->ac.i32_0;
-
- switch (ctx->type) {
- case PIPE_SHADER_VERTEX:
- return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
- case PIPE_SHADER_TESS_CTRL:
- return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
- case PIPE_SHADER_TESS_EVAL:
- return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
- case PIPE_SHADER_GEOMETRY:
- return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
- default:
- assert(0);
- return ctx->ac.i32_0;
- }
+ if (swizzle > 0)
+ return ctx->ac.i32_0;
+
+ switch (ctx->type) {
+ case PIPE_SHADER_VERTEX:
+ return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
+ case PIPE_SHADER_TESS_CTRL:
+ return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
+ case PIPE_SHADER_TESS_EVAL:
+ return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
+ case PIPE_SHADER_GEOMETRY:
+ return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+ default:
+ assert(0);
+ return ctx->ac.i32_0;
+ }
}
LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMValueRef values[3];
- LLVMValueRef result;
- unsigned i;
- unsigned *properties = ctx->shader->selector->info.properties;
+ LLVMValueRef values[3];
+ LLVMValueRef result;
+ unsigned i;
+ unsigned *properties = ctx->shader->selector->info.properties;
- if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
- unsigned sizes[3] = {
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
- };
+ if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+ unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+ properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+ properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]};
- for (i = 0; i < 3; ++i)
- values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
+ for (i = 0; i < 3; ++i)
+ values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
- result = ac_build_gather_values(&ctx->ac, values, 3);
- } else {
- result = ac_get_arg(&ctx->ac, ctx->block_size);
- }
+ result = ac_build_gather_values(&ctx->ac, values, 3);
+ } else {
+ result = ac_get_arg(&ctx->ac, ctx->block_size);
+ }
- return result;
+ return result;
}
void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
{
- struct si_shader_selector *sel = ctx->shader->selector;
- unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
+ struct si_shader_selector *sel = ctx->shader->selector;
+ unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
- LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
- LLVMValueRef var;
+ LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+ LLVMValueRef var;
- assert(!ctx->ac.lds);
+ assert(!ctx->ac.lds);
- var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- LLVMArrayType(ctx->ac.i8, lds_size),
- "compute_lds",
- AC_ADDR_SPACE_LDS);
- LLVMSetAlignment(var, 64 * 1024);
+ var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
+ "compute_lds", AC_ADDR_SPACE_LDS);
+ LLVMSetAlignment(var, 64 * 1024);
- ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
+ ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
}
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
{
- if (nir->info.stage == MESA_SHADER_VERTEX) {
- si_llvm_load_vs_inputs(ctx, nir);
- } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- unsigned colors_read =
- ctx->shader->selector->info.colors_read;
- LLVMValueRef main_fn = ctx->main_fn;
-
- LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
-
- unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
-
- if (colors_read & 0x0f) {
- unsigned mask = colors_read & 0x0f;
- LLVMValueRef values[4];
- values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
- values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
- values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
- values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
- ctx->abi.color0 =
- ac_to_integer(&ctx->ac,
- ac_build_gather_values(&ctx->ac, values, 4));
- }
- if (colors_read & 0xf0) {
- unsigned mask = (colors_read & 0xf0) >> 4;
- LLVMValueRef values[4];
- values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
- values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
- values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
- values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
- ctx->abi.color1 =
- ac_to_integer(&ctx->ac,
- ac_build_gather_values(&ctx->ac, values, 4));
- }
-
- ctx->abi.interp_at_sample_force_center =
- ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
- } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
- if (nir->info.cs.user_data_components_amd) {
- ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
- ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
- nir->info.cs.user_data_components_amd);
- }
- }
-
- ctx->abi.inputs = &ctx->inputs[0];
- ctx->abi.clamp_shadow_reference = true;
- ctx->abi.robust_buffer_access = true;
-
- if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
- assert(gl_shader_stage_is_compute(nir->info.stage));
- si_llvm_declare_compute_memory(ctx);
- }
- ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
-
- return true;
+ if (nir->info.stage == MESA_SHADER_VERTEX) {
+ si_llvm_load_vs_inputs(ctx, nir);
+ } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ unsigned colors_read = ctx->shader->selector->info.colors_read;
+ LLVMValueRef main_fn = ctx->main_fn;
+
+ LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
+
+ unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
+
+ if (colors_read & 0x0f) {
+ unsigned mask = colors_read & 0x0f;
+ LLVMValueRef values[4];
+ values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+ ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+ }
+ if (colors_read & 0xf0) {
+ unsigned mask = (colors_read & 0xf0) >> 4;
+ LLVMValueRef values[4];
+ values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+ values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+ ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+ }
+
+ ctx->abi.interp_at_sample_force_center =
+ ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
+ } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
+ if (nir->info.cs.user_data_components_amd) {
+ ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
+ ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
+ nir->info.cs.user_data_components_amd);
+ }
+ }
+
+ ctx->abi.inputs = &ctx->inputs[0];
+ ctx->abi.clamp_shadow_reference = true;
+ ctx->abi.robust_buffer_access = true;
+
+ if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+ assert(gl_shader_stage_is_compute(nir->info.stage));
+ si_llvm_declare_compute_memory(ctx);
+ }
+ ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
+
+ return true;
}
/**
* runs them in sequence to form a monolithic shader.
*/
void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
- unsigned num_parts, unsigned main_part,
- unsigned next_shader_first_part)
+ unsigned num_parts, unsigned main_part,
+ unsigned next_shader_first_part)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- /* PS epilog has one arg per color component; gfx9 merged shader
- * prologs need to forward 40 SGPRs.
- */
- LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
- LLVMTypeRef function_type;
- unsigned num_first_params;
- unsigned num_out, initial_num_out;
- ASSERTED unsigned num_out_sgpr; /* used in debug checks */
- ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
- unsigned num_sgprs, num_vgprs;
- unsigned gprs;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- for (unsigned i = 0; i < num_parts; ++i) {
- ac_add_function_attr(ctx->ac.context, parts[i], -1,
- AC_FUNC_ATTR_ALWAYSINLINE);
- LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
- }
-
- /* The parameters of the wrapper function correspond to those of the
- * first part in terms of SGPRs and VGPRs, but we use the types of the
- * main part to get the right types. This is relevant for the
- * dereferenceable attribute on descriptor table pointers.
- */
- num_sgprs = 0;
- num_vgprs = 0;
-
- function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
- num_first_params = LLVMCountParamTypes(function_type);
-
- for (unsigned i = 0; i < num_first_params; ++i) {
- LLVMValueRef param = LLVMGetParam(parts[0], i);
-
- if (ac_is_sgpr_param(param)) {
- assert(num_vgprs == 0);
- num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
- } else {
- num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
- }
- }
-
- gprs = 0;
- while (gprs < num_sgprs + num_vgprs) {
- LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
- LLVMTypeRef type = LLVMTypeOf(param);
- unsigned size = ac_get_type_size(type) / 4;
-
- /* This is going to get casted anyways, so we don't have to
- * have the exact same type. But we do have to preserve the
- * pointer-ness so that LLVM knows about it.
- */
- enum ac_arg_type arg_type = AC_ARG_INT;
- if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
- type = LLVMGetElementType(type);
-
- if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
- if (LLVMGetVectorSize(type) == 4)
- arg_type = AC_ARG_CONST_DESC_PTR;
- else if (LLVMGetVectorSize(type) == 8)
- arg_type = AC_ARG_CONST_IMAGE_PTR;
- else
- assert(0);
- } else if (type == ctx->ac.f32) {
- arg_type = AC_ARG_CONST_FLOAT_PTR;
- } else {
- assert(0);
- }
- }
-
- ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
- size, arg_type, NULL);
-
- assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
- assert(gprs + size <= num_sgprs + num_vgprs &&
- (gprs >= num_sgprs || gprs + size <= num_sgprs));
-
- gprs += size;
- }
-
- /* Prepare the return type. */
- unsigned num_returns = 0;
- LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
-
- last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
- return_type = LLVMGetReturnType(last_func_type);
-
- switch (LLVMGetTypeKind(return_type)) {
- case LLVMStructTypeKind:
- num_returns = LLVMCountStructElementTypes(return_type);
- assert(num_returns <= ARRAY_SIZE(returns));
- LLVMGetStructElementTypes(return_type, returns);
- break;
- case LLVMVoidTypeKind:
- break;
- default:
- unreachable("unexpected type");
- }
-
- si_llvm_create_func(ctx, "wrapper", returns, num_returns,
- si_get_max_workgroup_size(ctx->shader));
-
- if (si_is_merged_shader(ctx->shader))
- ac_init_exec_full_mask(&ctx->ac);
-
- /* Record the arguments of the function as if they were an output of
- * a previous part.
- */
- num_out = 0;
- num_out_sgpr = 0;
-
- for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
- LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
- LLVMTypeRef param_type = LLVMTypeOf(param);
- LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
- unsigned size = ac_get_type_size(param_type) / 4;
-
- if (size == 1) {
- if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
- param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
- param_type = ctx->ac.i32;
- }
-
- if (param_type != out_type)
- param = LLVMBuildBitCast(builder, param, out_type, "");
- out[num_out++] = param;
- } else {
- LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
-
- if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
- param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
- param_type = ctx->ac.i64;
- }
-
- if (param_type != vector_type)
- param = LLVMBuildBitCast(builder, param, vector_type, "");
-
- for (unsigned j = 0; j < size; ++j)
- out[num_out++] = LLVMBuildExtractElement(
- builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
- }
-
- if (ctx->args.args[i].file == AC_ARG_SGPR)
- num_out_sgpr = num_out;
- }
-
- memcpy(initial, out, sizeof(out));
- initial_num_out = num_out;
- initial_num_out_sgpr = num_out_sgpr;
-
- /* Now chain the parts. */
- LLVMValueRef ret = NULL;
- for (unsigned part = 0; part < num_parts; ++part) {
- LLVMValueRef in[AC_MAX_ARGS];
- LLVMTypeRef ret_type;
- unsigned out_idx = 0;
- unsigned num_params = LLVMCountParams(parts[part]);
-
- /* Merged shaders are executed conditionally depending
- * on the number of enabled threads passed in the input SGPRs. */
- if (si_is_multi_part_shader(ctx->shader) && part == 0) {
- LLVMValueRef ena, count = initial[3];
-
- count = LLVMBuildAnd(builder, count,
- LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
- ena = LLVMBuildICmp(builder, LLVMIntULT,
- ac_get_thread_id(&ctx->ac), count, "");
- ac_build_ifcc(&ctx->ac, ena, 6506);
- }
-
- /* Derive arguments for the next part from outputs of the
- * previous one.
- */
- for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
- LLVMValueRef param;
- LLVMTypeRef param_type;
- bool is_sgpr;
- unsigned param_size;
- LLVMValueRef arg = NULL;
-
- param = LLVMGetParam(parts[part], param_idx);
- param_type = LLVMTypeOf(param);
- param_size = ac_get_type_size(param_type) / 4;
- is_sgpr = ac_is_sgpr_param(param);
-
- if (is_sgpr) {
- ac_add_function_attr(ctx->ac.context, parts[part],
- param_idx + 1, AC_FUNC_ATTR_INREG);
- } else if (out_idx < num_out_sgpr) {
- /* Skip returned SGPRs the current part doesn't
- * declare on the input. */
- out_idx = num_out_sgpr;
- }
-
- assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
-
- if (param_size == 1)
- arg = out[out_idx];
- else
- arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
-
- if (LLVMTypeOf(arg) != param_type) {
- if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
- if (LLVMGetPointerAddressSpace(param_type) ==
- AC_ADDR_SPACE_CONST_32BIT) {
- arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
- arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
- } else {
- arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
- arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
- }
- } else {
- arg = LLVMBuildBitCast(builder, arg, param_type, "");
- }
- }
-
- in[param_idx] = arg;
- out_idx += param_size;
- }
-
- ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
-
- if (si_is_multi_part_shader(ctx->shader) &&
- part + 1 == next_shader_first_part) {
- ac_build_endif(&ctx->ac, 6506);
-
- /* The second half of the merged shader should use
- * the inputs from the toplevel (wrapper) function,
- * not the return value from the last call.
- *
- * That's because the last call was executed condi-
- * tionally, so we can't consume it in the main
- * block.
- */
- memcpy(out, initial, sizeof(initial));
- num_out = initial_num_out;
- num_out_sgpr = initial_num_out_sgpr;
- continue;
- }
-
- /* Extract the returned GPRs. */
- ret_type = LLVMTypeOf(ret);
- num_out = 0;
- num_out_sgpr = 0;
-
- if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
- assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
-
- unsigned ret_size = LLVMCountStructElementTypes(ret_type);
-
- for (unsigned i = 0; i < ret_size; ++i) {
- LLVMValueRef val =
- LLVMBuildExtractValue(builder, ret, i, "");
-
- assert(num_out < ARRAY_SIZE(out));
- out[num_out++] = val;
-
- if (LLVMTypeOf(val) == ctx->ac.i32) {
- assert(num_out_sgpr + 1 == num_out);
- num_out_sgpr = num_out;
- }
- }
- }
- }
-
- /* Return the value from the last part. */
- if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
- LLVMBuildRetVoid(builder);
- else
- LLVMBuildRet(builder, ret);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ /* PS epilog has one arg per color component; gfx9 merged shader
+ * prologs need to forward 40 SGPRs.
+ */
+ LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
+ LLVMTypeRef function_type;
+ unsigned num_first_params;
+ unsigned num_out, initial_num_out;
+ ASSERTED unsigned num_out_sgpr; /* used in debug checks */
+ ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
+ unsigned num_sgprs, num_vgprs;
+ unsigned gprs;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ for (unsigned i = 0; i < num_parts; ++i) {
+ ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
+ LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
+ }
+
+ /* The parameters of the wrapper function correspond to those of the
+ * first part in terms of SGPRs and VGPRs, but we use the types of the
+ * main part to get the right types. This is relevant for the
+ * dereferenceable attribute on descriptor table pointers.
+ */
+ num_sgprs = 0;
+ num_vgprs = 0;
+
+ function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
+ num_first_params = LLVMCountParamTypes(function_type);
+
+ for (unsigned i = 0; i < num_first_params; ++i) {
+ LLVMValueRef param = LLVMGetParam(parts[0], i);
+
+ if (ac_is_sgpr_param(param)) {
+ assert(num_vgprs == 0);
+ num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+ } else {
+ num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+ }
+ }
+
+ gprs = 0;
+ while (gprs < num_sgprs + num_vgprs) {
+ LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
+ LLVMTypeRef type = LLVMTypeOf(param);
+ unsigned size = ac_get_type_size(type) / 4;
+
+ /* This is going to get casted anyways, so we don't have to
+ * have the exact same type. But we do have to preserve the
+ * pointer-ness so that LLVM knows about it.
+ */
+ enum ac_arg_type arg_type = AC_ARG_INT;
+ if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+ type = LLVMGetElementType(type);
+
+ if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
+ if (LLVMGetVectorSize(type) == 4)
+ arg_type = AC_ARG_CONST_DESC_PTR;
+ else if (LLVMGetVectorSize(type) == 8)
+ arg_type = AC_ARG_CONST_IMAGE_PTR;
+ else
+ assert(0);
+ } else if (type == ctx->ac.f32) {
+ arg_type = AC_ARG_CONST_FLOAT_PTR;
+ } else {
+ assert(0);
+ }
+ }
+
+ ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
+
+ assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
+ assert(gprs + size <= num_sgprs + num_vgprs &&
+ (gprs >= num_sgprs || gprs + size <= num_sgprs));
+
+ gprs += size;
+ }
+
+ /* Prepare the return type. */
+ unsigned num_returns = 0;
+ LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
+
+ last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
+ return_type = LLVMGetReturnType(last_func_type);
+
+ switch (LLVMGetTypeKind(return_type)) {
+ case LLVMStructTypeKind:
+ num_returns = LLVMCountStructElementTypes(return_type);
+ assert(num_returns <= ARRAY_SIZE(returns));
+ LLVMGetStructElementTypes(return_type, returns);
+ break;
+ case LLVMVoidTypeKind:
+ break;
+ default:
+ unreachable("unexpected type");
+ }
+
+ si_llvm_create_func(ctx, "wrapper", returns, num_returns,
+ si_get_max_workgroup_size(ctx->shader));
+
+ if (si_is_merged_shader(ctx->shader))
+ ac_init_exec_full_mask(&ctx->ac);
+
+ /* Record the arguments of the function as if they were an output of
+ * a previous part.
+ */
+ num_out = 0;
+ num_out_sgpr = 0;
+
+ for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
+ LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
+ LLVMTypeRef param_type = LLVMTypeOf(param);
+ LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
+ unsigned size = ac_get_type_size(param_type) / 4;
+
+ if (size == 1) {
+ if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+ param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+ param_type = ctx->ac.i32;
+ }
+
+ if (param_type != out_type)
+ param = LLVMBuildBitCast(builder, param, out_type, "");
+ out[num_out++] = param;
+ } else {
+ LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
+
+ if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+ param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+ param_type = ctx->ac.i64;
+ }
+
+ if (param_type != vector_type)
+ param = LLVMBuildBitCast(builder, param, vector_type, "");
+
+ for (unsigned j = 0; j < size; ++j)
+ out[num_out++] =
+ LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
+ }
+
+ if (ctx->args.args[i].file == AC_ARG_SGPR)
+ num_out_sgpr = num_out;
+ }
+
+ memcpy(initial, out, sizeof(out));
+ initial_num_out = num_out;
+ initial_num_out_sgpr = num_out_sgpr;
+
+ /* Now chain the parts. */
+ LLVMValueRef ret = NULL;
+ for (unsigned part = 0; part < num_parts; ++part) {
+ LLVMValueRef in[AC_MAX_ARGS];
+ LLVMTypeRef ret_type;
+ unsigned out_idx = 0;
+ unsigned num_params = LLVMCountParams(parts[part]);
+
+ /* Merged shaders are executed conditionally depending
+ * on the number of enabled threads passed in the input SGPRs. */
+ if (si_is_multi_part_shader(ctx->shader) && part == 0) {
+ LLVMValueRef ena, count = initial[3];
+
+ count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+ ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
+ ac_build_ifcc(&ctx->ac, ena, 6506);
+ }
+
+ /* Derive arguments for the next part from outputs of the
+ * previous one.
+ */
+ for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
+ LLVMValueRef param;
+ LLVMTypeRef param_type;
+ bool is_sgpr;
+ unsigned param_size;
+ LLVMValueRef arg = NULL;
+
+ param = LLVMGetParam(parts[part], param_idx);
+ param_type = LLVMTypeOf(param);
+ param_size = ac_get_type_size(param_type) / 4;
+ is_sgpr = ac_is_sgpr_param(param);
+
+ if (is_sgpr) {
+ ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
+ } else if (out_idx < num_out_sgpr) {
+ /* Skip returned SGPRs the current part doesn't
+ * declare on the input. */
+ out_idx = num_out_sgpr;
+ }
+
+ assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
+
+ if (param_size == 1)
+ arg = out[out_idx];
+ else
+ arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
+
+ if (LLVMTypeOf(arg) != param_type) {
+ if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+ if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
+ arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
+ arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+ } else {
+ arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
+ arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+ }
+ } else {
+ arg = LLVMBuildBitCast(builder, arg, param_type, "");
+ }
+ }
+
+ in[param_idx] = arg;
+ out_idx += param_size;
+ }
+
+ ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
+
+ if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
+ ac_build_endif(&ctx->ac, 6506);
+
+ /* The second half of the merged shader should use
+ * the inputs from the toplevel (wrapper) function,
+ * not the return value from the last call.
+ *
+ * That's because the last call was executed condi-
+ * tionally, so we can't consume it in the main
+ * block.
+ */
+ memcpy(out, initial, sizeof(initial));
+ num_out = initial_num_out;
+ num_out_sgpr = initial_num_out_sgpr;
+ continue;
+ }
+
+ /* Extract the returned GPRs. */
+ ret_type = LLVMTypeOf(ret);
+ num_out = 0;
+ num_out_sgpr = 0;
+
+ if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
+ assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
+
+ unsigned ret_size = LLVMCountStructElementTypes(ret_type);
+
+ for (unsigned i = 0; i < ret_size; ++i) {
+ LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
+
+ assert(num_out < ARRAY_SIZE(out));
+ out[num_out++] = val;
+
+ if (LLVMTypeOf(val) == ctx->ac.i32) {
+ assert(num_out_sgpr + 1 == num_out);
+ num_out_sgpr = num_out;
+ }
+ }
+ }
+ }
+
+ /* Return the value from the last part. */
+ if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+ LLVMBuildRetVoid(builder);
+ else
+ LLVMBuildRet(builder, ret);
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
#include "util/u_memory.h"
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
{
- /* Return true if the current thread should execute an ES thread. */
- return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
- ac_get_thread_id(&ctx->ac),
- si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+ /* Return true if the current thread should execute an ES thread. */
+ return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+ si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
}
LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
{
- /* Return true if the current thread should execute a GS thread. */
- return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
- ac_get_thread_id(&ctx->ac),
- si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+ /* Return true if the current thread should execute a GS thread. */
+ return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+ si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
}
-static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
- unsigned input_index,
- unsigned vtx_offset_param,
- LLVMTypeRef type,
- unsigned swizzle)
+static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
+ unsigned vtx_offset_param, LLVMTypeRef type,
+ unsigned swizzle)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader *shader = ctx->shader;
- LLVMValueRef vtx_offset, soffset;
- struct si_shader_info *info = &shader->selector->info;
- unsigned semantic_name = info->input_semantic_name[input_index];
- unsigned semantic_index = info->input_semantic_index[input_index];
- unsigned param;
- LLVMValueRef value;
-
- param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
-
- /* GFX9 has the ESGS ring in LDS. */
- if (ctx->screen->info.chip_class >= GFX9) {
- unsigned index = vtx_offset_param;
-
- switch (index / 2) {
- case 0:
- vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
- index % 2 ? 16 : 0, 16);
- break;
- case 1:
- vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
- index % 2 ? 16 : 0, 16);
- break;
- case 2:
- vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
- index % 2 ? 16 : 0, 16);
- break;
- default:
- assert(0);
- return NULL;
- }
-
- unsigned offset = param * 4 + swizzle;
- vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
- LLVMConstInt(ctx->ac.i32, offset, false), "");
-
- LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
- LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
- if (ac_get_type_size(type) == 8) {
- ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
- &ctx->ac.i32_1, 1, "");
- LLVMValueRef values[2] = {
- value,
- LLVMBuildLoad(ctx->ac.builder, ptr, "")
- };
- value = ac_build_gather_values(&ctx->ac, values, 2);
- }
- return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
- }
-
- /* GFX6: input load from the ESGS ring in memory. */
- if (swizzle == ~0) {
- LLVMValueRef values[4];
- unsigned chan;
- for (chan = 0; chan < 4; chan++) {
- values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
- type, chan);
- }
- return ac_build_gather_values(&ctx->ac, values, 4);
- }
-
- /* Get the vertex offset parameter on GFX6. */
- LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
- ctx->gs_vtx_offset[vtx_offset_param]);
-
- vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
- LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
- soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
-
- value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0,
- vtx_offset, soffset, 0, ac_glc, true, false);
- if (ac_get_type_size(type) == 8) {
- LLVMValueRef value2;
- soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
-
- value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
- ctx->ac.i32_0, vtx_offset, soffset,
- 0, ac_glc, true, false);
- return si_build_gather_64bit(ctx, type, value, value2);
- }
- return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader *shader = ctx->shader;
+ LLVMValueRef vtx_offset, soffset;
+ struct si_shader_info *info = &shader->selector->info;
+ unsigned semantic_name = info->input_semantic_name[input_index];
+ unsigned semantic_index = info->input_semantic_index[input_index];
+ unsigned param;
+ LLVMValueRef value;
+
+ param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
+
+ /* GFX9 has the ESGS ring in LDS. */
+ if (ctx->screen->info.chip_class >= GFX9) {
+ unsigned index = vtx_offset_param;
+
+ switch (index / 2) {
+ case 0:
+ vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
+ break;
+ case 1:
+ vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
+ break;
+ case 2:
+ vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
+ break;
+ default:
+ assert(0);
+ return NULL;
+ }
+
+ unsigned offset = param * 4 + swizzle;
+ vtx_offset =
+ LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+ LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
+ LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+ if (ac_get_type_size(type) == 8) {
+ ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
+ LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
+ value = ac_build_gather_values(&ctx->ac, values, 2);
+ }
+ return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+ }
+
+ /* GFX6: input load from the ESGS ring in memory. */
+ if (swizzle == ~0) {
+ LLVMValueRef values[4];
+ unsigned chan;
+ for (chan = 0; chan < 4; chan++) {
+ values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
+ }
+ return ac_build_gather_values(&ctx->ac, values, 4);
+ }
+
+ /* Get the vertex offset parameter on GFX6. */
+ LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
+
+ vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+ soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
+
+ value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
+ ac_glc, true, false);
+ if (ac_get_type_size(type) == 8) {
+ LLVMValueRef value2;
+ soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
+
+ value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
+ 0, ac_glc, true, false);
+ return si_build_gather_64bit(ctx, type, value, value2);
+ }
+ return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
}
-static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
- unsigned location,
- unsigned driver_location,
- unsigned component,
- unsigned num_components,
- unsigned vertex_index,
- unsigned const_index,
- LLVMTypeRef type)
+static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
+ unsigned driver_location, unsigned component,
+ unsigned num_components, unsigned vertex_index,
+ unsigned const_index, LLVMTypeRef type)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMValueRef value[4];
- for (unsigned i = 0; i < num_components; i++) {
- unsigned offset = i;
- if (ac_get_type_size(type) == 8)
- offset *= 2;
+ LLVMValueRef value[4];
+ for (unsigned i = 0; i < num_components; i++) {
+ unsigned offset = i;
+ if (ac_get_type_size(type) == 8)
+ offset *= 2;
- offset += component;
- value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
- vertex_index, type, offset);
- }
+ offset += component;
+ value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
+ vertex_index, type, offset);
+ }
- return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+ return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
}
/* Pass GS inputs from ES to GS on GFX9. */
static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
{
- LLVMValueRef ret = ctx->return_value;
-
- ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
- ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
- if (ctx->shader->key.as_ngg)
- ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
- else
- ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
- ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
- ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
- ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
- 8 + SI_SGPR_RW_BUFFERS);
- ret = si_insert_input_ptr(ctx, ret,
- ctx->bindless_samplers_and_images,
- 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
- if (ctx->screen->use_ngg) {
- ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
- 8 + SI_SGPR_VS_STATE_BITS);
- }
-
- unsigned vgpr;
- if (ctx->type == PIPE_SHADER_VERTEX)
- vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
- else
- vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-
- ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
- ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
- ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
- ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
- ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
- ctx->return_value = ret;
+ LLVMValueRef ret = ctx->return_value;
+
+ ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+ ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+ if (ctx->shader->key.as_ngg)
+ ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
+ else
+ ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
+ ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+ ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+ ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+ ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+ if (ctx->screen->use_ngg) {
+ ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+ }
+
+ unsigned vgpr;
+ if (ctx->type == PIPE_SHADER_VERTEX)
+ vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+ else
+ vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
+ ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
+ ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+ ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+ ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+ ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+ ctx->return_value = ret;
}
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs)
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader *es = ctx->shader;
- struct si_shader_info *info = &es->selector->info;
- LLVMValueRef lds_base = NULL;
- unsigned chan;
- int i;
-
- if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
- unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
- LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
- LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
- vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
- LLVMBuildMul(ctx->ac.builder, wave_idx,
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), "");
- lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
- LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
- }
-
- for (i = 0; i < info->num_outputs; i++) {
- int param;
-
- if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
- info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
- continue;
-
- param = si_shader_io_get_unique_index(info->output_semantic_name[i],
- info->output_semantic_index[i], false);
-
- for (chan = 0; chan < 4; chan++) {
- if (!(info->output_usagemask[i] & (1 << chan)))
- continue;
-
- LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
- out_val = ac_to_integer(&ctx->ac, out_val);
-
- /* GFX9 has the ESGS ring in LDS. */
- if (ctx->screen->info.chip_class >= GFX9) {
- LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
- idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
- ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
- continue;
- }
-
- ac_build_buffer_store_dword(&ctx->ac,
- ctx->esgs_ring,
- out_val, 1, NULL,
- ac_get_arg(&ctx->ac, ctx->es2gs_offset),
- (4 * param + chan) * 4,
- ac_glc | ac_slc | ac_swizzled);
- }
- }
-
- if (ctx->screen->info.chip_class >= GFX9)
- si_set_es_return_value_for_gs(ctx);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader *es = ctx->shader;
+ struct si_shader_info *info = &es->selector->info;
+ LLVMValueRef lds_base = NULL;
+ unsigned chan;
+ int i;
+
+ if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
+ unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+ LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+ LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+ vertex_idx =
+ LLVMBuildOr(ctx->ac.builder, vertex_idx,
+ LLVMBuildMul(ctx->ac.builder, wave_idx,
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
+ "");
+ lds_base =
+ LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
+ }
+
+ for (i = 0; i < info->num_outputs; i++) {
+ int param;
+
+ if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+ info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+ continue;
+
+ param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+ info->output_semantic_index[i], false);
+
+ for (chan = 0; chan < 4; chan++) {
+ if (!(info->output_usagemask[i] & (1 << chan)))
+ continue;
+
+ LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+ out_val = ac_to_integer(&ctx->ac, out_val);
+
+ /* GFX9 has the ESGS ring in LDS. */
+ if (ctx->screen->info.chip_class >= GFX9) {
+ LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
+ idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
+ ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
+ continue;
+ }
+
+ ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
+ ac_get_arg(&ctx->ac, ctx->es2gs_offset),
+ (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
+ }
+ }
+
+ if (ctx->screen->info.chip_class >= GFX9)
+ si_set_es_return_value_for_gs(ctx);
}
static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
{
- if (ctx->screen->info.chip_class >= GFX9)
- return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
- else
- return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
+ if (ctx->screen->info.chip_class >= GFX9)
+ return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
+ else
+ return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
}
static void emit_gs_epilogue(struct si_shader_context *ctx)
{
- if (ctx->shader->key.as_ngg) {
- gfx10_ngg_gs_emit_epilogue(ctx);
- return;
- }
+ if (ctx->shader->key.as_ngg) {
+ gfx10_ngg_gs_emit_epilogue(ctx);
+ return;
+ }
- if (ctx->screen->info.chip_class >= GFX10)
- LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
+ if (ctx->screen->info.chip_class >= GFX10)
+ LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
- ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
- si_get_gs_wave_id(ctx));
+ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
- if (ctx->screen->info.chip_class >= GFX9)
- ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+ if (ctx->screen->info.chip_class >= GFX9)
+ ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
}
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
- assert(info->num_outputs <= max_outputs);
+ assert(info->num_outputs <= max_outputs);
- emit_gs_epilogue(ctx);
+ emit_gs_epilogue(ctx);
}
/* Emit one vertex from the geometry shader */
-static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
- unsigned stream,
- LLVMValueRef *addrs)
+static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
- if (ctx->shader->key.as_ngg) {
- gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
- return;
- }
-
- struct si_shader_info *info = &ctx->shader->selector->info;
- struct si_shader *shader = ctx->shader;
- LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
- LLVMValueRef gs_next_vertex;
- LLVMValueRef can_emit;
- unsigned chan, offset;
- int i;
-
- /* Write vertex attribute values to GSVS ring */
- gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
- ctx->gs_next_vertex[stream],
- "");
-
- /* If this thread has already emitted the declared maximum number of
- * vertices, skip the write: excessive vertex emissions are not
- * supposed to have any effect.
- *
- * If the shader has no writes to memory, kill it instead. This skips
- * further memory loads and may allow LLVM to skip to the end
- * altogether.
- */
- can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
- LLVMConstInt(ctx->ac.i32,
- shader->selector->gs_max_out_vertices, 0), "");
-
- bool use_kill = !info->writes_memory;
- if (use_kill) {
- ac_build_kill_if_false(&ctx->ac, can_emit);
- } else {
- ac_build_ifcc(&ctx->ac, can_emit, 6505);
- }
-
- offset = 0;
- for (i = 0; i < info->num_outputs; i++) {
- for (chan = 0; chan < 4; chan++) {
- if (!(info->output_usagemask[i] & (1 << chan)) ||
- ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
- continue;
-
- LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
- LLVMValueRef voffset =
- LLVMConstInt(ctx->ac.i32, offset *
- shader->selector->gs_max_out_vertices, 0);
- offset++;
-
- voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
- voffset = LLVMBuildMul(ctx->ac.builder, voffset,
- LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
- out_val = ac_to_integer(&ctx->ac, out_val);
-
- ac_build_buffer_store_dword(&ctx->ac,
- ctx->gsvs_ring[stream],
- out_val, 1,
- voffset, soffset, 0,
- ac_glc | ac_slc | ac_swizzled);
- }
- }
-
- gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
- LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
- /* Signal vertex emission if vertex data was written. */
- if (offset) {
- ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
- si_get_gs_wave_id(ctx));
- }
-
- if (!use_kill)
- ac_build_endif(&ctx->ac, 6505);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+ if (ctx->shader->key.as_ngg) {
+ gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+ return;
+ }
+
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ struct si_shader *shader = ctx->shader;
+ LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
+ LLVMValueRef gs_next_vertex;
+ LLVMValueRef can_emit;
+ unsigned chan, offset;
+ int i;
+
+ /* Write vertex attribute values to GSVS ring */
+ gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
+
+ /* If this thread has already emitted the declared maximum number of
+ * vertices, skip the write: excessive vertex emissions are not
+ * supposed to have any effect.
+ *
+ * If the shader has no writes to memory, kill it instead. This skips
+ * further memory loads and may allow LLVM to skip to the end
+ * altogether.
+ */
+ can_emit =
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
+ LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
+
+ bool use_kill = !info->writes_memory;
+ if (use_kill) {
+ ac_build_kill_if_false(&ctx->ac, can_emit);
+ } else {
+ ac_build_ifcc(&ctx->ac, can_emit, 6505);
+ }
+
+ offset = 0;
+ for (i = 0; i < info->num_outputs; i++) {
+ for (chan = 0; chan < 4; chan++) {
+ if (!(info->output_usagemask[i] & (1 << chan)) ||
+ ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+ continue;
+
+ LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+ LLVMValueRef voffset =
+ LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
+ offset++;
+
+ voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+ voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+ out_val = ac_to_integer(&ctx->ac, out_val);
+
+ ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
+ 0, ac_glc | ac_slc | ac_swizzled);
+ }
+ }
+
+ gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
+ LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
+
+ /* Signal vertex emission if vertex data was written. */
+ if (offset) {
+ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+ si_get_gs_wave_id(ctx));
+ }
+
+ if (!use_kill)
+ ac_build_endif(&ctx->ac, 6505);
}
/* Cut one primitive from the geometry shader */
-static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
- unsigned stream)
+static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- if (ctx->shader->key.as_ngg) {
- LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
- return;
- }
+ if (ctx->shader->key.as_ngg) {
+ LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+ return;
+ }
- /* Signal primitive cut */
- ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
- si_get_gs_wave_id(ctx));
+ /* Signal primitive cut */
+ ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+ si_get_gs_wave_id(ctx));
}
void si_preload_esgs_ring(struct si_shader_context *ctx)
{
- if (ctx->screen->info.chip_class <= GFX8) {
- unsigned ring =
- ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
- : SI_ES_RING_ESGS;
- LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
- ctx->esgs_ring =
- ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
- } else {
- if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
- /* Declare the ESGS ring as an explicit LDS symbol. */
- si_llvm_declare_esgs_ring(ctx);
- } else {
- ac_declare_lds_as_pointer(&ctx->ac);
- ctx->esgs_ring = ctx->ac.lds;
- }
- }
+ if (ctx->screen->info.chip_class <= GFX8) {
+ unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
+ LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+ ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+ } else {
+ if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+ /* Declare the ESGS ring as an explicit LDS symbol. */
+ si_llvm_declare_esgs_ring(ctx);
+ } else {
+ ac_declare_lds_as_pointer(&ctx->ac);
+ ctx->esgs_ring = ctx->ac.lds;
+ }
+ }
}
void si_preload_gs_rings(struct si_shader_context *ctx)
{
- const struct si_shader_selector *sel = ctx->shader->selector;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
- /* The conceptual layout of the GSVS ring is
- * v0c0 .. vLv0 v0c1 .. vLc1 ..
- * but the real memory layout is swizzled across
- * threads:
- * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
- * t16v0c0 ..
- * Override the buffer descriptor accordingly.
- */
- LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
- uint64_t stream_offset = 0;
-
- for (unsigned stream = 0; stream < 4; ++stream) {
- unsigned num_components;
- unsigned stride;
- unsigned num_records;
- LLVMValueRef ring, tmp;
-
- num_components = sel->info.num_stream_output_components[stream];
- if (!num_components)
- continue;
-
- stride = 4 * num_components * sel->gs_max_out_vertices;
-
- /* Limit on the stride field for <= GFX7. */
- assert(stride < (1 << 14));
-
- num_records = ctx->ac.wave_size;
-
- ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
- tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
- tmp = LLVMBuildAdd(builder, tmp,
- LLVMConstInt(ctx->ac.i64,
- stream_offset, 0), "");
- stream_offset += stride * ctx->ac.wave_size;
-
- ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
- ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
- tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
- tmp = LLVMBuildOr(builder, tmp,
- LLVMConstInt(ctx->ac.i32,
- S_008F04_STRIDE(stride) |
- S_008F04_SWIZZLE_ENABLE(1), 0), "");
- ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
- ring = LLVMBuildInsertElement(builder, ring,
- LLVMConstInt(ctx->ac.i32, num_records, 0),
- LLVMConstInt(ctx->ac.i32, 2, 0), "");
-
- uint32_t rsrc3 =
- S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
- S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
- S_008F0C_ADD_TID_ENABLE(1);
-
- if (ctx->ac.chip_class >= GFX10) {
- rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
- S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
- }
-
- ring = LLVMBuildInsertElement(builder, ring,
- LLVMConstInt(ctx->ac.i32, rsrc3, false),
- LLVMConstInt(ctx->ac.i32, 3, 0), "");
-
- ctx->gsvs_ring[stream] = ring;
- }
+ const struct si_shader_selector *sel = ctx->shader->selector;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+ /* The conceptual layout of the GSVS ring is
+ * v0c0 .. vLv0 v0c1 .. vLc1 ..
+ * but the real memory layout is swizzled across
+ * threads:
+ * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
+ * t16v0c0 ..
+ * Override the buffer descriptor accordingly.
+ */
+ LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
+ uint64_t stream_offset = 0;
+
+ for (unsigned stream = 0; stream < 4; ++stream) {
+ unsigned num_components;
+ unsigned stride;
+ unsigned num_records;
+ LLVMValueRef ring, tmp;
+
+ num_components = sel->info.num_stream_output_components[stream];
+ if (!num_components)
+ continue;
+
+ stride = 4 * num_components * sel->gs_max_out_vertices;
+
+ /* Limit on the stride field for <= GFX7. */
+ assert(stride < (1 << 14));
+
+ num_records = ctx->ac.wave_size;
+
+ ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
+ tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
+ tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
+ stream_offset += stride * ctx->ac.wave_size;
+
+ ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
+ ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
+ tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
+ tmp = LLVMBuildOr(
+ builder, tmp,
+ LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
+ ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
+ ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
+ LLVMConstInt(ctx->ac.i32, 2, 0), "");
+
+ uint32_t rsrc3 =
+ S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+ S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+ S_008F0C_ADD_TID_ENABLE(1);
+
+ if (ctx->ac.chip_class >= GFX10) {
+ rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+ S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+ }
+
+ ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
+ LLVMConstInt(ctx->ac.i32, 3, 0), "");
+
+ ctx->gsvs_ring[stream] = ring;
+ }
}
/* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
- struct ac_llvm_compiler *compiler,
- struct si_shader_selector *gs_selector,
- struct pipe_debug_callback *debug)
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+ struct ac_llvm_compiler *compiler,
+ struct si_shader_selector *gs_selector,
+ struct pipe_debug_callback *debug)
{
- struct si_shader_context ctx;
- struct si_shader *shader;
- LLVMBuilderRef builder;
- struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
- struct si_shader_info *gsinfo = &gs_selector->info;
- int i;
-
-
- shader = CALLOC_STRUCT(si_shader);
- if (!shader)
- return NULL;
-
- /* We can leave the fence as permanently signaled because the GS copy
- * shader only becomes visible globally after it has been compiled. */
- util_queue_fence_init(&shader->ready);
-
- shader->selector = gs_selector;
- shader->is_gs_copy_shader = true;
+ struct si_shader_context ctx;
+ struct si_shader *shader;
+ LLVMBuilderRef builder;
+ struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
+ struct si_shader_info *gsinfo = &gs_selector->info;
+ int i;
+
+ shader = CALLOC_STRUCT(si_shader);
+ if (!shader)
+ return NULL;
+
+ /* We can leave the fence as permanently signaled because the GS copy
+ * shader only becomes visible globally after it has been compiled. */
+ util_queue_fence_init(&shader->ready);
+
+ shader->selector = gs_selector;
+ shader->is_gs_copy_shader = true;
+
+ si_llvm_context_init(&ctx, sscreen, compiler,
+ si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+ ctx.shader = shader;
+ ctx.type = PIPE_SHADER_VERTEX;
+
+ builder = ctx.ac.builder;
+
+ si_create_function(&ctx, false);
+
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
+ ctx.gsvs_ring[0] =
+ ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
+
+ LLVMValueRef voffset =
+ LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
+
+ /* Fetch the vertex stream ID.*/
+ LLVMValueRef stream_id;
+
+ if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
+ stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
+ else
+ stream_id = ctx.ac.i32_0;
+
+ /* Fill in output information. */
+ for (i = 0; i < gsinfo->num_outputs; ++i) {
+ outputs[i].semantic_name = gsinfo->output_semantic_name[i];
+ outputs[i].semantic_index = gsinfo->output_semantic_index[i];
+
+ for (int chan = 0; chan < 4; chan++) {
+ outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
+ }
+ }
+
+ LLVMBasicBlockRef end_bb;
+ LLVMValueRef switch_inst;
+
+ end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
+ switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
+
+ for (int stream = 0; stream < 4; stream++) {
+ LLVMBasicBlockRef bb;
+ unsigned offset;
+
+ if (!gsinfo->num_stream_output_components[stream])
+ continue;
+
+ if (stream > 0 && !gs_selector->so.num_outputs)
+ continue;
+
+ bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
+ LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
+ LLVMPositionBuilderAtEnd(builder, bb);
+
+ /* Fetch vertex data from GSVS ring */
+ offset = 0;
+ for (i = 0; i < gsinfo->num_outputs; ++i) {
+ for (unsigned chan = 0; chan < 4; chan++) {
+ if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
+ outputs[i].vertex_stream[chan] != stream) {
+ outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
+ continue;
+ }
+
+ LLVMValueRef soffset =
+ LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
+ offset++;
+
+ outputs[i].values[chan] =
+ ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
+ ac_glc | ac_slc, true, false);
+ }
+ }
+
+ /* Streamout and exports. */
+ if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
+ si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
+ }
+
+ if (stream == 0)
+ si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
+
+ LLVMBuildBr(builder, end_bb);
+ }
+
+ LLVMPositionBuilderAtEnd(builder, end_bb);
+
+ LLVMBuildRetVoid(ctx.ac.builder);
+
+ ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+ si_llvm_optimize_module(&ctx);
+
+ bool ok = false;
+ if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
+ debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) {
+ if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
+ fprintf(stderr, "GS Copy Shader:\n");
+ si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
+
+ if (!ctx.shader->config.scratch_bytes_per_wave)
+ ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
+ else
+ ok = true;
+ }
- si_llvm_context_init(&ctx, sscreen, compiler,
- si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
- false, false, false));
- ctx.shader = shader;
- ctx.type = PIPE_SHADER_VERTEX;
-
- builder = ctx.ac.builder;
-
- si_create_function(&ctx, false);
-
- LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
- ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
- LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
-
- LLVMValueRef voffset =
- LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
- LLVMConstInt(ctx.ac.i32, 4, 0), "");
-
- /* Fetch the vertex stream ID.*/
- LLVMValueRef stream_id;
-
- if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
- stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
- else
- stream_id = ctx.ac.i32_0;
-
- /* Fill in output information. */
- for (i = 0; i < gsinfo->num_outputs; ++i) {
- outputs[i].semantic_name = gsinfo->output_semantic_name[i];
- outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
- for (int chan = 0; chan < 4; chan++) {
- outputs[i].vertex_stream[chan] =
- (gsinfo->output_streams[i] >> (2 * chan)) & 3;
- }
- }
-
- LLVMBasicBlockRef end_bb;
- LLVMValueRef switch_inst;
-
- end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
- switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
- for (int stream = 0; stream < 4; stream++) {
- LLVMBasicBlockRef bb;
- unsigned offset;
-
- if (!gsinfo->num_stream_output_components[stream])
- continue;
-
- if (stream > 0 && !gs_selector->so.num_outputs)
- continue;
-
- bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
- LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
- LLVMPositionBuilderAtEnd(builder, bb);
-
- /* Fetch vertex data from GSVS ring */
- offset = 0;
- for (i = 0; i < gsinfo->num_outputs; ++i) {
- for (unsigned chan = 0; chan < 4; chan++) {
- if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
- outputs[i].vertex_stream[chan] != stream) {
- outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
- continue;
- }
-
- LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32,
- offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
- offset++;
-
- outputs[i].values[chan] =
- ac_build_buffer_load(&ctx.ac,
- ctx.gsvs_ring[0], 1,
- ctx.ac.i32_0, voffset,
- soffset, 0, ac_glc | ac_slc,
- true, false);
- }
- }
-
- /* Streamout and exports. */
- if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
- si_llvm_emit_streamout(&ctx, outputs,
- gsinfo->num_outputs,
- stream);
- }
-
- if (stream == 0)
- si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
-
- LLVMBuildBr(builder, end_bb);
- }
-
- LLVMPositionBuilderAtEnd(builder, end_bb);
-
- LLVMBuildRetVoid(ctx.ac.builder);
-
- ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
- si_llvm_optimize_module(&ctx);
-
- bool ok = false;
- if (si_compile_llvm(sscreen, &ctx.shader->binary,
- &ctx.shader->config, ctx.compiler, &ctx.ac,
- debug, PIPE_SHADER_GEOMETRY,
- "GS Copy Shader", false)) {
- if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
- fprintf(stderr, "GS Copy Shader:\n");
- si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
- if (!ctx.shader->config.scratch_bytes_per_wave)
- ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
- else
- ok = true;
- }
-
- si_llvm_dispose(&ctx);
-
- if (!ok) {
- FREE(shader);
- shader = NULL;
- } else {
- si_fix_resource_usage(sscreen, shader);
- }
- return shader;
+ si_llvm_dispose(&ctx);
+
+ if (!ok) {
+ FREE(shader);
+ shader = NULL;
+ } else {
+ si_fix_resource_usage(sscreen, shader);
+ }
+ return shader;
}
/**
* Build the GS prolog function. Rotate the input vertices for triangle strips
* with adjacency.
*/
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key)
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
- unsigned num_sgprs, num_vgprs;
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMTypeRef returns[AC_MAX_ARGS];
- LLVMValueRef func, ret;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- if (ctx->screen->info.chip_class >= GFX9) {
- if (key->gs_prolog.states.gfx9_prev_is_vs)
- num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
- else
- num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
- num_vgprs = 5; /* ES inputs are not needed by GS */
- } else {
- num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
- num_vgprs = 8;
- }
-
- for (unsigned i = 0; i < num_sgprs; ++i) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- returns[i] = ctx->ac.i32;
- }
-
- for (unsigned i = 0; i < num_vgprs; ++i) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
- returns[num_sgprs + i] = ctx->ac.f32;
- }
-
- /* Create the function. */
- si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
- func = ctx->main_fn;
-
- /* Set the full EXEC mask for the prolog, because we are only fiddling
- * with registers here. The main shader part will set the correct EXEC
- * mask.
- */
- if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
- ac_init_exec_full_mask(&ctx->ac);
-
- /* Copy inputs to outputs. This should be no-op, as the registers match,
- * but it will prevent the compiler from overwriting them unintentionally.
- */
- ret = ctx->return_value;
- for (unsigned i = 0; i < num_sgprs; i++) {
- LLVMValueRef p = LLVMGetParam(func, i);
- ret = LLVMBuildInsertValue(builder, ret, p, i, "");
- }
- for (unsigned i = 0; i < num_vgprs; i++) {
- LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
- p = ac_to_float(&ctx->ac, p);
- ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
- }
-
- if (key->gs_prolog.states.tri_strip_adj_fix) {
- /* Remap the input vertices for every other primitive. */
- const struct ac_arg gfx6_vtx_params[6] = {
- { .used = true, .arg_index = num_sgprs },
- { .used = true, .arg_index = num_sgprs + 1 },
- { .used = true, .arg_index = num_sgprs + 3 },
- { .used = true, .arg_index = num_sgprs + 4 },
- { .used = true, .arg_index = num_sgprs + 5 },
- { .used = true, .arg_index = num_sgprs + 6 },
- };
- const struct ac_arg gfx9_vtx_params[3] = {
- { .used = true, .arg_index = num_sgprs },
- { .used = true, .arg_index = num_sgprs + 1 },
- { .used = true, .arg_index = num_sgprs + 4 },
- };
- LLVMValueRef vtx_in[6], vtx_out[6];
- LLVMValueRef prim_id, rotate;
-
- if (ctx->screen->info.chip_class >= GFX9) {
- for (unsigned i = 0; i < 3; i++) {
- vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
- vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
- }
- } else {
- for (unsigned i = 0; i < 6; i++)
- vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
- }
-
- prim_id = LLVMGetParam(func, num_sgprs + 2);
- rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
-
- for (unsigned i = 0; i < 6; ++i) {
- LLVMValueRef base, rotated;
- base = vtx_in[i];
- rotated = vtx_in[(i + 4) % 6];
- vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
- }
-
- if (ctx->screen->info.chip_class >= GFX9) {
- for (unsigned i = 0; i < 3; i++) {
- LLVMValueRef hi, out;
-
- hi = LLVMBuildShl(builder, vtx_out[i*2+1],
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
- out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
- out = ac_to_float(&ctx->ac, out);
- ret = LLVMBuildInsertValue(builder, ret, out,
- gfx9_vtx_params[i].arg_index, "");
- }
- } else {
- for (unsigned i = 0; i < 6; i++) {
- LLVMValueRef out;
-
- out = ac_to_float(&ctx->ac, vtx_out[i]);
- ret = LLVMBuildInsertValue(builder, ret, out,
- gfx6_vtx_params[i].arg_index, "");
- }
- }
- }
-
- LLVMBuildRet(builder, ret);
+ unsigned num_sgprs, num_vgprs;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMTypeRef returns[AC_MAX_ARGS];
+ LLVMValueRef func, ret;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ if (key->gs_prolog.states.gfx9_prev_is_vs)
+ num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+ else
+ num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
+ num_vgprs = 5; /* ES inputs are not needed by GS */
+ } else {
+ num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+ num_vgprs = 8;
+ }
+
+ for (unsigned i = 0; i < num_sgprs; ++i) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ returns[i] = ctx->ac.i32;
+ }
+
+ for (unsigned i = 0; i < num_vgprs; ++i) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+ returns[num_sgprs + i] = ctx->ac.f32;
+ }
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
+ func = ctx->main_fn;
+
+ /* Set the full EXEC mask for the prolog, because we are only fiddling
+ * with registers here. The main shader part will set the correct EXEC
+ * mask.
+ */
+ if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+ ac_init_exec_full_mask(&ctx->ac);
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx->return_value;
+ for (unsigned i = 0; i < num_sgprs; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+ }
+ for (unsigned i = 0; i < num_vgprs; i++) {
+ LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+ p = ac_to_float(&ctx->ac, p);
+ ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+ }
+
+ if (key->gs_prolog.states.tri_strip_adj_fix) {
+ /* Remap the input vertices for every other primitive. */
+ const struct ac_arg gfx6_vtx_params[6] = {
+ {.used = true, .arg_index = num_sgprs}, {.used = true, .arg_index = num_sgprs + 1},
+ {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
+ {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
+ };
+ const struct ac_arg gfx9_vtx_params[3] = {
+ {.used = true, .arg_index = num_sgprs},
+ {.used = true, .arg_index = num_sgprs + 1},
+ {.used = true, .arg_index = num_sgprs + 4},
+ };
+ LLVMValueRef vtx_in[6], vtx_out[6];
+ LLVMValueRef prim_id, rotate;
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ for (unsigned i = 0; i < 3; i++) {
+ vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+ vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+ }
+ } else {
+ for (unsigned i = 0; i < 6; i++)
+ vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
+ }
+
+ prim_id = LLVMGetParam(func, num_sgprs + 2);
+ rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
+
+ for (unsigned i = 0; i < 6; ++i) {
+ LLVMValueRef base, rotated;
+ base = vtx_in[i];
+ rotated = vtx_in[(i + 4) % 6];
+ vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+ }
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ for (unsigned i = 0; i < 3; i++) {
+ LLVMValueRef hi, out;
+
+ hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
+ out = ac_to_float(&ctx->ac, out);
+ ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
+ }
+ } else {
+ for (unsigned i = 0; i < 6; i++) {
+ LLVMValueRef out;
+
+ out = ac_to_float(&ctx->ac, vtx_out[i]);
+ ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
+ }
+ }
+ }
+
+ LLVMBuildRet(builder, ret);
}
void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
{
- ctx->abi.load_inputs = si_nir_load_input_gs;
- ctx->abi.emit_vertex = si_llvm_emit_vertex;
- ctx->abi.emit_primitive = si_llvm_emit_primitive;
- ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+ ctx->abi.load_inputs = si_nir_load_input_gs;
+ ctx->abi.emit_vertex = si_llvm_emit_vertex;
+ ctx->abi.emit_primitive = si_llvm_emit_primitive;
+ ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
+ return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
}
static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
}
static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
- LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
-
- /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
- LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
- LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
- LLVMValueRef pos[4] = {
- si_buffer_load_const(ctx, resource, offset0),
- si_buffer_load_const(ctx, resource, offset1),
- LLVMConstReal(ctx->ac.f32, 0),
- LLVMConstReal(ctx->ac.f32, 0)
- };
-
- return ac_build_gather_values(&ctx->ac, pos, 4);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
+ LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
+
+ /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
+ LLVMValueRef offset0 =
+ LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
+ LLVMValueRef offset1 =
+ LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+ LLVMValueRef pos[4] = {si_buffer_load_const(ctx, resource, offset0),
+ si_buffer_load_const(ctx, resource, offset1),
+ LLVMConstReal(ctx->ac.f32, 0), LLVMConstReal(ctx->ac.f32, 0)};
+
+ return ac_build_gather_values(&ctx->ac, pos, 4);
}
static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct ac_image_args args = {};
- LLVMValueRef ptr, image, fmask;
-
- /* Ignore src0, because KHR_blend_func_extended disallows multiple render
- * targets.
- */
-
- /* Load the image descriptor. */
- STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
- ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
- ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
- image = ac_build_load_to_sgpr(&ctx->ac, ptr,
- LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
-
- unsigned chan = 0;
-
- args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
-
- if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
- args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
-
- /* Get the current render target layer index. */
- if (ctx->shader->key.mono.u.ps.fbfetch_layered)
- args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
-
- if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
- args.coords[chan++] = si_get_sample_id(ctx);
-
- if (ctx->shader->key.mono.u.ps.fbfetch_msaa &&
- !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
- fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
- LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
-
- ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
- ctx->shader->key.mono.u.ps.fbfetch_layered);
- }
-
- args.opcode = ac_image_load;
- args.resource = image;
- args.dmask = 0xf;
- args.attributes = AC_FUNC_ATTR_READNONE;
-
- if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
- args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
- ac_image_2darraymsaa : ac_image_2dmsaa;
- else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
- args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
- ac_image_1darray : ac_image_1d;
- else
- args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
- ac_image_2darray : ac_image_2d;
-
- return ac_build_image_opcode(&ctx->ac, &args);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct ac_image_args args = {};
+ LLVMValueRef ptr, image, fmask;
+
+ /* Ignore src0, because KHR_blend_func_extended disallows multiple render
+ * targets.
+ */
+
+ /* Load the image descriptor. */
+ STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
+ ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ ptr =
+ LLVMBuildPointerCast(ctx->ac.builder, ptr, ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+ image =
+ ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
+
+ unsigned chan = 0;
+
+ args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
+
+ if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+ args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
+
+ /* Get the current render target layer index. */
+ if (ctx->shader->key.mono.u.ps.fbfetch_layered)
+ args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
+
+ if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+ args.coords[chan++] = si_get_sample_id(ctx);
+
+ if (ctx->shader->key.mono.u.ps.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
+ fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
+ LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
+
+ ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
+ ctx->shader->key.mono.u.ps.fbfetch_layered);
+ }
+
+ args.opcode = ac_image_load;
+ args.resource = image;
+ args.dmask = 0xf;
+ args.attributes = AC_FUNC_ATTR_READNONE;
+
+ if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+ args.dim =
+ ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
+ else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+ args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_1darray : ac_image_1d;
+ else
+ args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darray : ac_image_2d;
+
+ return ac_build_image_opcode(&ctx->ac, &args);
}
-static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
- unsigned attr_index, unsigned chan,
- LLVMValueRef prim_mask,
- LLVMValueRef i, LLVMValueRef j)
+static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
+ unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
+ LLVMValueRef j)
{
- if (i || j) {
- return ac_build_fs_interp(&ctx->ac,
- LLVMConstInt(ctx->ac.i32, chan, 0),
- LLVMConstInt(ctx->ac.i32, attr_index, 0),
- prim_mask, i, j);
- }
- return ac_build_fs_interp_mov(&ctx->ac,
- LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
- LLVMConstInt(ctx->ac.i32, chan, 0),
- LLVMConstInt(ctx->ac.i32, attr_index, 0),
- prim_mask);
+ if (i || j) {
+ return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
+ LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
+ }
+ return ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
+ LLVMConstInt(ctx->ac.i32, chan, 0),
+ LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
}
/**
* @param face SI_PARAM_FRONT_FACE
* @param result the return value (4 components)
*/
-static void interp_fs_color(struct si_shader_context *ctx,
- unsigned input_index,
- unsigned semantic_index,
- unsigned num_interp_inputs,
- unsigned colors_read_mask,
- LLVMValueRef interp_param,
- LLVMValueRef prim_mask,
- LLVMValueRef face,
- LLVMValueRef result[4])
+static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
+ unsigned semantic_index, unsigned num_interp_inputs,
+ unsigned colors_read_mask, LLVMValueRef interp_param,
+ LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
{
- LLVMValueRef i = NULL, j = NULL;
- unsigned chan;
-
- /* fs.constant returns the param from the middle vertex, so it's not
- * really useful for flat shading. It's meant to be used for custom
- * interpolation (but the intrinsic can't fetch from the other two
- * vertices).
- *
- * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
- * to do the right thing. The only reason we use fs.constant is that
- * fs.interp cannot be used on integers, because they can be equal
- * to NaN.
- *
- * When interp is false we will use fs.constant or for newer llvm,
- * amdgcn.interp.mov.
- */
- bool interp = interp_param != NULL;
-
- if (interp) {
- interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
- LLVMVectorType(ctx->ac.f32, 2), "");
-
- i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
- ctx->ac.i32_0, "");
- j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
- ctx->ac.i32_1, "");
- }
-
- if (ctx->shader->key.part.ps.prolog.color_two_side) {
- LLVMValueRef is_face_positive;
-
- /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
- * otherwise it's at offset "num_inputs".
- */
- unsigned back_attr_offset = num_interp_inputs;
- if (semantic_index == 1 && colors_read_mask & 0xf)
- back_attr_offset += 1;
-
- is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
- face, ctx->ac.i32_0, "");
-
- for (chan = 0; chan < 4; chan++) {
- LLVMValueRef front, back;
-
- front = si_build_fs_interp(ctx,
- input_index, chan,
- prim_mask, i, j);
- back = si_build_fs_interp(ctx,
- back_attr_offset, chan,
- prim_mask, i, j);
-
- result[chan] = LLVMBuildSelect(ctx->ac.builder,
- is_face_positive,
- front,
- back,
- "");
- }
- } else {
- for (chan = 0; chan < 4; chan++) {
- result[chan] = si_build_fs_interp(ctx,
- input_index, chan,
- prim_mask, i, j);
- }
- }
+ LLVMValueRef i = NULL, j = NULL;
+ unsigned chan;
+
+ /* fs.constant returns the param from the middle vertex, so it's not
+ * really useful for flat shading. It's meant to be used for custom
+ * interpolation (but the intrinsic can't fetch from the other two
+ * vertices).
+ *
+ * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
+ * to do the right thing. The only reason we use fs.constant is that
+ * fs.interp cannot be used on integers, because they can be equal
+ * to NaN.
+ *
+ * When interp is false we will use fs.constant or for newer llvm,
+ * amdgcn.interp.mov.
+ */
+ bool interp = interp_param != NULL;
+
+ if (interp) {
+ interp_param =
+ LLVMBuildBitCast(ctx->ac.builder, interp_param, LLVMVectorType(ctx->ac.f32, 2), "");
+
+ i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+ j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+ }
+
+ if (ctx->shader->key.part.ps.prolog.color_two_side) {
+ LLVMValueRef is_face_positive;
+
+ /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+ * otherwise it's at offset "num_inputs".
+ */
+ unsigned back_attr_offset = num_interp_inputs;
+ if (semantic_index == 1 && colors_read_mask & 0xf)
+ back_attr_offset += 1;
+
+ is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
+
+ for (chan = 0; chan < 4; chan++) {
+ LLVMValueRef front, back;
+
+ front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+ back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
+
+ result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
+ }
+ } else {
+ for (chan = 0; chan < 4; chan++) {
+ result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+ }
+ }
}
static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
{
- if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
- static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
- [PIPE_FUNC_LESS] = LLVMRealOLT,
- [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
- [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
- [PIPE_FUNC_GREATER] = LLVMRealOGT,
- [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
- [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
- };
- LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
- assert(cond);
-
- LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
- SI_PARAM_ALPHA_REF);
- LLVMValueRef alpha_pass =
- LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
- ac_build_kill_if_false(&ctx->ac, alpha_pass);
- } else {
- ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
- }
+ if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
+ static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
+ [PIPE_FUNC_LESS] = LLVMRealOLT, [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
+ [PIPE_FUNC_LEQUAL] = LLVMRealOLE, [PIPE_FUNC_GREATER] = LLVMRealOGT,
+ [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
+ };
+ LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
+ assert(cond);
+
+ LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
+ LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
+ ac_build_kill_if_false(&ctx->ac, alpha_pass);
+ } else {
+ ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
+ }
}
-static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx,
- LLVMValueRef alpha,
- unsigned samplemask_param)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha,
+ unsigned samplemask_param)
{
- LLVMValueRef coverage;
+ LLVMValueRef coverage;
- /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
- coverage = LLVMGetParam(ctx->main_fn,
- samplemask_param);
- coverage = ac_to_integer(&ctx->ac, coverage);
+ /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
+ coverage = LLVMGetParam(ctx->main_fn, samplemask_param);
+ coverage = ac_to_integer(&ctx->ac, coverage);
- coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
- ctx->ac.i32,
- &coverage, 1, AC_FUNC_ATTR_READNONE);
+ coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, &coverage, 1,
+ AC_FUNC_ATTR_READNONE);
- coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
- ctx->ac.f32, "");
+ coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, "");
- coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
- LLVMConstReal(ctx->ac.f32,
- 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
+ coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
+ LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
- return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
+ return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
}
struct si_ps_exports {
- unsigned num;
- struct ac_export_args args[10];
+ unsigned num;
+ struct ac_export_args args[10];
};
-static void si_export_mrt_z(struct si_shader_context *ctx,
- LLVMValueRef depth, LLVMValueRef stencil,
- LLVMValueRef samplemask, struct si_ps_exports *exp)
+static void si_export_mrt_z(struct si_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
+ LLVMValueRef samplemask, struct si_ps_exports *exp)
{
- struct ac_export_args args;
+ struct ac_export_args args;
- ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
+ ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
- memcpy(&exp->args[exp->num++], &args, sizeof(args));
+ memcpy(&exp->args[exp->num++], &args, sizeof(args));
}
/* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_ps_export_args(struct si_shader_context *ctx,
- LLVMValueRef *values,
- unsigned target,
- struct ac_export_args *args)
+static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+ unsigned target, struct ac_export_args *args)
{
- const struct si_shader_key *key = &ctx->shader->key;
- unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
- LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
- unsigned spi_shader_col_format;
- unsigned chan;
- bool is_int8, is_int10;
- int cbuf = target - V_008DFC_SQ_EXP_MRT;
-
- assert(cbuf >= 0 && cbuf < 8);
-
- spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
- is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
- is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
-
- /* Default is 0xf. Adjusted below depending on the format. */
- args->enabled_channels = 0xf; /* writemask */
-
- /* Specify whether the EXEC mask represents the valid mask */
- args->valid_mask = 0;
-
- /* Specify whether this is the last export */
- args->done = 0;
-
- /* Specify the target we are exporting */
- args->target = target;
-
- args->compr = false;
- args->out[0] = f32undef;
- args->out[1] = f32undef;
- args->out[2] = f32undef;
- args->out[3] = f32undef;
-
- LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
- LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
- unsigned bits, bool hi) = NULL;
-
- switch (spi_shader_col_format) {
- case V_028714_SPI_SHADER_ZERO:
- args->enabled_channels = 0; /* writemask */
- args->target = V_008DFC_SQ_EXP_NULL;
- break;
-
- case V_028714_SPI_SHADER_32_R:
- args->enabled_channels = 1; /* writemask */
- args->out[0] = values[0];
- break;
-
- case V_028714_SPI_SHADER_32_GR:
- args->enabled_channels = 0x3; /* writemask */
- args->out[0] = values[0];
- args->out[1] = values[1];
- break;
-
- case V_028714_SPI_SHADER_32_AR:
- if (ctx->screen->info.chip_class >= GFX10) {
- args->enabled_channels = 0x3; /* writemask */
- args->out[0] = values[0];
- args->out[1] = values[3];
- } else {
- args->enabled_channels = 0x9; /* writemask */
- args->out[0] = values[0];
- args->out[3] = values[3];
- }
- break;
-
- case V_028714_SPI_SHADER_FP16_ABGR:
- packf = ac_build_cvt_pkrtz_f16;
- break;
-
- case V_028714_SPI_SHADER_UNORM16_ABGR:
- packf = ac_build_cvt_pknorm_u16;
- break;
-
- case V_028714_SPI_SHADER_SNORM16_ABGR:
- packf = ac_build_cvt_pknorm_i16;
- break;
-
- case V_028714_SPI_SHADER_UINT16_ABGR:
- packi = ac_build_cvt_pk_u16;
- break;
-
- case V_028714_SPI_SHADER_SINT16_ABGR:
- packi = ac_build_cvt_pk_i16;
- break;
-
- case V_028714_SPI_SHADER_32_ABGR:
- memcpy(&args->out[0], values, sizeof(values[0]) * 4);
- break;
- }
-
- /* Pack f16 or norm_i16/u16. */
- if (packf) {
- for (chan = 0; chan < 2; chan++) {
- LLVMValueRef pack_args[2] = {
- values[2 * chan],
- values[2 * chan + 1]
- };
- LLVMValueRef packed;
-
- packed = packf(&ctx->ac, pack_args);
- args->out[chan] = ac_to_float(&ctx->ac, packed);
- }
- args->compr = 1; /* COMPR flag */
- }
- /* Pack i16/u16. */
- if (packi) {
- for (chan = 0; chan < 2; chan++) {
- LLVMValueRef pack_args[2] = {
- ac_to_integer(&ctx->ac, values[2 * chan]),
- ac_to_integer(&ctx->ac, values[2 * chan + 1])
- };
- LLVMValueRef packed;
-
- packed = packi(&ctx->ac, pack_args,
- is_int8 ? 8 : is_int10 ? 10 : 16,
- chan == 1);
- args->out[chan] = ac_to_float(&ctx->ac, packed);
- }
- args->compr = 1; /* COMPR flag */
- }
+ const struct si_shader_key *key = &ctx->shader->key;
+ unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
+ LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
+ unsigned spi_shader_col_format;
+ unsigned chan;
+ bool is_int8, is_int10;
+ int cbuf = target - V_008DFC_SQ_EXP_MRT;
+
+ assert(cbuf >= 0 && cbuf < 8);
+
+ spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+ is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
+ is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
+
+ /* Default is 0xf. Adjusted below depending on the format. */
+ args->enabled_channels = 0xf; /* writemask */
+
+ /* Specify whether the EXEC mask represents the valid mask */
+ args->valid_mask = 0;
+
+ /* Specify whether this is the last export */
+ args->done = 0;
+
+ /* Specify the target we are exporting */
+ args->target = target;
+
+ args->compr = false;
+ args->out[0] = f32undef;
+ args->out[1] = f32undef;
+ args->out[2] = f32undef;
+ args->out[3] = f32undef;
+
+ LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
+ LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
+ bool hi) = NULL;
+
+ switch (spi_shader_col_format) {
+ case V_028714_SPI_SHADER_ZERO:
+ args->enabled_channels = 0; /* writemask */
+ args->target = V_008DFC_SQ_EXP_NULL;
+ break;
+
+ case V_028714_SPI_SHADER_32_R:
+ args->enabled_channels = 1; /* writemask */
+ args->out[0] = values[0];
+ break;
+
+ case V_028714_SPI_SHADER_32_GR:
+ args->enabled_channels = 0x3; /* writemask */
+ args->out[0] = values[0];
+ args->out[1] = values[1];
+ break;
+
+ case V_028714_SPI_SHADER_32_AR:
+ if (ctx->screen->info.chip_class >= GFX10) {
+ args->enabled_channels = 0x3; /* writemask */
+ args->out[0] = values[0];
+ args->out[1] = values[3];
+ } else {
+ args->enabled_channels = 0x9; /* writemask */
+ args->out[0] = values[0];
+ args->out[3] = values[3];
+ }
+ break;
+
+ case V_028714_SPI_SHADER_FP16_ABGR:
+ packf = ac_build_cvt_pkrtz_f16;
+ break;
+
+ case V_028714_SPI_SHADER_UNORM16_ABGR:
+ packf = ac_build_cvt_pknorm_u16;
+ break;
+
+ case V_028714_SPI_SHADER_SNORM16_ABGR:
+ packf = ac_build_cvt_pknorm_i16;
+ break;
+
+ case V_028714_SPI_SHADER_UINT16_ABGR:
+ packi = ac_build_cvt_pk_u16;
+ break;
+
+ case V_028714_SPI_SHADER_SINT16_ABGR:
+ packi = ac_build_cvt_pk_i16;
+ break;
+
+ case V_028714_SPI_SHADER_32_ABGR:
+ memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+ break;
+ }
+
+ /* Pack f16 or norm_i16/u16. */
+ if (packf) {
+ for (chan = 0; chan < 2; chan++) {
+ LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
+ LLVMValueRef packed;
+
+ packed = packf(&ctx->ac, pack_args);
+ args->out[chan] = ac_to_float(&ctx->ac, packed);
+ }
+ args->compr = 1; /* COMPR flag */
+ }
+ /* Pack i16/u16. */
+ if (packi) {
+ for (chan = 0; chan < 2; chan++) {
+ LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
+ ac_to_integer(&ctx->ac, values[2 * chan + 1])};
+ LLVMValueRef packed;
+
+ packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
+ args->out[chan] = ac_to_float(&ctx->ac, packed);
+ }
+ args->compr = 1; /* COMPR flag */
+ }
}
-static void si_export_mrt_color(struct si_shader_context *ctx,
- LLVMValueRef *color, unsigned index,
- unsigned samplemask_param,
- bool is_last, struct si_ps_exports *exp)
+static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
+ unsigned samplemask_param, bool is_last, struct si_ps_exports *exp)
{
- int i;
-
- /* Clamp color */
- if (ctx->shader->key.part.ps.epilog.clamp_color)
- for (i = 0; i < 4; i++)
- color[i] = ac_build_clamp(&ctx->ac, color[i]);
-
- /* Alpha to one */
- if (ctx->shader->key.part.ps.epilog.alpha_to_one)
- color[3] = ctx->ac.f32_1;
-
- /* Alpha test */
- if (index == 0 &&
- ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
- si_alpha_test(ctx, color[3]);
-
- /* Line & polygon smoothing */
- if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
- color[3] = si_scale_alpha_by_sample_mask(ctx, color[3],
- samplemask_param);
-
- /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
- if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
- struct ac_export_args args[8];
- int c, last = -1;
-
- /* Get the export arguments, also find out what the last one is. */
- for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
- si_llvm_init_ps_export_args(ctx, color,
- V_008DFC_SQ_EXP_MRT + c, &args[c]);
- if (args[c].enabled_channels)
- last = c;
- }
-
- /* Emit all exports. */
- for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
- if (is_last && last == c) {
- args[c].valid_mask = 1; /* whether the EXEC mask is valid */
- args[c].done = 1; /* DONE bit */
- } else if (!args[c].enabled_channels)
- continue; /* unnecessary NULL export */
-
- memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
- }
- } else {
- struct ac_export_args args;
-
- /* Export */
- si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
- &args);
- if (is_last) {
- args.valid_mask = 1; /* whether the EXEC mask is valid */
- args.done = 1; /* DONE bit */
- } else if (!args.enabled_channels)
- return; /* unnecessary NULL export */
-
- memcpy(&exp->args[exp->num++], &args, sizeof(args));
- }
+ int i;
+
+ /* Clamp color */
+ if (ctx->shader->key.part.ps.epilog.clamp_color)
+ for (i = 0; i < 4; i++)
+ color[i] = ac_build_clamp(&ctx->ac, color[i]);
+
+ /* Alpha to one */
+ if (ctx->shader->key.part.ps.epilog.alpha_to_one)
+ color[3] = ctx->ac.f32_1;
+
+ /* Alpha test */
+ if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
+ si_alpha_test(ctx, color[3]);
+
+ /* Line & polygon smoothing */
+ if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
+ color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
+
+ /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+ if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
+ struct ac_export_args args[8];
+ int c, last = -1;
+
+ /* Get the export arguments, also find out what the last one is. */
+ for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+ si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + c, &args[c]);
+ if (args[c].enabled_channels)
+ last = c;
+ }
+
+ /* Emit all exports. */
+ for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+ if (is_last && last == c) {
+ args[c].valid_mask = 1; /* whether the EXEC mask is valid */
+ args[c].done = 1; /* DONE bit */
+ } else if (!args[c].enabled_channels)
+ continue; /* unnecessary NULL export */
+
+ memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
+ }
+ } else {
+ struct ac_export_args args;
+
+ /* Export */
+ si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, &args);
+ if (is_last) {
+ args.valid_mask = 1; /* whether the EXEC mask is valid */
+ args.done = 1; /* DONE bit */
+ } else if (!args.enabled_channels)
+ return; /* unnecessary NULL export */
+
+ memcpy(&exp->args[exp->num++], &args, sizeof(args));
+ }
}
-static void si_emit_ps_exports(struct si_shader_context *ctx,
- struct si_ps_exports *exp)
+static void si_emit_ps_exports(struct si_shader_context *ctx, struct si_ps_exports *exp)
{
- for (unsigned i = 0; i < exp->num; i++)
- ac_build_export(&ctx->ac, &exp->args[i]);
+ for (unsigned i = 0; i < exp->num; i++)
+ ac_build_export(&ctx->ac, &exp->args[i]);
}
/**
*
* The alpha-ref SGPR is returned via its original location.
*/
-static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader *shader = ctx->shader;
- struct si_shader_info *info = &shader->selector->info;
- LLVMBuilderRef builder = ctx->ac.builder;
- unsigned i, j, first_vgpr, vgpr;
-
- LLVMValueRef color[8][4] = {};
- LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
- LLVMValueRef ret;
-
- if (ctx->postponed_kill)
- ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
-
- /* Read the output values. */
- for (i = 0; i < info->num_outputs; i++) {
- unsigned semantic_name = info->output_semantic_name[i];
- unsigned semantic_index = info->output_semantic_index[i];
-
- switch (semantic_name) {
- case TGSI_SEMANTIC_COLOR:
- assert(semantic_index < 8);
- for (j = 0; j < 4; j++) {
- LLVMValueRef ptr = addrs[4 * i + j];
- LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
- color[semantic_index][j] = result;
- }
- break;
- case TGSI_SEMANTIC_POSITION:
- depth = LLVMBuildLoad(builder,
- addrs[4 * i + 0], "");
- break;
- case TGSI_SEMANTIC_STENCIL:
- stencil = LLVMBuildLoad(builder,
- addrs[4 * i + 0], "");
- break;
- case TGSI_SEMANTIC_SAMPLEMASK:
- samplemask = LLVMBuildLoad(builder,
- addrs[4 * i + 0], "");
- break;
- default:
- fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n",
- semantic_name);
- }
- }
-
- /* Fill the return structure. */
- ret = ctx->return_value;
-
- /* Set SGPRs. */
- ret = LLVMBuildInsertValue(builder, ret,
- ac_to_integer(&ctx->ac,
- LLVMGetParam(ctx->main_fn,
- SI_PARAM_ALPHA_REF)),
- SI_SGPR_ALPHA_REF, "");
-
- /* Set VGPRs */
- first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
- for (i = 0; i < ARRAY_SIZE(color); i++) {
- if (!color[i][0])
- continue;
-
- for (j = 0; j < 4; j++)
- ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
- }
- if (depth)
- ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
- if (stencil)
- ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
- if (samplemask)
- ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
-
- /* Add the input sample mask for smoothing at the end. */
- if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
- vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
- ret = LLVMBuildInsertValue(builder, ret,
- LLVMGetParam(ctx->main_fn,
- SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
-
- ctx->return_value = ret;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader *shader = ctx->shader;
+ struct si_shader_info *info = &shader->selector->info;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ unsigned i, j, first_vgpr, vgpr;
+
+ LLVMValueRef color[8][4] = {};
+ LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+ LLVMValueRef ret;
+
+ if (ctx->postponed_kill)
+ ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
+
+ /* Read the output values. */
+ for (i = 0; i < info->num_outputs; i++) {
+ unsigned semantic_name = info->output_semantic_name[i];
+ unsigned semantic_index = info->output_semantic_index[i];
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_COLOR:
+ assert(semantic_index < 8);
+ for (j = 0; j < 4; j++) {
+ LLVMValueRef ptr = addrs[4 * i + j];
+ LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+ color[semantic_index][j] = result;
+ }
+ break;
+ case TGSI_SEMANTIC_POSITION:
+ depth = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+ break;
+ case TGSI_SEMANTIC_STENCIL:
+ stencil = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+ break;
+ case TGSI_SEMANTIC_SAMPLEMASK:
+ samplemask = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+ break;
+ default:
+ fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", semantic_name);
+ }
+ }
+
+ /* Fill the return structure. */
+ ret = ctx->return_value;
+
+ /* Set SGPRs. */
+ ret = LLVMBuildInsertValue(
+ builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF)),
+ SI_SGPR_ALPHA_REF, "");
+
+ /* Set VGPRs */
+ first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+ for (i = 0; i < ARRAY_SIZE(color); i++) {
+ if (!color[i][0])
+ continue;
+
+ for (j = 0; j < 4; j++)
+ ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+ }
+ if (depth)
+ ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+ if (stencil)
+ ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+ if (samplemask)
+ ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+ /* Add the input sample mask for smoothing at the end. */
+ if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+ vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+ ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE),
+ vgpr++, "");
+
+ ctx->return_value = ret;
}
static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
- LLVMValueRef param_rw_buffers,
- struct ac_arg param_pos_fixed_pt)
+ LLVMValueRef param_rw_buffers,
+ struct ac_arg param_pos_fixed_pt)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef slot, desc, offset, row, bit, address[2];
-
- /* Use the fixed-point gl_FragCoord input.
- * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
- * per coordinate to get the repeating effect.
- */
- address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
- address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
-
- /* Load the buffer descriptor. */
- slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
- desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
-
- /* The stipple pattern is 32x32, each row has 32 bits. */
- offset = LLVMBuildMul(builder, address[1],
- LLVMConstInt(ctx->ac.i32, 4, 0), "");
- row = si_buffer_load_const(ctx, desc, offset);
- row = ac_to_integer(&ctx->ac, row);
- bit = LLVMBuildLShr(builder, row, address[0], "");
- bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
- ac_build_kill_if_false(&ctx->ac, bit);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef slot, desc, offset, row, bit, address[2];
+
+ /* Use the fixed-point gl_FragCoord input.
+ * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+ * per coordinate to get the repeating effect.
+ */
+ address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+ address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+ /* Load the buffer descriptor. */
+ slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
+ desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
+
+ /* The stipple pattern is 32x32, each row has 32 bits. */
+ offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
+ row = si_buffer_load_const(ctx, desc, offset);
+ row = ac_to_integer(&ctx->ac, row);
+ bit = LLVMBuildLShr(builder, row, address[0], "");
+ bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
+ ac_build_kill_if_false(&ctx->ac, bit);
}
/**
* overriden by other states. (e.g. per-sample interpolation)
* Interpolated colors are stored after the preloaded VGPRs.
*/
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key)
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
- LLVMValueRef ret, func;
- int num_returns, i, num_color_channels;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- /* Declare inputs. */
- LLVMTypeRef return_types[AC_MAX_ARGS];
- num_returns = 0;
- num_color_channels = util_bitcount(key->ps_prolog.colors_read);
- assert(key->ps_prolog.num_input_sgprs +
- key->ps_prolog.num_input_vgprs +
- num_color_channels <= AC_MAX_ARGS);
- for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- return_types[num_returns++] = ctx->ac.i32;
-
- }
-
- struct ac_arg pos_fixed_pt;
- struct ac_arg ancillary;
- struct ac_arg param_sample_mask;
- for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
- struct ac_arg *arg = NULL;
- if (i == key->ps_prolog.ancillary_vgpr_index) {
- arg = &ancillary;
- } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
- arg = ¶m_sample_mask;
- } else if (i == key->ps_prolog.num_input_vgprs - 1) {
- /* POS_FIXED_PT is always last. */
- arg = &pos_fixed_pt;
- }
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
- return_types[num_returns++] = ctx->ac.f32;
- }
-
- /* Declare outputs (same as inputs + add colors if needed) */
- for (i = 0; i < num_color_channels; i++)
- return_types[num_returns++] = ctx->ac.f32;
-
- /* Create the function. */
- si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
- func = ctx->main_fn;
-
- /* Copy inputs to outputs. This should be no-op, as the registers match,
- * but it will prevent the compiler from overwriting them unintentionally.
- */
- ret = ctx->return_value;
- for (i = 0; i < ctx->args.arg_count; i++) {
- LLVMValueRef p = LLVMGetParam(func, i);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
- }
-
- /* Polygon stippling. */
- if (key->ps_prolog.states.poly_stipple) {
- LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
-
- si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
- }
-
- if (key->ps_prolog.states.bc_optimize_for_persp ||
- key->ps_prolog.states.bc_optimize_for_linear) {
- unsigned i, base = key->ps_prolog.num_input_sgprs;
- LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
-
- /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
- * The hw doesn't compute CENTROID if the whole wave only
- * contains fully-covered quads.
- *
- * PRIM_MASK is after user SGPRs.
- */
- bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
- bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
- LLVMConstInt(ctx->ac.i32, 31, 0), "");
- bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
- ctx->ac.i1, "");
-
- if (key->ps_prolog.states.bc_optimize_for_persp) {
- /* Read PERSP_CENTER. */
- for (i = 0; i < 2; i++)
- center[i] = LLVMGetParam(func, base + 2 + i);
- /* Read PERSP_CENTROID. */
- for (i = 0; i < 2; i++)
- centroid[i] = LLVMGetParam(func, base + 4 + i);
- /* Select PERSP_CENTROID. */
- for (i = 0; i < 2; i++) {
- tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
- center[i], centroid[i], "");
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- tmp, base + 4 + i, "");
- }
- }
- if (key->ps_prolog.states.bc_optimize_for_linear) {
- /* Read LINEAR_CENTER. */
- for (i = 0; i < 2; i++)
- center[i] = LLVMGetParam(func, base + 8 + i);
- /* Read LINEAR_CENTROID. */
- for (i = 0; i < 2; i++)
- centroid[i] = LLVMGetParam(func, base + 10 + i);
- /* Select LINEAR_CENTROID. */
- for (i = 0; i < 2; i++) {
- tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
- center[i], centroid[i], "");
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- tmp, base + 10 + i, "");
- }
- }
- }
-
- /* Force per-sample interpolation. */
- if (key->ps_prolog.states.force_persp_sample_interp) {
- unsigned i, base = key->ps_prolog.num_input_sgprs;
- LLVMValueRef persp_sample[2];
-
- /* Read PERSP_SAMPLE. */
- for (i = 0; i < 2; i++)
- persp_sample[i] = LLVMGetParam(func, base + i);
- /* Overwrite PERSP_CENTER. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- persp_sample[i], base + 2 + i, "");
- /* Overwrite PERSP_CENTROID. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- persp_sample[i], base + 4 + i, "");
- }
- if (key->ps_prolog.states.force_linear_sample_interp) {
- unsigned i, base = key->ps_prolog.num_input_sgprs;
- LLVMValueRef linear_sample[2];
-
- /* Read LINEAR_SAMPLE. */
- for (i = 0; i < 2; i++)
- linear_sample[i] = LLVMGetParam(func, base + 6 + i);
- /* Overwrite LINEAR_CENTER. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- linear_sample[i], base + 8 + i, "");
- /* Overwrite LINEAR_CENTROID. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- linear_sample[i], base + 10 + i, "");
- }
-
- /* Force center interpolation. */
- if (key->ps_prolog.states.force_persp_center_interp) {
- unsigned i, base = key->ps_prolog.num_input_sgprs;
- LLVMValueRef persp_center[2];
-
- /* Read PERSP_CENTER. */
- for (i = 0; i < 2; i++)
- persp_center[i] = LLVMGetParam(func, base + 2 + i);
- /* Overwrite PERSP_SAMPLE. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- persp_center[i], base + i, "");
- /* Overwrite PERSP_CENTROID. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- persp_center[i], base + 4 + i, "");
- }
- if (key->ps_prolog.states.force_linear_center_interp) {
- unsigned i, base = key->ps_prolog.num_input_sgprs;
- LLVMValueRef linear_center[2];
-
- /* Read LINEAR_CENTER. */
- for (i = 0; i < 2; i++)
- linear_center[i] = LLVMGetParam(func, base + 8 + i);
- /* Overwrite LINEAR_SAMPLE. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- linear_center[i], base + 6 + i, "");
- /* Overwrite LINEAR_CENTROID. */
- for (i = 0; i < 2; i++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- linear_center[i], base + 10 + i, "");
- }
-
- /* Interpolate colors. */
- unsigned color_out_idx = 0;
- for (i = 0; i < 2; i++) {
- unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
- unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
- key->ps_prolog.face_vgpr_index;
- LLVMValueRef interp[2], color[4];
- LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
-
- if (!writemask)
- continue;
-
- /* If the interpolation qualifier is not CONSTANT (-1). */
- if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
- unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
- key->ps_prolog.color_interp_vgpr_index[i];
-
- /* Get the (i,j) updated by bc_optimize handling. */
- interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
- interp_vgpr, "");
- interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
- interp_vgpr + 1, "");
- interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
- }
-
- /* Use the absolute location of the input. */
- prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
-
- if (key->ps_prolog.states.color_two_side) {
- face = LLVMGetParam(func, face_vgpr);
- face = ac_to_integer(&ctx->ac, face);
- }
-
- interp_fs_color(ctx,
- key->ps_prolog.color_attr_index[i], i,
- key->ps_prolog.num_interp_inputs,
- key->ps_prolog.colors_read, interp_ij,
- prim_mask, face, color);
-
- while (writemask) {
- unsigned chan = u_bit_scan(&writemask);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
- ctx->args.arg_count + color_out_idx++, "");
- }
- }
-
- /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
- * says:
- *
- * "When per-sample shading is active due to the use of a fragment
- * input qualified by sample or due to the use of the gl_SampleID
- * or gl_SamplePosition variables, only the bit for the current
- * sample is set in gl_SampleMaskIn. When state specifies multiple
- * fragment shader invocations for a given fragment, the sample
- * mask for any single fragment shader invocation may specify a
- * subset of the covered samples for the fragment. In this case,
- * the bit corresponding to each covered sample will be set in
- * exactly one fragment shader invocation."
- *
- * The samplemask loaded by hardware is always the coverage of the
- * entire pixel/fragment, so mask bits out based on the sample ID.
- */
- if (key->ps_prolog.states.samplemask_log_ps_iter) {
- /* The bit pattern matches that used by fixed function fragment
- * processing. */
- static const uint16_t ps_iter_masks[] = {
- 0xffff, /* not used */
- 0x5555,
- 0x1111,
- 0x0101,
- 0x0001,
- };
- assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
-
- uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
- LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
- LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
-
- samplemask = ac_to_integer(&ctx->ac, samplemask);
- samplemask = LLVMBuildAnd(
- ctx->ac.builder,
- samplemask,
- LLVMBuildShl(ctx->ac.builder,
- LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
- sampleid, ""),
- "");
- samplemask = ac_to_float(&ctx->ac, samplemask);
-
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
- param_sample_mask.arg_index, "");
- }
-
- /* Tell LLVM to insert WQM instruction sequence when needed. */
- if (key->ps_prolog.wqm) {
- LLVMAddTargetDependentFunctionAttr(func,
- "amdgpu-ps-wqm-outputs", "");
- }
-
- si_llvm_build_ret(ctx, ret);
+ LLVMValueRef ret, func;
+ int num_returns, i, num_color_channels;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ /* Declare inputs. */
+ LLVMTypeRef return_types[AC_MAX_ARGS];
+ num_returns = 0;
+ num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+ assert(key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs + num_color_channels <=
+ AC_MAX_ARGS);
+ for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ return_types[num_returns++] = ctx->ac.i32;
+ }
+
+ struct ac_arg pos_fixed_pt;
+ struct ac_arg ancillary;
+ struct ac_arg param_sample_mask;
+ for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
+ struct ac_arg *arg = NULL;
+ if (i == key->ps_prolog.ancillary_vgpr_index) {
+ arg = &ancillary;
+ } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
+ arg = ¶m_sample_mask;
+ } else if (i == key->ps_prolog.num_input_vgprs - 1) {
+ /* POS_FIXED_PT is always last. */
+ arg = &pos_fixed_pt;
+ }
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
+ return_types[num_returns++] = ctx->ac.f32;
+ }
+
+ /* Declare outputs (same as inputs + add colors if needed) */
+ for (i = 0; i < num_color_channels; i++)
+ return_types[num_returns++] = ctx->ac.f32;
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
+ func = ctx->main_fn;
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx->return_value;
+ for (i = 0; i < ctx->args.arg_count; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+ }
+
+ /* Polygon stippling. */
+ if (key->ps_prolog.states.poly_stipple) {
+ LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+
+ si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
+ }
+
+ if (key->ps_prolog.states.bc_optimize_for_persp ||
+ key->ps_prolog.states.bc_optimize_for_linear) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
+
+ /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+ * The hw doesn't compute CENTROID if the whole wave only
+ * contains fully-covered quads.
+ *
+ * PRIM_MASK is after user SGPRs.
+ */
+ bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+ bc_optimize =
+ LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+ bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
+
+ if (key->ps_prolog.states.bc_optimize_for_persp) {
+ /* Read PERSP_CENTER. */
+ for (i = 0; i < 2; i++)
+ center[i] = LLVMGetParam(func, base + 2 + i);
+ /* Read PERSP_CENTROID. */
+ for (i = 0; i < 2; i++)
+ centroid[i] = LLVMGetParam(func, base + 4 + i);
+ /* Select PERSP_CENTROID. */
+ for (i = 0; i < 2; i++) {
+ tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, "");
+ }
+ }
+ if (key->ps_prolog.states.bc_optimize_for_linear) {
+ /* Read LINEAR_CENTER. */
+ for (i = 0; i < 2; i++)
+ center[i] = LLVMGetParam(func, base + 8 + i);
+ /* Read LINEAR_CENTROID. */
+ for (i = 0; i < 2; i++)
+ centroid[i] = LLVMGetParam(func, base + 10 + i);
+ /* Select LINEAR_CENTROID. */
+ for (i = 0; i < 2; i++) {
+ tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, "");
+ }
+ }
+ }
+
+ /* Force per-sample interpolation. */
+ if (key->ps_prolog.states.force_persp_sample_interp) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef persp_sample[2];
+
+ /* Read PERSP_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ persp_sample[i] = LLVMGetParam(func, base + i);
+ /* Overwrite PERSP_CENTER. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, "");
+ /* Overwrite PERSP_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, "");
+ }
+ if (key->ps_prolog.states.force_linear_sample_interp) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef linear_sample[2];
+
+ /* Read LINEAR_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+ /* Overwrite LINEAR_CENTER. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, "");
+ /* Overwrite LINEAR_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, "");
+ }
+
+ /* Force center interpolation. */
+ if (key->ps_prolog.states.force_persp_center_interp) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef persp_center[2];
+
+ /* Read PERSP_CENTER. */
+ for (i = 0; i < 2; i++)
+ persp_center[i] = LLVMGetParam(func, base + 2 + i);
+ /* Overwrite PERSP_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, "");
+ /* Overwrite PERSP_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, "");
+ }
+ if (key->ps_prolog.states.force_linear_center_interp) {
+ unsigned i, base = key->ps_prolog.num_input_sgprs;
+ LLVMValueRef linear_center[2];
+
+ /* Read LINEAR_CENTER. */
+ for (i = 0; i < 2; i++)
+ linear_center[i] = LLVMGetParam(func, base + 8 + i);
+ /* Overwrite LINEAR_SAMPLE. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, "");
+ /* Overwrite LINEAR_CENTROID. */
+ for (i = 0; i < 2; i++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, "");
+ }
+
+ /* Interpolate colors. */
+ unsigned color_out_idx = 0;
+ for (i = 0; i < 2; i++) {
+ unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+ unsigned face_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.face_vgpr_index;
+ LLVMValueRef interp[2], color[4];
+ LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+ if (!writemask)
+ continue;
+
+ /* If the interpolation qualifier is not CONSTANT (-1). */
+ if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+ unsigned interp_vgpr =
+ key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i];
+
+ /* Get the (i,j) updated by bc_optimize handling. */
+ interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, "");
+ interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, "");
+ interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
+ }
+
+ /* Use the absolute location of the input. */
+ prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+ if (key->ps_prolog.states.color_two_side) {
+ face = LLVMGetParam(func, face_vgpr);
+ face = ac_to_integer(&ctx->ac, face);
+ }
+
+ interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
+ key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
+
+ while (writemask) {
+ unsigned chan = u_bit_scan(&writemask);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
+ ctx->args.arg_count + color_out_idx++, "");
+ }
+ }
+
+ /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
+ * says:
+ *
+ * "When per-sample shading is active due to the use of a fragment
+ * input qualified by sample or due to the use of the gl_SampleID
+ * or gl_SamplePosition variables, only the bit for the current
+ * sample is set in gl_SampleMaskIn. When state specifies multiple
+ * fragment shader invocations for a given fragment, the sample
+ * mask for any single fragment shader invocation may specify a
+ * subset of the covered samples for the fragment. In this case,
+ * the bit corresponding to each covered sample will be set in
+ * exactly one fragment shader invocation."
+ *
+ * The samplemask loaded by hardware is always the coverage of the
+ * entire pixel/fragment, so mask bits out based on the sample ID.
+ */
+ if (key->ps_prolog.states.samplemask_log_ps_iter) {
+ /* The bit pattern matches that used by fixed function fragment
+ * processing. */
+ static const uint16_t ps_iter_masks[] = {
+ 0xffff, /* not used */
+ 0x5555, 0x1111, 0x0101, 0x0001,
+ };
+ assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
+
+ uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
+ LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
+ LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
+
+ samplemask = ac_to_integer(&ctx->ac, samplemask);
+ samplemask =
+ LLVMBuildAnd(ctx->ac.builder, samplemask,
+ LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
+ sampleid, ""),
+ "");
+ samplemask = ac_to_float(&ctx->ac, samplemask);
+
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, param_sample_mask.arg_index, "");
+ }
+
+ /* Tell LLVM to insert WQM instruction sequence when needed. */
+ if (key->ps_prolog.wqm) {
+ LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
+ }
+
+ si_llvm_build_ret(ctx, ret);
}
/**
* Build the pixel shader epilog function. This handles everything that must be
* emulated for pixel shader exports. (alpha-test, format conversions, etc)
*/
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
- union si_shader_part_key *key)
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
- LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
- int i;
- struct si_ps_exports exp = {};
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- /* Declare input SGPRs. */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->bindless_samplers_and_images);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->const_and_shader_buffers);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->samplers_and_images);
- si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT,
- NULL, SI_PARAM_ALPHA_REF);
-
- /* Declare input VGPRs. */
- unsigned required_num_params =
- ctx->args.num_sgprs_used +
- util_bitcount(key->ps_epilog.colors_written) * 4 +
- key->ps_epilog.writes_z +
- key->ps_epilog.writes_stencil +
- key->ps_epilog.writes_samplemask;
-
- required_num_params = MAX2(required_num_params,
- ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
- while (ctx->args.arg_count < required_num_params)
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
- /* Create the function. */
- si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
- /* Disable elimination of unused inputs. */
- ac_llvm_add_target_dep_function_attr(ctx->main_fn,
- "InitialPSInputAddr", 0xffffff);
-
- /* Process colors. */
- unsigned vgpr = ctx->args.num_sgprs_used;
- unsigned colors_written = key->ps_epilog.colors_written;
- int last_color_export = -1;
-
- /* Find the last color export. */
- if (!key->ps_epilog.writes_z &&
- !key->ps_epilog.writes_stencil &&
- !key->ps_epilog.writes_samplemask) {
- unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
-
- /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
- if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
- /* Just set this if any of the colorbuffers are enabled. */
- if (spi_format &
- ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
- last_color_export = 0;
- } else {
- for (i = 0; i < 8; i++)
- if (colors_written & (1 << i) &&
- (spi_format >> (i * 4)) & 0xf)
- last_color_export = i;
- }
- }
-
- while (colors_written) {
- LLVMValueRef color[4];
- int mrt = u_bit_scan(&colors_written);
-
- for (i = 0; i < 4; i++)
- color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
-
- si_export_mrt_color(ctx, color, mrt,
- ctx->args.arg_count - 1,
- mrt == last_color_export, &exp);
- }
-
- /* Process depth, stencil, samplemask. */
- if (key->ps_epilog.writes_z)
- depth = LLVMGetParam(ctx->main_fn, vgpr++);
- if (key->ps_epilog.writes_stencil)
- stencil = LLVMGetParam(ctx->main_fn, vgpr++);
- if (key->ps_epilog.writes_samplemask)
- samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
-
- if (depth || stencil || samplemask)
- si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
- else if (last_color_export == -1)
- ac_build_export_null(&ctx->ac);
-
- if (exp.num)
- si_emit_ps_exports(ctx, &exp);
-
- /* Compile. */
- LLVMBuildRetVoid(ctx->ac.builder);
+ LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+ int i;
+ struct si_ps_exports exp = {};
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ /* Declare input SGPRs. */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->bindless_samplers_and_images);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->const_and_shader_buffers);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->samplers_and_images);
+ si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_ALPHA_REF);
+
+ /* Declare input VGPRs. */
+ unsigned required_num_params =
+ ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
+ key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;
+
+ required_num_params =
+ MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+ while (ctx->args.arg_count < required_num_params)
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
+ /* Disable elimination of unused inputs. */
+ ac_llvm_add_target_dep_function_attr(ctx->main_fn, "InitialPSInputAddr", 0xffffff);
+
+ /* Process colors. */
+ unsigned vgpr = ctx->args.num_sgprs_used;
+ unsigned colors_written = key->ps_epilog.colors_written;
+ int last_color_export = -1;
+
+ /* Find the last color export. */
+ if (!key->ps_epilog.writes_z && !key->ps_epilog.writes_stencil &&
+ !key->ps_epilog.writes_samplemask) {
+ unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+ /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+ if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+ /* Just set this if any of the colorbuffers are enabled. */
+ if (spi_format & ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+ last_color_export = 0;
+ } else {
+ for (i = 0; i < 8; i++)
+ if (colors_written & (1 << i) && (spi_format >> (i * 4)) & 0xf)
+ last_color_export = i;
+ }
+ }
+
+ while (colors_written) {
+ LLVMValueRef color[4];
+ int mrt = u_bit_scan(&colors_written);
+
+ for (i = 0; i < 4; i++)
+ color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
+
+ si_export_mrt_color(ctx, color, mrt, ctx->args.arg_count - 1, mrt == last_color_export, &exp);
+ }
+
+ /* Process depth, stencil, samplemask. */
+ if (key->ps_epilog.writes_z)
+ depth = LLVMGetParam(ctx->main_fn, vgpr++);
+ if (key->ps_epilog.writes_stencil)
+ stencil = LLVMGetParam(ctx->main_fn, vgpr++);
+ if (key->ps_epilog.writes_samplemask)
+ samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
+
+ if (depth || stencil || samplemask)
+ si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
+ else if (last_color_export == -1)
+ ac_build_export_null(&ctx->ac);
+
+ if (exp.num)
+ si_emit_ps_exports(ctx, &exp);
+
+ /* Compile. */
+ LLVMBuildRetVoid(ctx->ac.builder);
}
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
- struct si_shader *shader)
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader)
{
- LLVMValueRef parts[3];
- unsigned num_parts = 0, main_index;
+ LLVMValueRef parts[3];
+ unsigned num_parts = 0, main_index;
- union si_shader_part_key prolog_key;
- si_get_ps_prolog_key(shader, &prolog_key, false);
+ union si_shader_part_key prolog_key;
+ si_get_ps_prolog_key(shader, &prolog_key, false);
- if (si_need_ps_prolog(&prolog_key)) {
- si_llvm_build_ps_prolog(ctx, &prolog_key);
- parts[num_parts++] = ctx->main_fn;
- }
+ if (si_need_ps_prolog(&prolog_key)) {
+ si_llvm_build_ps_prolog(ctx, &prolog_key);
+ parts[num_parts++] = ctx->main_fn;
+ }
- main_index = num_parts;
- parts[num_parts++] = ctx->main_fn;
+ main_index = num_parts;
+ parts[num_parts++] = ctx->main_fn;
- union si_shader_part_key epilog_key;
- si_get_ps_epilog_key(shader, &epilog_key);
- si_llvm_build_ps_epilog(ctx, &epilog_key);
- parts[num_parts++] = ctx->main_fn;
+ union si_shader_part_key epilog_key;
+ si_get_ps_epilog_key(shader, &epilog_key);
+ si_llvm_build_ps_epilog(ctx, &epilog_key);
+ parts[num_parts++] = ctx->main_fn;
- si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
+ si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
}
void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
{
- ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
- ctx->abi.load_sample_position = load_sample_position;
- ctx->abi.load_sample_mask_in = load_sample_mask_in;
- ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
+ ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
+ ctx->abi.load_sample_position = load_sample_position;
+ ctx->abi.load_sample_mask_in = load_sample_mask_in;
+ ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
/**
* Return a value that is equal to the given i32 \p index if it lies in [0,num)
* or an undefined value in the same interval otherwise.
*/
-static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
- LLVMValueRef index,
- unsigned num)
+static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
+ unsigned num)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
- LLVMValueRef cc;
-
- if (util_is_power_of_two_or_zero(num)) {
- index = LLVMBuildAnd(builder, index, c_max, "");
- } else {
- /* In theory, this MAX pattern should result in code that is
- * as good as the bit-wise AND above.
- *
- * In practice, LLVM generates worse code (at the time of
- * writing), because its value tracking is not strong enough.
- */
- cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
- index = LLVMBuildSelect(builder, cc, index, c_max, "");
- }
-
- return index;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
+ LLVMValueRef cc;
+
+ if (util_is_power_of_two_or_zero(num)) {
+ index = LLVMBuildAnd(builder, index, c_max, "");
+ } else {
+ /* In theory, this MAX pattern should result in code that is
+ * as good as the bit-wise AND above.
+ *
+ * In practice, LLVM generates worse code (at the time of
+ * writing), because its value tracking is not strong enough.
+ */
+ cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
+ index = LLVMBuildSelect(builder, cc, index, c_max, "");
+ }
+
+ return index;
}
static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
{
- LLVMValueRef ptr =
- ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
- struct si_shader_selector *sel = ctx->shader->selector;
-
- /* Do the bounds checking with a descriptor, because
- * doing computation and manual bounds checking of 64-bit
- * addresses generates horrible VALU code with very high
- * VGPR usage and very low SIMD occupancy.
- */
- ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
-
- LLVMValueRef desc0, desc1;
- desc0 = ptr;
- desc1 = LLVMConstInt(ctx->ac.i32,
- S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-
- uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (ctx->screen->info.chip_class >= GFX10)
- rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
- S_008F0C_RESOURCE_LEVEL(1);
- else
- rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
- LLVMValueRef desc_elems[] = {
- desc0,
- desc1,
- LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
- LLVMConstInt(ctx->ac.i32, rsrc3, false)
- };
-
- return ac_build_gather_values(&ctx->ac, desc_elems, 4);
+ LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+ struct si_shader_selector *sel = ctx->shader->selector;
+
+ /* Do the bounds checking with a descriptor, because
+ * doing computation and manual bounds checking of 64-bit
+ * addresses generates horrible VALU code with very high
+ * VGPR usage and very low SIMD occupancy.
+ */
+ ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
+
+ LLVMValueRef desc0, desc1;
+ desc0 = ptr;
+ desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+
+ uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (ctx->screen->info.chip_class >= GFX10)
+ rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+ else
+ rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ LLVMValueRef desc_elems[] = {desc0, desc1,
+ LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
+ LLVMConstInt(ctx->ac.i32, rsrc3, false)};
+
+ return ac_build_gather_values(&ctx->ac, desc_elems, 4);
}
static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_selector *sel = ctx->shader->selector;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_selector *sel = ctx->shader->selector;
- LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+ LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
- if (sel->info.const_buffers_declared == 1 &&
- sel->info.shader_buffers_declared == 0) {
- return load_const_buffer_desc_fast_path(ctx);
- }
+ if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
+ return load_const_buffer_desc_fast_path(ctx);
+ }
- index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
- index = LLVMBuildAdd(ctx->ac.builder, index,
- LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
+ index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
+ index =
+ LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
- return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
+ return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
}
-static LLVMValueRef
-load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
+static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
- ctx->const_and_shader_buffers);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
- index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
- index = LLVMBuildSub(ctx->ac.builder,
- LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
- index, "");
+ index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
+ index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
+ index, "");
- return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
+ return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
}
/**
* nicer: disabling DCC in the shader still leads to undefined results but
* avoids the lockup.
*/
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
- LLVMValueRef rsrc)
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
{
- if (ctx->screen->info.chip_class <= GFX7) {
- return rsrc;
- } else {
- LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
- LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
- LLVMValueRef tmp;
-
- tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
- tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
- return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
- }
+ if (ctx->screen->info.chip_class <= GFX7) {
+ return rsrc;
+ } else {
+ LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
+ LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
+ tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
+ return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
+ }
}
/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
* adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
- LLVMValueRef list, LLVMValueRef index,
- enum ac_descriptor_type desc_type,
- bool uses_store, bool bindless)
+static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
+ LLVMValueRef index, enum ac_descriptor_type desc_type,
+ bool uses_store, bool bindless)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef rsrc;
-
- if (desc_type == AC_DESC_BUFFER) {
- index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
- ctx->ac.i32_1);
- list = LLVMBuildPointerCast(builder, list,
- ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
- } else {
- assert(desc_type == AC_DESC_IMAGE ||
- desc_type == AC_DESC_FMASK);
- }
-
- if (bindless)
- rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
- else
- rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
-
- if (desc_type == AC_DESC_IMAGE && uses_store)
- rsrc = force_dcc_off(ctx, rsrc);
- return rsrc;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef rsrc;
+
+ if (desc_type == AC_DESC_BUFFER) {
+ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+ list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+ } else {
+ assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
+ }
+
+ if (bindless)
+ rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
+ else
+ rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+
+ if (desc_type == AC_DESC_IMAGE && uses_store)
+ rsrc = force_dcc_off(ctx, rsrc);
+ return rsrc;
}
/**
* Load an image view, fmask view. or sampler state descriptor.
*/
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
- LLVMValueRef list, LLVMValueRef index,
- enum ac_descriptor_type type)
+static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
+ LLVMValueRef index, enum ac_descriptor_type type)
{
- LLVMBuilderRef builder = ctx->ac.builder;
-
- switch (type) {
- case AC_DESC_IMAGE:
- /* The image is at [0:7]. */
- index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
- break;
- case AC_DESC_BUFFER:
- /* The buffer is in [4:7]. */
- index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
- ctx->ac.i32_1);
- list = LLVMBuildPointerCast(builder, list,
- ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
- break;
- case AC_DESC_FMASK:
- /* The FMASK is at [8:15]. */
- index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
- ctx->ac.i32_1);
- break;
- case AC_DESC_SAMPLER:
- /* The sampler state is at [12:15]. */
- index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
- LLVMConstInt(ctx->ac.i32, 3, 0));
- list = LLVMBuildPointerCast(builder, list,
- ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
- break;
- case AC_DESC_PLANE_0:
- case AC_DESC_PLANE_1:
- case AC_DESC_PLANE_2:
- /* Only used for the multiplane image support for Vulkan. Should
- * never be reached in radeonsi.
- */
- unreachable("Plane descriptor requested in radeonsi.");
- }
-
- return ac_build_load_to_sgpr(&ctx->ac, list, index);
+ LLVMBuilderRef builder = ctx->ac.builder;
+
+ switch (type) {
+ case AC_DESC_IMAGE:
+ /* The image is at [0:7]. */
+ index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+ break;
+ case AC_DESC_BUFFER:
+ /* The buffer is in [4:7]. */
+ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
+ list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+ break;
+ case AC_DESC_FMASK:
+ /* The FMASK is at [8:15]. */
+ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+ break;
+ case AC_DESC_SAMPLER:
+ /* The sampler state is at [12:15]. */
+ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
+ LLVMConstInt(ctx->ac.i32, 3, 0));
+ list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+ break;
+ case AC_DESC_PLANE_0:
+ case AC_DESC_PLANE_1:
+ case AC_DESC_PLANE_2:
+ /* Only used for the multiplane image support for Vulkan. Should
+ * never be reached in radeonsi.
+ */
+ unreachable("Plane descriptor requested in radeonsi.");
+ }
+
+ return ac_build_load_to_sgpr(&ctx->ac, list, index);
}
-static LLVMValueRef
-si_nir_load_sampler_desc(struct ac_shader_abi *abi,
- unsigned descriptor_set, unsigned base_index,
- unsigned constant_index, LLVMValueRef dynamic_index,
- enum ac_descriptor_type desc_type, bool image,
- bool write, bool bindless)
+static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
+ unsigned base_index, unsigned constant_index,
+ LLVMValueRef dynamic_index,
+ enum ac_descriptor_type desc_type, bool image,
+ bool write, bool bindless)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMBuilderRef builder = ctx->ac.builder;
- unsigned const_index = base_index + constant_index;
-
- assert(!descriptor_set);
- assert(desc_type <= AC_DESC_BUFFER);
-
- if (bindless) {
- LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
-
- /* dynamic_index is the bindless handle */
- if (image) {
- /* Bindless image descriptors use 16-dword slots. */
- dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
- LLVMConstInt(ctx->ac.i64, 2, 0), "");
- /* FMASK is right after the image. */
- if (desc_type == AC_DESC_FMASK) {
- dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
- ctx->ac.i32_1, "");
- }
-
- return si_load_image_desc(ctx, list, dynamic_index, desc_type,
- write, true);
- }
-
- /* Since bindless handle arithmetic can contain an unsigned integer
- * wraparound and si_load_sampler_desc assumes there isn't any,
- * use GEP without "inbounds" (inside ac_build_pointer_add)
- * to prevent incorrect code generation and hangs.
- */
- dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
- LLVMConstInt(ctx->ac.i64, 2, 0), "");
- list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
- return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
- }
-
- unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
- assert(const_index < num_slots || dynamic_index);
-
- LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
- LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
-
- if (dynamic_index) {
- index = LLVMBuildAdd(builder, index, dynamic_index, "");
-
- /* From the GL_ARB_shader_image_load_store extension spec:
- *
- * If a shader performs an image load, store, or atomic
- * operation using an image variable declared as an array,
- * and if the index used to select an individual element is
- * negative or greater than or equal to the size of the
- * array, the results of the operation are undefined but may
- * not lead to termination.
- */
- index = si_llvm_bound_index(ctx, index, num_slots);
- }
-
- if (image) {
- /* FMASKs are separate from images. */
- if (desc_type == AC_DESC_FMASK) {
- index = LLVMBuildAdd(ctx->ac.builder, index,
- LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
- }
- index = LLVMBuildSub(ctx->ac.builder,
- LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
- index, "");
- return si_load_image_desc(ctx, list, index, desc_type, write, false);
- }
-
- index = LLVMBuildAdd(ctx->ac.builder, index,
- LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
- return si_load_sampler_desc(ctx, list, index, desc_type);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ unsigned const_index = base_index + constant_index;
+
+ assert(!descriptor_set);
+ assert(desc_type <= AC_DESC_BUFFER);
+
+ if (bindless) {
+ LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
+
+ /* dynamic_index is the bindless handle */
+ if (image) {
+ /* Bindless image descriptors use 16-dword slots. */
+ dynamic_index =
+ LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+ /* FMASK is right after the image. */
+ if (desc_type == AC_DESC_FMASK) {
+ dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
+ }
+
+ return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
+ }
+
+ /* Since bindless handle arithmetic can contain an unsigned integer
+ * wraparound and si_load_sampler_desc assumes there isn't any,
+ * use GEP without "inbounds" (inside ac_build_pointer_add)
+ * to prevent incorrect code generation and hangs.
+ */
+ dynamic_index =
+ LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+ list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+ return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
+ }
+
+ unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+ assert(const_index < num_slots || dynamic_index);
+
+ LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
+ LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+
+ if (dynamic_index) {
+ index = LLVMBuildAdd(builder, index, dynamic_index, "");
+
+ /* From the GL_ARB_shader_image_load_store extension spec:
+ *
+ * If a shader performs an image load, store, or atomic
+ * operation using an image variable declared as an array,
+ * and if the index used to select an individual element is
+ * negative or greater than or equal to the size of the
+ * array, the results of the operation are undefined but may
+ * not lead to termination.
+ */
+ index = si_llvm_bound_index(ctx, index, num_slots);
+ }
+
+ if (image) {
+ /* FMASKs are separate from images. */
+ if (desc_type == AC_DESC_FMASK) {
+ index =
+ LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
+ }
+ index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
+ index, "");
+ return si_load_image_desc(ctx, list, index, desc_type, write, false);
+ }
+
+ index = LLVMBuildAdd(ctx->ac.builder, index,
+ LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
+ return si_load_sampler_desc(ctx, list, index, desc_type);
}
void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
{
- ctx->abi.load_ubo = load_ubo;
- ctx->abi.load_ssbo = load_ssbo;
- ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
+ ctx->abi.load_ubo = load_ubo;
+ ctx->abi.load_ssbo = load_ssbo;
+ ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
{
- switch (ctx->type) {
- case PIPE_SHADER_TESS_CTRL:
- return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
+ switch (ctx->type) {
+ case PIPE_SHADER_TESS_CTRL:
+ return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
- case PIPE_SHADER_TESS_EVAL:
- return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
+ case PIPE_SHADER_TESS_EVAL:
+ return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
- default:
- assert(0);
- return NULL;
- }
+ default:
+ assert(0);
+ return NULL;
+ }
}
/* Tessellation shaders pass outputs to the next shader using LDS.
* All three shaders VS(LS), TCS, TES share the same LDS space.
*/
-static LLVMValueRef
-get_tcs_in_patch_stride(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
{
- return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
+ return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
}
static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
{
- assert(ctx->type == PIPE_SHADER_TESS_CTRL);
+ assert(ctx->type == PIPE_SHADER_TESS_CTRL);
- if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
- return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+ if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+ return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
- return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
+ return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
}
static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
{
- unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+ unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
- return LLVMConstInt(ctx->ac.i32, stride, 0);
+ return LLVMConstInt(ctx->ac.i32, stride, 0);
}
static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
{
- if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
- return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
-
- const struct si_shader_info *info = &ctx->shader->selector->info;
- unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
- unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
- unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
- unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
- num_patch_outputs * 4;
- return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
+ if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+ return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
+
+ const struct si_shader_info *info = &ctx->shader->selector->info;
+ unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+ unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+ unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
+ unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
+ return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
}
-static LLVMValueRef
-get_tcs_out_patch0_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
{
- return LLVMBuildMul(ctx->ac.builder,
- si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
- LLVMConstInt(ctx->ac.i32, 4, 0), "");
+ return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
+ LLVMConstInt(ctx->ac.i32, 4, 0), "");
}
-static LLVMValueRef
-get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
{
- return LLVMBuildMul(ctx->ac.builder,
- si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
- LLVMConstInt(ctx->ac.i32, 4, 0), "");
+ return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
+ LLVMConstInt(ctx->ac.i32, 4, 0), "");
}
-static LLVMValueRef
-get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
{
- LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
- LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+ LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
+ LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
- return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
+ return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
}
-static LLVMValueRef
-get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
{
- LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
- LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
- LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+ LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
+ LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+ LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
- return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
+ return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
}
-static LLVMValueRef
-get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
{
- LLVMValueRef patch0_patch_data_offset =
- get_tcs_out_patch0_patch_data_offset(ctx);
- LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
- LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+ LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
+ LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+ LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
- return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
+ return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
}
static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
{
- unsigned tcs_out_vertices =
- ctx->shader->selector ?
- ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
+ unsigned tcs_out_vertices =
+ ctx->shader->selector ? ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]
+ : 0;
- /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
- if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
- return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
+ /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
+ if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
+ return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
- return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+ return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
}
static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
{
- unsigned stride;
-
- switch (ctx->type) {
- case PIPE_SHADER_VERTEX:
- stride = ctx->shader->selector->lshs_vertex_stride / 4;
- return LLVMConstInt(ctx->ac.i32, stride, 0);
-
- case PIPE_SHADER_TESS_CTRL:
- if (ctx->screen->info.chip_class >= GFX9 &&
- ctx->shader->is_monolithic) {
- stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
- return LLVMConstInt(ctx->ac.i32, stride, 0);
- }
- return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
-
- default:
- assert(0);
- return NULL;
- }
+ unsigned stride;
+
+ switch (ctx->type) {
+ case PIPE_SHADER_VERTEX:
+ stride = ctx->shader->selector->lshs_vertex_stride / 4;
+ return LLVMConstInt(ctx->ac.i32, stride, 0);
+
+ case PIPE_SHADER_TESS_CTRL:
+ if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
+ stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+ return LLVMConstInt(ctx->ac.i32, stride, 0);
+ }
+ return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
+
+ default:
+ assert(0);
+ return NULL;
+ }
}
-static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
- LLVMValueRef vertex_dw_stride,
- LLVMValueRef base_addr,
- LLVMValueRef vertex_index,
- LLVMValueRef param_index,
- ubyte name, ubyte index)
+static LLVMValueRef
+get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
+ LLVMValueRef base_addr, LLVMValueRef vertex_index,
+ LLVMValueRef param_index, ubyte name, ubyte index)
{
- if (vertex_dw_stride) {
- base_addr = ac_build_imad(&ctx->ac, vertex_index,
- vertex_dw_stride, base_addr);
- }
-
- if (param_index) {
- base_addr = ac_build_imad(&ctx->ac, param_index,
- LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
- }
-
- int param = name == TGSI_SEMANTIC_PATCH ||
- name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER ?
- si_shader_io_get_unique_index_patch(name, index) :
- si_shader_io_get_unique_index(name, index, false);
-
- /* Add the base address of the element. */
- return LLVMBuildAdd(ctx->ac.builder, base_addr,
- LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+ if (vertex_dw_stride) {
+ base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
+ }
+
+ if (param_index) {
+ base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
+ }
+
+ int param = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+ name == TGSI_SEMANTIC_TESSOUTER
+ ? si_shader_io_get_unique_index_patch(name, index)
+ : si_shader_io_get_unique_index(name, index, false);
+
+ /* Add the base address of the element. */
+ return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
}
/* The offchip buffer layout for TCS->TES is
* Note that every attribute has 4 components.
*/
static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
- LLVMValueRef rel_patch_id,
- LLVMValueRef vertex_index,
+ LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
LLVMValueRef param_index)
{
- LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
- LLVMValueRef param_stride, constant16;
-
- vertices_per_patch = get_num_tcs_out_vertices(ctx);
- num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
- total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
- num_patches, "");
-
- constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
- if (vertex_index) {
- base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
- vertices_per_patch, vertex_index);
- param_stride = total_vertices;
- } else {
- base_addr = rel_patch_id;
- param_stride = num_patches;
- }
-
- base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
- base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
-
- if (!vertex_index) {
- LLVMValueRef patch_data_offset =
- si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
-
- base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
- patch_data_offset, "");
- }
- return base_addr;
+ LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+ LLVMValueRef param_stride, constant16;
+
+ vertices_per_patch = get_num_tcs_out_vertices(ctx);
+ num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+ total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
+
+ constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
+ if (vertex_index) {
+ base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
+ param_stride = total_vertices;
+ } else {
+ base_addr = rel_patch_id;
+ param_stride = num_patches;
+ }
+
+ base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
+ base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
+
+ if (!vertex_index) {
+ LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+
+ base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
+ }
+ return base_addr;
}
-static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
- struct si_shader_context *ctx,
- LLVMValueRef vertex_index,
- LLVMValueRef param_index,
- ubyte name, ubyte index)
+static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
+ LLVMValueRef vertex_index,
+ LLVMValueRef param_index,
+ ubyte name, ubyte index)
{
- unsigned param_index_base;
-
- param_index_base = name == TGSI_SEMANTIC_PATCH ||
- name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER ?
- si_shader_io_get_unique_index_patch(name, index) :
- si_shader_io_get_unique_index(name, index, false);
-
- if (param_index) {
- param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
- LLVMConstInt(ctx->ac.i32, param_index_base, 0),
- "");
- } else {
- param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
- }
-
- return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
- vertex_index, param_index);
+ unsigned param_index_base;
+
+ param_index_base = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+ name == TGSI_SEMANTIC_TESSOUTER
+ ? si_shader_io_get_unique_index_patch(name, index)
+ : si_shader_io_get_unique_index(name, index, false);
+
+ if (param_index) {
+ param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
+ LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
+ } else {
+ param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
+ }
+
+ return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
}
-static LLVMValueRef buffer_load(struct si_shader_context *ctx,
- LLVMTypeRef type, unsigned swizzle,
- LLVMValueRef buffer, LLVMValueRef offset,
- LLVMValueRef base, bool can_speculate)
+static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+ LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
+ bool can_speculate)
{
- LLVMValueRef value, value2;
- LLVMTypeRef vec_type = LLVMVectorType(type, 4);
+ LLVMValueRef value, value2;
+ LLVMTypeRef vec_type = LLVMVectorType(type, 4);
- if (swizzle == ~0) {
- value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
- 0, ac_glc, can_speculate, false);
+ if (swizzle == ~0) {
+ value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+ can_speculate, false);
- return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
- }
+ return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+ }
- if (ac_get_type_size(type) != 8) {
- value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
- 0, ac_glc, can_speculate, false);
+ if (ac_get_type_size(type) != 8) {
+ value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+ can_speculate, false);
- value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
- return LLVMBuildExtractElement(ctx->ac.builder, value,
- LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
- }
+ value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+ return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
+ "");
+ }
- value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
- swizzle * 4, ac_glc, can_speculate, false);
+ value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc,
+ can_speculate, false);
- value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
- swizzle * 4 + 4, ac_glc, can_speculate, false);
+ value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc,
+ can_speculate, false);
- return si_build_gather_64bit(ctx, type, value, value2);
+ return si_build_gather_64bit(ctx, type, value, value2);
}
/**
* \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
* \param dw_addr address in dwords
*/
-static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
- LLVMTypeRef type, unsigned swizzle,
- LLVMValueRef dw_addr)
+static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+ LLVMValueRef dw_addr)
{
- LLVMValueRef value;
+ LLVMValueRef value;
- if (swizzle == ~0) {
- LLVMValueRef values[4];
+ if (swizzle == ~0) {
+ LLVMValueRef values[4];
- for (unsigned chan = 0; chan < 4; chan++)
- values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
+ for (unsigned chan = 0; chan < 4; chan++)
+ values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
- return ac_build_gather_values(&ctx->ac, values, 4);
- }
+ return ac_build_gather_values(&ctx->ac, values, 4);
+ }
- /* Split 64-bit loads. */
- if (ac_get_type_size(type) == 8) {
- LLVMValueRef lo, hi;
+ /* Split 64-bit loads. */
+ if (ac_get_type_size(type) == 8) {
+ LLVMValueRef lo, hi;
- lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
- hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
- return si_build_gather_64bit(ctx, type, lo, hi);
- }
+ lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
+ hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
+ return si_build_gather_64bit(ctx, type, lo, hi);
+ }
- dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
- LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
+ dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
- value = ac_lds_load(&ctx->ac, dw_addr);
+ value = ac_lds_load(&ctx->ac, dw_addr);
- return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+ return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
}
/**
* \param dw_addr address in dwords
* \param value value to store
*/
-static void lshs_lds_store(struct si_shader_context *ctx,
- unsigned dw_offset_imm, LLVMValueRef dw_addr,
- LLVMValueRef value)
+static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
+ LLVMValueRef dw_addr, LLVMValueRef value)
{
- dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
- LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
+ dw_addr =
+ LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
- ac_lds_store(&ctx->ac, dw_addr, value);
+ ac_lds_store(&ctx->ac, dw_addr, value);
}
-enum si_tess_ring {
- TCS_FACTOR_RING,
- TESS_OFFCHIP_RING_TCS,
- TESS_OFFCHIP_RING_TES,
+enum si_tess_ring
+{
+ TCS_FACTOR_RING,
+ TESS_OFFCHIP_RING_TCS,
+ TESS_OFFCHIP_RING_TES,
};
-static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
- enum si_tess_ring ring)
+static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef addr = ac_get_arg(&ctx->ac,
- ring == TESS_OFFCHIP_RING_TES ?
- ctx->tes_offchip_addr :
- ctx->tcs_out_lds_layout);
-
- /* TCS only receives high 13 bits of the address. */
- if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
- addr = LLVMBuildAnd(builder, addr,
- LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
- }
-
- if (ring == TCS_FACTOR_RING) {
- unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
- addr = LLVMBuildAdd(builder, addr,
- LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
- }
-
- uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
- S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
- S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
- if (ctx->screen->info.chip_class >= GFX10)
- rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
- S_008F0C_RESOURCE_LEVEL(1);
- else
- rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
- LLVMValueRef desc[4];
- desc[0] = addr;
- desc[1] = LLVMConstInt(ctx->ac.i32,
- S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
- desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
- desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
-
- return ac_build_gather_values(&ctx->ac, desc, 4);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef addr = ac_get_arg(
+ &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
+
+ /* TCS only receives high 13 bits of the address. */
+ if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
+ addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
+ }
+
+ if (ring == TCS_FACTOR_RING) {
+ unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
+ addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
+ }
+
+ uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+ if (ctx->screen->info.chip_class >= GFX10)
+ rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+ else
+ rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+ LLVMValueRef desc[4];
+ desc[0] = addr;
+ desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+ desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
+ desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
+
+ return ac_build_gather_values(&ctx->ac, desc, 4);
}
void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
{
- ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
+ ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
}
-static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
- LLVMTypeRef type,
- LLVMValueRef vertex_index,
- LLVMValueRef param_index,
- unsigned const_index,
- unsigned location,
- unsigned driver_location,
- unsigned component,
- unsigned num_components,
- bool is_patch,
- bool is_compact,
- bool load_input)
+static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
+ LLVMValueRef vertex_index, LLVMValueRef param_index,
+ unsigned const_index, unsigned location,
+ unsigned driver_location, unsigned component,
+ unsigned num_components, bool is_patch,
+ bool is_compact, bool load_input)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- LLVMValueRef dw_addr, stride;
- ubyte name, index;
-
- driver_location = driver_location / 4;
-
- if (load_input) {
- name = info->input_semantic_name[driver_location];
- index = info->input_semantic_index[driver_location];
- } else {
- name = info->output_semantic_name[driver_location];
- index = info->output_semantic_index[driver_location];
- }
-
- assert((name == TGSI_SEMANTIC_PATCH ||
- name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
- if (load_input) {
- stride = get_tcs_in_vertex_dw_stride(ctx);
- dw_addr = get_tcs_in_current_patch_offset(ctx);
- } else {
- if (is_patch) {
- stride = NULL;
- dw_addr = get_tcs_out_current_patch_data_offset(ctx);
- } else {
- stride = get_tcs_out_vertex_dw_stride(ctx);
- dw_addr = get_tcs_out_current_patch_offset(ctx);
- }
- }
-
- if (!param_index) {
- param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
- }
-
- dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
- vertex_index, param_index,
- name, index);
-
- LLVMValueRef value[4];
- for (unsigned i = 0; i < num_components; i++) {
- unsigned offset = i;
- if (ac_get_type_size(type) == 8)
- offset *= 2;
-
- offset += component;
- value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
- }
-
- return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ LLVMValueRef dw_addr, stride;
+ ubyte name, index;
+
+ driver_location = driver_location / 4;
+
+ if (load_input) {
+ name = info->input_semantic_name[driver_location];
+ index = info->input_semantic_index[driver_location];
+ } else {
+ name = info->output_semantic_name[driver_location];
+ index = info->output_semantic_index[driver_location];
+ }
+
+ assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+ name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+ if (load_input) {
+ stride = get_tcs_in_vertex_dw_stride(ctx);
+ dw_addr = get_tcs_in_current_patch_offset(ctx);
+ } else {
+ if (is_patch) {
+ stride = NULL;
+ dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+ } else {
+ stride = get_tcs_out_vertex_dw_stride(ctx);
+ dw_addr = get_tcs_out_current_patch_offset(ctx);
+ }
+ }
+
+ if (!param_index) {
+ param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+ }
+
+ dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+ name, index);
+
+ LLVMValueRef value[4];
+ for (unsigned i = 0; i < num_components; i++) {
+ unsigned offset = i;
+ if (ac_get_type_size(type) == 8)
+ offset *= 2;
+
+ offset += component;
+ value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
+ }
+
+ return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
}
-static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
- LLVMTypeRef type,
- LLVMValueRef vertex_index,
- LLVMValueRef param_index,
- unsigned const_index,
- unsigned location,
- unsigned driver_location,
- unsigned component,
- unsigned num_components,
- bool is_patch,
- bool is_compact,
- bool load_input)
+static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
+ LLVMValueRef vertex_index, LLVMValueRef param_index,
+ unsigned const_index, unsigned location,
+ unsigned driver_location, unsigned component,
+ unsigned num_components, bool is_patch, bool is_compact,
+ bool load_input)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- LLVMValueRef base, addr;
-
- driver_location = driver_location / 4;
- ubyte name = info->input_semantic_name[driver_location];
- ubyte index = info->input_semantic_index[driver_location];
-
- assert((name == TGSI_SEMANTIC_PATCH ||
- name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
- base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
- if (!param_index) {
- param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
- }
-
- addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
- param_index,
- name, index);
-
- /* TODO: This will generate rather ordinary llvm code, although it
- * should be easy for the optimiser to fix up. In future we might want
- * to refactor buffer_load().
- */
- LLVMValueRef value[4];
- for (unsigned i = 0; i < num_components; i++) {
- unsigned offset = i;
- if (ac_get_type_size(type) == 8) {
- offset *= 2;
- if (offset == 4) {
- ubyte name = info->input_semantic_name[driver_location + 1];
- ubyte index = info->input_semantic_index[driver_location + 1];
- addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
- vertex_index,
- param_index,
- name, index);
- }
-
- offset = offset % 4;
- }
-
- offset += component;
- value[i + component] = buffer_load(ctx, type, offset,
- ctx->tess_offchip_ring, base, addr, true);
- }
-
- return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ LLVMValueRef base, addr;
+
+ driver_location = driver_location / 4;
+ ubyte name = info->input_semantic_name[driver_location];
+ ubyte index = info->input_semantic_index[driver_location];
+
+ assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+ name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+ base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+ if (!param_index) {
+ param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+ }
+
+ addr =
+ get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+ /* TODO: This will generate rather ordinary llvm code, although it
+ * should be easy for the optimiser to fix up. In future we might want
+ * to refactor buffer_load().
+ */
+ LLVMValueRef value[4];
+ for (unsigned i = 0; i < num_components; i++) {
+ unsigned offset = i;
+ if (ac_get_type_size(type) == 8) {
+ offset *= 2;
+ if (offset == 4) {
+ ubyte name = info->input_semantic_name[driver_location + 1];
+ ubyte index = info->input_semantic_index[driver_location + 1];
+ addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+ name, index);
+ }
+
+ offset = offset % 4;
+ }
+
+ offset += component;
+ value[i + component] =
+ buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true);
+ }
+
+ return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
}
-static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
- const struct nir_variable *var,
- LLVMValueRef vertex_index,
- LLVMValueRef param_index,
- unsigned const_index,
- LLVMValueRef src,
- unsigned writemask)
+static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_variable *var,
+ LLVMValueRef vertex_index, LLVMValueRef param_index,
+ unsigned const_index, LLVMValueRef src, unsigned writemask)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- const unsigned component = var->data.location_frac;
- unsigned driver_location = var->data.driver_location;
- LLVMValueRef dw_addr, stride;
- LLVMValueRef buffer, base, addr;
- LLVMValueRef values[8];
- bool skip_lds_store;
- bool is_tess_factor = false, is_tess_inner = false;
-
- driver_location = driver_location / 4;
- ubyte name = info->output_semantic_name[driver_location];
- ubyte index = info->output_semantic_index[driver_location];
-
- bool is_const = !param_index;
- if (!param_index)
- param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-
- const bool is_patch = var->data.patch ||
- var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
- var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
-
- /* Invalid SPIR-V can cause this. */
- if ((name == TGSI_SEMANTIC_PATCH ||
- name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
- return;
-
- if (!is_patch) {
- stride = get_tcs_out_vertex_dw_stride(ctx);
- dw_addr = get_tcs_out_current_patch_offset(ctx);
- dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
- vertex_index, param_index,
- name, index);
-
- skip_lds_store = !info->reads_pervertex_outputs;
- } else {
- dw_addr = get_tcs_out_current_patch_data_offset(ctx);
- dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
- vertex_index, param_index,
- name, index);
-
- skip_lds_store = !info->reads_perpatch_outputs;
-
- if (is_const && const_index == 0) {
- int name = info->output_semantic_name[driver_location];
-
- /* Always write tess factors into LDS for the TCS epilog. */
- if (name == TGSI_SEMANTIC_TESSINNER ||
- name == TGSI_SEMANTIC_TESSOUTER) {
- /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
- skip_lds_store = !info->reads_tessfactor_outputs &&
- ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
- is_tess_factor = true;
- is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
- }
- }
- }
-
- buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-
- base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
- addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
- param_index, name, index);
-
- for (unsigned chan = component; chan < 8; chan++) {
- if (!(writemask & (1 << chan)))
- continue;
- LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-
- unsigned buffer_store_offset = chan % 4;
- if (chan == 4) {
- ubyte name = info->output_semantic_name[driver_location + 1];
- ubyte index = info->output_semantic_index[driver_location + 1];
- addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
- vertex_index,
- param_index,
- name, index);
- }
-
- /* Skip LDS stores if there is no LDS read of this output. */
- if (!skip_lds_store)
- lshs_lds_store(ctx, chan, dw_addr, value);
-
- value = ac_to_integer(&ctx->ac, value);
- values[chan] = value;
-
- if (writemask != 0xF && !is_tess_factor) {
- ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
- addr, base,
- 4 * buffer_store_offset,
- ac_glc);
- }
-
- /* Write tess factors into VGPRs for the epilog. */
- if (is_tess_factor &&
- ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
- if (!is_tess_inner) {
- LLVMBuildStore(ctx->ac.builder, value, /* outer */
- ctx->invoc0_tess_factors[chan]);
- } else if (chan < 2) {
- LLVMBuildStore(ctx->ac.builder, value, /* inner */
- ctx->invoc0_tess_factors[4 + chan]);
- }
- }
- }
-
- if (writemask == 0xF && !is_tess_factor) {
- LLVMValueRef value = ac_build_gather_values(&ctx->ac,
- values, 4);
- ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
- base, 0, ac_glc);
- }
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ const unsigned component = var->data.location_frac;
+ unsigned driver_location = var->data.driver_location;
+ LLVMValueRef dw_addr, stride;
+ LLVMValueRef buffer, base, addr;
+ LLVMValueRef values[8];
+ bool skip_lds_store;
+ bool is_tess_factor = false, is_tess_inner = false;
+
+ driver_location = driver_location / 4;
+ ubyte name = info->output_semantic_name[driver_location];
+ ubyte index = info->output_semantic_index[driver_location];
+
+ bool is_const = !param_index;
+ if (!param_index)
+ param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+
+ const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+ var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+ /* Invalid SPIR-V can cause this. */
+ if ((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+ name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
+ return;
+
+ if (!is_patch) {
+ stride = get_tcs_out_vertex_dw_stride(ctx);
+ dw_addr = get_tcs_out_current_patch_offset(ctx);
+ dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+ name, index);
+
+ skip_lds_store = !info->reads_pervertex_outputs;
+ } else {
+ dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+ dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
+ name, index);
+
+ skip_lds_store = !info->reads_perpatch_outputs;
+
+ if (is_const && const_index == 0) {
+ int name = info->output_semantic_name[driver_location];
+
+ /* Always write tess factors into LDS for the TCS epilog. */
+ if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
+ /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+ skip_lds_store = !info->reads_tessfactor_outputs &&
+ ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
+ is_tess_factor = true;
+ is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
+ }
+ }
+ }
+
+ buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+
+ base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+ addr =
+ get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+ for (unsigned chan = component; chan < 8; chan++) {
+ if (!(writemask & (1 << chan)))
+ continue;
+ LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+
+ unsigned buffer_store_offset = chan % 4;
+ if (chan == 4) {
+ ubyte name = info->output_semantic_name[driver_location + 1];
+ ubyte index = info->output_semantic_index[driver_location + 1];
+ addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+ name, index);
+ }
+
+ /* Skip LDS stores if there is no LDS read of this output. */
+ if (!skip_lds_store)
+ lshs_lds_store(ctx, chan, dw_addr, value);
+
+ value = ac_to_integer(&ctx->ac, value);
+ values[chan] = value;
+
+ if (writemask != 0xF && !is_tess_factor) {
+ ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base,
+ 4 * buffer_store_offset, ac_glc);
+ }
+
+ /* Write tess factors into VGPRs for the epilog. */
+ if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+ if (!is_tess_inner) {
+ LLVMBuildStore(ctx->ac.builder, value, /* outer */
+ ctx->invoc0_tess_factors[chan]);
+ } else if (chan < 2) {
+ LLVMBuildStore(ctx->ac.builder, value, /* inner */
+ ctx->invoc0_tess_factors[4 + chan]);
+ }
+ }
+ }
+
+ if (writemask == 0xF && !is_tess_factor) {
+ LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
+ ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc);
+ }
}
static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMValueRef coord[4] = {
- ac_get_arg(&ctx->ac, ctx->tes_u),
- ac_get_arg(&ctx->ac, ctx->tes_v),
- ctx->ac.f32_0,
- ctx->ac.f32_0
- };
-
- /* For triangles, the vector should be (u, v, 1-u-v). */
- if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
- PIPE_PRIM_TRIANGLES) {
- coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
- LLVMBuildFAdd(ctx->ac.builder,
- coord[0], coord[1], ""), "");
- }
- return ac_build_gather_values(&ctx->ac, coord, 4);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v),
+ ctx->ac.f32_0, ctx->ac.f32_0};
+
+ /* For triangles, the vector should be (u, v, 1-u-v). */
+ if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) {
+ coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+ LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
+ }
+ return ac_build_gather_values(&ctx->ac, coord, 4);
}
-static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
- unsigned semantic_name)
+static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic_name)
{
- LLVMValueRef base, addr;
-
- int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
+ LLVMValueRef base, addr;
- base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
- addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
- LLVMConstInt(ctx->ac.i32, param, 0));
+ int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
- return buffer_load(ctx, ctx->ac.f32,
- ~0, ctx->tess_offchip_ring, base, addr, true);
+ base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+ addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
+ LLVMConstInt(ctx->ac.i32, param, 0));
+ return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
}
-static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
- unsigned semantic_name)
+static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned semantic_name)
{
- LLVMValueRef buf, slot, val[4];
- int i, offset;
-
- slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
- buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
- offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
-
- for (i = 0; i < 4; i++)
- val[i] = si_buffer_load_const(ctx, buf,
- LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
- return ac_build_gather_values(&ctx->ac, val, 4);
+ LLVMValueRef buf, slot, val[4];
+ int i, offset;
+
+ slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
+ buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
+ offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
+
+ for (i = 0; i < 4; i++)
+ val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
+ return ac_build_gather_values(&ctx->ac, val, 4);
}
-static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
- unsigned varying_id,
- bool load_default_state)
+static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
+ bool load_default_state)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- unsigned semantic_name;
-
- if (load_default_state) {
- switch (varying_id) {
- case VARYING_SLOT_TESS_LEVEL_INNER:
- semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
- break;
- case VARYING_SLOT_TESS_LEVEL_OUTER:
- semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
- break;
- default:
- unreachable("unknown tess level");
- }
- return load_tess_level_default(ctx, semantic_name);
- }
-
- switch (varying_id) {
- case VARYING_SLOT_TESS_LEVEL_INNER:
- semantic_name = TGSI_SEMANTIC_TESSINNER;
- break;
- case VARYING_SLOT_TESS_LEVEL_OUTER:
- semantic_name = TGSI_SEMANTIC_TESSOUTER;
- break;
- default:
- unreachable("unknown tess level");
- }
-
- return load_tess_level(ctx, semantic_name);
-
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ unsigned semantic_name;
+
+ if (load_default_state) {
+ switch (varying_id) {
+ case VARYING_SLOT_TESS_LEVEL_INNER:
+ semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
+ break;
+ case VARYING_SLOT_TESS_LEVEL_OUTER:
+ semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
+ break;
+ default:
+ unreachable("unknown tess level");
+ }
+ return load_tess_level_default(ctx, semantic_name);
+ }
+
+ switch (varying_id) {
+ case VARYING_SLOT_TESS_LEVEL_INNER:
+ semantic_name = TGSI_SEMANTIC_TESSINNER;
+ break;
+ case VARYING_SLOT_TESS_LEVEL_OUTER:
+ semantic_name = TGSI_SEMANTIC_TESSOUTER;
+ break;
+ default:
+ unreachable("unknown tess level");
+ }
+
+ return load_tess_level(ctx, semantic_name);
}
static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- if (ctx->type == PIPE_SHADER_TESS_CTRL)
- return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
- else if (ctx->type == PIPE_SHADER_TESS_EVAL)
- return get_num_tcs_out_vertices(ctx);
- else
- unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ if (ctx->type == PIPE_SHADER_TESS_CTRL)
+ return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
+ else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+ return get_num_tcs_out_vertices(ctx);
+ else
+ unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
}
/**
*/
static void si_copy_tcs_inputs(struct si_shader_context *ctx)
{
- LLVMValueRef invocation_id, buffer, buffer_offset;
- LLVMValueRef lds_vertex_stride, lds_base;
- uint64_t inputs;
+ LLVMValueRef invocation_id, buffer, buffer_offset;
+ LLVMValueRef lds_vertex_stride, lds_base;
+ uint64_t inputs;
- invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
- buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
- buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+ invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+ buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+ buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
- lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
- lds_base = get_tcs_in_current_patch_offset(ctx);
- lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
- lds_base);
+ lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
+ lds_base = get_tcs_in_current_patch_offset(ctx);
+ lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
- inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
- while (inputs) {
- unsigned i = u_bit_scan64(&inputs);
+ inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
+ while (inputs) {
+ unsigned i = u_bit_scan64(&inputs);
- LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
- LLVMConstInt(ctx->ac.i32, 4 * i, 0),
- "");
+ LLVMValueRef lds_ptr =
+ LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
- LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
- get_rel_patch_id(ctx),
- invocation_id,
- LLVMConstInt(ctx->ac.i32, i, 0));
+ LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
+ ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
- LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
+ LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
- ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
- buffer_offset, 0, ac_glc);
- }
+ ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0,
+ ac_glc);
+ }
}
-static void si_write_tess_factors(struct si_shader_context *ctx,
- LLVMValueRef rel_patch_id,
- LLVMValueRef invocation_id,
- LLVMValueRef tcs_out_current_patch_data_offset,
- LLVMValueRef invoc0_tf_outer[4],
- LLVMValueRef invoc0_tf_inner[2])
+static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
+ LLVMValueRef invocation_id,
+ LLVMValueRef tcs_out_current_patch_data_offset,
+ LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
{
- struct si_shader *shader = ctx->shader;
- unsigned tess_inner_index, tess_outer_index;
- LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
- LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
- unsigned stride, outer_comps, inner_comps, i, offset;
-
- /* Add a barrier before loading tess factors from LDS. */
- if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
- si_llvm_emit_barrier(ctx);
-
- /* Do this only for invocation 0, because the tess levels are per-patch,
- * not per-vertex.
- *
- * This can't jump, because invocation 0 executes this. It should
- * at least mask out the loads and stores for other invocations.
- */
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
- invocation_id, ctx->ac.i32_0, ""), 6503);
-
- /* Determine the layout of one tess factor element in the buffer. */
- switch (shader->key.part.tcs.epilog.prim_mode) {
- case PIPE_PRIM_LINES:
- stride = 2; /* 2 dwords, 1 vec2 store */
- outer_comps = 2;
- inner_comps = 0;
- break;
- case PIPE_PRIM_TRIANGLES:
- stride = 4; /* 4 dwords, 1 vec4 store */
- outer_comps = 3;
- inner_comps = 1;
- break;
- case PIPE_PRIM_QUADS:
- stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
- outer_comps = 4;
- inner_comps = 2;
- break;
- default:
- assert(0);
- return;
- }
-
- for (i = 0; i < 4; i++) {
- inner[i] = LLVMGetUndef(ctx->ac.i32);
- outer[i] = LLVMGetUndef(ctx->ac.i32);
- }
-
- if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
- /* Tess factors are in VGPRs. */
- for (i = 0; i < outer_comps; i++)
- outer[i] = out[i] = invoc0_tf_outer[i];
- for (i = 0; i < inner_comps; i++)
- inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
- } else {
- /* Load tess_inner and tess_outer from LDS.
- * Any invocation can write them, so we can't get them from a temporary.
- */
- tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
- tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
- lds_base = tcs_out_current_patch_data_offset;
- lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
- LLVMConstInt(ctx->ac.i32,
- tess_inner_index * 4, 0), "");
- lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
- LLVMConstInt(ctx->ac.i32,
- tess_outer_index * 4, 0), "");
-
- for (i = 0; i < outer_comps; i++) {
- outer[i] = out[i] =
- lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
- }
- for (i = 0; i < inner_comps; i++) {
- inner[i] = out[outer_comps+i] =
- lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
- }
- }
-
- if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
- /* For isolines, the hardware expects tess factors in the
- * reverse order from what NIR specifies.
- */
- LLVMValueRef tmp = out[0];
- out[0] = out[1];
- out[1] = tmp;
- }
-
- /* Convert the outputs to vectors for stores. */
- vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
- vec1 = NULL;
-
- if (stride > 4)
- vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
-
- /* Get the buffer. */
- buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
-
- /* Get the offset. */
- tf_base = ac_get_arg(&ctx->ac,
- ctx->tcs_factor_offset);
- byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
- LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
-
- ac_build_ifcc(&ctx->ac,
- LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
- rel_patch_id, ctx->ac.i32_0, ""), 6504);
-
- /* Store the dynamic HS control word. */
- offset = 0;
- if (ctx->screen->info.chip_class <= GFX8) {
- ac_build_buffer_store_dword(&ctx->ac, buffer,
- LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
- 1, ctx->ac.i32_0, tf_base,
- offset, ac_glc);
- offset += 4;
- }
-
- ac_build_endif(&ctx->ac, 6504);
-
- /* Store the tessellation factors. */
- ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
- MIN2(stride, 4), byteoffset, tf_base,
- offset, ac_glc);
- offset += 16;
- if (vec1)
- ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
- stride - 4, byteoffset, tf_base,
- offset, ac_glc);
-
- /* Store the tess factors into the offchip buffer if TES reads them. */
- if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
- LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
- LLVMValueRef tf_inner_offset;
- unsigned param_outer, param_inner;
-
- buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
- base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
- param_outer = si_shader_io_get_unique_index_patch(
- TGSI_SEMANTIC_TESSOUTER, 0);
- tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
- LLVMConstInt(ctx->ac.i32, param_outer, 0));
-
- unsigned outer_vec_size =
- ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
- outer_comps : util_next_power_of_two(outer_comps);
- outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
-
- ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
- outer_comps, tf_outer_offset,
- base, 0, ac_glc);
- if (inner_comps) {
- param_inner = si_shader_io_get_unique_index_patch(
- TGSI_SEMANTIC_TESSINNER, 0);
- tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
- LLVMConstInt(ctx->ac.i32, param_inner, 0));
-
- inner_vec = inner_comps == 1 ? inner[0] :
- ac_build_gather_values(&ctx->ac, inner, inner_comps);
- ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
- inner_comps, tf_inner_offset,
- base, 0, ac_glc);
- }
- }
-
- ac_build_endif(&ctx->ac, 6503);
+ struct si_shader *shader = ctx->shader;
+ unsigned tess_inner_index, tess_outer_index;
+ LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+ LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+ unsigned stride, outer_comps, inner_comps, i, offset;
+
+ /* Add a barrier before loading tess factors from LDS. */
+ if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+ si_llvm_emit_barrier(ctx);
+
+ /* Do this only for invocation 0, because the tess levels are per-patch,
+ * not per-vertex.
+ *
+ * This can't jump, because invocation 0 executes this. It should
+ * at least mask out the loads and stores for other invocations.
+ */
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
+
+ /* Determine the layout of one tess factor element in the buffer. */
+ switch (shader->key.part.tcs.epilog.prim_mode) {
+ case PIPE_PRIM_LINES:
+ stride = 2; /* 2 dwords, 1 vec2 store */
+ outer_comps = 2;
+ inner_comps = 0;
+ break;
+ case PIPE_PRIM_TRIANGLES:
+ stride = 4; /* 4 dwords, 1 vec4 store */
+ outer_comps = 3;
+ inner_comps = 1;
+ break;
+ case PIPE_PRIM_QUADS:
+ stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+ outer_comps = 4;
+ inner_comps = 2;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ for (i = 0; i < 4; i++) {
+ inner[i] = LLVMGetUndef(ctx->ac.i32);
+ outer[i] = LLVMGetUndef(ctx->ac.i32);
+ }
+
+ if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+ /* Tess factors are in VGPRs. */
+ for (i = 0; i < outer_comps; i++)
+ outer[i] = out[i] = invoc0_tf_outer[i];
+ for (i = 0; i < inner_comps; i++)
+ inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
+ } else {
+ /* Load tess_inner and tess_outer from LDS.
+ * Any invocation can write them, so we can't get them from a temporary.
+ */
+ tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+ tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+ lds_base = tcs_out_current_patch_data_offset;
+ lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
+ LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
+ lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
+ LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
+
+ for (i = 0; i < outer_comps; i++) {
+ outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
+ }
+ for (i = 0; i < inner_comps; i++) {
+ inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
+ }
+ }
+
+ if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+ /* For isolines, the hardware expects tess factors in the
+ * reverse order from what NIR specifies.
+ */
+ LLVMValueRef tmp = out[0];
+ out[0] = out[1];
+ out[1] = tmp;
+ }
+
+ /* Convert the outputs to vectors for stores. */
+ vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
+ vec1 = NULL;
+
+ if (stride > 4)
+ vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
+
+ /* Get the buffer. */
+ buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
+
+ /* Get the offset. */
+ tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset);
+ byteoffset =
+ LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
+
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
+
+ /* Store the dynamic HS control word. */
+ offset = 0;
+ if (ctx->screen->info.chip_class <= GFX8) {
+ ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1,
+ ctx->ac.i32_0, tf_base, offset, ac_glc);
+ offset += 4;
+ }
+
+ ac_build_endif(&ctx->ac, 6504);
+
+ /* Store the tessellation factors. */
+ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset,
+ ac_glc);
+ offset += 16;
+ if (vec1)
+ ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset,
+ ac_glc);
+
+ /* Store the tess factors into the offchip buffer if TES reads them. */
+ if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+ LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
+ LLVMValueRef tf_inner_offset;
+ unsigned param_outer, param_inner;
+
+ buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+ base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+ param_outer = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+ tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+ LLVMConstInt(ctx->ac.i32, param_outer, 0));
+
+ unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false)
+ ? outer_comps
+ : util_next_power_of_two(outer_comps);
+ outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
+
+ ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0,
+ ac_glc);
+ if (inner_comps) {
+ param_inner = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+ tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+ LLVMConstInt(ctx->ac.i32, param_inner, 0));
+
+ inner_vec =
+ inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps);
+ ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base,
+ 0, ac_glc);
+ }
+ }
+
+ ac_build_endif(&ctx->ac, 6503);
}
/* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-
- si_copy_tcs_inputs(ctx);
-
- rel_patch_id = get_rel_patch_id(ctx);
- invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
- tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
-
- if (ctx->screen->info.chip_class >= GFX9) {
- LLVMBasicBlockRef blocks[2] = {
- LLVMGetInsertBlock(builder),
- ctx->merged_wrap_if_entry_block
- };
- LLVMValueRef values[2];
-
- ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
- values[0] = rel_patch_id;
- values[1] = LLVMGetUndef(ctx->ac.i32);
- rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
- values[0] = tf_lds_offset;
- values[1] = LLVMGetUndef(ctx->ac.i32);
- tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
- values[0] = invocation_id;
- values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
- invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
- }
-
- /* Return epilog parameters from this function. */
- LLVMValueRef ret = ctx->return_value;
- unsigned vgpr;
-
- if (ctx->screen->info.chip_class >= GFX9) {
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
- 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
- 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
- /* Tess offchip and tess factor offsets are at the beginning. */
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
- vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
- } else {
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
- GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
- GFX6_SGPR_TCS_OUT_LAYOUT);
- /* Tess offchip and tess factor offsets are after user SGPRs. */
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
- GFX6_TCS_NUM_USER_SGPR);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
- GFX6_TCS_NUM_USER_SGPR + 1);
- vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
- }
-
- /* VGPRs */
- rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
- invocation_id = ac_to_float(&ctx->ac, invocation_id);
- tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
-
- /* Leave a hole corresponding to the two input VGPRs. This ensures that
- * the invocation_id output does not alias the tcs_rel_ids input,
- * which saves a V_MOV on gfx9.
- */
- vgpr += 2;
-
- ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
- ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-
- if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
- vgpr++; /* skip the tess factor LDS offset */
- for (unsigned i = 0; i < 6; i++) {
- LLVMValueRef value =
- LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
- value = ac_to_float(&ctx->ac, value);
- ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
- }
- } else {
- ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
- }
- ctx->return_value = ret;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+ si_copy_tcs_inputs(ctx);
+
+ rel_patch_id = get_rel_patch_id(ctx);
+ invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+ tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
+ LLVMValueRef values[2];
+
+ ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+ values[0] = rel_patch_id;
+ values[1] = LLVMGetUndef(ctx->ac.i32);
+ rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+ values[0] = tf_lds_offset;
+ values[1] = LLVMGetUndef(ctx->ac.i32);
+ tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+ values[0] = invocation_id;
+ values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
+ invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+ }
+
+ /* Return epilog parameters from this function. */
+ LLVMValueRef ret = ctx->return_value;
+ unsigned vgpr;
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ ret =
+ si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+ /* Tess offchip and tess factor offsets are at the beginning. */
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+ vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
+ } else {
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
+ /* Tess offchip and tess factor offsets are after user SGPRs. */
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
+ vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+ }
+
+ /* VGPRs */
+ rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
+ invocation_id = ac_to_float(&ctx->ac, invocation_id);
+ tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
+
+ /* Leave a hole corresponding to the two input VGPRs. This ensures that
+ * the invocation_id output does not alias the tcs_rel_ids input,
+ * which saves a V_MOV on gfx9.
+ */
+ vgpr += 2;
+
+ ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+ ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+
+ if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+ vgpr++; /* skip the tess factor LDS offset */
+ for (unsigned i = 0; i < 6; i++) {
+ LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+ value = ac_to_float(&ctx->ac, value);
+ ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+ }
+ } else {
+ ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+ }
+ ctx->return_value = ret;
}
/* Pass TCS inputs from LS to TCS on GFX9. */
static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
{
- LLVMValueRef ret = ctx->return_value;
-
- ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
- ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
- ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
- ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
- ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
- 8 + SI_SGPR_RW_BUFFERS);
- ret = si_insert_input_ptr(ctx, ret,
- ctx->bindless_samplers_and_images,
- 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-
- ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
- 8 + SI_SGPR_VS_STATE_BITS);
-
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
- 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
- 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
- ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
- 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-
- unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- ac_to_float(&ctx->ac,
- ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
- vgpr++, "");
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
- ac_to_float(&ctx->ac,
- ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
- vgpr++, "");
- ctx->return_value = ret;
+ LLVMValueRef ret = ctx->return_value;
+
+ ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+ ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+ ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+ ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+ ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+ ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+ 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+
+ ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+ ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+
+ unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+ ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
+ vgpr++, "");
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+ ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
+ vgpr++, "");
+ ctx->return_value = ret;
}
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs)
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader *shader = ctx->shader;
- struct si_shader_info *info = &shader->selector->info;
- unsigned i, chan;
- LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
- LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
- LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
- vertex_dw_stride, "");
-
- /* Write outputs to LDS. The next shader (TCS aka HS) will read
- * its inputs from it. */
- for (i = 0; i < info->num_outputs; i++) {
- unsigned name = info->output_semantic_name[i];
- unsigned index = info->output_semantic_index[i];
-
- /* The ARB_shader_viewport_layer_array spec contains the
- * following issue:
- *
- * 2) What happens if gl_ViewportIndex or gl_Layer is
- * written in the vertex shader and a geometry shader is
- * present?
- *
- * RESOLVED: The value written by the last vertex processing
- * stage is used. If the last vertex processing stage
- * (vertex, tessellation evaluation or geometry) does not
- * statically assign to gl_ViewportIndex or gl_Layer, index
- * or layer zero is assumed.
- *
- * So writes to those outputs in VS-as-LS are simply ignored.
- */
- if (name == TGSI_SEMANTIC_LAYER ||
- name == TGSI_SEMANTIC_VIEWPORT_INDEX)
- continue;
-
- int param = si_shader_io_get_unique_index(name, index, false);
- LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
- LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
-
- for (chan = 0; chan < 4; chan++) {
- if (!(info->output_usagemask[i] & (1 << chan)))
- continue;
-
- lshs_lds_store(ctx, chan, dw_addr,
- LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
- }
- }
-
- if (ctx->screen->info.chip_class >= GFX9)
- si_set_ls_return_value_for_tcs(ctx);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader *shader = ctx->shader;
+ struct si_shader_info *info = &shader->selector->info;
+ unsigned i, chan;
+ LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
+ LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
+ LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
+
+ /* Write outputs to LDS. The next shader (TCS aka HS) will read
+ * its inputs from it. */
+ for (i = 0; i < info->num_outputs; i++) {
+ unsigned name = info->output_semantic_name[i];
+ unsigned index = info->output_semantic_index[i];
+
+ /* The ARB_shader_viewport_layer_array spec contains the
+ * following issue:
+ *
+ * 2) What happens if gl_ViewportIndex or gl_Layer is
+ * written in the vertex shader and a geometry shader is
+ * present?
+ *
+ * RESOLVED: The value written by the last vertex processing
+ * stage is used. If the last vertex processing stage
+ * (vertex, tessellation evaluation or geometry) does not
+ * statically assign to gl_ViewportIndex or gl_Layer, index
+ * or layer zero is assumed.
+ *
+ * So writes to those outputs in VS-as-LS are simply ignored.
+ */
+ if (name == TGSI_SEMANTIC_LAYER || name == TGSI_SEMANTIC_VIEWPORT_INDEX)
+ continue;
+
+ int param = si_shader_io_get_unique_index(name, index, false);
+ LLVMValueRef dw_addr =
+ LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+
+ for (chan = 0; chan < 4; chan++) {
+ if (!(info->output_usagemask[i] & (1 << chan)))
+ continue;
+
+ lshs_lds_store(ctx, chan, dw_addr,
+ LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
+ }
+ }
+
+ if (ctx->screen->info.chip_class >= GFX9)
+ si_set_ls_return_value_for_tcs(ctx);
}
/**
* Compile the TCS epilog function. This writes tesselation factors to memory
* based on the output primitive type of the tesselator (determined by TES).
*/
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
- union si_shader_part_key *key)
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- if (ctx->screen->info.chip_class >= GFX9) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_factor_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_out_lds_layout);
- } else {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_offchip_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_out_lds_layout);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_offchip_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &ctx->tcs_factor_offset);
- }
-
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
- struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
- struct ac_arg invocation_id; /* invocation ID within the patch */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
- struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
- &tcs_out_current_patch_data_offset);
-
- struct ac_arg tess_factors[6];
- for (unsigned i = 0; i < 6; i++)
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
-
- /* Create the function. */
- si_llvm_create_func(ctx, "tcs_epilog", NULL, 0,
- ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
- ac_declare_lds_as_pointer(&ctx->ac);
-
- LLVMValueRef invoc0_tess_factors[6];
- for (unsigned i = 0; i < 6; i++)
- invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
-
- si_write_tess_factors(ctx,
- ac_get_arg(&ctx->ac, rel_patch_id),
- ac_get_arg(&ctx->ac, invocation_id),
- ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
- invoc0_tess_factors, invoc0_tess_factors + 4);
-
- LLVMBuildRetVoid(ctx->ac.builder);
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+ } else {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+ }
+
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+ struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
+ struct ac_arg invocation_id; /* invocation ID within the patch */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
+ struct ac_arg
+ tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
+
+ struct ac_arg tess_factors[6];
+ for (unsigned i = 0; i < 6; i++)
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
+ ac_declare_lds_as_pointer(&ctx->ac);
+
+ LLVMValueRef invoc0_tess_factors[6];
+ for (unsigned i = 0; i < 6; i++)
+ invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
+
+ si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
+ ac_get_arg(&ctx->ac, invocation_id),
+ ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
+ invoc0_tess_factors, invoc0_tess_factors + 4);
+
+ LLVMBuildRetVoid(ctx->ac.builder);
}
void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
{
- ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
- ctx->abi.load_tess_level = si_load_tess_level;
- ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
- ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
- ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+ ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
+ ctx->abi.load_tess_level = si_load_tess_level;
+ ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
+ ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
+ ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
}
void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
{
- ctx->abi.load_tess_varyings = si_nir_load_input_tes;
- ctx->abi.load_tess_coord = si_load_tess_coord;
- ctx->abi.load_tess_level = si_load_tess_level;
- ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
-
- if (ctx->shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (ngg_cull_shader)
- ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
- else if (ctx->shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+ ctx->abi.load_tess_varyings = si_nir_load_input_tes;
+ ctx->abi.load_tess_coord = si_load_tess_coord;
+ ctx->abi.load_tess_level = si_load_tess_level;
+ ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+
+ if (ctx->shader->key.as_es)
+ ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+ else if (ctx->shader->key.as_ngg)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+ else
+ ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
#include "si_pipe.h"
+#include "si_shader_internal.h"
#include "sid.h"
#include "util/u_memory.h"
-static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
- LLVMValueRef i32, unsigned index)
+static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
{
- assert(index <= 1);
+ assert(index <= 1);
- if (index == 1)
- return LLVMBuildAShr(ctx->ac.builder, i32,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ if (index == 1)
+ return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
- return LLVMBuildSExt(ctx->ac.builder,
- LLVMBuildTrunc(ctx->ac.builder, i32,
- ctx->ac.i16, ""),
- ctx->ac.i32, "");
+ return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
+ ctx->ac.i32, "");
}
-static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
- LLVMValueRef out[4])
+static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
{
- const struct si_shader_info *info = &ctx->shader->selector->info;
- unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
- if (vs_blit_property) {
- LLVMValueRef vertex_id = ctx->abi.vertex_id;
- LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
- LLVMIntULE, vertex_id,
- ctx->ac.i32_1, "");
- /* Use LLVMIntNE, because we have 3 vertices and only
- * the middle one should use y2.
- */
- LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
- LLVMIntNE, vertex_id,
- ctx->ac.i32_1, "");
-
- unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
- if (input_index == 0) {
- /* Position: */
- LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs);
- LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 1);
-
- LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
- LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
- LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
- LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
-
- LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
- x1, x2, "");
- LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
- y1, y2, "");
-
- out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
- out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
- out[2] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 2);
- out[3] = ctx->ac.f32_1;
- return;
- }
-
- /* Color or texture coordinates: */
- assert(input_index == 1);
-
- if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
- for (int i = 0; i < 4; i++) {
- out[i] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 3 + i);
- }
- } else {
- assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
- LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 3);
- LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 4);
- LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 5);
- LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 6);
-
- out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
- x1, x2, "");
- out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
- y1, y2, "");
- out[2] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 7);
- out[3] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 8);
- }
- return;
- }
-
- unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
- union si_vs_fix_fetch fix_fetch;
- LLVMValueRef vb_desc;
- LLVMValueRef vertex_index;
- LLVMValueRef tmp;
-
- if (input_index < num_vbos_in_user_sgprs) {
- vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
- } else {
- unsigned index= input_index - num_vbos_in_user_sgprs;
- vb_desc = ac_build_load_to_sgpr(&ctx->ac,
- ac_get_arg(&ctx->ac, ctx->vertex_buffers),
- LLVMConstInt(ctx->ac.i32, index, 0));
- }
-
- vertex_index = LLVMGetParam(ctx->main_fn,
- ctx->vertex_index0.arg_index +
- input_index);
-
- /* Use the open-coded implementation for all loads of doubles and
- * of dword-sized data that needs fixups. We need to insert conversion
- * code anyway, and the amd/common code does it for us.
- *
- * Note: On LLVM <= 8, we can only open-code formats with
- * channel size >= 4 bytes.
- */
- bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
- fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
- if (opencode ||
- (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
- (fix_fetch.u.log_size == 2)) {
- tmp = ac_build_opencoded_load_format(
- &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
- fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
- vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
- for (unsigned i = 0; i < 4; ++i)
- out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
- return;
- }
-
- /* Do multiple loads for special formats. */
- unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
- LLVMValueRef fetches[4];
- unsigned num_fetches;
- unsigned fetch_stride;
- unsigned channels_per_fetch;
-
- if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
- num_fetches = MIN2(required_channels, 3);
- fetch_stride = 1 << fix_fetch.u.log_size;
- channels_per_fetch = 1;
- } else {
- num_fetches = 1;
- fetch_stride = 0;
- channels_per_fetch = required_channels;
- }
-
- for (unsigned i = 0; i < num_fetches; ++i) {
- LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
- fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
- channels_per_fetch, 0, true);
- }
-
- if (num_fetches == 1 && channels_per_fetch > 1) {
- LLVMValueRef fetch = fetches[0];
- for (unsigned i = 0; i < channels_per_fetch; ++i) {
- tmp = LLVMConstInt(ctx->ac.i32, i, false);
- fetches[i] = LLVMBuildExtractElement(
- ctx->ac.builder, fetch, tmp, "");
- }
- num_fetches = channels_per_fetch;
- channels_per_fetch = 1;
- }
-
- for (unsigned i = num_fetches; i < 4; ++i)
- fetches[i] = LLVMGetUndef(ctx->ac.f32);
-
- if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
- required_channels == 4) {
- if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
- fetches[3] = ctx->ac.i32_1;
- else
- fetches[3] = ctx->ac.f32_1;
- } else if (fix_fetch.u.log_size == 3 &&
- (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
- fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
- fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
- required_channels == 4) {
- /* For 2_10_10_10, the hardware returns an unsigned value;
- * convert it to a signed one.
- */
- LLVMValueRef tmp = fetches[3];
- LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
-
- /* First, recover the sign-extended signed integer value. */
- if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
- tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
- else
- tmp = ac_to_integer(&ctx->ac, tmp);
-
- /* For the integer-like cases, do a natural sign extension.
- *
- * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
- * and happen to contain 0, 1, 2, 3 as the two LSBs of the
- * exponent.
- */
- tmp = LLVMBuildShl(ctx->ac.builder, tmp,
- fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
- LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
- tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
-
- /* Convert back to the right type. */
- if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
- LLVMValueRef clamp;
- LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
- clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
- tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
- } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
- }
-
- fetches[3] = tmp;
- }
-
- for (unsigned i = 0; i < 4; ++i)
- out[i] = ac_to_float(&ctx->ac, fetches[i]);
+ const struct si_shader_info *info = &ctx->shader->selector->info;
+ unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+ if (vs_blit_property) {
+ LLVMValueRef vertex_id = ctx->abi.vertex_id;
+ LLVMValueRef sel_x1 =
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
+ /* Use LLVMIntNE, because we have 3 vertices and only
+ * the middle one should use y2.
+ */
+ LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
+
+ unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
+ if (input_index == 0) {
+ /* Position: */
+ LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
+ LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
+
+ LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
+ LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
+ LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
+ LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
+
+ LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+ LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+
+ out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+ out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
+ out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
+ out[3] = ctx->ac.f32_1;
+ return;
+ }
+
+ /* Color or texture coordinates: */
+ assert(input_index == 1);
+
+ if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+ for (int i = 0; i < 4; i++) {
+ out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
+ }
+ } else {
+ assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
+ LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
+ LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
+ LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
+ LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
+
+ out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+ out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+ out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
+ out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
+ }
+ return;
+ }
+
+ unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+ union si_vs_fix_fetch fix_fetch;
+ LLVMValueRef vb_desc;
+ LLVMValueRef vertex_index;
+ LLVMValueRef tmp;
+
+ if (input_index < num_vbos_in_user_sgprs) {
+ vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+ } else {
+ unsigned index = input_index - num_vbos_in_user_sgprs;
+ vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+ LLVMConstInt(ctx->ac.i32, index, 0));
+ }
+
+ vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
+
+ /* Use the open-coded implementation for all loads of doubles and
+ * of dword-sized data that needs fixups. We need to insert conversion
+ * code anyway, and the amd/common code does it for us.
+ *
+ * Note: On LLVM <= 8, we can only open-code formats with
+ * channel size >= 4 bytes.
+ */
+ bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+ fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+ if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+ (fix_fetch.u.log_size == 2)) {
+ tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
+ fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
+ fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
+ ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] =
+ LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
+ return;
+ }
+
+ /* Do multiple loads for special formats. */
+ unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+ LLVMValueRef fetches[4];
+ unsigned num_fetches;
+ unsigned fetch_stride;
+ unsigned channels_per_fetch;
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+ num_fetches = MIN2(required_channels, 3);
+ fetch_stride = 1 << fix_fetch.u.log_size;
+ channels_per_fetch = 1;
+ } else {
+ num_fetches = 1;
+ fetch_stride = 0;
+ channels_per_fetch = required_channels;
+ }
+
+ for (unsigned i = 0; i < num_fetches; ++i) {
+ LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
+ fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
+ channels_per_fetch, 0, true);
+ }
+
+ if (num_fetches == 1 && channels_per_fetch > 1) {
+ LLVMValueRef fetch = fetches[0];
+ for (unsigned i = 0; i < channels_per_fetch; ++i) {
+ tmp = LLVMConstInt(ctx->ac.i32, i, false);
+ fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
+ }
+ num_fetches = channels_per_fetch;
+ channels_per_fetch = 1;
+ }
+
+ for (unsigned i = num_fetches; i < 4; ++i)
+ fetches[i] = LLVMGetUndef(ctx->ac.f32);
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+ fetches[3] = ctx->ac.i32_1;
+ else
+ fetches[3] = ctx->ac.f32_1;
+ } else if (fix_fetch.u.log_size == 3 &&
+ (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+ required_channels == 4) {
+ /* For 2_10_10_10, the hardware returns an unsigned value;
+ * convert it to a signed one.
+ */
+ LLVMValueRef tmp = fetches[3];
+ LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+
+ /* First, recover the sign-extended signed integer value. */
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
+ tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
+ else
+ tmp = ac_to_integer(&ctx->ac, tmp);
+
+ /* For the integer-like cases, do a natural sign extension.
+ *
+ * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+ * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+ * exponent.
+ */
+ tmp = LLVMBuildShl(
+ ctx->ac.builder, tmp,
+ fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+ tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
+
+ /* Convert back to the right type. */
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
+ LLVMValueRef clamp;
+ LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+ clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
+ tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
+ } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+ }
+
+ fetches[3] = tmp;
+ }
+
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] = ac_to_float(&ctx->ac, fetches[i]);
}
static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
{
- LLVMValueRef input[4];
+ LLVMValueRef input[4];
- load_input_vs(ctx, input_index / 4, input);
+ load_input_vs(ctx, input_index / 4, input);
- for (unsigned chan = 0; chan < 4; chan++) {
- ctx->inputs[input_index + chan] =
- LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
- }
+ for (unsigned chan = 0; chan < 4; chan++) {
+ ctx->inputs[input_index + chan] =
+ LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
+ }
}
void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
{
- uint64_t processed_inputs = 0;
-
- nir_foreach_variable(variable, &nir->inputs) {
- unsigned attrib_count = glsl_count_attribute_slots(variable->type,
- true);
- unsigned input_idx = variable->data.driver_location;
- unsigned loc = variable->data.location;
-
- for (unsigned i = 0; i < attrib_count; i++) {
- /* Packed components share the same location so skip
- * them if we have already processed the location.
- */
- if (processed_inputs & ((uint64_t)1 << (loc + i))) {
- input_idx += 4;
- continue;
- }
-
- declare_input_vs(ctx, input_idx);
- if (glsl_type_is_dual_slot(variable->type)) {
- input_idx += 4;
- declare_input_vs(ctx, input_idx);
- }
-
- processed_inputs |= ((uint64_t)1 << (loc + i));
- input_idx += 4;
- }
- }
+ uint64_t processed_inputs = 0;
+
+ nir_foreach_variable (variable, &nir->inputs) {
+ unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
+ unsigned input_idx = variable->data.driver_location;
+ unsigned loc = variable->data.location;
+
+ for (unsigned i = 0; i < attrib_count; i++) {
+ /* Packed components share the same location so skip
+ * them if we have already processed the location.
+ */
+ if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+ input_idx += 4;
+ continue;
+ }
+
+ declare_input_vs(ctx, input_idx);
+ if (glsl_type_is_dual_slot(variable->type)) {
+ input_idx += 4;
+ declare_input_vs(ctx, input_idx);
+ }
+
+ processed_inputs |= ((uint64_t)1 << (loc + i));
+ input_idx += 4;
+ }
+ }
}
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
- LLVMValueRef const *so_buffers,
- LLVMValueRef const *so_write_offsets,
- struct pipe_stream_output *stream_out,
- struct si_shader_output_values *shader_out)
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+ LLVMValueRef const *so_write_offsets,
+ struct pipe_stream_output *stream_out,
+ struct si_shader_output_values *shader_out)
{
- unsigned buf_idx = stream_out->output_buffer;
- unsigned start = stream_out->start_component;
- unsigned num_comps = stream_out->num_components;
- LLVMValueRef out[4];
-
- assert(num_comps && num_comps <= 4);
- if (!num_comps || num_comps > 4)
- return;
-
- /* Load the output as int. */
- for (int j = 0; j < num_comps; j++) {
- assert(stream_out->stream == shader_out->vertex_stream[start + j]);
-
- out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
- }
-
- /* Pack the output. */
- LLVMValueRef vdata = NULL;
-
- switch (num_comps) {
- case 1: /* as i32 */
- vdata = out[0];
- break;
- case 2: /* as v2i32 */
- case 3: /* as v3i32 */
- if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
- vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
- break;
- }
- /* as v4i32 (aligned to 4) */
- out[3] = LLVMGetUndef(ctx->ac.i32);
- /* fall through */
- case 4: /* as v4i32 */
- vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
- break;
- }
-
- ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
- vdata, num_comps,
- so_write_offsets[buf_idx],
- ctx->ac.i32_0,
- stream_out->dst_offset * 4, ac_glc | ac_slc);
+ unsigned buf_idx = stream_out->output_buffer;
+ unsigned start = stream_out->start_component;
+ unsigned num_comps = stream_out->num_components;
+ LLVMValueRef out[4];
+
+ assert(num_comps && num_comps <= 4);
+ if (!num_comps || num_comps > 4)
+ return;
+
+ /* Load the output as int. */
+ for (int j = 0; j < num_comps; j++) {
+ assert(stream_out->stream == shader_out->vertex_stream[start + j]);
+
+ out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
+ }
+
+ /* Pack the output. */
+ LLVMValueRef vdata = NULL;
+
+ switch (num_comps) {
+ case 1: /* as i32 */
+ vdata = out[0];
+ break;
+ case 2: /* as v2i32 */
+ case 3: /* as v3i32 */
+ if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
+ vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
+ break;
+ }
+ /* as v4i32 (aligned to 4) */
+ out[3] = LLVMGetUndef(ctx->ac.i32);
+ /* fall through */
+ case 4: /* as v4i32 */
+ vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
+ break;
+ }
+
+ ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
+ so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
+ ac_glc | ac_slc);
}
/**
* Write streamout data to buffers for vertex stream @p stream (different
* vertex streams can occur for GS copy shaders).
*/
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput, unsigned stream)
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+ unsigned noutput, unsigned stream)
{
- struct si_shader_selector *sel = ctx->shader->selector;
- struct pipe_stream_output_info *so = &sel->so;
- LLVMBuilderRef builder = ctx->ac.builder;
- int i;
-
- /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
- LLVMValueRef so_vtx_count =
- si_unpack_param(ctx, ctx->streamout_config, 16, 7);
-
- LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
- /* can_emit = tid < so_vtx_count; */
- LLVMValueRef can_emit =
- LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
-
- /* Emit the streamout code conditionally. This actually avoids
- * out-of-bounds buffer access. The hw tells us via the SGPR
- * (so_vtx_count) which threads are allowed to emit streamout data. */
- ac_build_ifcc(&ctx->ac, can_emit, 6501);
- {
- /* The buffer offset is computed as follows:
- * ByteOffset = streamout_offset[buffer_id]*4 +
- * (streamout_write_index + thread_id)*stride[buffer_id] +
- * attrib_offset
- */
-
- LLVMValueRef so_write_index =
- ac_get_arg(&ctx->ac,
- ctx->streamout_write_index);
-
- /* Compute (streamout_write_index + thread_id). */
- so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
-
- /* Load the descriptor and compute the write offset for each
- * enabled buffer. */
- LLVMValueRef so_write_offset[4] = {};
- LLVMValueRef so_buffers[4];
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
- ctx->rw_buffers);
-
- for (i = 0; i < 4; i++) {
- if (!so->stride[i])
- continue;
-
- LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
- SI_VS_STREAMOUT_BUF0 + i, 0);
-
- so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
- LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
- ctx->streamout_offset[i]);
- so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
- so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
- LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
- so_offset);
- }
-
- /* Write streamout data. */
- for (i = 0; i < so->num_outputs; i++) {
- unsigned reg = so->output[i].register_index;
-
- if (reg >= noutput)
- continue;
-
- if (stream != so->output[i].stream)
- continue;
-
- si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
- &so->output[i], &outputs[reg]);
- }
- }
- ac_build_endif(&ctx->ac, 6501);
+ struct si_shader_selector *sel = ctx->shader->selector;
+ struct pipe_stream_output_info *so = &sel->so;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ int i;
+
+ /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
+ LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7);
+
+ LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+ /* can_emit = tid < so_vtx_count; */
+ LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+
+ /* Emit the streamout code conditionally. This actually avoids
+ * out-of-bounds buffer access. The hw tells us via the SGPR
+ * (so_vtx_count) which threads are allowed to emit streamout data. */
+ ac_build_ifcc(&ctx->ac, can_emit, 6501);
+ {
+ /* The buffer offset is computed as follows:
+ * ByteOffset = streamout_offset[buffer_id]*4 +
+ * (streamout_write_index + thread_id)*stride[buffer_id] +
+ * attrib_offset
+ */
+
+ LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index);
+
+ /* Compute (streamout_write_index + thread_id). */
+ so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
+
+ /* Load the descriptor and compute the write offset for each
+ * enabled buffer. */
+ LLVMValueRef so_write_offset[4] = {};
+ LLVMValueRef so_buffers[4];
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+ for (i = 0; i < 4; i++) {
+ if (!so->stride[i])
+ continue;
+
+ LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
+
+ so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+ LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]);
+ so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+ so_write_offset[i] = ac_build_imad(
+ &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
+ }
+
+ /* Write streamout data. */
+ for (i = 0; i < so->num_outputs; i++) {
+ unsigned reg = so->output[i].register_index;
+
+ if (reg >= noutput)
+ continue;
+
+ if (stream != so->output[i].stream)
+ continue;
+
+ si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
+ &outputs[reg]);
+ }
+ }
+ ac_build_endif(&ctx->ac, 6501);
}
-static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
- struct ac_export_args *pos, LLVMValueRef *out_elts)
+static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
+ LLVMValueRef *out_elts)
{
- unsigned reg_index;
- unsigned chan;
- unsigned const_chan;
- LLVMValueRef base_elt;
- LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
- SI_VS_CONST_CLIP_PLANES, 0);
- LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
-
- for (reg_index = 0; reg_index < 2; reg_index ++) {
- struct ac_export_args *args = &pos[2 + reg_index];
-
- args->out[0] =
- args->out[1] =
- args->out[2] =
- args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
-
- /* Compute dot products of position and user clip plane vectors */
- for (chan = 0; chan < 4; chan++) {
- for (const_chan = 0; const_chan < 4; const_chan++) {
- LLVMValueRef addr =
- LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
- const_chan) * 4, 0);
- base_elt = si_buffer_load_const(ctx, const_resource,
- addr);
- args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
- out_elts[const_chan], args->out[chan]);
- }
- }
-
- args->enabled_channels = 0xf;
- args->valid_mask = 0;
- args->done = 0;
- args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
- args->compr = 0;
- }
+ unsigned reg_index;
+ unsigned chan;
+ unsigned const_chan;
+ LLVMValueRef base_elt;
+ LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
+ LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
+
+ for (reg_index = 0; reg_index < 2; reg_index++) {
+ struct ac_export_args *args = &pos[2 + reg_index];
+
+ args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
+
+ /* Compute dot products of position and user clip plane vectors */
+ for (chan = 0; chan < 4; chan++) {
+ for (const_chan = 0; const_chan < 4; const_chan++) {
+ LLVMValueRef addr =
+ LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
+ base_elt = si_buffer_load_const(ctx, const_resource, addr);
+ args->out[chan] =
+ ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]);
+ }
+ }
+
+ args->enabled_channels = 0xf;
+ args->valid_mask = 0;
+ args->done = 0;
+ args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
+ args->compr = 0;
+ }
}
/* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
- LLVMValueRef *values,
- unsigned target,
- struct ac_export_args *args)
+static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+ unsigned target, struct ac_export_args *args)
{
- args->enabled_channels = 0xf; /* writemask - default is 0xf */
- args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
- args->done = 0; /* Specify whether this is the last export */
- args->target = target; /* Specify the target we are exporting */
- args->compr = false;
+ args->enabled_channels = 0xf; /* writemask - default is 0xf */
+ args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
+ args->done = 0; /* Specify whether this is the last export */
+ args->target = target; /* Specify the target we are exporting */
+ args->compr = false;
- memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+ memcpy(&args->out[0], values, sizeof(values[0]) * 4);
}
-static void si_export_param(struct si_shader_context *ctx, unsigned index,
- LLVMValueRef *values)
+static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values)
{
- struct ac_export_args args;
+ struct ac_export_args args;
- si_llvm_init_vs_export_args(ctx, values,
- V_008DFC_SQ_EXP_PARAM + index, &args);
- ac_build_export(&ctx->ac, &args);
+ si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args);
+ ac_build_export(&ctx->ac, &args);
}
static void si_build_param_exports(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
+ struct si_shader_output_values *outputs, unsigned noutput)
{
- struct si_shader *shader = ctx->shader;
- unsigned param_count = 0;
-
- for (unsigned i = 0; i < noutput; i++) {
- unsigned semantic_name = outputs[i].semantic_name;
- unsigned semantic_index = outputs[i].semantic_index;
-
- if (outputs[i].vertex_stream[0] != 0 &&
- outputs[i].vertex_stream[1] != 0 &&
- outputs[i].vertex_stream[2] != 0 &&
- outputs[i].vertex_stream[3] != 0)
- continue;
-
- switch (semantic_name) {
- case TGSI_SEMANTIC_LAYER:
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- case TGSI_SEMANTIC_CLIPDIST:
- case TGSI_SEMANTIC_COLOR:
- case TGSI_SEMANTIC_BCOLOR:
- case TGSI_SEMANTIC_PRIMID:
- case TGSI_SEMANTIC_FOG:
- case TGSI_SEMANTIC_TEXCOORD:
- case TGSI_SEMANTIC_GENERIC:
- break;
- default:
- continue;
- }
-
- if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
- semantic_index < SI_MAX_IO_GENERIC) &&
- shader->key.opt.kill_outputs &
- (1ull << si_shader_io_get_unique_index(semantic_name,
- semantic_index, true)))
- continue;
-
- si_export_param(ctx, param_count, outputs[i].values);
-
- assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
- shader->info.vs_output_param_offset[i] = param_count++;
- }
-
- shader->info.nr_param_exports = param_count;
+ struct si_shader *shader = ctx->shader;
+ unsigned param_count = 0;
+
+ for (unsigned i = 0; i < noutput; i++) {
+ unsigned semantic_name = outputs[i].semantic_name;
+ unsigned semantic_index = outputs[i].semantic_index;
+
+ if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
+ outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
+ continue;
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_LAYER:
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ case TGSI_SEMANTIC_CLIPDIST:
+ case TGSI_SEMANTIC_COLOR:
+ case TGSI_SEMANTIC_BCOLOR:
+ case TGSI_SEMANTIC_PRIMID:
+ case TGSI_SEMANTIC_FOG:
+ case TGSI_SEMANTIC_TEXCOORD:
+ case TGSI_SEMANTIC_GENERIC:
+ break;
+ default:
+ continue;
+ }
+
+ if ((semantic_name != TGSI_SEMANTIC_GENERIC || semantic_index < SI_MAX_IO_GENERIC) &&
+ shader->key.opt.kill_outputs &
+ (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index, true)))
+ continue;
+
+ si_export_param(ctx, param_count, outputs[i].values);
+
+ assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+ shader->info.vs_output_param_offset[i] = param_count++;
+ }
+
+ shader->info.nr_param_exports = param_count;
}
/**
* is true.
*/
static void si_vertex_color_clamping(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
+ struct si_shader_output_values *outputs, unsigned noutput)
{
- LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
- bool has_colors = false;
-
- /* Store original colors to alloca variables. */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
- LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
- }
- has_colors = true;
- }
-
- if (!has_colors)
- return;
-
- /* The state is in the first bit of the user SGPR. */
- LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
- cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
-
- ac_build_ifcc(&ctx->ac, cond, 6502);
-
- /* Store clamped colors to alloca variables within the conditional block. */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- LLVMBuildStore(ctx->ac.builder,
- ac_build_clamp(&ctx->ac, outputs[i].values[j]),
- addr[i][j]);
- }
- }
- ac_build_endif(&ctx->ac, 6502);
-
- /* Load clamped colors */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- outputs[i].values[j] =
- LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
- }
- }
+ LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
+ bool has_colors = false;
+
+ /* Store original colors to alloca variables. */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
+ LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
+ }
+ has_colors = true;
+ }
+
+ if (!has_colors)
+ return;
+
+ /* The state is in the first bit of the user SGPR. */
+ LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+ cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
+
+ ac_build_ifcc(&ctx->ac, cond, 6502);
+
+ /* Store clamped colors to alloca variables within the conditional block. */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
+ addr[i][j]);
+ }
+ }
+ ac_build_endif(&ctx->ac, 6502);
+
+ /* Load clamped colors */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
+ }
+ }
}
/* Generate export instructions for hardware VS shader stage or NGG GS stage
* (position and parameter data only).
*/
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
+ struct si_shader_output_values *outputs, unsigned noutput)
{
- struct si_shader *shader = ctx->shader;
- struct ac_export_args pos_args[4] = {};
- LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
- unsigned pos_idx;
- int i;
-
- si_vertex_color_clamping(ctx, outputs, noutput);
-
- /* Build position exports. */
- for (i = 0; i < noutput; i++) {
- switch (outputs[i].semantic_name) {
- case TGSI_SEMANTIC_POSITION:
- si_llvm_init_vs_export_args(ctx, outputs[i].values,
- V_008DFC_SQ_EXP_POS, &pos_args[0]);
- break;
- case TGSI_SEMANTIC_PSIZE:
- psize_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_LAYER:
- layer_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- viewport_index_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_EDGEFLAG:
- edgeflag_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_CLIPDIST:
- if (!shader->key.opt.clip_disable) {
- unsigned index = 2 + outputs[i].semantic_index;
- si_llvm_init_vs_export_args(ctx, outputs[i].values,
- V_008DFC_SQ_EXP_POS + index,
- &pos_args[index]);
- }
- break;
- case TGSI_SEMANTIC_CLIPVERTEX:
- if (!shader->key.opt.clip_disable) {
- si_llvm_emit_clipvertex(ctx, pos_args,
- outputs[i].values);
- }
- break;
- }
- }
-
- /* We need to add the position output manually if it's missing. */
- if (!pos_args[0].out[0]) {
- pos_args[0].enabled_channels = 0xf; /* writemask */
- pos_args[0].valid_mask = 0; /* EXEC mask */
- pos_args[0].done = 0; /* last export? */
- pos_args[0].target = V_008DFC_SQ_EXP_POS;
- pos_args[0].compr = 0; /* COMPR flag */
- pos_args[0].out[0] = ctx->ac.f32_0; /* X */
- pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
- pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
- pos_args[0].out[3] = ctx->ac.f32_1; /* W */
- }
-
- bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
- !shader->key.as_ngg;
-
- /* Write the misc vector (point size, edgeflag, layer, viewport). */
- if (shader->selector->info.writes_psize ||
- pos_writes_edgeflag ||
- shader->selector->info.writes_viewport_index ||
- shader->selector->info.writes_layer) {
- pos_args[1].enabled_channels = shader->selector->info.writes_psize |
- (pos_writes_edgeflag << 1) |
- (shader->selector->info.writes_layer << 2);
-
- pos_args[1].valid_mask = 0; /* EXEC mask */
- pos_args[1].done = 0; /* last export? */
- pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
- pos_args[1].compr = 0; /* COMPR flag */
- pos_args[1].out[0] = ctx->ac.f32_0; /* X */
- pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
- pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
- pos_args[1].out[3] = ctx->ac.f32_0; /* W */
-
- if (shader->selector->info.writes_psize)
- pos_args[1].out[0] = psize_value;
-
- if (pos_writes_edgeflag) {
- /* The output is a float, but the hw expects an integer
- * with the first bit containing the edge flag. */
- edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
- edgeflag_value,
- ctx->ac.i32, "");
- edgeflag_value = ac_build_umin(&ctx->ac,
- edgeflag_value,
- ctx->ac.i32_1);
-
- /* The LLVM intrinsic expects a float. */
- pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
- }
-
- if (ctx->screen->info.chip_class >= GFX9) {
- /* GFX9 has the layer in out.z[10:0] and the viewport
- * index in out.z[19:16].
- */
- if (shader->selector->info.writes_layer)
- pos_args[1].out[2] = layer_value;
-
- if (shader->selector->info.writes_viewport_index) {
- LLVMValueRef v = viewport_index_value;
-
- v = ac_to_integer(&ctx->ac, v);
- v = LLVMBuildShl(ctx->ac.builder, v,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
- v = LLVMBuildOr(ctx->ac.builder, v,
- ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
- pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
- pos_args[1].enabled_channels |= 1 << 2;
- }
- } else {
- if (shader->selector->info.writes_layer)
- pos_args[1].out[2] = layer_value;
-
- if (shader->selector->info.writes_viewport_index) {
- pos_args[1].out[3] = viewport_index_value;
- pos_args[1].enabled_channels |= 1 << 3;
- }
- }
- }
-
- for (i = 0; i < 4; i++)
- if (pos_args[i].out[0])
- shader->info.nr_pos_exports++;
-
- /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
- * Setting valid_mask=1 prevents it and has no other effect.
- */
- if (ctx->screen->info.family == CHIP_NAVI10 ||
- ctx->screen->info.family == CHIP_NAVI12 ||
- ctx->screen->info.family == CHIP_NAVI14)
- pos_args[0].valid_mask = 1;
-
- pos_idx = 0;
- for (i = 0; i < 4; i++) {
- if (!pos_args[i].out[0])
- continue;
-
- /* Specify the target we are exporting */
- pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
-
- if (pos_idx == shader->info.nr_pos_exports)
- /* Specify that this is the last export */
- pos_args[i].done = 1;
-
- ac_build_export(&ctx->ac, &pos_args[i]);
- }
-
- /* Build parameter exports. */
- si_build_param_exports(ctx, outputs, noutput);
+ struct si_shader *shader = ctx->shader;
+ struct ac_export_args pos_args[4] = {};
+ LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
+ viewport_index_value = NULL;
+ unsigned pos_idx;
+ int i;
+
+ si_vertex_color_clamping(ctx, outputs, noutput);
+
+ /* Build position exports. */
+ for (i = 0; i < noutput; i++) {
+ switch (outputs[i].semantic_name) {
+ case TGSI_SEMANTIC_POSITION:
+ si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ psize_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_LAYER:
+ layer_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ viewport_index_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ edgeflag_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_CLIPDIST:
+ if (!shader->key.opt.clip_disable) {
+ unsigned index = 2 + outputs[i].semantic_index;
+ si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index,
+ &pos_args[index]);
+ }
+ break;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ if (!shader->key.opt.clip_disable) {
+ si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
+ }
+ break;
+ }
+ }
+
+ /* We need to add the position output manually if it's missing. */
+ if (!pos_args[0].out[0]) {
+ pos_args[0].enabled_channels = 0xf; /* writemask */
+ pos_args[0].valid_mask = 0; /* EXEC mask */
+ pos_args[0].done = 0; /* last export? */
+ pos_args[0].target = V_008DFC_SQ_EXP_POS;
+ pos_args[0].compr = 0; /* COMPR flag */
+ pos_args[0].out[0] = ctx->ac.f32_0; /* X */
+ pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
+ pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
+ pos_args[0].out[3] = ctx->ac.f32_1; /* W */
+ }
+
+ bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
+
+ /* Write the misc vector (point size, edgeflag, layer, viewport). */
+ if (shader->selector->info.writes_psize || pos_writes_edgeflag ||
+ shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
+ pos_args[1].enabled_channels = shader->selector->info.writes_psize |
+ (pos_writes_edgeflag << 1) |
+ (shader->selector->info.writes_layer << 2);
+
+ pos_args[1].valid_mask = 0; /* EXEC mask */
+ pos_args[1].done = 0; /* last export? */
+ pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
+ pos_args[1].compr = 0; /* COMPR flag */
+ pos_args[1].out[0] = ctx->ac.f32_0; /* X */
+ pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
+ pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
+ pos_args[1].out[3] = ctx->ac.f32_0; /* W */
+
+ if (shader->selector->info.writes_psize)
+ pos_args[1].out[0] = psize_value;
+
+ if (pos_writes_edgeflag) {
+ /* The output is a float, but the hw expects an integer
+ * with the first bit containing the edge flag. */
+ edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
+ edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
+
+ /* The LLVM intrinsic expects a float. */
+ pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
+ }
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ /* GFX9 has the layer in out.z[10:0] and the viewport
+ * index in out.z[19:16].
+ */
+ if (shader->selector->info.writes_layer)
+ pos_args[1].out[2] = layer_value;
+
+ if (shader->selector->info.writes_viewport_index) {
+ LLVMValueRef v = viewport_index_value;
+
+ v = ac_to_integer(&ctx->ac, v);
+ v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
+ pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
+ pos_args[1].enabled_channels |= 1 << 2;
+ }
+ } else {
+ if (shader->selector->info.writes_layer)
+ pos_args[1].out[2] = layer_value;
+
+ if (shader->selector->info.writes_viewport_index) {
+ pos_args[1].out[3] = viewport_index_value;
+ pos_args[1].enabled_channels |= 1 << 3;
+ }
+ }
+ }
+
+ for (i = 0; i < 4; i++)
+ if (pos_args[i].out[0])
+ shader->info.nr_pos_exports++;
+
+ /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+ * Setting valid_mask=1 prevents it and has no other effect.
+ */
+ if (ctx->screen->info.family == CHIP_NAVI10 || ctx->screen->info.family == CHIP_NAVI12 ||
+ ctx->screen->info.family == CHIP_NAVI14)
+ pos_args[0].valid_mask = 1;
+
+ pos_idx = 0;
+ for (i = 0; i < 4; i++) {
+ if (!pos_args[i].out[0])
+ continue;
+
+ /* Specify the target we are exporting */
+ pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
+
+ if (pos_idx == shader->info.nr_pos_exports)
+ /* Specify that this is the last export */
+ pos_args[i].done = 1;
+
+ ac_build_export(&ctx->ac, &pos_args[i]);
+ }
+
+ /* Build parameter exports. */
+ si_build_param_exports(ctx, outputs, noutput);
}
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
- LLVMValueRef *addrs)
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- struct si_shader_output_values *outputs = NULL;
- int i,j;
-
- assert(!ctx->shader->is_gs_copy_shader);
- assert(info->num_outputs <= max_outputs);
-
- outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
-
- for (i = 0; i < info->num_outputs; i++) {
- outputs[i].semantic_name = info->output_semantic_name[i];
- outputs[i].semantic_index = info->output_semantic_index[i];
-
- for (j = 0; j < 4; j++) {
- outputs[i].values[j] =
- LLVMBuildLoad(ctx->ac.builder,
- addrs[4 * i + j],
- "");
- outputs[i].vertex_stream[j] =
- (info->output_streams[i] >> (2 * j)) & 3;
- }
- }
-
- if (!ctx->screen->use_ngg_streamout &&
- ctx->shader->selector->so.num_outputs)
- si_llvm_emit_streamout(ctx, outputs, i, 0);
-
- /* Export PrimitiveID. */
- if (ctx->shader->key.mono.u.vs_export_prim_id) {
- outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
- outputs[i].semantic_index = 0;
- outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
- for (j = 1; j < 4; j++)
- outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
-
- memset(outputs[i].vertex_stream, 0,
- sizeof(outputs[i].vertex_stream));
- i++;
- }
-
- si_llvm_build_vs_exports(ctx, outputs, i);
- FREE(outputs);
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ struct si_shader_output_values *outputs = NULL;
+ int i, j;
+
+ assert(!ctx->shader->is_gs_copy_shader);
+ assert(info->num_outputs <= max_outputs);
+
+ outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
+
+ for (i = 0; i < info->num_outputs; i++) {
+ outputs[i].semantic_name = info->output_semantic_name[i];
+ outputs[i].semantic_index = info->output_semantic_index[i];
+
+ for (j = 0; j < 4; j++) {
+ outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+ outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+ }
+ }
+
+ if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
+ si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+ /* Export PrimitiveID. */
+ if (ctx->shader->key.mono.u.vs_export_prim_id) {
+ outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+ outputs[i].semantic_index = 0;
+ outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+ for (j = 1; j < 4; j++)
+ outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
+
+ memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+ i++;
+ }
+
+ si_llvm_build_vs_exports(ctx, outputs, i);
+ FREE(outputs);
}
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- LLVMValueRef pos[4] = {};
-
- assert(info->num_outputs <= max_outputs);
-
- for (unsigned i = 0; i < info->num_outputs; i++) {
- if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
- continue;
-
- for (unsigned chan = 0; chan < 4; chan++)
- pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
- break;
- }
- assert(pos[0] != NULL);
-
- /* Return the position output. */
- LLVMValueRef ret = ctx->return_value;
- for (unsigned chan = 0; chan < 4; chan++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
- ctx->return_value = ret;
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ LLVMValueRef pos[4] = {};
+
+ assert(info->num_outputs <= max_outputs);
+
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+ continue;
+
+ for (unsigned chan = 0; chan < 4; chan++)
+ pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+ break;
+ }
+ assert(pos[0] != NULL);
+
+ /* Return the position output. */
+ LLVMValueRef ret = ctx->return_value;
+ for (unsigned chan = 0; chan < 4; chan++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+ ctx->return_value = ret;
}
/**
* (InstanceID + StartInstance),
* (InstanceID / 2 + StartInstance)
*/
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
- union si_shader_part_key *key)
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
- LLVMTypeRef *returns;
- LLVMValueRef ret, func;
- int num_returns, i;
- unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
- unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
- (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
- struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
- struct ac_arg input_vgpr_param[10];
- LLVMValueRef input_vgprs[10];
- unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
- num_input_vgprs;
- unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
- returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
- sizeof(LLVMTypeRef));
- num_returns = 0;
-
- /* Declare input and output SGPRs. */
- for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &input_sgpr_param[i]);
- returns[num_returns++] = ctx->ac.i32;
- }
-
- struct ac_arg merged_wave_info = input_sgpr_param[3];
-
- /* Preloaded VGPRs (outputs must be floats) */
- for (i = 0; i < num_input_vgprs; i++) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
- returns[num_returns++] = ctx->ac.f32;
- }
-
- /* Vertex load indices. */
- for (i = 0; i < key->vs_prolog.num_inputs; i++)
- returns[num_returns++] = ctx->ac.f32;
-
- /* Create the function. */
- si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
- func = ctx->main_fn;
-
- for (i = 0; i < num_input_vgprs; i++) {
- input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
- }
-
- if (key->vs_prolog.num_merged_next_stage_vgprs) {
- if (!key->vs_prolog.is_monolithic)
- si_init_exec_from_input(ctx, merged_wave_info, 0);
-
- if (key->vs_prolog.as_ls &&
- ctx->screen->info.has_ls_vgpr_init_bug) {
- /* If there are no HS threads, SPI loads the LS VGPRs
- * starting at VGPR 0. Shift them back to where they
- * belong.
- */
- LLVMValueRef has_hs_threads =
- LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
- si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
- ctx->ac.i32_0, "");
-
- for (i = 4; i > 0; --i) {
- input_vgprs[i + 1] =
- LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
- input_vgprs[i + 1],
- input_vgprs[i - 1], "");
- }
- }
- }
-
- if (key->vs_prolog.gs_fast_launch_tri_list ||
- key->vs_prolog.gs_fast_launch_tri_strip) {
- LLVMValueRef wave_id, thread_id_in_tg;
-
- wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
- thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
- ac_get_thread_id(&ctx->ac));
-
- /* The GS fast launch initializes all VGPRs to the value of
- * the first thread, so we have to add the thread ID.
- *
- * Only these are initialized by the hw:
- * VGPR2: Base Primitive ID
- * VGPR5: Base Vertex ID
- * VGPR6: Instance ID
- */
-
- /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
- * The NGG cull shader will read them from there.
- */
- if (key->vs_prolog.gs_fast_launch_tri_list) {
- input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
- LLVMConstInt(ctx->ac.i32, 0, 0));
- input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
- LLVMConstInt(ctx->ac.i32, 1, 0));
- input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
- LLVMConstInt(ctx->ac.i32, 2, 0));
- } else {
- assert(key->vs_prolog.gs_fast_launch_tri_strip);
- LLVMBuilderRef builder = ctx->ac.builder;
- /* Triangle indices: */
- LLVMValueRef index[3] = {
- thread_id_in_tg,
- LLVMBuildAdd(builder, thread_id_in_tg,
- LLVMConstInt(ctx->ac.i32, 1, 0), ""),
- LLVMBuildAdd(builder, thread_id_in_tg,
- LLVMConstInt(ctx->ac.i32, 2, 0), ""),
- };
- LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
- thread_id_in_tg, ctx->ac.i1, "");
- LLVMValueRef flatshade_first =
- LLVMBuildICmp(builder, LLVMIntEQ,
- si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
- ctx->ac.i32_0, "");
-
- ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
- flatshade_first, index);
- input_vgprs[0] = index[0];
- input_vgprs[1] = index[1];
- input_vgprs[4] = index[2];
- }
-
- /* Triangles always have all edge flags set initially. */
- input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
- input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
- thread_id_in_tg, ""); /* PrimID */
- input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
- thread_id_in_tg, ""); /* VertexID */
- input_vgprs[8] = input_vgprs[6]; /* InstanceID */
- }
-
- unsigned vertex_id_vgpr = first_vs_vgpr;
- unsigned instance_id_vgpr =
- ctx->screen->info.chip_class >= GFX10 ?
- first_vs_vgpr + 3 :
- first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
-
- ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
- ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
-
- /* InstanceID = VertexID >> 16;
- * VertexID = VertexID & 0xffff;
- */
- if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
- ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
- ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
- }
-
- /* Copy inputs to outputs. This should be no-op, as the registers match,
- * but it will prevent the compiler from overwriting them unintentionally.
- */
- ret = ctx->return_value;
- for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
- LLVMValueRef p = LLVMGetParam(func, i);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
- }
- for (i = 0; i < num_input_vgprs; i++) {
- LLVMValueRef p = input_vgprs[i];
-
- if (i == vertex_id_vgpr)
- p = ctx->abi.vertex_id;
- else if (i == instance_id_vgpr)
- p = ctx->abi.instance_id;
-
- p = ac_to_float(&ctx->ac, p);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
- key->vs_prolog.num_input_sgprs + i, "");
- }
-
- /* Compute vertex load indices from instance divisors. */
- LLVMValueRef instance_divisor_constbuf = NULL;
-
- if (key->vs_prolog.states.instance_divisor_is_fetched) {
- LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
- LLVMValueRef buf_index =
- LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
- instance_divisor_constbuf =
- ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
- }
-
- for (i = 0; i < key->vs_prolog.num_inputs; i++) {
- bool divisor_is_one =
- key->vs_prolog.states.instance_divisor_is_one & (1u << i);
- bool divisor_is_fetched =
- key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
- LLVMValueRef index = NULL;
-
- if (divisor_is_one) {
- index = ctx->abi.instance_id;
- } else if (divisor_is_fetched) {
- LLVMValueRef udiv_factors[4];
-
- for (unsigned j = 0; j < 4; j++) {
- udiv_factors[j] =
- si_buffer_load_const(ctx, instance_divisor_constbuf,
- LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
- udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
- }
- /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
- * Such InstanceID might not be achievable in a reasonable time though.
- */
- index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
- udiv_factors[0], udiv_factors[1],
- udiv_factors[2], udiv_factors[3]);
- }
-
- if (divisor_is_one || divisor_is_fetched) {
- /* Add StartInstance. */
- index = LLVMBuildAdd(ctx->ac.builder, index,
- LLVMGetParam(ctx->main_fn, user_sgpr_base +
- SI_SGPR_START_INSTANCE), "");
- } else {
- /* VertexID + BaseVertex */
- index = LLVMBuildAdd(ctx->ac.builder,
- ctx->abi.vertex_id,
- LLVMGetParam(func, user_sgpr_base +
- SI_SGPR_BASE_VERTEX), "");
- }
-
- index = ac_to_float(&ctx->ac, index);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
- ctx->args.arg_count + i, "");
- }
-
- si_llvm_build_ret(ctx, ret);
+ LLVMTypeRef *returns;
+ LLVMValueRef ret, func;
+ int num_returns, i;
+ unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
+ unsigned num_input_vgprs =
+ key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
+ struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
+ struct ac_arg input_vgpr_param[10];
+ LLVMValueRef input_vgprs[10];
+ unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
+ unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+ returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
+ num_returns = 0;
+
+ /* Declare input and output SGPRs. */
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
+ returns[num_returns++] = ctx->ac.i32;
+ }
+
+ struct ac_arg merged_wave_info = input_sgpr_param[3];
+
+ /* Preloaded VGPRs (outputs must be floats) */
+ for (i = 0; i < num_input_vgprs; i++) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
+ returns[num_returns++] = ctx->ac.f32;
+ }
+
+ /* Vertex load indices. */
+ for (i = 0; i < key->vs_prolog.num_inputs; i++)
+ returns[num_returns++] = ctx->ac.f32;
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
+ func = ctx->main_fn;
+
+ for (i = 0; i < num_input_vgprs; i++) {
+ input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
+ }
+
+ if (key->vs_prolog.num_merged_next_stage_vgprs) {
+ if (!key->vs_prolog.is_monolithic)
+ si_init_exec_from_input(ctx, merged_wave_info, 0);
+
+ if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
+ /* If there are no HS threads, SPI loads the LS VGPRs
+ * starting at VGPR 0. Shift them back to where they
+ * belong.
+ */
+ LLVMValueRef has_hs_threads =
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
+ si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
+
+ for (i = 4; i > 0; --i) {
+ input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
+ input_vgprs[i + 1], input_vgprs[i - 1], "");
+ }
+ }
+ }
+
+ if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
+ LLVMValueRef wave_id, thread_id_in_tg;
+
+ wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+ thread_id_in_tg =
+ ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+ ac_get_thread_id(&ctx->ac));
+
+ /* The GS fast launch initializes all VGPRs to the value of
+ * the first thread, so we have to add the thread ID.
+ *
+ * Only these are initialized by the hw:
+ * VGPR2: Base Primitive ID
+ * VGPR5: Base Vertex ID
+ * VGPR6: Instance ID
+ */
+
+ /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+ * The NGG cull shader will read them from there.
+ */
+ if (key->vs_prolog.gs_fast_launch_tri_list) {
+ input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+ LLVMConstInt(ctx->ac.i32, 0, 0));
+ input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+ LLVMConstInt(ctx->ac.i32, 1, 0));
+ input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+ LLVMConstInt(ctx->ac.i32, 2, 0));
+ } else {
+ assert(key->vs_prolog.gs_fast_launch_tri_strip);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ /* Triangle indices: */
+ LLVMValueRef index[3] = {
+ thread_id_in_tg,
+ LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+ LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+ };
+ LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
+ LLVMValueRef flatshade_first = LLVMBuildICmp(
+ builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+ ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
+ input_vgprs[0] = index[0];
+ input_vgprs[1] = index[1];
+ input_vgprs[4] = index[2];
+ }
+
+ /* Triangles always have all edge flags set initially. */
+ input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+ input_vgprs[2] =
+ LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
+ input_vgprs[5] =
+ LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
+ input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+ }
+
+ unsigned vertex_id_vgpr = first_vs_vgpr;
+ unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
+ ? first_vs_vgpr + 3
+ : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
+
+ ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
+ ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+
+ /* InstanceID = VertexID >> 16;
+ * VertexID = VertexID & 0xffff;
+ */
+ if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
+ ctx->abi.instance_id =
+ LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
+ LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
+ }
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx->return_value;
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+ }
+ for (i = 0; i < num_input_vgprs; i++) {
+ LLVMValueRef p = input_vgprs[i];
+
+ if (i == vertex_id_vgpr)
+ p = ctx->abi.vertex_id;
+ else if (i == instance_id_vgpr)
+ p = ctx->abi.instance_id;
+
+ p = ac_to_float(&ctx->ac, p);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
+ }
+
+ /* Compute vertex load indices from instance divisors. */
+ LLVMValueRef instance_divisor_constbuf = NULL;
+
+ if (key->vs_prolog.states.instance_divisor_is_fetched) {
+ LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+ LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+ instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
+ }
+
+ for (i = 0; i < key->vs_prolog.num_inputs; i++) {
+ bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+ bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
+ LLVMValueRef index = NULL;
+
+ if (divisor_is_one) {
+ index = ctx->abi.instance_id;
+ } else if (divisor_is_fetched) {
+ LLVMValueRef udiv_factors[4];
+
+ for (unsigned j = 0; j < 4; j++) {
+ udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
+ LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
+ udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
+ }
+ /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+ * Such InstanceID might not be achievable in a reasonable time though.
+ */
+ index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
+ udiv_factors[1], udiv_factors[2], udiv_factors[3]);
+ }
+
+ if (divisor_is_one || divisor_is_fetched) {
+ /* Add StartInstance. */
+ index =
+ LLVMBuildAdd(ctx->ac.builder, index,
+ LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
+ } else {
+ /* VertexID + BaseVertex */
+ index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
+ LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
+ }
+
+ index = ac_to_float(&ctx->ac, index);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
+ }
+
+ si_llvm_build_ret(ctx, ret);
}
static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
- /* For non-indexed draws, the base vertex set by the driver
- * (for direct draws) or the CP (for indirect draws) is the
- * first vertex ID, but GLSL expects 0 to be returned.
- */
- LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
- ctx->vs_state_bits);
- LLVMValueRef indexed;
-
- indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
- indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
-
- return LLVMBuildSelect(ctx->ac.builder, indexed,
- ac_get_arg(&ctx->ac, ctx->args.base_vertex),
- ctx->ac.i32_0, "");
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+ /* For non-indexed draws, the base vertex set by the driver
+ * (for direct draws) or the CP (for indirect draws) is the
+ * first vertex ID, but GLSL expects 0 to be returned.
+ */
+ LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+ LLVMValueRef indexed;
+
+ indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+ indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
+
+ return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
+ ctx->ac.i32_0, "");
}
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
{
- struct si_shader *shader = ctx->shader;
-
- if (shader->key.as_ls)
- ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
- else if (shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (shader->key.opt.vs_as_prim_discard_cs)
- ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
- else if (ngg_cull_shader)
- ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
- else if (shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
-
- ctx->abi.load_base_vertex = get_base_vertex;
+ struct si_shader *shader = ctx->shader;
+
+ if (shader->key.as_ls)
+ ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
+ else if (shader->key.as_es)
+ ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (shader->key.opt.vs_as_prim_discard_cs)
+ ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+ else if (shader->key.as_ngg)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+ else
+ ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+
+ ctx->abi.load_base_vertex = get_base_vertex;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-
#include "ac_nir_to_llvm.h"
-
-#include "tgsi/tgsi_from_mesa.h"
-
#include "compiler/nir/nir.h"
-#include "compiler/nir_types.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_deref.h"
+#include "compiler/nir_types.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "tgsi/tgsi_from_mesa.h"
static const nir_deref_instr *tex_get_texture_deref(nir_tex_instr *instr)
{
- for (unsigned i = 0; i < instr->num_srcs; i++) {
- switch (instr->src[i].src_type) {
- case nir_tex_src_texture_deref:
- return nir_src_as_deref(instr->src[i].src);
- default:
- break;
- }
- }
-
- return NULL;
+ for (unsigned i = 0; i < instr->num_srcs; i++) {
+ switch (instr->src[i].src_type) {
+ case nir_tex_src_texture_deref:
+ return nir_src_as_deref(instr->src[i].src);
+ default:
+ break;
+ }
+ }
+
+ return NULL;
}
-static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
+static nir_variable *intrinsic_get_var(nir_intrinsic_instr *instr)
{
- return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
+ return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
}
-static void gather_usage_helper(const nir_deref_instr **deref_ptr,
- unsigned location,
- uint8_t mask,
- uint8_t *usage_mask)
+static void gather_usage_helper(const nir_deref_instr **deref_ptr, unsigned location, uint8_t mask,
+ uint8_t *usage_mask)
{
- for (; *deref_ptr; deref_ptr++) {
- const nir_deref_instr *deref = *deref_ptr;
- switch (deref->deref_type) {
- case nir_deref_type_array: {
- unsigned elem_size =
- glsl_count_attribute_slots(deref->type, false);
- if (nir_src_is_const(deref->arr.index)) {
- location += elem_size * nir_src_as_uint(deref->arr.index);
- } else {
- unsigned array_elems =
- glsl_get_length(deref_ptr[-1]->type);
- for (unsigned i = 0; i < array_elems; i++) {
- gather_usage_helper(deref_ptr + 1,
- location + elem_size * i,
- mask, usage_mask);
- }
- return;
- }
- break;
- }
- case nir_deref_type_struct: {
- const struct glsl_type *parent_type =
- deref_ptr[-1]->type;
- unsigned index = deref->strct.index;
- for (unsigned i = 0; i < index; i++) {
- const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
- location += glsl_count_attribute_slots(ft, false);
- }
- break;
- }
- default:
- unreachable("Unhandled deref type in gather_components_used_helper");
- }
- }
-
- usage_mask[location] |= mask & 0xf;
- if (mask & 0xf0)
- usage_mask[location + 1] |= (mask >> 4) & 0xf;
+ for (; *deref_ptr; deref_ptr++) {
+ const nir_deref_instr *deref = *deref_ptr;
+ switch (deref->deref_type) {
+ case nir_deref_type_array: {
+ unsigned elem_size = glsl_count_attribute_slots(deref->type, false);
+ if (nir_src_is_const(deref->arr.index)) {
+ location += elem_size * nir_src_as_uint(deref->arr.index);
+ } else {
+ unsigned array_elems = glsl_get_length(deref_ptr[-1]->type);
+ for (unsigned i = 0; i < array_elems; i++) {
+ gather_usage_helper(deref_ptr + 1, location + elem_size * i, mask, usage_mask);
+ }
+ return;
+ }
+ break;
+ }
+ case nir_deref_type_struct: {
+ const struct glsl_type *parent_type = deref_ptr[-1]->type;
+ unsigned index = deref->strct.index;
+ for (unsigned i = 0; i < index; i++) {
+ const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
+ location += glsl_count_attribute_slots(ft, false);
+ }
+ break;
+ }
+ default:
+ unreachable("Unhandled deref type in gather_components_used_helper");
+ }
+ }
+
+ usage_mask[location] |= mask & 0xf;
+ if (mask & 0xf0)
+ usage_mask[location + 1] |= (mask >> 4) & 0xf;
}
-static void gather_usage(const nir_deref_instr *deref,
- uint8_t mask,
- uint8_t *usage_mask)
+static void gather_usage(const nir_deref_instr *deref, uint8_t mask, uint8_t *usage_mask)
{
- nir_deref_path path;
- nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
-
- unsigned location_frac = path.path[0]->var->data.location_frac;
- if (glsl_type_is_64bit(deref->type)) {
- uint8_t new_mask = 0;
- for (unsigned i = 0; i < 4; i++) {
- if (mask & (1 << i))
- new_mask |= 0x3 << (2 * i);
- }
- mask = new_mask << location_frac;
- } else {
- mask <<= location_frac;
- mask &= 0xf;
- }
-
- gather_usage_helper((const nir_deref_instr **)&path.path[1],
- path.path[0]->var->data.driver_location,
- mask, usage_mask);
-
- nir_deref_path_finish(&path);
+ nir_deref_path path;
+ nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
+
+ unsigned location_frac = path.path[0]->var->data.location_frac;
+ if (glsl_type_is_64bit(deref->type)) {
+ uint8_t new_mask = 0;
+ for (unsigned i = 0; i < 4; i++) {
+ if (mask & (1 << i))
+ new_mask |= 0x3 << (2 * i);
+ }
+ mask = new_mask << location_frac;
+ } else {
+ mask <<= location_frac;
+ mask &= 0xf;
+ }
+
+ gather_usage_helper((const nir_deref_instr **)&path.path[1],
+ path.path[0]->var->data.driver_location, mask, usage_mask);
+
+ nir_deref_path_finish(&path);
}
static void gather_intrinsic_load_deref_input_info(const nir_shader *nir,
- const nir_intrinsic_instr *instr,
- const nir_deref_instr *deref,
- struct si_shader_info *info)
+ const nir_intrinsic_instr *instr,
+ const nir_deref_instr *deref,
+ struct si_shader_info *info)
{
- switch (nir->info.stage) {
- case MESA_SHADER_VERTEX:
- gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa),
- info->input_usage_mask);
- default:;
- }
+ switch (nir->info.stage) {
+ case MESA_SHADER_VERTEX:
+ gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), info->input_usage_mask);
+ default:;
+ }
}
static void gather_intrinsic_load_deref_output_info(const nir_shader *nir,
- const nir_intrinsic_instr *instr,
- nir_variable *var,
- struct si_shader_info *info)
+ const nir_intrinsic_instr *instr,
+ nir_variable *var, struct si_shader_info *info)
{
- assert(var && var->data.mode == nir_var_shader_out);
-
- switch (nir->info.stage) {
- case MESA_SHADER_TESS_CTRL:
- if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
- var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
- info->reads_tessfactor_outputs = true;
- else if (var->data.patch)
- info->reads_perpatch_outputs = true;
- else
- info->reads_pervertex_outputs = true;
- break;
-
- case MESA_SHADER_FRAGMENT:
- if (var->data.fb_fetch_output)
- info->uses_fbfetch = true;
- break;
- default:;
- }
+ assert(var && var->data.mode == nir_var_shader_out);
+
+ switch (nir->info.stage) {
+ case MESA_SHADER_TESS_CTRL:
+ if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+ var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+ info->reads_tessfactor_outputs = true;
+ else if (var->data.patch)
+ info->reads_perpatch_outputs = true;
+ else
+ info->reads_pervertex_outputs = true;
+ break;
+
+ case MESA_SHADER_FRAGMENT:
+ if (var->data.fb_fetch_output)
+ info->uses_fbfetch = true;
+ break;
+ default:;
+ }
}
static void gather_intrinsic_store_deref_output_info(const nir_shader *nir,
- const nir_intrinsic_instr *instr,
- const nir_deref_instr *deref,
- struct si_shader_info *info)
+ const nir_intrinsic_instr *instr,
+ const nir_deref_instr *deref,
+ struct si_shader_info *info)
{
- switch (nir->info.stage) {
- case MESA_SHADER_VERTEX: /* needed by LS, ES */
- case MESA_SHADER_TESS_EVAL: /* needed by ES */
- case MESA_SHADER_GEOMETRY:
- gather_usage(deref, nir_intrinsic_write_mask(instr),
- info->output_usagemask);
- break;
- default:;
- }
+ switch (nir->info.stage) {
+ case MESA_SHADER_VERTEX: /* needed by LS, ES */
+ case MESA_SHADER_TESS_EVAL: /* needed by ES */
+ case MESA_SHADER_GEOMETRY:
+ gather_usage(deref, nir_intrinsic_write_mask(instr), info->output_usagemask);
+ break;
+ default:;
+ }
}
-static void scan_instruction(const struct nir_shader *nir,
- struct si_shader_info *info,
- nir_instr *instr)
+static void scan_instruction(const struct nir_shader *nir, struct si_shader_info *info,
+ nir_instr *instr)
{
- if (instr->type == nir_instr_type_alu) {
- nir_alu_instr *alu = nir_instr_as_alu(instr);
-
- switch (alu->op) {
- case nir_op_fddx:
- case nir_op_fddy:
- case nir_op_fddx_fine:
- case nir_op_fddy_fine:
- case nir_op_fddx_coarse:
- case nir_op_fddy_coarse:
- info->uses_derivatives = true;
- break;
- default:
- break;
- }
- } else if (instr->type == nir_instr_type_tex) {
- nir_tex_instr *tex = nir_instr_as_tex(instr);
- const nir_deref_instr *deref = tex_get_texture_deref(tex);
- nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
-
- if (!var) {
- info->samplers_declared |=
- u_bit_consecutive(tex->sampler_index, 1);
- } else {
- if (deref->mode != nir_var_uniform || var->data.bindless)
- info->uses_bindless_samplers = true;
- }
-
- switch (tex->op) {
- case nir_texop_tex:
- case nir_texop_txb:
- case nir_texop_lod:
- info->uses_derivatives = true;
- break;
- default:
- break;
- }
- } else if (instr->type == nir_instr_type_intrinsic) {
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_load_front_face:
- info->uses_frontface = 1;
- break;
- case nir_intrinsic_load_instance_id:
- info->uses_instanceid = 1;
- break;
- case nir_intrinsic_load_invocation_id:
- info->uses_invocationid = true;
- break;
- case nir_intrinsic_load_num_work_groups:
- info->uses_grid_size = true;
- break;
- case nir_intrinsic_load_local_invocation_index:
- case nir_intrinsic_load_subgroup_id:
- case nir_intrinsic_load_num_subgroups:
- info->uses_subgroup_info = true;
- break;
- case nir_intrinsic_load_local_group_size:
- /* The block size is translated to IMM with a fixed block size. */
- if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
- info->uses_block_size = true;
- break;
- case nir_intrinsic_load_local_invocation_id:
- case nir_intrinsic_load_work_group_id: {
- unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
- while (mask) {
- unsigned i = u_bit_scan(&mask);
-
- if (intr->intrinsic == nir_intrinsic_load_work_group_id)
- info->uses_block_id[i] = true;
- else
- info->uses_thread_id[i] = true;
- }
- break;
- }
- case nir_intrinsic_load_vertex_id:
- info->uses_vertexid = 1;
- break;
- case nir_intrinsic_load_vertex_id_zero_base:
- info->uses_vertexid_nobase = 1;
- break;
- case nir_intrinsic_load_base_vertex:
- info->uses_basevertex = 1;
- break;
- case nir_intrinsic_load_draw_id:
- info->uses_drawid = 1;
- break;
- case nir_intrinsic_load_primitive_id:
- info->uses_primid = 1;
- break;
- case nir_intrinsic_load_sample_mask_in:
- info->reads_samplemask = true;
- break;
- case nir_intrinsic_load_tess_level_inner:
- case nir_intrinsic_load_tess_level_outer:
- info->reads_tess_factors = true;
- break;
- case nir_intrinsic_bindless_image_load:
- case nir_intrinsic_bindless_image_size:
- case nir_intrinsic_bindless_image_samples:
- info->uses_bindless_images = true;
- break;
- case nir_intrinsic_bindless_image_store:
- info->uses_bindless_images = true;
- info->writes_memory = true;
- info->num_memory_instructions++; /* we only care about stores */
- break;
- case nir_intrinsic_image_deref_store:
- info->writes_memory = true;
- info->num_memory_instructions++; /* we only care about stores */
- break;
- case nir_intrinsic_bindless_image_atomic_add:
- case nir_intrinsic_bindless_image_atomic_imin:
- case nir_intrinsic_bindless_image_atomic_umin:
- case nir_intrinsic_bindless_image_atomic_imax:
- case nir_intrinsic_bindless_image_atomic_umax:
- case nir_intrinsic_bindless_image_atomic_and:
- case nir_intrinsic_bindless_image_atomic_or:
- case nir_intrinsic_bindless_image_atomic_xor:
- case nir_intrinsic_bindless_image_atomic_exchange:
- case nir_intrinsic_bindless_image_atomic_comp_swap:
- info->uses_bindless_images = true;
- info->writes_memory = true;
- info->num_memory_instructions++; /* we only care about stores */
- break;
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_imin:
- case nir_intrinsic_image_deref_atomic_umin:
- case nir_intrinsic_image_deref_atomic_imax:
- case nir_intrinsic_image_deref_atomic_umax:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
- case nir_intrinsic_image_deref_atomic_inc_wrap:
- case nir_intrinsic_image_deref_atomic_dec_wrap:
- info->writes_memory = true;
- info->num_memory_instructions++; /* we only care about stores */
- break;
- case nir_intrinsic_store_ssbo:
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
- info->writes_memory = true;
- info->num_memory_instructions++; /* we only care about stores */
- break;
- case nir_intrinsic_load_color0:
- case nir_intrinsic_load_color1: {
- unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
- uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
- info->colors_read |= mask << (index * 4);
- break;
- }
- case nir_intrinsic_load_barycentric_pixel:
- case nir_intrinsic_load_barycentric_centroid:
- case nir_intrinsic_load_barycentric_sample:
- case nir_intrinsic_load_barycentric_at_offset: /* uses center */
- case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
- unsigned mode = nir_intrinsic_interp_mode(intr);
-
- if (mode == INTERP_MODE_FLAT)
- break;
-
- if (mode == INTERP_MODE_NOPERSPECTIVE) {
- if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
- info->uses_linear_sample = true;
- else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
- info->uses_linear_centroid = true;
- else
- info->uses_linear_center = true;
-
- if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
- info->uses_linear_opcode_interp_sample = true;
- } else {
- if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
- info->uses_persp_sample = true;
- else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
- info->uses_persp_centroid = true;
- else
- info->uses_persp_center = true;
-
- if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
- info->uses_persp_opcode_interp_sample = true;
- }
- break;
- }
- case nir_intrinsic_load_deref: {
- nir_variable *var = intrinsic_get_var(intr);
- nir_variable_mode mode = var->data.mode;
-
- if (mode == nir_var_shader_in) {
- /* PS inputs use the interpolated load intrinsics. */
- assert(nir->info.stage != MESA_SHADER_FRAGMENT);
- gather_intrinsic_load_deref_input_info(nir, intr,
- nir_src_as_deref(intr->src[0]), info);
- } else if (mode == nir_var_shader_out) {
- gather_intrinsic_load_deref_output_info(nir, intr, var, info);
- }
- break;
- }
- case nir_intrinsic_store_deref: {
- nir_variable *var = intrinsic_get_var(intr);
-
- if (var->data.mode == nir_var_shader_out)
- gather_intrinsic_store_deref_output_info(nir, intr,
- nir_src_as_deref(intr->src[0]), info);
- break;
- }
- case nir_intrinsic_interp_deref_at_centroid:
- case nir_intrinsic_interp_deref_at_sample:
- case nir_intrinsic_interp_deref_at_offset:
- unreachable("interp opcodes should have been lowered");
- break;
- default:
- break;
- }
- }
+ if (instr->type == nir_instr_type_alu) {
+ nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+ switch (alu->op) {
+ case nir_op_fddx:
+ case nir_op_fddy:
+ case nir_op_fddx_fine:
+ case nir_op_fddy_fine:
+ case nir_op_fddx_coarse:
+ case nir_op_fddy_coarse:
+ info->uses_derivatives = true;
+ break;
+ default:
+ break;
+ }
+ } else if (instr->type == nir_instr_type_tex) {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ const nir_deref_instr *deref = tex_get_texture_deref(tex);
+ nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
+
+ if (!var) {
+ info->samplers_declared |= u_bit_consecutive(tex->sampler_index, 1);
+ } else {
+ if (deref->mode != nir_var_uniform || var->data.bindless)
+ info->uses_bindless_samplers = true;
+ }
+
+ switch (tex->op) {
+ case nir_texop_tex:
+ case nir_texop_txb:
+ case nir_texop_lod:
+ info->uses_derivatives = true;
+ break;
+ default:
+ break;
+ }
+ } else if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_front_face:
+ info->uses_frontface = 1;
+ break;
+ case nir_intrinsic_load_instance_id:
+ info->uses_instanceid = 1;
+ break;
+ case nir_intrinsic_load_invocation_id:
+ info->uses_invocationid = true;
+ break;
+ case nir_intrinsic_load_num_work_groups:
+ info->uses_grid_size = true;
+ break;
+ case nir_intrinsic_load_local_invocation_index:
+ case nir_intrinsic_load_subgroup_id:
+ case nir_intrinsic_load_num_subgroups:
+ info->uses_subgroup_info = true;
+ break;
+ case nir_intrinsic_load_local_group_size:
+ /* The block size is translated to IMM with a fixed block size. */
+ if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+ info->uses_block_size = true;
+ break;
+ case nir_intrinsic_load_local_invocation_id:
+ case nir_intrinsic_load_work_group_id: {
+ unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+
+ if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+ info->uses_block_id[i] = true;
+ else
+ info->uses_thread_id[i] = true;
+ }
+ break;
+ }
+ case nir_intrinsic_load_vertex_id:
+ info->uses_vertexid = 1;
+ break;
+ case nir_intrinsic_load_vertex_id_zero_base:
+ info->uses_vertexid_nobase = 1;
+ break;
+ case nir_intrinsic_load_base_vertex:
+ info->uses_basevertex = 1;
+ break;
+ case nir_intrinsic_load_draw_id:
+ info->uses_drawid = 1;
+ break;
+ case nir_intrinsic_load_primitive_id:
+ info->uses_primid = 1;
+ break;
+ case nir_intrinsic_load_sample_mask_in:
+ info->reads_samplemask = true;
+ break;
+ case nir_intrinsic_load_tess_level_inner:
+ case nir_intrinsic_load_tess_level_outer:
+ info->reads_tess_factors = true;
+ break;
+ case nir_intrinsic_bindless_image_load:
+ case nir_intrinsic_bindless_image_size:
+ case nir_intrinsic_bindless_image_samples:
+ info->uses_bindless_images = true;
+ break;
+ case nir_intrinsic_bindless_image_store:
+ info->uses_bindless_images = true;
+ info->writes_memory = true;
+ info->num_memory_instructions++; /* we only care about stores */
+ break;
+ case nir_intrinsic_image_deref_store:
+ info->writes_memory = true;
+ info->num_memory_instructions++; /* we only care about stores */
+ break;
+ case nir_intrinsic_bindless_image_atomic_add:
+ case nir_intrinsic_bindless_image_atomic_imin:
+ case nir_intrinsic_bindless_image_atomic_umin:
+ case nir_intrinsic_bindless_image_atomic_imax:
+ case nir_intrinsic_bindless_image_atomic_umax:
+ case nir_intrinsic_bindless_image_atomic_and:
+ case nir_intrinsic_bindless_image_atomic_or:
+ case nir_intrinsic_bindless_image_atomic_xor:
+ case nir_intrinsic_bindless_image_atomic_exchange:
+ case nir_intrinsic_bindless_image_atomic_comp_swap:
+ info->uses_bindless_images = true;
+ info->writes_memory = true;
+ info->num_memory_instructions++; /* we only care about stores */
+ break;
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_imin:
+ case nir_intrinsic_image_deref_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_imax:
+ case nir_intrinsic_image_deref_atomic_umax:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_atomic_inc_wrap:
+ case nir_intrinsic_image_deref_atomic_dec_wrap:
+ info->writes_memory = true;
+ info->num_memory_instructions++; /* we only care about stores */
+ break;
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ info->writes_memory = true;
+ info->num_memory_instructions++; /* we only care about stores */
+ break;
+ case nir_intrinsic_load_color0:
+ case nir_intrinsic_load_color1: {
+ unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
+ uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
+ info->colors_read |= mask << (index * 4);
+ break;
+ }
+ case nir_intrinsic_load_barycentric_pixel:
+ case nir_intrinsic_load_barycentric_centroid:
+ case nir_intrinsic_load_barycentric_sample:
+ case nir_intrinsic_load_barycentric_at_offset: /* uses center */
+ case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
+ unsigned mode = nir_intrinsic_interp_mode(intr);
+
+ if (mode == INTERP_MODE_FLAT)
+ break;
+
+ if (mode == INTERP_MODE_NOPERSPECTIVE) {
+ if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+ info->uses_linear_sample = true;
+ else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+ info->uses_linear_centroid = true;
+ else
+ info->uses_linear_center = true;
+
+ if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+ info->uses_linear_opcode_interp_sample = true;
+ } else {
+ if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+ info->uses_persp_sample = true;
+ else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+ info->uses_persp_centroid = true;
+ else
+ info->uses_persp_center = true;
+
+ if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+ info->uses_persp_opcode_interp_sample = true;
+ }
+ break;
+ }
+ case nir_intrinsic_load_deref: {
+ nir_variable *var = intrinsic_get_var(intr);
+ nir_variable_mode mode = var->data.mode;
+
+ if (mode == nir_var_shader_in) {
+ /* PS inputs use the interpolated load intrinsics. */
+ assert(nir->info.stage != MESA_SHADER_FRAGMENT);
+ gather_intrinsic_load_deref_input_info(nir, intr, nir_src_as_deref(intr->src[0]), info);
+ } else if (mode == nir_var_shader_out) {
+ gather_intrinsic_load_deref_output_info(nir, intr, var, info);
+ }
+ break;
+ }
+ case nir_intrinsic_store_deref: {
+ nir_variable *var = intrinsic_get_var(intr);
+
+ if (var->data.mode == nir_var_shader_out)
+ gather_intrinsic_store_deref_output_info(nir, intr, nir_src_as_deref(intr->src[0]),
+ info);
+ break;
+ }
+ case nir_intrinsic_interp_deref_at_centroid:
+ case nir_intrinsic_interp_deref_at_sample:
+ case nir_intrinsic_interp_deref_at_offset:
+ unreachable("interp opcodes should have been lowered");
+ break;
+ default:
+ break;
+ }
+ }
}
-static void scan_output_slot(const nir_variable *var,
- unsigned var_idx,
- unsigned component, unsigned num_components,
- struct si_shader_info *info)
+static void scan_output_slot(const nir_variable *var, unsigned var_idx, unsigned component,
+ unsigned num_components, struct si_shader_info *info)
{
- assert(component + num_components <= 4);
- assert(component < 4);
-
- unsigned semantic_name, semantic_index;
-
- unsigned location = var->data.location + var_idx;
- unsigned drv_location = var->data.driver_location + var_idx;
-
- if (info->processor == PIPE_SHADER_FRAGMENT) {
- tgsi_get_gl_frag_result_semantic(location,
- &semantic_name, &semantic_index);
-
- /* Adjust for dual source blending */
- if (var->data.index > 0) {
- semantic_index++;
- }
- } else {
- tgsi_get_gl_varying_semantic(location, true,
- &semantic_name, &semantic_index);
- }
-
- ubyte usagemask = ((1 << num_components) - 1) << component;
-
- unsigned gs_out_streams;
- if (var->data.stream & NIR_STREAM_PACKED) {
- gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
- } else {
- assert(var->data.stream < 4);
- gs_out_streams = 0;
- for (unsigned j = 0; j < num_components; ++j)
- gs_out_streams |= var->data.stream << (2 * (component + j));
- }
-
- unsigned streamx = gs_out_streams & 3;
- unsigned streamy = (gs_out_streams >> 2) & 3;
- unsigned streamz = (gs_out_streams >> 4) & 3;
- unsigned streamw = (gs_out_streams >> 6) & 3;
-
- if (usagemask & TGSI_WRITEMASK_X) {
- info->output_streams[drv_location] |= streamx;
- info->num_stream_output_components[streamx]++;
- }
- if (usagemask & TGSI_WRITEMASK_Y) {
- info->output_streams[drv_location] |= streamy << 2;
- info->num_stream_output_components[streamy]++;
- }
- if (usagemask & TGSI_WRITEMASK_Z) {
- info->output_streams[drv_location] |= streamz << 4;
- info->num_stream_output_components[streamz]++;
- }
- if (usagemask & TGSI_WRITEMASK_W) {
- info->output_streams[drv_location] |= streamw << 6;
- info->num_stream_output_components[streamw]++;
- }
-
- info->output_semantic_name[drv_location] = semantic_name;
- info->output_semantic_index[drv_location] = semantic_index;
-
- switch (semantic_name) {
- case TGSI_SEMANTIC_PRIMID:
- info->writes_primid = true;
- break;
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- info->writes_viewport_index = true;
- break;
- case TGSI_SEMANTIC_LAYER:
- info->writes_layer = true;
- break;
- case TGSI_SEMANTIC_PSIZE:
- info->writes_psize = true;
- break;
- case TGSI_SEMANTIC_CLIPVERTEX:
- info->writes_clipvertex = true;
- break;
- case TGSI_SEMANTIC_COLOR:
- info->colors_written |= 1 << semantic_index;
- break;
- case TGSI_SEMANTIC_STENCIL:
- info->writes_stencil = true;
- break;
- case TGSI_SEMANTIC_SAMPLEMASK:
- info->writes_samplemask = true;
- break;
- case TGSI_SEMANTIC_EDGEFLAG:
- info->writes_edgeflag = true;
- break;
- case TGSI_SEMANTIC_POSITION:
- if (info->processor == PIPE_SHADER_FRAGMENT)
- info->writes_z = true;
- else
- info->writes_position = true;
- break;
- }
+ assert(component + num_components <= 4);
+ assert(component < 4);
+
+ unsigned semantic_name, semantic_index;
+
+ unsigned location = var->data.location + var_idx;
+ unsigned drv_location = var->data.driver_location + var_idx;
+
+ if (info->processor == PIPE_SHADER_FRAGMENT) {
+ tgsi_get_gl_frag_result_semantic(location, &semantic_name, &semantic_index);
+
+ /* Adjust for dual source blending */
+ if (var->data.index > 0) {
+ semantic_index++;
+ }
+ } else {
+ tgsi_get_gl_varying_semantic(location, true, &semantic_name, &semantic_index);
+ }
+
+ ubyte usagemask = ((1 << num_components) - 1) << component;
+
+ unsigned gs_out_streams;
+ if (var->data.stream & NIR_STREAM_PACKED) {
+ gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
+ } else {
+ assert(var->data.stream < 4);
+ gs_out_streams = 0;
+ for (unsigned j = 0; j < num_components; ++j)
+ gs_out_streams |= var->data.stream << (2 * (component + j));
+ }
+
+ unsigned streamx = gs_out_streams & 3;
+ unsigned streamy = (gs_out_streams >> 2) & 3;
+ unsigned streamz = (gs_out_streams >> 4) & 3;
+ unsigned streamw = (gs_out_streams >> 6) & 3;
+
+ if (usagemask & TGSI_WRITEMASK_X) {
+ info->output_streams[drv_location] |= streamx;
+ info->num_stream_output_components[streamx]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_Y) {
+ info->output_streams[drv_location] |= streamy << 2;
+ info->num_stream_output_components[streamy]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_Z) {
+ info->output_streams[drv_location] |= streamz << 4;
+ info->num_stream_output_components[streamz]++;
+ }
+ if (usagemask & TGSI_WRITEMASK_W) {
+ info->output_streams[drv_location] |= streamw << 6;
+ info->num_stream_output_components[streamw]++;
+ }
+
+ info->output_semantic_name[drv_location] = semantic_name;
+ info->output_semantic_index[drv_location] = semantic_index;
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_PRIMID:
+ info->writes_primid = true;
+ break;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ info->writes_viewport_index = true;
+ break;
+ case TGSI_SEMANTIC_LAYER:
+ info->writes_layer = true;
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ info->writes_psize = true;
+ break;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ info->writes_clipvertex = true;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ info->colors_written |= 1 << semantic_index;
+ break;
+ case TGSI_SEMANTIC_STENCIL:
+ info->writes_stencil = true;
+ break;
+ case TGSI_SEMANTIC_SAMPLEMASK:
+ info->writes_samplemask = true;
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ info->writes_edgeflag = true;
+ break;
+ case TGSI_SEMANTIC_POSITION:
+ if (info->processor == PIPE_SHADER_FRAGMENT)
+ info->writes_z = true;
+ else
+ info->writes_position = true;
+ break;
+ }
}
-static void scan_output_helper(const nir_variable *var,
- unsigned location,
- const struct glsl_type *type,
- struct si_shader_info *info)
+static void scan_output_helper(const nir_variable *var, unsigned location,
+ const struct glsl_type *type, struct si_shader_info *info)
{
- if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
- for (unsigned i = 0; i < glsl_get_length(type); i++) {
- const struct glsl_type *ft = glsl_get_struct_field(type, i);
- scan_output_helper(var, location, ft, info);
- location += glsl_count_attribute_slots(ft, false);
- }
- } else if (glsl_type_is_array_or_matrix(type)) {
- const struct glsl_type *elem_type =
- glsl_get_array_element(type);
- unsigned num_elems = glsl_get_length(type);
- if (var->data.compact) {
- assert(glsl_type_is_scalar(elem_type));
- assert(glsl_get_bit_size(elem_type) == 32);
- unsigned component = var->data.location_frac;
- scan_output_slot(var, location, component,
- MIN2(num_elems, 4 - component), info);
- if (component + num_elems > 4) {
- scan_output_slot(var, location + 1, 0,
- component + num_elems - 4, info);
- }
-
- } else {
- unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
- for (unsigned i = 0; i < num_elems; i++) {
- scan_output_helper(var, location, elem_type, info);
- location += elem_count;
- }
- }
- } else if (glsl_type_is_dual_slot(type)) {
- unsigned component = var->data.location_frac;
- scan_output_slot(var, location, component, 4 - component, info);
- scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4,
- info);
- } else {
- unsigned component = var->data.location_frac;
- assert(glsl_type_is_vector_or_scalar(type));
- unsigned num_components = glsl_get_components(type);
- if (glsl_type_is_64bit(type))
- num_components *= 2;
- scan_output_slot(var, location, component, num_components, info);
- }
+ if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
+ for (unsigned i = 0; i < glsl_get_length(type); i++) {
+ const struct glsl_type *ft = glsl_get_struct_field(type, i);
+ scan_output_helper(var, location, ft, info);
+ location += glsl_count_attribute_slots(ft, false);
+ }
+ } else if (glsl_type_is_array_or_matrix(type)) {
+ const struct glsl_type *elem_type = glsl_get_array_element(type);
+ unsigned num_elems = glsl_get_length(type);
+ if (var->data.compact) {
+ assert(glsl_type_is_scalar(elem_type));
+ assert(glsl_get_bit_size(elem_type) == 32);
+ unsigned component = var->data.location_frac;
+ scan_output_slot(var, location, component, MIN2(num_elems, 4 - component), info);
+ if (component + num_elems > 4) {
+ scan_output_slot(var, location + 1, 0, component + num_elems - 4, info);
+ }
+
+ } else {
+ unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
+ for (unsigned i = 0; i < num_elems; i++) {
+ scan_output_helper(var, location, elem_type, info);
+ location += elem_count;
+ }
+ }
+ } else if (glsl_type_is_dual_slot(type)) {
+ unsigned component = var->data.location_frac;
+ scan_output_slot(var, location, component, 4 - component, info);
+ scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, info);
+ } else {
+ unsigned component = var->data.location_frac;
+ assert(glsl_type_is_vector_or_scalar(type));
+ unsigned num_components = glsl_get_components(type);
+ if (glsl_type_is_64bit(type))
+ num_components *= 2;
+ scan_output_slot(var, location, component, num_components, info);
+ }
}
-void si_nir_scan_shader(const struct nir_shader *nir,
- struct si_shader_info *info)
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info)
{
- nir_function *func;
- unsigned i;
-
- info->processor = pipe_shader_type_from_mesa(nir->info.stage);
-
- info->properties[TGSI_PROPERTY_NEXT_SHADER] =
- pipe_shader_type_from_mesa(nir->info.next_stage);
-
- if (nir->info.stage == MESA_SHADER_VERTEX) {
- info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] =
- nir->info.vs.window_space_position;
- info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] =
- nir->info.vs.blit_sgprs_amd;
- }
-
- if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
- info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] =
- nir->info.tess.tcs_vertices_out;
- }
-
- if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
- if (nir->info.tess.primitive_mode == GL_ISOLINES)
- info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
- else
- info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
-
- STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
- STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 ==
- PIPE_TESS_SPACING_FRACTIONAL_ODD);
- STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 ==
- PIPE_TESS_SPACING_FRACTIONAL_EVEN);
-
- info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
- info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
- info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
- }
-
- if (nir->info.stage == MESA_SHADER_GEOMETRY) {
- info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
- info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
- info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
- info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
- }
-
- if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
- nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
- info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
-
- if (nir->info.fs.pixel_center_integer) {
- info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
- TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
- }
-
- if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
- switch (nir->info.fs.depth_layout) {
- case FRAG_DEPTH_LAYOUT_ANY:
- info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
- break;
- case FRAG_DEPTH_LAYOUT_GREATER:
- info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
- break;
- case FRAG_DEPTH_LAYOUT_LESS:
- info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
- break;
- case FRAG_DEPTH_LAYOUT_UNCHANGED:
- info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
- break;
- default:
- unreachable("Unknow depth layout");
- }
- }
- }
-
- if (gl_shader_stage_is_compute(nir->info.stage)) {
- info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
- info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
- info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
- info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] = nir->info.cs.user_data_components_amd;
- }
-
- i = 0;
- uint64_t processed_inputs = 0;
- nir_foreach_variable(variable, &nir->inputs) {
- unsigned semantic_name, semantic_index;
-
- const struct glsl_type *type = variable->type;
- if (nir_is_per_vertex_io(variable, nir->info.stage)) {
- assert(glsl_type_is_array(type));
- type = glsl_get_array_element(type);
- }
-
- unsigned attrib_count = glsl_count_attribute_slots(type,
- nir->info.stage == MESA_SHADER_VERTEX);
-
- i = variable->data.driver_location;
-
- /* Vertex shader inputs don't have semantics. The state
- * tracker has already mapped them to attributes via
- * variable->data.driver_location.
- */
- if (nir->info.stage == MESA_SHADER_VERTEX)
- continue;
-
- for (unsigned j = 0; j < attrib_count; j++, i++) {
-
- if (processed_inputs & ((uint64_t)1 << i))
- continue;
-
- processed_inputs |= ((uint64_t)1 << i);
-
- tgsi_get_gl_varying_semantic(variable->data.location + j, true,
- &semantic_name, &semantic_index);
-
- info->input_semantic_name[i] = semantic_name;
- info->input_semantic_index[i] = semantic_index;
-
- if (semantic_name == TGSI_SEMANTIC_PRIMID)
- info->uses_primid = true;
-
- if (semantic_name == TGSI_SEMANTIC_COLOR) {
- /* We only need this for color inputs. */
- if (variable->data.sample)
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
- else if (variable->data.centroid)
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
- else
- info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
- }
-
- enum glsl_base_type base_type =
- glsl_get_base_type(glsl_without_array(variable->type));
-
- switch (variable->data.interpolation) {
- case INTERP_MODE_NONE:
- if (glsl_base_type_is_integer(base_type)) {
- info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
- break;
- }
-
- if (semantic_name == TGSI_SEMANTIC_COLOR) {
- info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
- break;
- }
- /* fall-through */
-
- case INTERP_MODE_SMOOTH:
- assert(!glsl_base_type_is_integer(base_type));
-
- info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
- break;
-
- case INTERP_MODE_NOPERSPECTIVE:
- assert(!glsl_base_type_is_integer(base_type));
-
- info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
- break;
-
- case INTERP_MODE_FLAT:
- info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
- break;
- }
- }
- }
-
- nir_foreach_variable(variable, &nir->outputs) {
- const struct glsl_type *type = variable->type;
- if (nir_is_per_vertex_io(variable, nir->info.stage)) {
- assert(glsl_type_is_array(type));
- type = glsl_get_array_element(type);
- }
-
- ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
- scan_output_helper(variable, 0, type, info);
-
- unsigned loc = variable->data.location;
- if (nir->info.stage == MESA_SHADER_FRAGMENT &&
- loc == FRAG_RESULT_COLOR &&
- nir->info.outputs_written & (1ull << loc)) {
- assert(attrib_count == 1);
- info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
- }
- }
-
- info->num_inputs = nir->num_inputs;
- info->num_outputs = nir->num_outputs;
-
- info->constbuf0_num_slots = nir->num_uniforms;
- info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
- info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
- if (nir->num_uniforms > 0)
- info->const_buffers_declared |= 1;
- info->images_declared = u_bit_consecutive(0, nir->info.num_images);
- info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
- info->samplers_declared = nir->info.textures_used;
-
- info->num_written_clipdistance = nir->info.clip_distance_array_size;
- info->num_written_culldistance = nir->info.cull_distance_array_size;
- info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
- info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
-
- if (info->processor == PIPE_SHADER_FRAGMENT)
- info->uses_kill = nir->info.fs.uses_discard;
-
- if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
- info->tessfactors_are_def_in_all_invocs =
- ac_are_tessfactors_def_in_all_invocs(nir);
- }
-
- func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
- nir_foreach_block(block, func->impl) {
- nir_foreach_instr(instr, block)
- scan_instruction(nir, info, instr);
- }
+ nir_function *func;
+ unsigned i;
+
+ info->processor = pipe_shader_type_from_mesa(nir->info.stage);
+
+ info->properties[TGSI_PROPERTY_NEXT_SHADER] = pipe_shader_type_from_mesa(nir->info.next_stage);
+
+ if (nir->info.stage == MESA_SHADER_VERTEX) {
+ info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = nir->info.vs.window_space_position;
+ info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] = nir->info.vs.blit_sgprs_amd;
+ }
+
+ if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+ info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = nir->info.tess.tcs_vertices_out;
+ }
+
+ if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (nir->info.tess.primitive_mode == GL_ISOLINES)
+ info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
+ else
+ info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
+
+ STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
+ STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_ODD);
+ STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+
+ info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
+ info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
+ info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
+ }
+
+ if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+ info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
+ info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
+ info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
+ info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
+ }
+
+ if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+ info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
+ nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
+ info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
+
+ if (nir->info.fs.pixel_center_integer) {
+ info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+ }
+
+ if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
+ switch (nir->info.fs.depth_layout) {
+ case FRAG_DEPTH_LAYOUT_ANY:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
+ break;
+ case FRAG_DEPTH_LAYOUT_GREATER:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
+ break;
+ case FRAG_DEPTH_LAYOUT_LESS:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
+ break;
+ case FRAG_DEPTH_LAYOUT_UNCHANGED:
+ info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
+ break;
+ default:
+ unreachable("Unknow depth layout");
+ }
+ }
+ }
+
+ if (gl_shader_stage_is_compute(nir->info.stage)) {
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
+ info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+ info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] =
+ nir->info.cs.user_data_components_amd;
+ }
+
+ i = 0;
+ uint64_t processed_inputs = 0;
+ nir_foreach_variable (variable, &nir->inputs) {
+ unsigned semantic_name, semantic_index;
+
+ const struct glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
+ }
+
+ unsigned attrib_count =
+ glsl_count_attribute_slots(type, nir->info.stage == MESA_SHADER_VERTEX);
+
+ i = variable->data.driver_location;
+
+ /* Vertex shader inputs don't have semantics. The state
+ * tracker has already mapped them to attributes via
+ * variable->data.driver_location.
+ */
+ if (nir->info.stage == MESA_SHADER_VERTEX)
+ continue;
+
+ for (unsigned j = 0; j < attrib_count; j++, i++) {
+
+ if (processed_inputs & ((uint64_t)1 << i))
+ continue;
+
+ processed_inputs |= ((uint64_t)1 << i);
+
+ tgsi_get_gl_varying_semantic(variable->data.location + j, true, &semantic_name,
+ &semantic_index);
+
+ info->input_semantic_name[i] = semantic_name;
+ info->input_semantic_index[i] = semantic_index;
+
+ if (semantic_name == TGSI_SEMANTIC_PRIMID)
+ info->uses_primid = true;
+
+ if (semantic_name == TGSI_SEMANTIC_COLOR) {
+ /* We only need this for color inputs. */
+ if (variable->data.sample)
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
+ else if (variable->data.centroid)
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
+ else
+ info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
+ }
+
+ enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array(variable->type));
+
+ switch (variable->data.interpolation) {
+ case INTERP_MODE_NONE:
+ if (glsl_base_type_is_integer(base_type)) {
+ info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+ break;
+ }
+
+ if (semantic_name == TGSI_SEMANTIC_COLOR) {
+ info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
+ break;
+ }
+ /* fall-through */
+
+ case INTERP_MODE_SMOOTH:
+ assert(!glsl_base_type_is_integer(base_type));
+
+ info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+ break;
+
+ case INTERP_MODE_NOPERSPECTIVE:
+ assert(!glsl_base_type_is_integer(base_type));
+
+ info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+ break;
+
+ case INTERP_MODE_FLAT:
+ info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+ break;
+ }
+ }
+ }
+
+ nir_foreach_variable (variable, &nir->outputs) {
+ const struct glsl_type *type = variable->type;
+ if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+ assert(glsl_type_is_array(type));
+ type = glsl_get_array_element(type);
+ }
+
+ ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
+ scan_output_helper(variable, 0, type, info);
+
+ unsigned loc = variable->data.location;
+ if (nir->info.stage == MESA_SHADER_FRAGMENT && loc == FRAG_RESULT_COLOR &&
+ nir->info.outputs_written & (1ull << loc)) {
+ assert(attrib_count == 1);
+ info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
+ }
+ }
+
+ info->num_inputs = nir->num_inputs;
+ info->num_outputs = nir->num_outputs;
+
+ info->constbuf0_num_slots = nir->num_uniforms;
+ info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
+ info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
+ if (nir->num_uniforms > 0)
+ info->const_buffers_declared |= 1;
+ info->images_declared = u_bit_consecutive(0, nir->info.num_images);
+ info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
+ info->samplers_declared = nir->info.textures_used;
+
+ info->num_written_clipdistance = nir->info.clip_distance_array_size;
+ info->num_written_culldistance = nir->info.cull_distance_array_size;
+ info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
+ info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
+
+ if (info->processor == PIPE_SHADER_FRAGMENT)
+ info->uses_kill = nir->info.fs.uses_discard;
+
+ if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+ info->tessfactors_are_def_in_all_invocs = ac_are_tessfactors_def_in_all_invocs(nir);
+ }
+
+ func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
+ nir_foreach_block (block, func->impl) {
+ nir_foreach_instr (instr, block)
+ scan_instruction(nir, info, instr);
+ }
}
-static void
-si_nir_opts(struct nir_shader *nir)
+static void si_nir_opts(struct nir_shader *nir)
{
- bool progress;
-
- do {
- progress = false;
-
- NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
- NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
- NIR_PASS(progress, nir, nir_opt_dead_write_vars);
-
- NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
- NIR_PASS_V(nir, nir_lower_phis_to_scalar);
-
- /* (Constant) copy propagation is needed for txf with offsets. */
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_remove_phis);
- NIR_PASS(progress, nir, nir_opt_dce);
- if (nir_opt_trivial_continues(nir)) {
- progress = true;
- NIR_PASS(progress, nir, nir_copy_prop);
- NIR_PASS(progress, nir, nir_opt_dce);
- }
- NIR_PASS(progress, nir, nir_opt_if, true);
- NIR_PASS(progress, nir, nir_opt_dead_cf);
- NIR_PASS(progress, nir, nir_opt_cse);
- NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-
- /* Needed for algebraic lowering */
- NIR_PASS(progress, nir, nir_opt_algebraic);
- NIR_PASS(progress, nir, nir_opt_constant_folding);
-
- if (!nir->info.flrp_lowered) {
- unsigned lower_flrp =
- (nir->options->lower_flrp16 ? 16 : 0) |
- (nir->options->lower_flrp32 ? 32 : 0) |
- (nir->options->lower_flrp64 ? 64 : 0);
- assert(lower_flrp);
- bool lower_flrp_progress = false;
-
- NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp,
- lower_flrp,
- false /* always_precise */,
- nir->options->lower_ffma);
- if (lower_flrp_progress) {
- NIR_PASS(progress, nir,
- nir_opt_constant_folding);
- progress = true;
- }
-
- /* Nothing should rematerialize any flrps, so we only
- * need to do this lowering once.
- */
- nir->info.flrp_lowered = true;
- }
-
- NIR_PASS(progress, nir, nir_opt_undef);
- NIR_PASS(progress, nir, nir_opt_conditional_discard);
- if (nir->options->max_unroll_iterations) {
- NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
- }
- } while (progress);
+ bool progress;
+
+ do {
+ progress = false;
+
+ NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+ NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, nir, nir_opt_dead_write_vars);
+
+ NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+ NIR_PASS_V(nir, nir_lower_phis_to_scalar);
+
+ /* (Constant) copy propagation is needed for txf with offsets. */
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_remove_phis);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ if (nir_opt_trivial_continues(nir)) {
+ progress = true;
+ NIR_PASS(progress, nir, nir_copy_prop);
+ NIR_PASS(progress, nir, nir_opt_dce);
+ }
+ NIR_PASS(progress, nir, nir_opt_if, true);
+ NIR_PASS(progress, nir, nir_opt_dead_cf);
+ NIR_PASS(progress, nir, nir_opt_cse);
+ NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+
+ /* Needed for algebraic lowering */
+ NIR_PASS(progress, nir, nir_opt_algebraic);
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+ if (!nir->info.flrp_lowered) {
+ unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
+ (nir->options->lower_flrp32 ? 32 : 0) |
+ (nir->options->lower_flrp64 ? 64 : 0);
+ assert(lower_flrp);
+ bool lower_flrp_progress = false;
+
+ NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */,
+ nir->options->lower_ffma);
+ if (lower_flrp_progress) {
+ NIR_PASS(progress, nir, nir_opt_constant_folding);
+ progress = true;
+ }
+
+ /* Nothing should rematerialize any flrps, so we only
+ * need to do this lowering once.
+ */
+ nir->info.flrp_lowered = true;
+ }
+
+ NIR_PASS(progress, nir, nir_opt_undef);
+ NIR_PASS(progress, nir, nir_opt_conditional_discard);
+ if (nir->options->max_unroll_iterations) {
+ NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
+ }
+ } while (progress);
}
-static int
-type_size_vec4(const struct glsl_type *type, bool bindless)
+static int type_size_vec4(const struct glsl_type *type, bool bindless)
{
- return glsl_count_attribute_slots(type, false);
+ return glsl_count_attribute_slots(type, false);
}
-static void
-si_nir_lower_color(nir_shader *nir)
+static void si_nir_lower_color(nir_shader *nir)
{
- nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
-
- nir_builder b;
- nir_builder_init(&b, entrypoint);
-
- nir_foreach_block(block, entrypoint) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intrin =
- nir_instr_as_intrinsic(instr);
-
- if (intrin->intrinsic != nir_intrinsic_load_deref)
- continue;
-
- nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
- if (deref->mode != nir_var_shader_in)
- continue;
-
- b.cursor = nir_before_instr(instr);
- nir_variable *var = nir_deref_instr_get_variable(deref);
- nir_ssa_def *def;
-
- if (var->data.location == VARYING_SLOT_COL0) {
- def = nir_load_color0(&b);
- } else if (var->data.location == VARYING_SLOT_COL1) {
- def = nir_load_color1(&b);
- } else {
- continue;
- }
-
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
- nir_instr_remove(instr);
- }
- }
+ nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, entrypoint);
+
+ nir_foreach_block (block, entrypoint) {
+ nir_foreach_instr_safe (instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+ if (intrin->intrinsic != nir_intrinsic_load_deref)
+ continue;
+
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ if (deref->mode != nir_var_shader_in)
+ continue;
+
+ b.cursor = nir_before_instr(instr);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+ nir_ssa_def *def;
+
+ if (var->data.location == VARYING_SLOT_COL0) {
+ def = nir_load_color0(&b);
+ } else if (var->data.location == VARYING_SLOT_COL1) {
+ def = nir_load_color1(&b);
+ } else {
+ continue;
+ }
+
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
+ nir_instr_remove(instr);
+ }
+ }
}
static void si_nir_lower_ps_inputs(struct nir_shader *nir)
{
- if (nir->info.stage != MESA_SHADER_FRAGMENT)
- return;
-
- NIR_PASS_V(nir, nir_lower_io_to_temporaries,
- nir_shader_get_entrypoint(nir), false, true);
-
- /* Since we're doing nir_lower_io_to_temporaries late, we need
- * to lower all the copy_deref's introduced by
- * lower_io_to_temporaries before calling nir_lower_io.
- */
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_global_vars_to_local);
-
- si_nir_lower_color(nir);
- NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
-
- /* This pass needs actual constants */
- NIR_PASS_V(nir, nir_opt_constant_folding);
- NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
- nir_var_shader_in);
+ if (nir->info.stage != MESA_SHADER_FRAGMENT)
+ return;
+
+ NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), false, true);
+
+ /* Since we're doing nir_lower_io_to_temporaries late, we need
+ * to lower all the copy_deref's introduced by
+ * lower_io_to_temporaries before calling nir_lower_io.
+ */
+ NIR_PASS_V(nir, nir_split_var_copies);
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+
+ si_nir_lower_color(nir);
+ NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
+
+ /* This pass needs actual constants */
+ NIR_PASS_V(nir, nir_opt_constant_folding);
+ NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in);
}
void si_nir_adjust_driver_locations(struct nir_shader *nir)
{
- /* Adjust the driver location of inputs and outputs. The state tracker
- * interprets them as slots, while the ac/nir backend interprets them
- * as individual components.
- */
- if (nir->info.stage != MESA_SHADER_FRAGMENT) {
- nir_foreach_variable(variable, &nir->inputs)
- variable->data.driver_location *= 4;
- }
-
- nir_foreach_variable(variable, &nir->outputs)
- variable->data.driver_location *= 4;
+ /* Adjust the driver location of inputs and outputs. The state tracker
+ * interprets them as slots, while the ac/nir backend interprets them
+ * as individual components.
+ */
+ if (nir->info.stage != MESA_SHADER_FRAGMENT) {
+ nir_foreach_variable (variable, &nir->inputs)
+ variable->data.driver_location *= 4;
+ }
+
+ nir_foreach_variable (variable, &nir->outputs)
+ variable->data.driver_location *= 4;
}
/**
*/
static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
{
- /* Perform lowerings (and optimizations) of code.
- *
- * Performance considerations aside, we must:
- * - lower certain ALU operations
- * - ensure constant offsets for texture instructions are folded
- * and copy-propagated
- */
-
- static const struct nir_lower_tex_options lower_tex_options = {
- .lower_txp = ~0u,
- };
- NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
-
- const nir_lower_subgroups_options subgroups_options = {
- .subgroup_size = 64,
- .ballot_bit_size = 64,
- .lower_to_scalar = true,
- .lower_subgroup_masks = true,
- .lower_vote_trivial = false,
- .lower_vote_eq_to_ballot = true,
- };
- NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
-
- /* Lower load constants to scalar and then clean up the mess */
- NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
- NIR_PASS_V(nir, nir_lower_var_copies);
- NIR_PASS_V(nir, nir_lower_pack);
- NIR_PASS_V(nir, nir_opt_access);
- si_nir_opts(nir);
-
- /* Lower large variables that are always constant with load_constant
- * intrinsics, which get turned into PC-relative loads from a data
- * section next to the shader.
- *
- * st/mesa calls finalize_nir twice, but we can't call this pass twice.
- */
- bool changed = false;
- if (!nir->constant_data) {
- NIR_PASS(changed, nir, nir_opt_large_constants,
- glsl_get_natural_size_align_bytes, 16);
- }
-
- changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
- if (changed)
- si_nir_opts(nir);
-
- NIR_PASS_V(nir, nir_lower_bool_to_int32);
- NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
-
- if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
- NIR_PASS_V(nir, nir_lower_discard_to_demote);
+ /* Perform lowerings (and optimizations) of code.
+ *
+ * Performance considerations aside, we must:
+ * - lower certain ALU operations
+ * - ensure constant offsets for texture instructions are folded
+ * and copy-propagated
+ */
+
+ static const struct nir_lower_tex_options lower_tex_options = {
+ .lower_txp = ~0u,
+ };
+ NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
+
+ const nir_lower_subgroups_options subgroups_options = {
+ .subgroup_size = 64,
+ .ballot_bit_size = 64,
+ .lower_to_scalar = true,
+ .lower_subgroup_masks = true,
+ .lower_vote_trivial = false,
+ .lower_vote_eq_to_ballot = true,
+ };
+ NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
+
+ /* Lower load constants to scalar and then clean up the mess */
+ NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+ NIR_PASS_V(nir, nir_lower_var_copies);
+ NIR_PASS_V(nir, nir_lower_pack);
+ NIR_PASS_V(nir, nir_opt_access);
+ si_nir_opts(nir);
+
+ /* Lower large variables that are always constant with load_constant
+ * intrinsics, which get turned into PC-relative loads from a data
+ * section next to the shader.
+ *
+ * st/mesa calls finalize_nir twice, but we can't call this pass twice.
+ */
+ bool changed = false;
+ if (!nir->constant_data) {
+ NIR_PASS(changed, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
+ }
+
+ changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
+ if (changed)
+ si_nir_opts(nir);
+
+ NIR_PASS_V(nir, nir_lower_bool_to_int32);
+ NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
+
+ if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+ NIR_PASS_V(nir, nir_lower_discard_to_demote);
}
void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- struct nir_shader *nir = (struct nir_shader *)nirptr;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct nir_shader *nir = (struct nir_shader *)nirptr;
- nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
- si_nir_lower_ps_inputs(nir);
- si_lower_nir(sscreen, nir);
+ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+ si_nir_lower_ps_inputs(nir);
+ si_lower_nir(sscreen, nir);
}
#include "tgsi/tgsi_text.h"
#include "tgsi/tgsi_ureg.h"
-void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
- unsigned num_layers)
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
{
- unsigned vs_blit_property;
- void **vs;
-
- switch (type) {
- case UTIL_BLITTER_ATTRIB_NONE:
- vs = num_layers > 1 ? &sctx->vs_blit_pos_layered :
- &sctx->vs_blit_pos;
- vs_blit_property = SI_VS_BLIT_SGPRS_POS;
- break;
- case UTIL_BLITTER_ATTRIB_COLOR:
- vs = num_layers > 1 ? &sctx->vs_blit_color_layered :
- &sctx->vs_blit_color;
- vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
- break;
- case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
- case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
- assert(num_layers == 1);
- vs = &sctx->vs_blit_texcoord;
- vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
- break;
- default:
- assert(0);
- return NULL;
- }
- if (*vs)
- return *vs;
-
- struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
- if (!ureg)
- return NULL;
-
- /* Tell the shader to load VS inputs from SGPRs: */
- ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
- ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
-
- /* This is just a pass-through shader with 1-3 MOV instructions. */
- ureg_MOV(ureg,
- ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0),
- ureg_DECL_vs_input(ureg, 0));
-
- if (type != UTIL_BLITTER_ATTRIB_NONE) {
- ureg_MOV(ureg,
- ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0),
- ureg_DECL_vs_input(ureg, 1));
- }
-
- if (num_layers > 1) {
- struct ureg_src instance_id =
- ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
- struct ureg_dst layer =
- ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
-
- ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
- ureg_scalar(instance_id, TGSI_SWIZZLE_X));
- }
- ureg_END(ureg);
-
- *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
- return *vs;
+ unsigned vs_blit_property;
+ void **vs;
+
+ switch (type) {
+ case UTIL_BLITTER_ATTRIB_NONE:
+ vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS;
+ break;
+ case UTIL_BLITTER_ATTRIB_COLOR:
+ vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
+ break;
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+ assert(num_layers == 1);
+ vs = &sctx->vs_blit_texcoord;
+ vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
+ break;
+ default:
+ assert(0);
+ return NULL;
+ }
+ if (*vs)
+ return *vs;
+
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
+ if (!ureg)
+ return NULL;
+
+ /* Tell the shader to load VS inputs from SGPRs: */
+ ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
+ ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
+
+ /* This is just a pass-through shader with 1-3 MOV instructions. */
+ ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+
+ if (type != UTIL_BLITTER_ATTRIB_NONE) {
+ ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1));
+ }
+
+ if (num_layers > 1) {
+ struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+ struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+ ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
+ ureg_scalar(instance_id, TGSI_SWIZZLE_X));
+ }
+ ureg_END(ureg);
+
+ *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
+ return *vs;
}
/**
*/
void *si_create_fixed_func_tcs(struct si_context *sctx)
{
- struct ureg_src outer, inner;
- struct ureg_dst tessouter, tessinner;
- struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
+ struct ureg_src outer, inner;
+ struct ureg_dst tessouter, tessinner;
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
- if (!ureg)
- return NULL;
+ if (!ureg)
+ return NULL;
- outer = ureg_DECL_system_value(ureg,
- TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
- inner = ureg_DECL_system_value(ureg,
- TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
+ outer = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
+ inner = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
- tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
- tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+ tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+ tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
- ureg_MOV(ureg, tessouter, outer);
- ureg_MOV(ureg, tessinner, inner);
- ureg_END(ureg);
+ ureg_MOV(ureg, tessouter, outer);
+ ureg_MOV(ureg, tessinner, inner);
+ ureg_END(ureg);
- return ureg_create_shader_and_destroy(ureg, &sctx->b);
+ return ureg_create_shader_and_destroy(ureg, &sctx->b);
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
- unsigned num_dwords_per_thread,
- bool dst_stream_cache_policy, bool is_copy)
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+ bool dst_stream_cache_policy, bool is_copy)
{
- struct si_screen *sscreen = (struct si_screen *)ctx->screen;
- assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
-
- unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
- if (dst_stream_cache_policy)
- store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
-
- /* Don't cache loads, because there is no reuse. */
- unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
-
- unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
- unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
-
- for (unsigned i = 0; i < num_mem_ops; i++) {
- if (i*4 < num_dwords_per_thread)
- inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
- }
-
- struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
- if (!ureg)
- return NULL;
-
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
- struct ureg_src value;
- if (!is_copy) {
- ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
- value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
- }
-
- struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
- struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
- struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
- struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
- struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
- struct ureg_src srcbuf;
- struct ureg_src *values = NULL;
-
- if (is_copy) {
- srcbuf = ureg_DECL_buffer(ureg, 1, false);
- values = malloc(num_mem_ops * sizeof(struct ureg_src));
- }
-
- /* If there are multiple stores, the first store writes into 0*wavesize+tid,
- * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
- */
- ureg_UMAD(ureg, store_addr, blk,
- ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid);
- /* Convert from a "store size unit" into bytes. */
- ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
- ureg_imm1u(ureg, 4 * inst_dwords[0]));
- ureg_MOV(ureg, load_addr, ureg_src(store_addr));
-
- /* Distance between a load and a store for latency hiding. */
- unsigned load_store_distance = is_copy ? 8 : 0;
-
- for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
- int d = i - load_store_distance;
-
- if (is_copy && i < num_mem_ops) {
- if (i) {
- ureg_UADD(ureg, load_addr, ureg_src(load_addr),
- ureg_imm1u(ureg, 4 * inst_dwords[i] *
- sscreen->compute_wave_size));
- }
-
- values[i] = ureg_src(ureg_DECL_temporary(ureg));
- struct ureg_dst dst =
- ureg_writemask(ureg_dst(values[i]),
- u_bit_consecutive(0, inst_dwords[i]));
- struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
- ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
- load_qualifier, TGSI_TEXTURE_BUFFER, 0);
- }
-
- if (d >= 0) {
- if (d) {
- ureg_UADD(ureg, store_addr, ureg_src(store_addr),
- ureg_imm1u(ureg, 4 * inst_dwords[d] *
- sscreen->compute_wave_size));
- }
-
- struct ureg_dst dst =
- ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
- struct ureg_src srcs[] =
- {ureg_src(store_addr), is_copy ? values[d] : value};
- ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
- store_qualifier, TGSI_TEXTURE_BUFFER, 0);
- }
- }
- ureg_END(ureg);
-
- struct pipe_compute_state state = {};
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = ureg_get_tokens(ureg, NULL);
-
- void *cs = ctx->create_compute_state(ctx, &state);
- ureg_destroy(ureg);
- ureg_free_tokens(state.prog);
-
- free(values);
- return cs;
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+ unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+ if (dst_stream_cache_policy)
+ store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+ /* Don't cache loads, because there is no reuse. */
+ unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+ unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+ unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+ for (unsigned i = 0; i < num_mem_ops; i++) {
+ if (i * 4 < num_dwords_per_thread)
+ inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
+ }
+
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+ if (!ureg)
+ return NULL;
+
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+ struct ureg_src value;
+ if (!is_copy) {
+ ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
+ value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
+ }
+
+ struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+ struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+ struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+ struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+ struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+ struct ureg_src srcbuf;
+ struct ureg_src *values = NULL;
+
+ if (is_copy) {
+ srcbuf = ureg_DECL_buffer(ureg, 1, false);
+ values = malloc(num_mem_ops * sizeof(struct ureg_src));
+ }
+
+ /* If there are multiple stores, the first store writes into 0*wavesize+tid,
+ * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
+ */
+ ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops),
+ tid);
+ /* Convert from a "store size unit" into bytes. */
+ ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
+ ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+ /* Distance between a load and a store for latency hiding. */
+ unsigned load_store_distance = is_copy ? 8 : 0;
+
+ for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+ int d = i - load_store_distance;
+
+ if (is_copy && i < num_mem_ops) {
+ if (i) {
+ ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+ ureg_imm1u(ureg, 4 * inst_dwords[i] * sscreen->compute_wave_size));
+ }
+
+ values[i] = ureg_src(ureg_DECL_temporary(ureg));
+ struct ureg_dst dst =
+ ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
+ struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+ ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
+ TGSI_TEXTURE_BUFFER, 0);
+ }
+
+ if (d >= 0) {
+ if (d) {
+ ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+ ureg_imm1u(ureg, 4 * inst_dwords[d] * sscreen->compute_wave_size));
+ }
+
+ struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+ struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
+ ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
+ TGSI_TEXTURE_BUFFER, 0);
+ }
+ }
+ ureg_END(ureg);
+
+ struct pipe_compute_state state = {};
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = ureg_get_tokens(ureg, NULL);
+
+ void *cs = ctx->create_compute_state(ctx, &state);
+ ureg_destroy(ureg);
+ ureg_free_tokens(state.prog);
+
+ free(values);
+ return cs;
}
/* Create a compute shader that copies DCC from one buffer to another
*/
void *si_create_dcc_retile_cs(struct pipe_context *ctx)
{
- struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
- if (!ureg)
- return NULL;
-
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
- /* Compute the global thread ID (in idx). */
- struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
- struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
- struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg),
- TGSI_WRITEMASK_X);
- ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
-
- /* Load 2 pairs of offsets for DCC load & store. */
- struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
- struct ureg_dst offsets = ureg_DECL_temporary(ureg);
- struct ureg_src map_load_args[] = {map, ureg_src(idx)};
-
- ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2,
- TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-
- struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER,
- 0, false, false);
- struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER,
- 0, true, false));
- struct ureg_dst dcc_value[2];
-
- /* Copy DCC values:
- * dst[offsets.y] = src[offsets.x];
- * dst[offsets.w] = src[offsets.z];
- */
- for (unsigned i = 0; i < 2; i++) {
- dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-
- struct ureg_src load_args[] =
- {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)};
- ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2,
- TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
- }
-
- dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
-
- for (unsigned i = 0; i < 2; i++) {
- struct ureg_src store_args[] = {
- ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2),
- ureg_src(dcc_value[i])
- };
- ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2,
- TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
- }
- ureg_END(ureg);
-
- struct pipe_compute_state state = {};
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = ureg_get_tokens(ureg, NULL);
-
- void *cs = ctx->create_compute_state(ctx, &state);
- ureg_destroy(ureg);
- return cs;
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+ if (!ureg)
+ return NULL;
+
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+ /* Compute the global thread ID (in idx). */
+ struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+ struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+ struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+ ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
+
+ /* Load 2 pairs of offsets for DCC load & store. */
+ struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
+ struct ureg_dst offsets = ureg_DECL_temporary(ureg);
+ struct ureg_src map_load_args[] = {map, ureg_src(idx)};
+
+ ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, TGSI_MEMORY_RESTRICT,
+ TGSI_TEXTURE_BUFFER, 0);
+
+ struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, 0, false, false);
+ struct ureg_dst dcc_dst =
+ ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, 0, true, false));
+ struct ureg_dst dcc_value[2];
+
+ /* Copy DCC values:
+ * dst[offsets.y] = src[offsets.x];
+ * dst[offsets.w] = src[offsets.z];
+ */
+ for (unsigned i = 0; i < 2; i++) {
+ dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+
+ struct ureg_src load_args[] = {dcc_src,
+ ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i * 2)};
+ ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, TGSI_MEMORY_RESTRICT,
+ TGSI_TEXTURE_BUFFER, 0);
+ }
+
+ dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
+
+ for (unsigned i = 0; i < 2; i++) {
+ struct ureg_src store_args[] = {ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i * 2),
+ ureg_src(dcc_value[i])};
+ ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, TGSI_MEMORY_RESTRICT,
+ TGSI_TEXTURE_BUFFER, 0);
+ }
+ ureg_END(ureg);
+
+ struct pipe_compute_state state = {};
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = ureg_get_tokens(ureg, NULL);
+
+ void *cs = ctx->create_compute_state(ctx, &state);
+ ureg_destroy(ureg);
+ return cs;
}
/* Create the compute shader that is used to collect the results.
*/
void *si_create_query_result_cs(struct si_context *sctx)
{
- /* TEMP[0].xy = accumulated result so far
- * TEMP[0].z = result not available
- *
- * TEMP[1].x = current result index
- * TEMP[1].y = current pair index
- */
- static const char text_tmpl[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL BUFFER[0]\n"
- "DCL BUFFER[1]\n"
- "DCL BUFFER[2]\n"
- "DCL CONST[0][0..1]\n"
- "DCL TEMP[0..5]\n"
- "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
- "IMM[1] UINT32 {1, 2, 4, 8}\n"
- "IMM[2] UINT32 {16, 32, 64, 128}\n"
- "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
- "IMM[4] UINT32 {256, 0, 0, 0}\n"
-
- "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
- "UIF TEMP[5]\n"
- /* Check result availability. */
- "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
- "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
- "MOV TEMP[1], TEMP[0].zzzz\n"
- "NOT TEMP[0].z, TEMP[0].zzzz\n"
-
- /* Load result if available. */
- "UIF TEMP[1]\n"
- "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
- "ENDIF\n"
- "ELSE\n"
- /* Load previously accumulated result if requested. */
- "MOV TEMP[0], IMM[0].xxxx\n"
- "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
- "UIF TEMP[4]\n"
- "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
- "ENDIF\n"
-
- "MOV TEMP[1].x, IMM[0].xxxx\n"
- "BGNLOOP\n"
- /* Break if accumulated result so far is not available. */
- "UIF TEMP[0].zzzz\n"
- "BRK\n"
- "ENDIF\n"
-
- /* Break if result_index >= result_count. */
- "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
- "UIF TEMP[5]\n"
- "BRK\n"
- "ENDIF\n"
-
- /* Load fence and check result availability */
- "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
- "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
- "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
- "NOT TEMP[0].z, TEMP[0].zzzz\n"
- "UIF TEMP[0].zzzz\n"
- "BRK\n"
- "ENDIF\n"
-
- "MOV TEMP[1].y, IMM[0].xxxx\n"
- "BGNLOOP\n"
- /* Load start and end. */
- "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
- "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
- "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-
- "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
- "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
- "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
-
- "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
- "UIF TEMP[5].zzzz\n"
- /* Load second start/end half-pair and
- * take the difference
- */
- "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
- "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
- "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
- "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
- "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
- "ENDIF\n"
-
- "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
-
- /* Increment pair index */
- "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
- "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
- "UIF TEMP[5]\n"
- "BRK\n"
- "ENDIF\n"
- "ENDLOOP\n"
-
- /* Increment result index */
- "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
- "ENDLOOP\n"
- "ENDIF\n"
-
- "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
- "UIF TEMP[4]\n"
- /* Store accumulated data for chaining. */
- "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
- "ELSE\n"
- "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
- "UIF TEMP[4]\n"
- /* Store result availability. */
- "NOT TEMP[0].z, TEMP[0]\n"
- "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
- "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
-
- "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
- "UIF TEMP[4]\n"
- "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
- "ENDIF\n"
- "ELSE\n"
- /* Store result if it is available. */
- "NOT TEMP[4], TEMP[0].zzzz\n"
- "UIF TEMP[4]\n"
- /* Apply timestamp conversion */
- "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
- "UIF TEMP[4]\n"
- "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
- "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
- "ENDIF\n"
-
- /* Convert to boolean */
- "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
- "UIF TEMP[4]\n"
- "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
- "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
- "MOV TEMP[0].y, IMM[0].xxxx\n"
- "ENDIF\n"
-
- "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
- "UIF TEMP[4]\n"
- "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
- "ELSE\n"
- /* Clamping */
- "UIF TEMP[0].yyyy\n"
- "MOV TEMP[0].x, IMM[0].wwww\n"
- "ENDIF\n"
-
- "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
- "UIF TEMP[4]\n"
- "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
- "ENDIF\n"
-
- "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
- "ENDIF\n"
- "ENDIF\n"
- "ENDIF\n"
- "ENDIF\n"
-
- "END\n";
-
- char text[sizeof(text_tmpl) + 32];
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {};
-
- /* Hard code the frequency into the shader so that the backend can
- * use the full range of optimizations for divide-by-constant.
- */
- snprintf(text, sizeof(text), text_tmpl,
- sctx->screen->info.clock_crystal_freq);
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return sctx->b.create_compute_state(&sctx->b, &state);
+ /* TEMP[0].xy = accumulated result so far
+ * TEMP[0].z = result not available
+ *
+ * TEMP[1].x = current result index
+ * TEMP[1].y = current pair index
+ */
+ static const char text_tmpl[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL BUFFER[0]\n"
+ "DCL BUFFER[1]\n"
+ "DCL BUFFER[2]\n"
+ "DCL CONST[0][0..1]\n"
+ "DCL TEMP[0..5]\n"
+ "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+ "IMM[1] UINT32 {1, 2, 4, 8}\n"
+ "IMM[2] UINT32 {16, 32, 64, 128}\n"
+ "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+ "IMM[4] UINT32 {256, 0, 0, 0}\n"
+
+ "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
+ "UIF TEMP[5]\n"
+ /* Check result availability. */
+ "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+ "MOV TEMP[1], TEMP[0].zzzz\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+ /* Load result if available. */
+ "UIF TEMP[1]\n"
+ "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Load previously accumulated result if requested. */
+ "MOV TEMP[0], IMM[0].xxxx\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
+ "UIF TEMP[4]\n"
+ "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].x, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Break if accumulated result so far is not available. */
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Break if result_index >= result_count. */
+ "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Load fence and check result availability */
+ "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Load start and end. */
+ "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+ "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+ "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+ "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+ "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+ "UIF TEMP[5].zzzz\n"
+ /* Load second start/end half-pair and
+ * take the difference
+ */
+ "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+ "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+ "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+ "ENDIF\n"
+
+ "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
+
+ /* Increment pair index */
+ "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+ "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "ENDLOOP\n"
+
+ /* Increment result index */
+ "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+ "ENDLOOP\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
+ "UIF TEMP[4]\n"
+ /* Store accumulated data for chaining. */
+ "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+ "ELSE\n"
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Store result availability. */
+ "NOT TEMP[0].z, TEMP[0]\n"
+ "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Store result if it is available. */
+ "NOT TEMP[4], TEMP[0].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Apply timestamp conversion */
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
+ "UIF TEMP[4]\n"
+ "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+ "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+ "ENDIF\n"
+
+ /* Convert to boolean */
+ "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
+ "UIF TEMP[4]\n"
+ "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
+ "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+ "MOV TEMP[0].y, IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+ "ELSE\n"
+ /* Clamping */
+ "UIF TEMP[0].yyyy\n"
+ "MOV TEMP[0].x, IMM[0].wwww\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
+ "UIF TEMP[4]\n"
+ "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+ "ENDIF\n"
+
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+
+ "END\n";
+
+ char text[sizeof(text_tmpl) + 32];
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {};
+
+ /* Hard code the frequency into the shader so that the backend can
+ * use the full range of optimizations for divide-by-constant.
+ */
+ snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return sctx->b.create_compute_state(&sctx->b, &state);
}
/* Create a compute shader implementing copy_image.
*/
void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
{
- static const char text[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
- "DCL TEMP[0..4], LOCAL\n"
- "IMM[0] UINT32 {8, 1, 0, 0}\n"
- "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
- "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
- "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
- "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
- "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
- "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
+ static const char text[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+ "DCL TEMP[0..4], LOCAL\n"
+ "IMM[0] UINT32 {8, 1, 0, 0}\n"
+ "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+ "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+ "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+ "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
+ "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
+ "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
}
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)
{
- static const char text[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
- "DCL TEMP[0..4], LOCAL\n"
- "IMM[0] UINT32 {64, 1, 0, 0}\n"
- "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
- "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
- "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
- "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "MOV TEMP[4].xy, CONST[0][1].xzzw\n"
- "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
- "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
+ static const char text[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+ "DCL TEMP[0..4], LOCAL\n"
+ "IMM[0] UINT32 {64, 1, 0, 0}\n"
+ "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+ "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+ "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+ "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "MOV TEMP[4].xy, CONST[0][1].xzzw\n"
+ "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
+ "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
}
void *si_clear_render_target_shader(struct pipe_context *ctx)
{
- static const char text[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
- "DCL TEMP[0..3], LOCAL\n"
- "IMM[0] UINT32 {8, 1, 0, 0}\n"
- "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
- "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
- "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
- "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
- "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
+ static const char text[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+ "DCL TEMP[0..3], LOCAL\n"
+ "IMM[0] UINT32 {8, 1, 0, 0}\n"
+ "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+ "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+ "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+ "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+ "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
}
/* TODO: Didn't really test 1D_ARRAY */
void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
{
- static const char text[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
- "DCL TEMP[0..3], LOCAL\n"
- "IMM[0] UINT32 {64, 1, 0, 0}\n"
- "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
- "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
- "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
- "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
- "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
+ static const char text[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+ "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+ "DCL TEMP[0..3], LOCAL\n"
+ "IMM[0] UINT32 {64, 1, 0, 0}\n"
+ "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+ "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+ "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+ "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+ "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
}
void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
{
- static const char text[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL BUFFER[0]\n"
- "DCL CONST[0][0..0]\n" // 0:xyzw
- "DCL TEMP[0..0]\n"
- "IMM[0] UINT32 {64, 1, 12, 0}\n"
- "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
- "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes
- "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
+ static const char text[] = "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL BUFFER[0]\n"
+ "DCL CONST[0][0..0]\n" // 0:xyzw
+ "DCL TEMP[0..0]\n"
+ "IMM[0] UINT32 {64, 1, 12, 0}\n"
+ "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+ "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes
+ "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
}
-
/* Load samples from the image, and copy them to the same image. This looks like
* a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
* reordered to match expanded FMASK.
*
* After the shader finishes, FMASK should be cleared to identity.
*/
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
- bool is_array)
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array)
{
- enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA :
- TGSI_TEXTURE_2D_MSAA;
- struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
- if (!ureg)
- return NULL;
-
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
- ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
- /* Compute the image coordinates. */
- struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
- struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
- struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
- struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg),
- TGSI_WRITEMASK_XYZW);
- ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY),
- ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8),
- ureg_swizzle(tid, 0, 1, 1, 1));
- if (is_array) {
- ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z),
- ureg_scalar(blk, TGSI_SWIZZLE_Z));
- }
-
- /* Load samples, resolving FMASK. */
- struct ureg_dst sample[8];
- assert(num_samples <= ARRAY_SIZE(sample));
-
- for (unsigned i = 0; i < num_samples; i++) {
- sample[i] = ureg_DECL_temporary(ureg);
-
- ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
- ureg_imm1u(ureg, i));
-
- struct ureg_src srcs[] = {image, ureg_src(coord)};
- ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2,
- TGSI_MEMORY_RESTRICT, target, 0);
- }
-
- /* Store samples, ignoring FMASK. */
- for (unsigned i = 0; i < num_samples; i++) {
- ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
- ureg_imm1u(ureg, i));
-
- struct ureg_dst dst_image = ureg_dst(image);
- struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
- ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2,
- TGSI_MEMORY_RESTRICT, target, 0);
- }
- ureg_END(ureg);
-
- struct pipe_compute_state state = {};
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = ureg_get_tokens(ureg, NULL);
-
- void *cs = ctx->create_compute_state(ctx, &state);
- ureg_destroy(ureg);
- return cs;
+ enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
+ struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+ if (!ureg)
+ return NULL;
+
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
+ ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+ /* Compute the image coordinates. */
+ struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
+ struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+ struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+ struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW);
+ ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1),
+ ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1));
+ if (is_array) {
+ ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z));
+ }
+
+ /* Load samples, resolving FMASK. */
+ struct ureg_dst sample[8];
+ assert(num_samples <= ARRAY_SIZE(sample));
+
+ for (unsigned i = 0; i < num_samples; i++) {
+ sample[i] = ureg_DECL_temporary(ureg);
+
+ ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+ struct ureg_src srcs[] = {image, ureg_src(coord)};
+ ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target,
+ 0);
+ }
+
+ /* Store samples, ignoring FMASK. */
+ for (unsigned i = 0; i < num_samples; i++) {
+ ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+ struct ureg_dst dst_image = ureg_dst(image);
+ struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
+ ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT,
+ target, 0);
+ }
+ ureg_END(ureg);
+
+ struct pipe_compute_state state = {};
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = ureg_get_tokens(ureg, NULL);
+
+ void *cs = ctx->create_compute_state(ctx, &state);
+ ureg_destroy(ureg);
+ return cs;
}
/* Create the compute shader that is used to collect the results of gfx10+
*/
void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
{
- /* TEMP[0].x = accumulated result so far
- * TEMP[0].y = result missing
- * TEMP[0].z = whether we're in overflow mode
- */
- static const char text_tmpl[] =
- "COMP\n"
- "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
- "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
- "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
- "DCL BUFFER[0]\n"
- "DCL BUFFER[1]\n"
- "DCL BUFFER[2]\n"
- "DCL CONST[0][0..0]\n"
- "DCL TEMP[0..5]\n"
- "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
- "IMM[1] UINT32 {1, 2, 4, 8}\n"
- "IMM[2] UINT32 {16, 32, 64, 128}\n"
-
- /*
- acc_result = 0;
- acc_missing = 0;
- if (chain & 1) {
- acc_result = buffer[1][0];
- acc_missing = buffer[1][1];
- }
- */
- "MOV TEMP[0].xy, IMM[0].xxxx\n"
- "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
- "UIF TEMP[5]\n"
- "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
- "ENDIF\n"
-
- /*
- is_overflow (TEMP[0].z) = (config & 7) >= 2;
- result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
- base_offset (TEMP[1].y) = 0;
- for (;;) {
- if (!result_remaining)
- break;
- result_remaining--;
- */
- "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
- "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
-
- "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
- "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
- "MOV TEMP[1].y, IMM[0].xxxx\n"
-
- "BGNLOOP\n"
- "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
- "UIF TEMP[5]\n"
- "BRK\n"
- "ENDIF\n"
- "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
-
- /*
- fence = buffer[0]@(base_offset + 32);
- if (!fence) {
- acc_missing = ~0u;
- break;
- }
- */
- "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
- "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
- "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
- "UIF TEMP[5]\n"
- "MOV TEMP[0].y, TEMP[5].xxxx\n"
- "BRK\n"
- "ENDIF\n"
-
- /*
- stream_offset (TEMP[2].x) = base_offset + offset;
-
- if (!(config & 7)) {
- acc_result += buffer[0]@stream_offset;
- }
- */
- "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
-
- "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
- "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
- "UIF TEMP[5]\n"
- "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
- "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
- "ENDIF\n"
-
- /*
- if ((config & 7) >= 2) {
- count (TEMP[2].y) = (config & 1) ? 4 : 1;
- */
- "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
- "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
- "UIF TEMP[5]\n"
- "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
- "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
-
- /*
- do {
- generated = buffer[0]@stream_offset;
- emitted = buffer[0]@(stream_offset + 16);
- if (generated != emitted) {
- acc_result = 1;
- result_remaining = 0;
- break;
- }
-
- stream_offset += 4;
- } while (--count);
- */
- "BGNLOOP\n"
- "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
- "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
- "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
- "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
- "UIF TEMP[5]\n"
- "MOV TEMP[0].x, IMM[1].xxxx\n"
- "MOV TEMP[1].y, IMM[0].xxxx\n"
- "BRK\n"
- "ENDIF\n"
-
- "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
- "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
- "UIF TEMP[5]\n"
- "BRK\n"
- "ENDIF\n"
- "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
- "ENDLOOP\n"
- "ENDIF\n"
-
- /*
- base_offset += 64;
- } // end outer loop
- */
- "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
- "ENDLOOP\n"
-
- /*
- if (chain & 2) {
- buffer[2][0] = acc_result;
- buffer[2][1] = acc_missing;
- } else {
- */
- "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
- "UIF TEMP[5]\n"
- "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
- "ELSE\n"
-
- /*
- if ((config & 7) == 1) {
- acc_result = acc_missing ? 0 : 1;
- acc_missing = 0;
- }
- */
- "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
- "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
- "UIF TEMP[5]\n"
- "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
- "MOV TEMP[0].y, IMM[0].xxxx\n"
- "ENDIF\n"
-
- /*
- if (!acc_missing) {
- buffer[2][0] = acc_result;
- if (config & 8)
- buffer[2][1] = 0;
- }
- */
- "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
- "UIF TEMP[5]\n"
- "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-
- "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
- "UIF TEMP[5]\n"
- "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
- "ENDIF\n"
- "ENDIF\n"
- "ENDIF\n"
-
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {};
-
- if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return sctx->b.create_compute_state(&sctx->b, &state);
+ /* TEMP[0].x = accumulated result so far
+ * TEMP[0].y = result missing
+ * TEMP[0].z = whether we're in overflow mode
+ */
+ static const char text_tmpl[] = "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL BUFFER[0]\n"
+ "DCL BUFFER[1]\n"
+ "DCL BUFFER[2]\n"
+ "DCL CONST[0][0..0]\n"
+ "DCL TEMP[0..5]\n"
+ "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+ "IMM[1] UINT32 {1, 2, 4, 8}\n"
+ "IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+ /*
+ acc_result = 0;
+ acc_missing = 0;
+ if (chain & 1) {
+ acc_result = buffer[1][0];
+ acc_missing = buffer[1][1];
+ }
+ */
+ "MOV TEMP[0].xy, IMM[0].xxxx\n"
+ "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+ "UIF TEMP[5]\n"
+ "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ is_overflow (TEMP[0].z) = (config & 7) >= 2;
+ result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 :
+ result_count; base_offset (TEMP[1].y) = 0; for (;;) { if
+ (!result_remaining) break; result_remaining--;
+ */
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+ "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+ "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+
+ "BGNLOOP\n"
+ "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+ /*
+ fence = buffer[0]@(base_offset + 32);
+ if (!fence) {
+ acc_missing = ~0u;
+ break;
+ }
+ */
+ "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "MOV TEMP[0].y, TEMP[5].xxxx\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /*
+ stream_offset (TEMP[2].x) = base_offset + offset;
+
+ if (!(config & 7)) {
+ acc_result += buffer[0]@stream_offset;
+ }
+ */
+ "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+ "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ if ((config & 7) >= 2) {
+ count (TEMP[2].y) = (config & 1) ? 4 : 1;
+ */
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+ "UIF TEMP[5]\n"
+ "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+ "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+ /*
+ do {
+ generated = buffer[0]@stream_offset;
+ emitted = buffer[0]@(stream_offset + 16);
+ if (generated != emitted) {
+ acc_result = 1;
+ result_remaining = 0;
+ break;
+ }
+
+ stream_offset += 4;
+ } while (--count);
+ */
+ "BGNLOOP\n"
+ "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+ "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+ "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+ "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+ "UIF TEMP[5]\n"
+ "MOV TEMP[0].x, IMM[1].xxxx\n"
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+ "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+ "ENDLOOP\n"
+ "ENDIF\n"
+
+ /*
+ base_offset += 64;
+ } // end outer loop
+ */
+ "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+ "ENDLOOP\n"
+
+ /*
+ if (chain & 2) {
+ buffer[2][0] = acc_result;
+ buffer[2][1] = acc_missing;
+ } else {
+ */
+ "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+ "ELSE\n"
+
+ /*
+ if ((config & 7) == 1) {
+ acc_result = acc_missing ? 0 : 1;
+ acc_missing = 0;
+ }
+ */
+ "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+ "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+ "UIF TEMP[5]\n"
+ "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+ "MOV TEMP[0].y, IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ /*
+ if (!acc_missing) {
+ buffer[2][0] = acc_result;
+ if (config & 8)
+ buffer[2][1] = 0;
+ }
+ */
+ "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+ "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+ "UIF TEMP[5]\n"
+ "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {};
+
+ if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return sctx->b.create_compute_state(&sctx->b, &state);
}
*/
#include "si_build_pm4.h"
-#include "sid.h"
#include "si_query.h"
-
-#include "util/u_dual_blend.h"
+#include "sid.h"
+#include "util/fast_idiv_by_const.h"
#include "util/format/u_format.h"
#include "util/format/u_format_s3tc.h"
+#include "util/u_dual_blend.h"
#include "util/u_memory.h"
#include "util/u_resource.h"
#include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
struct gfx10_format {
- unsigned img_format:9;
+ unsigned img_format : 9;
- /* Various formats are only supported with workarounds for vertex fetch,
- * and some 32_32_32 formats are supported natively, but only for buffers
- * (possibly with some image support, actually, but no filtering). */
- bool buffers_only:1;
+ /* Various formats are only supported with workarounds for vertex fetch,
+ * and some 32_32_32 formats are supported natively, but only for buffers
+ * (possibly with some image support, actually, but no filtering). */
+ bool buffers_only : 1;
};
#include "gfx10_format_table.h"
static unsigned si_map_swizzle(unsigned swizzle)
{
- switch (swizzle) {
- case PIPE_SWIZZLE_Y:
- return V_008F0C_SQ_SEL_Y;
- case PIPE_SWIZZLE_Z:
- return V_008F0C_SQ_SEL_Z;
- case PIPE_SWIZZLE_W:
- return V_008F0C_SQ_SEL_W;
- case PIPE_SWIZZLE_0:
- return V_008F0C_SQ_SEL_0;
- case PIPE_SWIZZLE_1:
- return V_008F0C_SQ_SEL_1;
- default: /* PIPE_SWIZZLE_X */
- return V_008F0C_SQ_SEL_X;
- }
+ switch (swizzle) {
+ case PIPE_SWIZZLE_Y:
+ return V_008F0C_SQ_SEL_Y;
+ case PIPE_SWIZZLE_Z:
+ return V_008F0C_SQ_SEL_Z;
+ case PIPE_SWIZZLE_W:
+ return V_008F0C_SQ_SEL_W;
+ case PIPE_SWIZZLE_0:
+ return V_008F0C_SQ_SEL_0;
+ case PIPE_SWIZZLE_1:
+ return V_008F0C_SQ_SEL_1;
+ default: /* PIPE_SWIZZLE_X */
+ return V_008F0C_SQ_SEL_X;
+ }
}
/* 12.4 fixed-point */
static unsigned si_pack_float_12p4(float x)
{
- return x <= 0 ? 0 :
- x >= 4096 ? 0xffff : x * 16;
+ return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16;
}
/*
*/
static void si_emit_cb_render_state(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct si_state_blend *blend = sctx->queued.named.blend;
- /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
- * but you never know. */
- uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit &
- blend->cb_target_mask;
- unsigned i;
-
- /* Avoid a hang that happens when dual source blending is enabled
- * but there is not enough color outputs. This is undefined behavior,
- * so disable color writes completely.
- *
- * Reproducible with Unigine Heaven 4.0 and drirc missing.
- */
- if (blend->dual_src_blend &&
- sctx->ps_shader.cso &&
- (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
- cb_target_mask = 0;
-
- /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
- * I think we don't have to do anything between IBs.
- */
- if (sctx->screen->dpbb_allowed &&
- sctx->last_cb_target_mask != cb_target_mask) {
- sctx->last_cb_target_mask = cb_target_mask;
-
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
- }
-
- unsigned initial_cdw = cs->current.cdw;
- radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
- SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
-
- if (sctx->chip_class >= GFX8) {
- /* DCC MSAA workaround.
- * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
- * COMBINER_DISABLE, but that would be more complicated.
- */
- bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask &&
- sctx->framebuffer.nr_samples >= 2;
- unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
-
- radeon_opt_set_context_reg(
- sctx, R_028424_CB_DCC_CONTROL,
- SI_TRACKED_CB_DCC_CONTROL,
- S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
- S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
- S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
- S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
- }
-
- /* RB+ register settings. */
- if (sctx->screen->info.rbplus_allowed) {
- unsigned spi_shader_col_format =
- sctx->ps_shader.cso ?
- sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0;
- unsigned sx_ps_downconvert = 0;
- unsigned sx_blend_opt_epsilon = 0;
- unsigned sx_blend_opt_control = 0;
-
- for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- struct si_surface *surf =
- (struct si_surface*)sctx->framebuffer.state.cbufs[i];
- unsigned format, swap, spi_format, colormask;
- bool has_alpha, has_rgb;
-
- if (!surf) {
- /* If the color buffer is not set, the driver sets 32_R
- * as the SPI color format, because the hw doesn't allow
- * holes between color outputs, so also set this to
- * enable RB+.
- */
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
- continue;
- }
-
- format = G_028C70_FORMAT(surf->cb_color_info);
- swap = G_028C70_COMP_SWAP(surf->cb_color_info);
- spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
- colormask = (cb_target_mask >> (i * 4)) & 0xf;
-
- /* Set if RGB and A are present. */
- has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
-
- if (format == V_028C70_COLOR_8 ||
- format == V_028C70_COLOR_16 ||
- format == V_028C70_COLOR_32)
- has_rgb = !has_alpha;
- else
- has_rgb = true;
-
- /* Check the colormask and export format. */
- if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
- has_rgb = false;
- if (!(colormask & PIPE_MASK_A))
- has_alpha = false;
-
- if (spi_format == V_028714_SPI_SHADER_ZERO) {
- has_rgb = false;
- has_alpha = false;
- }
-
- /* Disable value checking for disabled channels. */
- if (!has_rgb)
- sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
- if (!has_alpha)
- sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
-
- /* Enable down-conversion for 32bpp and smaller formats. */
- switch (format) {
- case V_028C70_COLOR_8:
- case V_028C70_COLOR_8_8:
- case V_028C70_COLOR_8_8_8_8:
- /* For 1 and 2-channel formats, use the superset thereof. */
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
- spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
- spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
- sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
- }
- break;
-
- case V_028C70_COLOR_5_6_5:
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
- sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
- }
- break;
-
- case V_028C70_COLOR_1_5_5_5:
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
- sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
- }
- break;
-
- case V_028C70_COLOR_4_4_4_4:
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
- sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
- }
- break;
-
- case V_028C70_COLOR_32:
- if (swap == V_028C70_SWAP_STD &&
- spi_format == V_028714_SPI_SHADER_32_R)
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
- else if (swap == V_028C70_SWAP_ALT_REV &&
- spi_format == V_028714_SPI_SHADER_32_AR)
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
- break;
-
- case V_028C70_COLOR_16:
- case V_028C70_COLOR_16_16:
- /* For 1-channel formats, use the superset thereof. */
- if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
- spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
- spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
- spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
- if (swap == V_028C70_SWAP_STD ||
- swap == V_028C70_SWAP_STD_REV)
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
- else
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
- }
- break;
-
- case V_028C70_COLOR_10_11_11:
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
- break;
-
- case V_028C70_COLOR_2_10_10_10:
- if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
- sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
- sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
- }
- break;
- }
- }
-
- /* If there are no color outputs, the first color export is
- * always enabled as 32_R, so also set this to enable RB+.
- */
- if (!sx_ps_downconvert)
- sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
-
- /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
- radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
- SI_TRACKED_SX_PS_DOWNCONVERT,
- sx_ps_downconvert, sx_blend_opt_epsilon,
- sx_blend_opt_control);
- }
- if (initial_cdw != cs->current.cdw)
- sctx->context_roll = true;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct si_state_blend *blend = sctx->queued.named.blend;
+ /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
+ * but you never know. */
+ uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask;
+ unsigned i;
+
+ /* Avoid a hang that happens when dual source blending is enabled
+ * but there is not enough color outputs. This is undefined behavior,
+ * so disable color writes completely.
+ *
+ * Reproducible with Unigine Heaven 4.0 and drirc missing.
+ */
+ if (blend->dual_src_blend && sctx->ps_shader.cso &&
+ (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
+ cb_target_mask = 0;
+
+ /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
+ * I think we don't have to do anything between IBs.
+ */
+ if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
+ sctx->last_cb_target_mask = cb_target_mask;
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+ }
+
+ unsigned initial_cdw = cs->current.cdw;
+ radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
+ cb_target_mask);
+
+ if (sctx->chip_class >= GFX8) {
+ /* DCC MSAA workaround.
+ * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
+ * COMBINER_DISABLE, but that would be more complicated.
+ */
+ bool oc_disable =
+ blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2;
+ unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
+
+ radeon_opt_set_context_reg(
+ sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL,
+ S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
+ S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
+ S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
+ S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
+ }
+
+ /* RB+ register settings. */
+ if (sctx->screen->info.rbplus_allowed) {
+ unsigned spi_shader_col_format =
+ sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format
+ : 0;
+ unsigned sx_ps_downconvert = 0;
+ unsigned sx_blend_opt_epsilon = 0;
+ unsigned sx_blend_opt_control = 0;
+
+ for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i];
+ unsigned format, swap, spi_format, colormask;
+ bool has_alpha, has_rgb;
+
+ if (!surf) {
+ /* If the color buffer is not set, the driver sets 32_R
+ * as the SPI color format, because the hw doesn't allow
+ * holes between color outputs, so also set this to
+ * enable RB+.
+ */
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+ continue;
+ }
+
+ format = G_028C70_FORMAT(surf->cb_color_info);
+ swap = G_028C70_COMP_SWAP(surf->cb_color_info);
+ spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
+ colormask = (cb_target_mask >> (i * 4)) & 0xf;
+
+ /* Set if RGB and A are present. */
+ has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
+
+ if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 ||
+ format == V_028C70_COLOR_32)
+ has_rgb = !has_alpha;
+ else
+ has_rgb = true;
+
+ /* Check the colormask and export format. */
+ if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
+ has_rgb = false;
+ if (!(colormask & PIPE_MASK_A))
+ has_alpha = false;
+
+ if (spi_format == V_028714_SPI_SHADER_ZERO) {
+ has_rgb = false;
+ has_alpha = false;
+ }
+
+ /* Disable value checking for disabled channels. */
+ if (!has_rgb)
+ sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+ if (!has_alpha)
+ sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
+
+ /* Enable down-conversion for 32bpp and smaller formats. */
+ switch (format) {
+ case V_028C70_COLOR_8:
+ case V_028C70_COLOR_8_8:
+ case V_028C70_COLOR_8_8_8_8:
+ /* For 1 and 2-channel formats, use the superset thereof. */
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
+ spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+ spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
+ sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
+ }
+ break;
+
+ case V_028C70_COLOR_5_6_5:
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
+ sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
+ }
+ break;
+
+ case V_028C70_COLOR_1_5_5_5:
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
+ sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
+ }
+ break;
+
+ case V_028C70_COLOR_4_4_4_4:
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
+ sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
+ }
+ break;
+
+ case V_028C70_COLOR_32:
+ if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+ else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
+ break;
+
+ case V_028C70_COLOR_16:
+ case V_028C70_COLOR_16_16:
+ /* For 1-channel formats, use the superset thereof. */
+ if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
+ spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
+ spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+ spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+ if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
+ else
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
+ }
+ break;
+
+ case V_028C70_COLOR_10_11_11:
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
+ break;
+
+ case V_028C70_COLOR_2_10_10_10:
+ if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+ sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
+ sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
+ }
+ break;
+ }
+ }
+
+ /* If there are no color outputs, the first color export is
+ * always enabled as 32_R, so also set this to enable RB+.
+ */
+ if (!sx_ps_downconvert)
+ sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
+
+ /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
+ radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
+ sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
+ }
+ if (initial_cdw != cs->current.cdw)
+ sctx->context_roll = true;
}
/*
static uint32_t si_translate_blend_function(int blend_func)
{
- switch (blend_func) {
- case PIPE_BLEND_ADD:
- return V_028780_COMB_DST_PLUS_SRC;
- case PIPE_BLEND_SUBTRACT:
- return V_028780_COMB_SRC_MINUS_DST;
- case PIPE_BLEND_REVERSE_SUBTRACT:
- return V_028780_COMB_DST_MINUS_SRC;
- case PIPE_BLEND_MIN:
- return V_028780_COMB_MIN_DST_SRC;
- case PIPE_BLEND_MAX:
- return V_028780_COMB_MAX_DST_SRC;
- default:
- PRINT_ERR("Unknown blend function %d\n", blend_func);
- assert(0);
- break;
- }
- return 0;
+ switch (blend_func) {
+ case PIPE_BLEND_ADD:
+ return V_028780_COMB_DST_PLUS_SRC;
+ case PIPE_BLEND_SUBTRACT:
+ return V_028780_COMB_SRC_MINUS_DST;
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ return V_028780_COMB_DST_MINUS_SRC;
+ case PIPE_BLEND_MIN:
+ return V_028780_COMB_MIN_DST_SRC;
+ case PIPE_BLEND_MAX:
+ return V_028780_COMB_MAX_DST_SRC;
+ default:
+ PRINT_ERR("Unknown blend function %d\n", blend_func);
+ assert(0);
+ break;
+ }
+ return 0;
}
static uint32_t si_translate_blend_factor(int blend_fact)
{
- switch (blend_fact) {
- case PIPE_BLENDFACTOR_ONE:
- return V_028780_BLEND_ONE;
- case PIPE_BLENDFACTOR_SRC_COLOR:
- return V_028780_BLEND_SRC_COLOR;
- case PIPE_BLENDFACTOR_SRC_ALPHA:
- return V_028780_BLEND_SRC_ALPHA;
- case PIPE_BLENDFACTOR_DST_ALPHA:
- return V_028780_BLEND_DST_ALPHA;
- case PIPE_BLENDFACTOR_DST_COLOR:
- return V_028780_BLEND_DST_COLOR;
- case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
- return V_028780_BLEND_SRC_ALPHA_SATURATE;
- case PIPE_BLENDFACTOR_CONST_COLOR:
- return V_028780_BLEND_CONSTANT_COLOR;
- case PIPE_BLENDFACTOR_CONST_ALPHA:
- return V_028780_BLEND_CONSTANT_ALPHA;
- case PIPE_BLENDFACTOR_ZERO:
- return V_028780_BLEND_ZERO;
- case PIPE_BLENDFACTOR_INV_SRC_COLOR:
- return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
- case PIPE_BLENDFACTOR_INV_DST_ALPHA:
- return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
- case PIPE_BLENDFACTOR_INV_DST_COLOR:
- return V_028780_BLEND_ONE_MINUS_DST_COLOR;
- case PIPE_BLENDFACTOR_INV_CONST_COLOR:
- return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
- case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
- return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
- case PIPE_BLENDFACTOR_SRC1_COLOR:
- return V_028780_BLEND_SRC1_COLOR;
- case PIPE_BLENDFACTOR_SRC1_ALPHA:
- return V_028780_BLEND_SRC1_ALPHA;
- case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
- return V_028780_BLEND_INV_SRC1_COLOR;
- case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
- return V_028780_BLEND_INV_SRC1_ALPHA;
- default:
- PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
- assert(0);
- break;
- }
- return 0;
+ switch (blend_fact) {
+ case PIPE_BLENDFACTOR_ONE:
+ return V_028780_BLEND_ONE;
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ return V_028780_BLEND_SRC_COLOR;
+ case PIPE_BLENDFACTOR_SRC_ALPHA:
+ return V_028780_BLEND_SRC_ALPHA;
+ case PIPE_BLENDFACTOR_DST_ALPHA:
+ return V_028780_BLEND_DST_ALPHA;
+ case PIPE_BLENDFACTOR_DST_COLOR:
+ return V_028780_BLEND_DST_COLOR;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ return V_028780_BLEND_SRC_ALPHA_SATURATE;
+ case PIPE_BLENDFACTOR_CONST_COLOR:
+ return V_028780_BLEND_CONSTANT_COLOR;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ return V_028780_BLEND_CONSTANT_ALPHA;
+ case PIPE_BLENDFACTOR_ZERO:
+ return V_028780_BLEND_ZERO;
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
+ case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+ return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
+ case PIPE_BLENDFACTOR_INV_DST_COLOR:
+ return V_028780_BLEND_ONE_MINUS_DST_COLOR;
+ case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+ return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
+ case PIPE_BLENDFACTOR_SRC1_COLOR:
+ return V_028780_BLEND_SRC1_COLOR;
+ case PIPE_BLENDFACTOR_SRC1_ALPHA:
+ return V_028780_BLEND_SRC1_ALPHA;
+ case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+ return V_028780_BLEND_INV_SRC1_COLOR;
+ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+ return V_028780_BLEND_INV_SRC1_ALPHA;
+ default:
+ PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
+ assert(0);
+ break;
+ }
+ return 0;
}
static uint32_t si_translate_blend_opt_function(int blend_func)
{
- switch (blend_func) {
- case PIPE_BLEND_ADD:
- return V_028760_OPT_COMB_ADD;
- case PIPE_BLEND_SUBTRACT:
- return V_028760_OPT_COMB_SUBTRACT;
- case PIPE_BLEND_REVERSE_SUBTRACT:
- return V_028760_OPT_COMB_REVSUBTRACT;
- case PIPE_BLEND_MIN:
- return V_028760_OPT_COMB_MIN;
- case PIPE_BLEND_MAX:
- return V_028760_OPT_COMB_MAX;
- default:
- return V_028760_OPT_COMB_BLEND_DISABLED;
- }
+ switch (blend_func) {
+ case PIPE_BLEND_ADD:
+ return V_028760_OPT_COMB_ADD;
+ case PIPE_BLEND_SUBTRACT:
+ return V_028760_OPT_COMB_SUBTRACT;
+ case PIPE_BLEND_REVERSE_SUBTRACT:
+ return V_028760_OPT_COMB_REVSUBTRACT;
+ case PIPE_BLEND_MIN:
+ return V_028760_OPT_COMB_MIN;
+ case PIPE_BLEND_MAX:
+ return V_028760_OPT_COMB_MAX;
+ default:
+ return V_028760_OPT_COMB_BLEND_DISABLED;
+ }
}
static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
{
- switch (blend_fact) {
- case PIPE_BLENDFACTOR_ZERO:
- return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
- case PIPE_BLENDFACTOR_ONE:
- return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
- case PIPE_BLENDFACTOR_SRC_COLOR:
- return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
- : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
- case PIPE_BLENDFACTOR_INV_SRC_COLOR:
- return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
- : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
- case PIPE_BLENDFACTOR_SRC_ALPHA:
- return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
- return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
- case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
- return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
- : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
- default:
- return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
- }
+ switch (blend_fact) {
+ case PIPE_BLENDFACTOR_ZERO:
+ return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
+ case PIPE_BLENDFACTOR_ONE:
+ return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
+ case PIPE_BLENDFACTOR_SRC_COLOR:
+ return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
+ : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
+ case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+ return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
+ : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
+ case PIPE_BLENDFACTOR_SRC_ALPHA:
+ return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
+ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+ return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
+ : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+ default:
+ return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+ }
}
-static void si_blend_check_commutativity(struct si_screen *sscreen,
- struct si_state_blend *blend,
- enum pipe_blend_func func,
- enum pipe_blendfactor src,
- enum pipe_blendfactor dst,
- unsigned chanmask)
+static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend,
+ enum pipe_blend_func func, enum pipe_blendfactor src,
+ enum pipe_blendfactor dst, unsigned chanmask)
{
- /* Src factor is allowed when it does not depend on Dst */
- static const uint32_t src_allowed =
- (1u << PIPE_BLENDFACTOR_ONE) |
- (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
- (1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
- (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
- (1u << PIPE_BLENDFACTOR_CONST_COLOR) |
- (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
- (1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
- (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
- (1u << PIPE_BLENDFACTOR_ZERO) |
- (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
- (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
- (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
- (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
- (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
- (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
-
- if (dst == PIPE_BLENDFACTOR_ONE &&
- (src_allowed & (1u << src))) {
- /* Addition is commutative, but floating point addition isn't
- * associative: subtle changes can be introduced via different
- * rounding.
- *
- * Out-of-order is also non-deterministic, which means that
- * this breaks OpenGL invariance requirements. So only enable
- * out-of-order additive blending if explicitly allowed by a
- * setting.
- */
- if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
- (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
- blend->commutative_4bit |= chanmask;
- }
+ /* Src factor is allowed when it does not depend on Dst */
+ static const uint32_t src_allowed =
+ (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
+ (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
+ (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
+ (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
+ (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
+ (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
+ (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
+ (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
+
+ if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) {
+ /* Addition is commutative, but floating point addition isn't
+ * associative: subtle changes can be introduced via different
+ * rounding.
+ *
+ * Out-of-order is also non-deterministic, which means that
+ * this breaks OpenGL invariance requirements. So only enable
+ * out-of-order additive blending if explicitly allowed by a
+ * setting.
+ */
+ if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
+ (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
+ blend->commutative_4bit |= chanmask;
+ }
}
/**
* Get rid of DST in the blend factors by commuting the operands:
* func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
*/
-static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
- unsigned *dst_factor, unsigned expected_dst,
- unsigned replacement_src)
+static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
+ unsigned expected_dst, unsigned replacement_src)
{
- if (*src_factor == expected_dst &&
- *dst_factor == PIPE_BLENDFACTOR_ZERO) {
- *src_factor = PIPE_BLENDFACTOR_ZERO;
- *dst_factor = replacement_src;
-
- /* Commuting the operands requires reversing subtractions. */
- if (*func == PIPE_BLEND_SUBTRACT)
- *func = PIPE_BLEND_REVERSE_SUBTRACT;
- else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
- *func = PIPE_BLEND_SUBTRACT;
- }
+ if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) {
+ *src_factor = PIPE_BLENDFACTOR_ZERO;
+ *dst_factor = replacement_src;
+
+ /* Commuting the operands requires reversing subtractions. */
+ if (*func == PIPE_BLEND_SUBTRACT)
+ *func = PIPE_BLEND_REVERSE_SUBTRACT;
+ else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
+ *func = PIPE_BLEND_SUBTRACT;
+ }
}
static bool si_blend_factor_uses_dst(unsigned factor)
{
- return factor == PIPE_BLENDFACTOR_DST_COLOR ||
- factor == PIPE_BLENDFACTOR_DST_ALPHA ||
- factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
- factor == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
- factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
+ return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA ||
+ factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+ factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
}
static void *si_create_blend_state_mode(struct pipe_context *ctx,
- const struct pipe_blend_state *state,
- unsigned mode)
+ const struct pipe_blend_state *state, unsigned mode)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
- struct si_pm4_state *pm4 = &blend->pm4;
- uint32_t sx_mrt_blend_opt[8] = {0};
- uint32_t color_control = 0;
- bool logicop_enable = state->logicop_enable &&
- state->logicop_func != PIPE_LOGICOP_COPY;
-
- if (!blend)
- return NULL;
-
- blend->alpha_to_coverage = state->alpha_to_coverage;
- blend->alpha_to_one = state->alpha_to_one;
- blend->dual_src_blend = util_blend_state_is_dual(state, 0);
- blend->logicop_enable = logicop_enable;
-
- if (logicop_enable) {
- color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
- } else {
- color_control |= S_028808_ROP3(0xcc);
- }
-
- si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
- S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
- S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
- S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
- S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
- S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
- S_028B70_OFFSET_ROUND(1));
-
- if (state->alpha_to_coverage)
- blend->need_src_alpha_4bit |= 0xf;
-
- blend->cb_target_mask = 0;
- blend->cb_target_enabled_4bit = 0;
-
- for (int i = 0; i < 8; i++) {
- /* state->rt entries > 0 only written if independent blending */
- const int j = state->independent_blend_enable ? i : 0;
-
- unsigned eqRGB = state->rt[j].rgb_func;
- unsigned srcRGB = state->rt[j].rgb_src_factor;
- unsigned dstRGB = state->rt[j].rgb_dst_factor;
- unsigned eqA = state->rt[j].alpha_func;
- unsigned srcA = state->rt[j].alpha_src_factor;
- unsigned dstA = state->rt[j].alpha_dst_factor;
-
- unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
- unsigned blend_cntl = 0;
-
- sx_mrt_blend_opt[i] =
- S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
- S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
-
- /* Only set dual source blending for MRT0 to avoid a hang. */
- if (i >= 1 && blend->dual_src_blend) {
- /* Vulkan does this for dual source blending. */
- if (i == 1)
- blend_cntl |= S_028780_ENABLE(1);
-
- si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
- continue;
- }
-
- /* Only addition and subtraction equations are supported with
- * dual source blending.
- */
- if (blend->dual_src_blend &&
- (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
- eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
- assert(!"Unsupported equation for dual source blending");
- si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
- continue;
- }
-
- /* cb_render_state will disable unused ones */
- blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
- if (state->rt[j].colormask)
- blend->cb_target_enabled_4bit |= 0xf << (4 * i);
-
- if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
- si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
- continue;
- }
-
- si_blend_check_commutativity(sctx->screen, blend,
- eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
- si_blend_check_commutativity(sctx->screen, blend,
- eqA, srcA, dstA, 0x8 << (4 * i));
-
- /* Blending optimizations for RB+.
- * These transformations don't change the behavior.
- *
- * First, get rid of DST in the blend factors:
- * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
- */
- si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
- PIPE_BLENDFACTOR_DST_COLOR,
- PIPE_BLENDFACTOR_SRC_COLOR);
- si_blend_remove_dst(&eqA, &srcA, &dstA,
- PIPE_BLENDFACTOR_DST_COLOR,
- PIPE_BLENDFACTOR_SRC_COLOR);
- si_blend_remove_dst(&eqA, &srcA, &dstA,
- PIPE_BLENDFACTOR_DST_ALPHA,
- PIPE_BLENDFACTOR_SRC_ALPHA);
-
- /* Look up the ideal settings from tables. */
- srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
- dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
- srcA_opt = si_translate_blend_opt_factor(srcA, true);
- dstA_opt = si_translate_blend_opt_factor(dstA, true);
-
- /* Handle interdependencies. */
- if (si_blend_factor_uses_dst(srcRGB))
- dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
- if (si_blend_factor_uses_dst(srcA))
- dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-
- if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
- (dstRGB == PIPE_BLENDFACTOR_ZERO ||
- dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
- dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
- dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
-
- /* Set the final value. */
- sx_mrt_blend_opt[i] =
- S_028760_COLOR_SRC_OPT(srcRGB_opt) |
- S_028760_COLOR_DST_OPT(dstRGB_opt) |
- S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
- S_028760_ALPHA_SRC_OPT(srcA_opt) |
- S_028760_ALPHA_DST_OPT(dstA_opt) |
- S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
-
- /* Set blend state. */
- blend_cntl |= S_028780_ENABLE(1);
- blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
- blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
- blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
-
- if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
- blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
- blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
- blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
- blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
- }
- si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-
- blend->blend_enable_4bit |= 0xfu << (i * 4);
-
- if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
- blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
-
- /* This is only important for formats without alpha. */
- if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
- dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
- srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
- dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
- srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
- dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
- blend->need_src_alpha_4bit |= 0xfu << (i * 4);
- }
-
- if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
- blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
-
- if (blend->cb_target_mask) {
- color_control |= S_028808_MODE(mode);
- } else {
- color_control |= S_028808_MODE(V_028808_CB_DISABLE);
- }
-
- if (sctx->screen->info.rbplus_allowed) {
- /* Disable RB+ blend optimizations for dual source blending.
- * Vulkan does this.
- */
- if (blend->dual_src_blend) {
- for (int i = 0; i < 8; i++) {
- sx_mrt_blend_opt[i] =
- S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
- S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
- }
- }
-
- for (int i = 0; i < 8; i++)
- si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
- sx_mrt_blend_opt[i]);
-
- /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
- if (blend->dual_src_blend || logicop_enable ||
- mode == V_028808_CB_RESOLVE)
- color_control |= S_028808_DISABLE_DUAL_QUAD(1);
- }
-
- si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
- return blend;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
+ struct si_pm4_state *pm4 = &blend->pm4;
+ uint32_t sx_mrt_blend_opt[8] = {0};
+ uint32_t color_control = 0;
+ bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY;
+
+ if (!blend)
+ return NULL;
+
+ blend->alpha_to_coverage = state->alpha_to_coverage;
+ blend->alpha_to_one = state->alpha_to_one;
+ blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+ blend->logicop_enable = logicop_enable;
+
+ if (logicop_enable) {
+ color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
+ } else {
+ color_control |= S_028808_ROP3(0xcc);
+ }
+
+ si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
+ S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
+ S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+ S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+ S_028B70_OFFSET_ROUND(1));
+
+ if (state->alpha_to_coverage)
+ blend->need_src_alpha_4bit |= 0xf;
+
+ blend->cb_target_mask = 0;
+ blend->cb_target_enabled_4bit = 0;
+
+ for (int i = 0; i < 8; i++) {
+ /* state->rt entries > 0 only written if independent blending */
+ const int j = state->independent_blend_enable ? i : 0;
+
+ unsigned eqRGB = state->rt[j].rgb_func;
+ unsigned srcRGB = state->rt[j].rgb_src_factor;
+ unsigned dstRGB = state->rt[j].rgb_dst_factor;
+ unsigned eqA = state->rt[j].alpha_func;
+ unsigned srcA = state->rt[j].alpha_src_factor;
+ unsigned dstA = state->rt[j].alpha_dst_factor;
+
+ unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
+ unsigned blend_cntl = 0;
+
+ sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
+ S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+
+ /* Only set dual source blending for MRT0 to avoid a hang. */
+ if (i >= 1 && blend->dual_src_blend) {
+ /* Vulkan does this for dual source blending. */
+ if (i == 1)
+ blend_cntl |= S_028780_ENABLE(1);
+
+ si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+ continue;
+ }
+
+ /* Only addition and subtraction equations are supported with
+ * dual source blending.
+ */
+ if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
+ eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
+ assert(!"Unsupported equation for dual source blending");
+ si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+ continue;
+ }
+
+ /* cb_render_state will disable unused ones */
+ blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
+ if (state->rt[j].colormask)
+ blend->cb_target_enabled_4bit |= 0xf << (4 * i);
+
+ if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
+ si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+ continue;
+ }
+
+ si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
+ si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i));
+
+ /* Blending optimizations for RB+.
+ * These transformations don't change the behavior.
+ *
+ * First, get rid of DST in the blend factors:
+ * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+ */
+ si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR,
+ PIPE_BLENDFACTOR_SRC_COLOR);
+ si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR,
+ PIPE_BLENDFACTOR_SRC_COLOR);
+ si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA,
+ PIPE_BLENDFACTOR_SRC_ALPHA);
+
+ /* Look up the ideal settings from tables. */
+ srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
+ dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
+ srcA_opt = si_translate_blend_opt_factor(srcA, true);
+ dstA_opt = si_translate_blend_opt_factor(dstA, true);
+
+ /* Handle interdependencies. */
+ if (si_blend_factor_uses_dst(srcRGB))
+ dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+ if (si_blend_factor_uses_dst(srcA))
+ dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+
+ if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
+ (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+ dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
+ dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+
+ /* Set the final value. */
+ sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) |
+ S_028760_COLOR_DST_OPT(dstRGB_opt) |
+ S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
+ S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
+ S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
+
+ /* Set blend state. */
+ blend_cntl |= S_028780_ENABLE(1);
+ blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
+ blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
+ blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
+
+ if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+ blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
+ blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
+ blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
+ blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
+ }
+ si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+
+ blend->blend_enable_4bit |= 0xfu << (i * 4);
+
+ if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
+ blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
+
+ /* This is only important for formats without alpha. */
+ if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+ srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+ dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+ srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+ blend->need_src_alpha_4bit |= 0xfu << (i * 4);
+ }
+
+ if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
+ blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
+
+ if (blend->cb_target_mask) {
+ color_control |= S_028808_MODE(mode);
+ } else {
+ color_control |= S_028808_MODE(V_028808_CB_DISABLE);
+ }
+
+ if (sctx->screen->info.rbplus_allowed) {
+ /* Disable RB+ blend optimizations for dual source blending.
+ * Vulkan does this.
+ */
+ if (blend->dual_src_blend) {
+ for (int i = 0; i < 8; i++) {
+ sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
+ S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
+ }
+ }
+
+ for (int i = 0; i < 8; i++)
+ si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
+
+ /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
+ if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
+ color_control |= S_028808_DISABLE_DUAL_QUAD(1);
+ }
+
+ si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
+ return blend;
}
-static void *si_create_blend_state(struct pipe_context *ctx,
- const struct pipe_blend_state *state)
+static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state)
{
- return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
+ return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
}
static void si_bind_blend_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_blend *old_blend = sctx->queued.named.blend;
- struct si_state_blend *blend = (struct si_state_blend *)state;
-
- if (!blend)
- blend = (struct si_state_blend *)sctx->noop_blend;
-
- si_pm4_bind_state(sctx, blend, blend);
-
- if (old_blend->cb_target_mask != blend->cb_target_mask ||
- old_blend->dual_src_blend != blend->dual_src_blend ||
- (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
- sctx->framebuffer.nr_samples >= 2 &&
- sctx->screen->dcc_msaa_allowed))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
- if (old_blend->cb_target_mask != blend->cb_target_mask ||
- old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
- old_blend->alpha_to_one != blend->alpha_to_one ||
- old_blend->dual_src_blend != blend->dual_src_blend ||
- old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
- old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
- sctx->do_update_shaders = true;
-
- if (sctx->screen->dpbb_allowed &&
- (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
- old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
- old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
- if (sctx->screen->has_out_of_order_rast &&
- ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
- old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
- old_blend->commutative_4bit != blend->commutative_4bit ||
- old_blend->logicop_enable != blend->logicop_enable)))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_blend *old_blend = sctx->queued.named.blend;
+ struct si_state_blend *blend = (struct si_state_blend *)state;
+
+ if (!blend)
+ blend = (struct si_state_blend *)sctx->noop_blend;
+
+ si_pm4_bind_state(sctx, blend, blend);
+
+ if (old_blend->cb_target_mask != blend->cb_target_mask ||
+ old_blend->dual_src_blend != blend->dual_src_blend ||
+ (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
+ sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+ if (old_blend->cb_target_mask != blend->cb_target_mask ||
+ old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+ old_blend->alpha_to_one != blend->alpha_to_one ||
+ old_blend->dual_src_blend != blend->dual_src_blend ||
+ old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+ old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+ sctx->do_update_shaders = true;
+
+ if (sctx->screen->dpbb_allowed &&
+ (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+ old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+ old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+ if (sctx->screen->has_out_of_order_rast &&
+ ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+ old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+ old_blend->commutative_4bit != blend->commutative_4bit ||
+ old_blend->logicop_enable != blend->logicop_enable)))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
}
static void si_delete_blend_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (sctx->queued.named.blend == state)
- si_bind_blend_state(ctx, sctx->noop_blend);
+ if (sctx->queued.named.blend == state)
+ si_bind_blend_state(ctx, sctx->noop_blend);
- si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
+ si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
}
-static void si_set_blend_color(struct pipe_context *ctx,
- const struct pipe_blend_color *state)
+static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- static const struct pipe_blend_color zeros;
+ struct si_context *sctx = (struct si_context *)ctx;
+ static const struct pipe_blend_color zeros;
- sctx->blend_color.state = *state;
- sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
+ sctx->blend_color.state = *state;
+ sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
}
static void si_emit_blend_color(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
- radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
+ radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
+ radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
}
/*
* Clipping
*/
-static void si_set_clip_state(struct pipe_context *ctx,
- const struct pipe_clip_state *state)
+static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_constant_buffer cb;
- static const struct pipe_clip_state zeros;
-
- if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
- return;
-
- sctx->clip_state.state = *state;
- sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
-
- cb.buffer = NULL;
- cb.user_buffer = state->ucp;
- cb.buffer_offset = 0;
- cb.buffer_size = 4*4*8;
- si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
- pipe_resource_reference(&cb.buffer, NULL);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_constant_buffer cb;
+ static const struct pipe_clip_state zeros;
+
+ if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
+ return;
+
+ sctx->clip_state.state = *state;
+ sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
+
+ cb.buffer = NULL;
+ cb.user_buffer = state->ucp;
+ cb.buffer_offset = 0;
+ cb.buffer_size = 4 * 4 * 8;
+ si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
+ pipe_resource_reference(&cb.buffer, NULL);
}
static void si_emit_clip_state(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
- radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
+ radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+ radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
}
static void si_emit_clip_regs(struct si_context *sctx)
{
- struct si_shader *vs = si_get_vs_state(sctx);
- struct si_shader_selector *vs_sel = vs->selector;
- struct si_shader_info *info = &vs_sel->info;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- unsigned window_space =
- info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
- unsigned clipdist_mask = vs_sel->clipdist_mask;
- unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
- unsigned culldist_mask = vs_sel->culldist_mask;
- unsigned total_mask;
-
- if (vs->key.opt.clip_disable) {
- assert(!info->culldist_writemask);
- clipdist_mask = 0;
- culldist_mask = 0;
- }
- total_mask = clipdist_mask | culldist_mask;
-
- /* Clip distances on points have no effect, so need to be implemented
- * as cull distances. This applies for the clipvertex case as well.
- *
- * Setting this for primitives other than points should have no adverse
- * effects.
- */
- clipdist_mask &= rs->clip_plane_enable;
- culldist_mask |= clipdist_mask;
-
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
- S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
- clipdist_mask | (culldist_mask << 8);
-
- if (sctx->chip_class >= GFX10) {
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
- pa_cl_cntl,
- ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
- } else {
- radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
- vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
- }
- radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
- SI_TRACKED_PA_CL_CLIP_CNTL,
- rs->pa_cl_clip_cntl |
- ucp_mask |
- S_028810_CLIP_DISABLE(window_space));
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
+ struct si_shader *vs = si_get_vs_state(sctx);
+ struct si_shader_selector *vs_sel = vs->selector;
+ struct si_shader_info *info = &vs_sel->info;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+ unsigned clipdist_mask = vs_sel->clipdist_mask;
+ unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
+ unsigned culldist_mask = vs_sel->culldist_mask;
+ unsigned total_mask;
+
+ if (vs->key.opt.clip_disable) {
+ assert(!info->culldist_writemask);
+ clipdist_mask = 0;
+ culldist_mask = 0;
+ }
+ total_mask = clipdist_mask | culldist_mask;
+
+ /* Clip distances on points have no effect, so need to be implemented
+ * as cull distances. This applies for the clipvertex case as well.
+ *
+ * Setting this for primitives other than points should have no adverse
+ * effects.
+ */
+ clipdist_mask &= rs->clip_plane_enable;
+ culldist_mask |= clipdist_mask;
+
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
+ S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
+ (culldist_mask << 8);
+
+ if (sctx->chip_class >= GFX10) {
+ radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
+ ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+ } else {
+ radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
+ vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
+ }
+ radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
+ rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
/*
*/
static void si_update_poly_offset_state(struct si_context *sctx)
{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
- if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
- si_pm4_bind_state(sctx, poly_offset, NULL);
- return;
- }
-
- /* Use the user format, not db_render_format, so that the polygon
- * offset behaves as expected by applications.
- */
- switch (sctx->framebuffer.state.zsbuf->texture->format) {
- case PIPE_FORMAT_Z16_UNORM:
- si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
- break;
- default: /* 24-bit */
- si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
- break;
- case PIPE_FORMAT_Z32_FLOAT:
- case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
- si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
- break;
- }
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+ if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
+ si_pm4_bind_state(sctx, poly_offset, NULL);
+ return;
+ }
+
+ /* Use the user format, not db_render_format, so that the polygon
+ * offset behaves as expected by applications.
+ */
+ switch (sctx->framebuffer.state.zsbuf->texture->format) {
+ case PIPE_FORMAT_Z16_UNORM:
+ si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
+ break;
+ default: /* 24-bit */
+ si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
+ break;
+ case PIPE_FORMAT_Z32_FLOAT:
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
+ break;
+ }
}
/*
static uint32_t si_translate_fill(uint32_t func)
{
- switch(func) {
- case PIPE_POLYGON_MODE_FILL:
- return V_028814_X_DRAW_TRIANGLES;
- case PIPE_POLYGON_MODE_LINE:
- return V_028814_X_DRAW_LINES;
- case PIPE_POLYGON_MODE_POINT:
- return V_028814_X_DRAW_POINTS;
- default:
- assert(0);
- return V_028814_X_DRAW_POINTS;
- }
+ switch (func) {
+ case PIPE_POLYGON_MODE_FILL:
+ return V_028814_X_DRAW_TRIANGLES;
+ case PIPE_POLYGON_MODE_LINE:
+ return V_028814_X_DRAW_LINES;
+ case PIPE_POLYGON_MODE_POINT:
+ return V_028814_X_DRAW_POINTS;
+ default:
+ assert(0);
+ return V_028814_X_DRAW_POINTS;
+ }
}
-static void *si_create_rs_state(struct pipe_context *ctx,
- const struct pipe_rasterizer_state *state)
+static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state)
{
- struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
- struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
- struct si_pm4_state *pm4 = &rs->pm4;
- unsigned tmp, i;
- float psize_min, psize_max;
-
- if (!rs) {
- return NULL;
- }
-
- if (!state->front_ccw) {
- rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
- rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
- } else {
- rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
- rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
- }
- rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
- rs->provoking_vertex_first = state->flatshade_first;
- rs->scissor_enable = state->scissor;
- rs->clip_halfz = state->clip_halfz;
- rs->two_side = state->light_twoside;
- rs->multisample_enable = state->multisample;
- rs->force_persample_interp = state->force_persample_interp;
- rs->clip_plane_enable = state->clip_plane_enable;
- rs->half_pixel_center = state->half_pixel_center;
- rs->line_stipple_enable = state->line_stipple_enable;
- rs->poly_stipple_enable = state->poly_stipple_enable;
- rs->line_smooth = state->line_smooth;
- rs->line_width = state->line_width;
- rs->poly_smooth = state->poly_smooth;
- rs->uses_poly_offset = state->offset_point || state->offset_line ||
- state->offset_tri;
- rs->clamp_fragment_color = state->clamp_fragment_color;
- rs->clamp_vertex_color = state->clamp_vertex_color;
- rs->flatshade = state->flatshade;
- rs->flatshade_first = state->flatshade_first;
- rs->sprite_coord_enable = state->sprite_coord_enable;
- rs->rasterizer_discard = state->rasterizer_discard;
- rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL &&
- !(state->cull_face & PIPE_FACE_FRONT)) ||
- (state->fill_back != PIPE_POLYGON_MODE_FILL &&
- !(state->cull_face & PIPE_FACE_BACK));
- rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE &&
- !(state->cull_face & PIPE_FACE_FRONT)) ||
- (state->fill_back == PIPE_POLYGON_MODE_LINE &&
- !(state->cull_face & PIPE_FACE_BACK));
- rs->pa_sc_line_stipple = state->line_stipple_enable ?
- S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
- S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0;
- rs->pa_cl_clip_cntl =
- S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
- S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
- S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
- S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
- S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
-
- si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
- S_0286D4_FLAT_SHADE_ENA(1) |
- S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
- S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
- S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
- S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
- S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
- S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
-
- /* point size 12.4 fixed point */
- tmp = (unsigned)(state->point_size * 8.0);
- si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
-
- if (state->point_size_per_vertex) {
- psize_min = util_get_min_point_size(state);
- psize_max = SI_MAX_POINT_SIZE;
- } else {
- /* Force the point size to be as if the vertex output was disabled. */
- psize_min = state->point_size;
- psize_max = state->point_size;
- }
- rs->max_point_size = psize_max;
-
- /* Divide by two, because 0.5 = 1 pixel. */
- si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
- S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) |
- S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2)));
-
- si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
- S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2)));
- si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
- S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
- S_028A48_MSAA_ENABLE(state->multisample ||
- state->poly_smooth ||
- state->line_smooth) |
- S_028A48_VPORT_SCISSOR_ENABLE(1) |
- S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
-
- si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
- si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
- S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
- S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
- S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
- S_028814_FACE(!state->front_ccw) |
- S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
- S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
- S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
- S_028814_POLY_MODE(rs->polygon_mode_enabled) |
- S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
- S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
-
- if (!rs->uses_poly_offset)
- return rs;
-
- rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
- if (!rs->pm4_poly_offset) {
- FREE(rs);
- return NULL;
- }
-
- /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
- for (i = 0; i < 3; i++) {
- struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
- float offset_units = state->offset_units;
- float offset_scale = state->offset_scale * 16.0f;
- uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
-
- if (!state->offset_units_unscaled) {
- switch (i) {
- case 0: /* 16-bit zbuffer */
- offset_units *= 4.0f;
- pa_su_poly_offset_db_fmt_cntl =
- S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
- break;
- case 1: /* 24-bit zbuffer */
- offset_units *= 2.0f;
- pa_su_poly_offset_db_fmt_cntl =
- S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
- break;
- case 2: /* 32-bit zbuffer */
- offset_units *= 1.0f;
- pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) |
- S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
- break;
- }
- }
-
- si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE,
- fui(offset_scale));
- si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET,
- fui(offset_units));
- si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE,
- fui(offset_scale));
- si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET,
- fui(offset_units));
- si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
- pa_su_poly_offset_db_fmt_cntl);
- }
-
- return rs;
+ struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
+ struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
+ struct si_pm4_state *pm4 = &rs->pm4;
+ unsigned tmp, i;
+ float psize_min, psize_max;
+
+ if (!rs) {
+ return NULL;
+ }
+
+ if (!state->front_ccw) {
+ rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+ rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+ } else {
+ rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+ rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
+ }
+ rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
+ rs->provoking_vertex_first = state->flatshade_first;
+ rs->scissor_enable = state->scissor;
+ rs->clip_halfz = state->clip_halfz;
+ rs->two_side = state->light_twoside;
+ rs->multisample_enable = state->multisample;
+ rs->force_persample_interp = state->force_persample_interp;
+ rs->clip_plane_enable = state->clip_plane_enable;
+ rs->half_pixel_center = state->half_pixel_center;
+ rs->line_stipple_enable = state->line_stipple_enable;
+ rs->poly_stipple_enable = state->poly_stipple_enable;
+ rs->line_smooth = state->line_smooth;
+ rs->line_width = state->line_width;
+ rs->poly_smooth = state->poly_smooth;
+ rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri;
+ rs->clamp_fragment_color = state->clamp_fragment_color;
+ rs->clamp_vertex_color = state->clamp_vertex_color;
+ rs->flatshade = state->flatshade;
+ rs->flatshade_first = state->flatshade_first;
+ rs->sprite_coord_enable = state->sprite_coord_enable;
+ rs->rasterizer_discard = state->rasterizer_discard;
+ rs->polygon_mode_enabled =
+ (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+ (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+ rs->polygon_mode_is_lines =
+ (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
+ (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
+ rs->pa_sc_line_stipple = state->line_stipple_enable
+ ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
+ S_028A0C_REPEAT_COUNT(state->line_stipple_factor)
+ : 0;
+ rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
+ S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
+ S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
+ S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
+ S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+
+ si_pm4_set_reg(
+ pm4, R_0286D4_SPI_INTERP_CONTROL_0,
+ S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
+ S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+ S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+ S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+ S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+ S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
+
+ /* point size 12.4 fixed point */
+ tmp = (unsigned)(state->point_size * 8.0);
+ si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
+
+ if (state->point_size_per_vertex) {
+ psize_min = util_get_min_point_size(state);
+ psize_max = SI_MAX_POINT_SIZE;
+ } else {
+ /* Force the point size to be as if the vertex output was disabled. */
+ psize_min = state->point_size;
+ psize_max = state->point_size;
+ }
+ rs->max_point_size = psize_max;
+
+ /* Divide by two, because 0.5 = 1 pixel. */
+ si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
+ S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
+ S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
+
+ si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
+ S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
+ si_pm4_set_reg(
+ pm4, R_028A48_PA_SC_MODE_CNTL_0,
+ S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
+ S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) |
+ S_028A48_VPORT_SCISSOR_ENABLE(1) |
+ S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
+
+ si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+ si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
+ S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
+ S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+ S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+ S_028814_FACE(!state->front_ccw) |
+ S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+ S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+ S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
+ S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+ S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
+ S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
+
+ if (!rs->uses_poly_offset)
+ return rs;
+
+ rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
+ if (!rs->pm4_poly_offset) {
+ FREE(rs);
+ return NULL;
+ }
+
+ /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
+ for (i = 0; i < 3; i++) {
+ struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
+ float offset_units = state->offset_units;
+ float offset_scale = state->offset_scale * 16.0f;
+ uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
+
+ if (!state->offset_units_unscaled) {
+ switch (i) {
+ case 0: /* 16-bit zbuffer */
+ offset_units *= 4.0f;
+ pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
+ break;
+ case 1: /* 24-bit zbuffer */
+ offset_units *= 2.0f;
+ pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
+ break;
+ case 2: /* 32-bit zbuffer */
+ offset_units *= 1.0f;
+ pa_su_poly_offset_db_fmt_cntl =
+ S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
+ break;
+ }
+ }
+
+ si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
+ si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
+ si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
+ si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
+ si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+ }
+
+ return rs;
}
static void si_bind_rs_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_rasterizer *old_rs =
- (struct si_state_rasterizer*)sctx->queued.named.rasterizer;
- struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
-
- if (!rs)
- rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
-
- if (old_rs->multisample_enable != rs->multisample_enable) {
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
- /* Update the small primitive filter workaround if necessary. */
- if (sctx->screen->info.has_msaa_sample_loc_bug &&
- sctx->framebuffer.nr_samples > 1)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
- }
-
- sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
- sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
-
- si_pm4_bind_state(sctx, rasterizer, rs);
- si_update_poly_offset_state(sctx);
-
- if (old_rs->scissor_enable != rs->scissor_enable)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
-
- if (old_rs->line_width != rs->line_width ||
- old_rs->max_point_size != rs->max_point_size ||
- old_rs->half_pixel_center != rs->half_pixel_center)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
- if (old_rs->clip_halfz != rs->clip_halfz)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
-
- if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
- old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
- if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
- old_rs->rasterizer_discard != rs->rasterizer_discard ||
- old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
- old_rs->flatshade != rs->flatshade ||
- old_rs->two_side != rs->two_side ||
- old_rs->multisample_enable != rs->multisample_enable ||
- old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
- old_rs->poly_smooth != rs->poly_smooth ||
- old_rs->line_smooth != rs->line_smooth ||
- old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
- old_rs->force_persample_interp != rs->force_persample_interp)
- sctx->do_update_shaders = true;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer;
+ struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+
+ if (!rs)
+ rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
+
+ if (old_rs->multisample_enable != rs->multisample_enable) {
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+ /* Update the small primitive filter workaround if necessary. */
+ if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+ }
+
+ sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
+ sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
+
+ si_pm4_bind_state(sctx, rasterizer, rs);
+ si_update_poly_offset_state(sctx);
+
+ if (old_rs->scissor_enable != rs->scissor_enable)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
+
+ if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
+ old_rs->half_pixel_center != rs->half_pixel_center)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+ if (old_rs->clip_halfz != rs->clip_halfz)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
+
+ if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+ old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+ if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+ old_rs->rasterizer_discard != rs->rasterizer_discard ||
+ old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+ old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
+ old_rs->multisample_enable != rs->multisample_enable ||
+ old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+ old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
+ old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
+ old_rs->force_persample_interp != rs->force_persample_interp)
+ sctx->do_update_shaders = true;
}
static void si_delete_rs_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
- if (sctx->queued.named.rasterizer == state)
- si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
+ if (sctx->queued.named.rasterizer == state)
+ si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
- FREE(rs->pm4_poly_offset);
- si_pm4_delete_state(sctx, rasterizer, rs);
+ FREE(rs->pm4_poly_offset);
+ si_pm4_delete_state(sctx, rasterizer, rs);
}
/*
*/
static void si_emit_stencil_ref(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
- struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
-
- radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
- radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
- S_028430_STENCILMASK(dsa->valuemask[0]) |
- S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
- S_028430_STENCILOPVAL(1));
- radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
- S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
- S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
- S_028434_STENCILOPVAL_BF(1));
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
+ struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
+
+ radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
+ radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+ S_028430_STENCILMASK(dsa->valuemask[0]) |
+ S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
+ radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+ S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+ S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+ S_028434_STENCILOPVAL_BF(1));
}
-static void si_set_stencil_ref(struct pipe_context *ctx,
- const struct pipe_stencil_ref *state)
+static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
- return;
+ if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
+ return;
- sctx->stencil_ref.state = *state;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+ sctx->stencil_ref.state = *state;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
}
-
/*
* DSA
*/
static uint32_t si_translate_stencil_op(int s_op)
{
- switch (s_op) {
- case PIPE_STENCIL_OP_KEEP:
- return V_02842C_STENCIL_KEEP;
- case PIPE_STENCIL_OP_ZERO:
- return V_02842C_STENCIL_ZERO;
- case PIPE_STENCIL_OP_REPLACE:
- return V_02842C_STENCIL_REPLACE_TEST;
- case PIPE_STENCIL_OP_INCR:
- return V_02842C_STENCIL_ADD_CLAMP;
- case PIPE_STENCIL_OP_DECR:
- return V_02842C_STENCIL_SUB_CLAMP;
- case PIPE_STENCIL_OP_INCR_WRAP:
- return V_02842C_STENCIL_ADD_WRAP;
- case PIPE_STENCIL_OP_DECR_WRAP:
- return V_02842C_STENCIL_SUB_WRAP;
- case PIPE_STENCIL_OP_INVERT:
- return V_02842C_STENCIL_INVERT;
- default:
- PRINT_ERR("Unknown stencil op %d", s_op);
- assert(0);
- break;
- }
- return 0;
+ switch (s_op) {
+ case PIPE_STENCIL_OP_KEEP:
+ return V_02842C_STENCIL_KEEP;
+ case PIPE_STENCIL_OP_ZERO:
+ return V_02842C_STENCIL_ZERO;
+ case PIPE_STENCIL_OP_REPLACE:
+ return V_02842C_STENCIL_REPLACE_TEST;
+ case PIPE_STENCIL_OP_INCR:
+ return V_02842C_STENCIL_ADD_CLAMP;
+ case PIPE_STENCIL_OP_DECR:
+ return V_02842C_STENCIL_SUB_CLAMP;
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ return V_02842C_STENCIL_ADD_WRAP;
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ return V_02842C_STENCIL_SUB_WRAP;
+ case PIPE_STENCIL_OP_INVERT:
+ return V_02842C_STENCIL_INVERT;
+ default:
+ PRINT_ERR("Unknown stencil op %d", s_op);
+ assert(0);
+ break;
+ }
+ return 0;
}
static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
{
- return s->enabled && s->writemask &&
- (s->fail_op != PIPE_STENCIL_OP_KEEP ||
- s->zfail_op != PIPE_STENCIL_OP_KEEP ||
- s->zpass_op != PIPE_STENCIL_OP_KEEP);
+ return s->enabled && s->writemask &&
+ (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+ s->zpass_op != PIPE_STENCIL_OP_KEEP);
}
static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
{
- /* REPLACE is normally order invariant, except when the stencil
- * reference value is written by the fragment shader. Tracking this
- * interaction does not seem worth the effort, so be conservative. */
- return op != PIPE_STENCIL_OP_INCR &&
- op != PIPE_STENCIL_OP_DECR &&
- op != PIPE_STENCIL_OP_REPLACE;
+ /* REPLACE is normally order invariant, except when the stencil
+ * reference value is written by the fragment shader. Tracking this
+ * interaction does not seem worth the effort, so be conservative. */
+ return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE;
}
/* Compute whether, assuming Z writes are disabled, this stencil state is order
* final stencil buffer result does not depend on the order of fragments. */
static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
{
- return !state->enabled || !state->writemask ||
- /* The following assumes that Z writes are disabled. */
- (state->func == PIPE_FUNC_ALWAYS &&
- si_order_invariant_stencil_op(state->zpass_op) &&
- si_order_invariant_stencil_op(state->zfail_op)) ||
- (state->func == PIPE_FUNC_NEVER &&
- si_order_invariant_stencil_op(state->fail_op));
+ return !state->enabled || !state->writemask ||
+ /* The following assumes that Z writes are disabled. */
+ (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) &&
+ si_order_invariant_stencil_op(state->zfail_op)) ||
+ (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op));
}
static void *si_create_dsa_state(struct pipe_context *ctx,
- const struct pipe_depth_stencil_alpha_state *state)
+ const struct pipe_depth_stencil_alpha_state *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
- struct si_pm4_state *pm4 = &dsa->pm4;
- unsigned db_depth_control;
- uint32_t db_stencil_control = 0;
-
- if (!dsa) {
- return NULL;
- }
-
- dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
- dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
- dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
- dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
-
- db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
- S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
- S_028800_ZFUNC(state->depth.func) |
- S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
-
- /* stencil */
- if (state->stencil[0].enabled) {
- db_depth_control |= S_028800_STENCIL_ENABLE(1);
- db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
- db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
- db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
- db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
-
- if (state->stencil[1].enabled) {
- db_depth_control |= S_028800_BACKFACE_ENABLE(1);
- db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
- db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
- db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
- db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
- }
- }
-
- /* alpha */
- if (state->alpha.enabled) {
- dsa->alpha_func = state->alpha.func;
-
- si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 +
- SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value));
- } else {
- dsa->alpha_func = PIPE_FUNC_ALWAYS;
- }
-
- si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
- if (state->stencil[0].enabled)
- si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
- if (state->depth.bounds_test) {
- si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
- si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
- }
-
- dsa->depth_enabled = state->depth.enabled;
- dsa->depth_write_enabled = state->depth.enabled &&
- state->depth.writemask;
- dsa->stencil_enabled = state->stencil[0].enabled;
- dsa->stencil_write_enabled = state->stencil[0].enabled &&
- (si_dsa_writes_stencil(&state->stencil[0]) ||
- si_dsa_writes_stencil(&state->stencil[1]));
- dsa->db_can_write = dsa->depth_write_enabled ||
- dsa->stencil_write_enabled;
-
- bool zfunc_is_ordered =
- state->depth.func == PIPE_FUNC_NEVER ||
- state->depth.func == PIPE_FUNC_LESS ||
- state->depth.func == PIPE_FUNC_LEQUAL ||
- state->depth.func == PIPE_FUNC_GREATER ||
- state->depth.func == PIPE_FUNC_GEQUAL;
-
- bool nozwrite_and_order_invariant_stencil =
- !dsa->db_can_write ||
- (!dsa->depth_write_enabled &&
- si_order_invariant_stencil_state(&state->stencil[0]) &&
- si_order_invariant_stencil_state(&state->stencil[1]));
-
- dsa->order_invariance[1].zs =
- nozwrite_and_order_invariant_stencil ||
- (!dsa->stencil_write_enabled && zfunc_is_ordered);
- dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
-
- dsa->order_invariance[1].pass_set =
- nozwrite_and_order_invariant_stencil ||
- (!dsa->stencil_write_enabled &&
- (state->depth.func == PIPE_FUNC_ALWAYS ||
- state->depth.func == PIPE_FUNC_NEVER));
- dsa->order_invariance[0].pass_set =
- !dsa->depth_write_enabled ||
- (state->depth.func == PIPE_FUNC_ALWAYS ||
- state->depth.func == PIPE_FUNC_NEVER);
-
- dsa->order_invariance[1].pass_last =
- sctx->screen->assume_no_z_fights &&
- !dsa->stencil_write_enabled &&
- dsa->depth_write_enabled && zfunc_is_ordered;
- dsa->order_invariance[0].pass_last =
- sctx->screen->assume_no_z_fights &&
- dsa->depth_write_enabled && zfunc_is_ordered;
-
- return dsa;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
+ struct si_pm4_state *pm4 = &dsa->pm4;
+ unsigned db_depth_control;
+ uint32_t db_stencil_control = 0;
+
+ if (!dsa) {
+ return NULL;
+ }
+
+ dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
+ dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
+ dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
+ dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
+
+ db_depth_control =
+ S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+ S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
+
+ /* stencil */
+ if (state->stencil[0].enabled) {
+ db_depth_control |= S_028800_STENCIL_ENABLE(1);
+ db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
+ db_stencil_control |=
+ S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
+ db_stencil_control |=
+ S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
+ db_stencil_control |=
+ S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
+
+ if (state->stencil[1].enabled) {
+ db_depth_control |= S_028800_BACKFACE_ENABLE(1);
+ db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
+ db_stencil_control |=
+ S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
+ db_stencil_control |=
+ S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
+ db_stencil_control |=
+ S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
+ }
+ }
+
+ /* alpha */
+ if (state->alpha.enabled) {
+ dsa->alpha_func = state->alpha.func;
+
+ si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+ fui(state->alpha.ref_value));
+ } else {
+ dsa->alpha_func = PIPE_FUNC_ALWAYS;
+ }
+
+ si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
+ if (state->stencil[0].enabled)
+ si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+ if (state->depth.bounds_test) {
+ si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+ si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+ }
+
+ dsa->depth_enabled = state->depth.enabled;
+ dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask;
+ dsa->stencil_enabled = state->stencil[0].enabled;
+ dsa->stencil_write_enabled =
+ state->stencil[0].enabled &&
+ (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1]));
+ dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
+
+ bool zfunc_is_ordered =
+ state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS ||
+ state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER ||
+ state->depth.func == PIPE_FUNC_GEQUAL;
+
+ bool nozwrite_and_order_invariant_stencil =
+ !dsa->db_can_write ||
+ (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) &&
+ si_order_invariant_stencil_state(&state->stencil[1]));
+
+ dsa->order_invariance[1].zs =
+ nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered);
+ dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
+
+ dsa->order_invariance[1].pass_set =
+ nozwrite_and_order_invariant_stencil ||
+ (!dsa->stencil_write_enabled &&
+ (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER));
+ dsa->order_invariance[0].pass_set =
+ !dsa->depth_write_enabled ||
+ (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER);
+
+ dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights &&
+ !dsa->stencil_write_enabled && dsa->depth_write_enabled &&
+ zfunc_is_ordered;
+ dsa->order_invariance[0].pass_last =
+ sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered;
+
+ return dsa;
}
static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
- struct si_state_dsa *dsa = state;
-
- if (!dsa)
- dsa = (struct si_state_dsa *)sctx->noop_dsa;
-
- si_pm4_bind_state(sctx, dsa, dsa);
-
- if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
- sizeof(struct si_dsa_stencil_ref_part)) != 0) {
- sctx->stencil_ref.dsa_part = dsa->stencil_ref;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
- }
-
- if (old_dsa->alpha_func != dsa->alpha_func)
- sctx->do_update_shaders = true;
-
- if (sctx->screen->dpbb_allowed &&
- ((old_dsa->depth_enabled != dsa->depth_enabled ||
- old_dsa->stencil_enabled != dsa->stencil_enabled ||
- old_dsa->db_can_write != dsa->db_can_write)))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
- if (sctx->screen->has_out_of_order_rast &&
- (memcmp(old_dsa->order_invariance, dsa->order_invariance,
- sizeof(old_dsa->order_invariance))))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
+ struct si_state_dsa *dsa = state;
+
+ if (!dsa)
+ dsa = (struct si_state_dsa *)sctx->noop_dsa;
+
+ si_pm4_bind_state(sctx, dsa, dsa);
+
+ if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
+ sizeof(struct si_dsa_stencil_ref_part)) != 0) {
+ sctx->stencil_ref.dsa_part = dsa->stencil_ref;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+ }
+
+ if (old_dsa->alpha_func != dsa->alpha_func)
+ sctx->do_update_shaders = true;
+
+ if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
+ old_dsa->stencil_enabled != dsa->stencil_enabled ||
+ old_dsa->db_can_write != dsa->db_can_write)))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+ if (sctx->screen->has_out_of_order_rast &&
+ (memcmp(old_dsa->order_invariance, dsa->order_invariance,
+ sizeof(old_dsa->order_invariance))))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
}
static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (sctx->queued.named.dsa == state)
- si_bind_dsa_state(ctx, sctx->noop_dsa);
+ if (sctx->queued.named.dsa == state)
+ si_bind_dsa_state(ctx, sctx->noop_dsa);
- si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
+ si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
}
static void *si_create_db_flush_dsa(struct si_context *sctx)
{
- struct pipe_depth_stencil_alpha_state dsa = {};
+ struct pipe_depth_stencil_alpha_state dsa = {};
- return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
+ return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
}
/* DB RENDER STATE */
static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
{
- struct si_context *sctx = (struct si_context*)ctx;
-
- /* Pipeline stat & streamout queries. */
- if (enable) {
- sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
- } else {
- sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
- sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
- }
-
- /* Occlusion queries. */
- if (sctx->occlusion_queries_disabled != !enable) {
- sctx->occlusion_queries_disabled = !enable;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ /* Pipeline stat & streamout queries. */
+ if (enable) {
+ sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+ sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+ } else {
+ sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+ sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+ }
+
+ /* Occlusion queries. */
+ if (sctx->occlusion_queries_disabled != !enable) {
+ sctx->occlusion_queries_disabled = !enable;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ }
}
-void si_set_occlusion_query_state(struct si_context *sctx,
- bool old_perfect_enable)
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
{
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+ bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
- if (perfect_enable != old_perfect_enable)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ if (perfect_enable != old_perfect_enable)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
}
void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
{
- st->saved_compute = sctx->cs_shader_state.program;
+ st->saved_compute = sctx->cs_shader_state.program;
- si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
- si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+ si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
- st->saved_ssbo_writable_mask = 0;
+ st->saved_ssbo_writable_mask = 0;
- for (unsigned i = 0; i < 3; i++) {
- if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
- (1u << si_get_shaderbuf_slot(i)))
- st->saved_ssbo_writable_mask |= 1 << i;
- }
+ for (unsigned i = 0; i < 3; i++) {
+ if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+ (1u << si_get_shaderbuf_slot(i)))
+ st->saved_ssbo_writable_mask |= 1 << i;
+ }
}
void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
{
- sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
+ sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
- sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
- pipe_resource_reference(&st->saved_const0.buffer, NULL);
+ sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+ pipe_resource_reference(&st->saved_const0.buffer, NULL);
- sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
- st->saved_ssbo_writable_mask);
- for (unsigned i = 0; i < 3; ++i)
- pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+ sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
+ st->saved_ssbo_writable_mask);
+ for (unsigned i = 0; i < 3; ++i)
+ pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
}
static void si_emit_db_render_state(struct si_context *sctx)
{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- unsigned db_shader_control, db_render_control, db_count_control;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
- /* DB_RENDER_CONTROL */
- if (sctx->dbcb_depth_copy_enabled ||
- sctx->dbcb_stencil_copy_enabled) {
- db_render_control =
- S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
- S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
- S_028000_COPY_CENTROID(1) |
- S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
- } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
- db_render_control =
- S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
- S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
- } else {
- db_render_control =
- S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
- S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
- }
-
- /* DB_COUNT_CONTROL (occlusion queries) */
- if (sctx->num_occlusion_queries > 0 &&
- !sctx->occlusion_queries_disabled) {
- bool perfect = sctx->num_perfect_occlusion_queries > 0;
- bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
-
- if (sctx->chip_class >= GFX7) {
- unsigned log_sample_rate = sctx->framebuffer.log_samples;
-
- /* Stoney doesn't increment occlusion query counters
- * if the sample rate is 16x. Use 8x sample rate instead.
- */
- if (sctx->family == CHIP_STONEY)
- log_sample_rate = MIN2(log_sample_rate, 3);
-
- db_count_control =
- S_028004_PERFECT_ZPASS_COUNTS(perfect) |
- S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
- S_028004_SAMPLE_RATE(log_sample_rate) |
- S_028004_ZPASS_ENABLE(1) |
- S_028004_SLICE_EVEN_ENABLE(1) |
- S_028004_SLICE_ODD_ENABLE(1);
- } else {
- db_count_control =
- S_028004_PERFECT_ZPASS_COUNTS(perfect) |
- S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
- }
- } else {
- /* Disable occlusion queries. */
- if (sctx->chip_class >= GFX7) {
- db_count_control = 0;
- } else {
- db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
- }
- }
-
- radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
- SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
- db_count_control);
-
- /* DB_RENDER_OVERRIDE2 */
- radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2,
- SI_TRACKED_DB_RENDER_OVERRIDE2,
- S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
- S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
- S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
-
- db_shader_control = sctx->ps_db_shader_control;
-
- /* Bug workaround for smoothing (overrasterization) on GFX6. */
- if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
- db_shader_control &= C_02880C_Z_ORDER;
- db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
- }
-
- /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
- if (!rs->multisample_enable)
- db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
-
- if (sctx->screen->info.has_rbplus &&
- !sctx->screen->info.rbplus_allowed)
- db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
-
- radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
- SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ unsigned db_shader_control, db_render_control, db_count_control;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ /* DB_RENDER_CONTROL */
+ if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
+ db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
+ S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
+ S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
+ } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
+ db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
+ S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
+ } else {
+ db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+ S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
+ }
+
+ /* DB_COUNT_CONTROL (occlusion queries) */
+ if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
+ bool perfect = sctx->num_perfect_occlusion_queries > 0;
+ bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
+
+ if (sctx->chip_class >= GFX7) {
+ unsigned log_sample_rate = sctx->framebuffer.log_samples;
+
+ /* Stoney doesn't increment occlusion query counters
+ * if the sample rate is 16x. Use 8x sample rate instead.
+ */
+ if (sctx->family == CHIP_STONEY)
+ log_sample_rate = MIN2(log_sample_rate, 3);
+
+ db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+ S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
+ S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
+ S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
+ } else {
+ db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+ S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
+ }
+ } else {
+ /* Disable occlusion queries. */
+ if (sctx->chip_class >= GFX7) {
+ db_count_control = 0;
+ } else {
+ db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
+ }
+ }
+
+ radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
+ db_render_control, db_count_control);
+
+ /* DB_RENDER_OVERRIDE2 */
+ radeon_opt_set_context_reg(
+ sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
+ S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
+ S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
+ S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
+
+ db_shader_control = sctx->ps_db_shader_control;
+
+ /* Bug workaround for smoothing (overrasterization) on GFX6. */
+ if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
+ db_shader_control &= C_02880C_Z_ORDER;
+ db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
+ }
+
+ /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
+ if (!rs->multisample_enable)
+ db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
+
+ if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed)
+ db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
+
+ radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL,
+ db_shader_control);
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
/*
*/
static uint32_t si_translate_colorformat(enum pipe_format format)
{
- const struct util_format_description *desc = util_format_description(format);
- if (!desc)
- return V_028C70_COLOR_INVALID;
-
-#define HAS_SIZE(x,y,z,w) \
- (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
- desc->channel[2].size == (z) && desc->channel[3].size == (w))
-
- if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
- return V_028C70_COLOR_10_11_11;
-
- if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
- return V_028C70_COLOR_INVALID;
-
- /* hw cannot support mixed formats (except depth/stencil, since
- * stencil is not written to). */
- if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
- return V_028C70_COLOR_INVALID;
-
- switch (desc->nr_channels) {
- case 1:
- switch (desc->channel[0].size) {
- case 8:
- return V_028C70_COLOR_8;
- case 16:
- return V_028C70_COLOR_16;
- case 32:
- return V_028C70_COLOR_32;
- }
- break;
- case 2:
- if (desc->channel[0].size == desc->channel[1].size) {
- switch (desc->channel[0].size) {
- case 8:
- return V_028C70_COLOR_8_8;
- case 16:
- return V_028C70_COLOR_16_16;
- case 32:
- return V_028C70_COLOR_32_32;
- }
- } else if (HAS_SIZE(8,24,0,0)) {
- return V_028C70_COLOR_24_8;
- } else if (HAS_SIZE(24,8,0,0)) {
- return V_028C70_COLOR_8_24;
- }
- break;
- case 3:
- if (HAS_SIZE(5,6,5,0)) {
- return V_028C70_COLOR_5_6_5;
- } else if (HAS_SIZE(32,8,24,0)) {
- return V_028C70_COLOR_X24_8_32_FLOAT;
- }
- break;
- case 4:
- if (desc->channel[0].size == desc->channel[1].size &&
- desc->channel[0].size == desc->channel[2].size &&
- desc->channel[0].size == desc->channel[3].size) {
- switch (desc->channel[0].size) {
- case 4:
- return V_028C70_COLOR_4_4_4_4;
- case 8:
- return V_028C70_COLOR_8_8_8_8;
- case 16:
- return V_028C70_COLOR_16_16_16_16;
- case 32:
- return V_028C70_COLOR_32_32_32_32;
- }
- } else if (HAS_SIZE(5,5,5,1)) {
- return V_028C70_COLOR_1_5_5_5;
- } else if (HAS_SIZE(1,5,5,5)) {
- return V_028C70_COLOR_5_5_5_1;
- } else if (HAS_SIZE(10,10,10,2)) {
- return V_028C70_COLOR_2_10_10_10;
- }
- break;
- }
- return V_028C70_COLOR_INVALID;
+ const struct util_format_description *desc = util_format_description(format);
+ if (!desc)
+ return V_028C70_COLOR_INVALID;
+
+#define HAS_SIZE(x, y, z, w) \
+ (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
+ desc->channel[2].size == (z) && desc->channel[3].size == (w))
+
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+ return V_028C70_COLOR_10_11_11;
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return V_028C70_COLOR_INVALID;
+
+ /* hw cannot support mixed formats (except depth/stencil, since
+ * stencil is not written to). */
+ if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+ return V_028C70_COLOR_INVALID;
+
+ switch (desc->nr_channels) {
+ case 1:
+ switch (desc->channel[0].size) {
+ case 8:
+ return V_028C70_COLOR_8;
+ case 16:
+ return V_028C70_COLOR_16;
+ case 32:
+ return V_028C70_COLOR_32;
+ }
+ break;
+ case 2:
+ if (desc->channel[0].size == desc->channel[1].size) {
+ switch (desc->channel[0].size) {
+ case 8:
+ return V_028C70_COLOR_8_8;
+ case 16:
+ return V_028C70_COLOR_16_16;
+ case 32:
+ return V_028C70_COLOR_32_32;
+ }
+ } else if (HAS_SIZE(8, 24, 0, 0)) {
+ return V_028C70_COLOR_24_8;
+ } else if (HAS_SIZE(24, 8, 0, 0)) {
+ return V_028C70_COLOR_8_24;
+ }
+ break;
+ case 3:
+ if (HAS_SIZE(5, 6, 5, 0)) {
+ return V_028C70_COLOR_5_6_5;
+ } else if (HAS_SIZE(32, 8, 24, 0)) {
+ return V_028C70_COLOR_X24_8_32_FLOAT;
+ }
+ break;
+ case 4:
+ if (desc->channel[0].size == desc->channel[1].size &&
+ desc->channel[0].size == desc->channel[2].size &&
+ desc->channel[0].size == desc->channel[3].size) {
+ switch (desc->channel[0].size) {
+ case 4:
+ return V_028C70_COLOR_4_4_4_4;
+ case 8:
+ return V_028C70_COLOR_8_8_8_8;
+ case 16:
+ return V_028C70_COLOR_16_16_16_16;
+ case 32:
+ return V_028C70_COLOR_32_32_32_32;
+ }
+ } else if (HAS_SIZE(5, 5, 5, 1)) {
+ return V_028C70_COLOR_1_5_5_5;
+ } else if (HAS_SIZE(1, 5, 5, 5)) {
+ return V_028C70_COLOR_5_5_5_1;
+ } else if (HAS_SIZE(10, 10, 10, 2)) {
+ return V_028C70_COLOR_2_10_10_10;
+ }
+ break;
+ }
+ return V_028C70_COLOR_INVALID;
}
static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
{
- if (SI_BIG_ENDIAN) {
- switch(colorformat) {
- /* 8-bit buffers. */
- case V_028C70_COLOR_8:
- return V_028C70_ENDIAN_NONE;
-
- /* 16-bit buffers. */
- case V_028C70_COLOR_5_6_5:
- case V_028C70_COLOR_1_5_5_5:
- case V_028C70_COLOR_4_4_4_4:
- case V_028C70_COLOR_16:
- case V_028C70_COLOR_8_8:
- return V_028C70_ENDIAN_8IN16;
-
- /* 32-bit buffers. */
- case V_028C70_COLOR_8_8_8_8:
- case V_028C70_COLOR_2_10_10_10:
- case V_028C70_COLOR_8_24:
- case V_028C70_COLOR_24_8:
- case V_028C70_COLOR_16_16:
- return V_028C70_ENDIAN_8IN32;
-
- /* 64-bit buffers. */
- case V_028C70_COLOR_16_16_16_16:
- return V_028C70_ENDIAN_8IN16;
-
- case V_028C70_COLOR_32_32:
- return V_028C70_ENDIAN_8IN32;
-
- /* 128-bit buffers. */
- case V_028C70_COLOR_32_32_32_32:
- return V_028C70_ENDIAN_8IN32;
- default:
- return V_028C70_ENDIAN_NONE; /* Unsupported. */
- }
- } else {
- return V_028C70_ENDIAN_NONE;
- }
+ if (SI_BIG_ENDIAN) {
+ switch (colorformat) {
+ /* 8-bit buffers. */
+ case V_028C70_COLOR_8:
+ return V_028C70_ENDIAN_NONE;
+
+ /* 16-bit buffers. */
+ case V_028C70_COLOR_5_6_5:
+ case V_028C70_COLOR_1_5_5_5:
+ case V_028C70_COLOR_4_4_4_4:
+ case V_028C70_COLOR_16:
+ case V_028C70_COLOR_8_8:
+ return V_028C70_ENDIAN_8IN16;
+
+ /* 32-bit buffers. */
+ case V_028C70_COLOR_8_8_8_8:
+ case V_028C70_COLOR_2_10_10_10:
+ case V_028C70_COLOR_8_24:
+ case V_028C70_COLOR_24_8:
+ case V_028C70_COLOR_16_16:
+ return V_028C70_ENDIAN_8IN32;
+
+ /* 64-bit buffers. */
+ case V_028C70_COLOR_16_16_16_16:
+ return V_028C70_ENDIAN_8IN16;
+
+ case V_028C70_COLOR_32_32:
+ return V_028C70_ENDIAN_8IN32;
+
+ /* 128-bit buffers. */
+ case V_028C70_COLOR_32_32_32_32:
+ return V_028C70_ENDIAN_8IN32;
+ default:
+ return V_028C70_ENDIAN_NONE; /* Unsupported. */
+ }
+ } else {
+ return V_028C70_ENDIAN_NONE;
+ }
}
static uint32_t si_translate_dbformat(enum pipe_format format)
{
- switch (format) {
- case PIPE_FORMAT_Z16_UNORM:
- return V_028040_Z_16;
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- case PIPE_FORMAT_X8Z24_UNORM:
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- return V_028040_Z_24; /* deprecated on AMD GCN */
- case PIPE_FORMAT_Z32_FLOAT:
- case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
- return V_028040_Z_32_FLOAT;
- default:
- return V_028040_Z_INVALID;
- }
+ switch (format) {
+ case PIPE_FORMAT_Z16_UNORM:
+ return V_028040_Z_16;
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ case PIPE_FORMAT_X8Z24_UNORM:
+ case PIPE_FORMAT_Z24X8_UNORM:
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ return V_028040_Z_24; /* deprecated on AMD GCN */
+ case PIPE_FORMAT_Z32_FLOAT:
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ return V_028040_Z_32_FLOAT;
+ default:
+ return V_028040_Z_INVALID;
+ }
}
/*
* Texture translation
*/
-static uint32_t si_translate_texformat(struct pipe_screen *screen,
- enum pipe_format format,
- const struct util_format_description *desc,
- int first_non_void)
+static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format,
+ const struct util_format_description *desc,
+ int first_non_void)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- bool uniform = true;
- int i;
-
- assert(sscreen->info.chip_class <= GFX9);
-
- /* Colorspace (return non-RGB formats directly). */
- switch (desc->colorspace) {
- /* Depth stencil formats */
- case UTIL_FORMAT_COLORSPACE_ZS:
- switch (format) {
- case PIPE_FORMAT_Z16_UNORM:
- return V_008F14_IMG_DATA_FORMAT_16;
- case PIPE_FORMAT_X24S8_UINT:
- case PIPE_FORMAT_S8X24_UINT:
- /*
- * Implemented as an 8_8_8_8 data format to fix texture
- * gathers in stencil sampling. This affects at least
- * GL45-CTS.texture_cube_map_array.sampling on GFX8.
- */
- if (sscreen->info.chip_class <= GFX8)
- return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
-
- if (format == PIPE_FORMAT_X24S8_UINT)
- return V_008F14_IMG_DATA_FORMAT_8_24;
- else
- return V_008F14_IMG_DATA_FORMAT_24_8;
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- return V_008F14_IMG_DATA_FORMAT_8_24;
- case PIPE_FORMAT_X8Z24_UNORM:
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- return V_008F14_IMG_DATA_FORMAT_24_8;
- case PIPE_FORMAT_S8_UINT:
- return V_008F14_IMG_DATA_FORMAT_8;
- case PIPE_FORMAT_Z32_FLOAT:
- return V_008F14_IMG_DATA_FORMAT_32;
- case PIPE_FORMAT_X32_S8X24_UINT:
- case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
- return V_008F14_IMG_DATA_FORMAT_X24_8_32;
- default:
- goto out_unknown;
- }
-
- case UTIL_FORMAT_COLORSPACE_YUV:
- goto out_unknown; /* TODO */
-
- case UTIL_FORMAT_COLORSPACE_SRGB:
- if (desc->nr_channels != 4 && desc->nr_channels != 1)
- goto out_unknown;
- break;
-
- default:
- break;
- }
-
- if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
- if (!sscreen->info.has_format_bc1_through_bc7)
- goto out_unknown;
-
- switch (format) {
- case PIPE_FORMAT_RGTC1_SNORM:
- case PIPE_FORMAT_LATC1_SNORM:
- case PIPE_FORMAT_RGTC1_UNORM:
- case PIPE_FORMAT_LATC1_UNORM:
- return V_008F14_IMG_DATA_FORMAT_BC4;
- case PIPE_FORMAT_RGTC2_SNORM:
- case PIPE_FORMAT_LATC2_SNORM:
- case PIPE_FORMAT_RGTC2_UNORM:
- case PIPE_FORMAT_LATC2_UNORM:
- return V_008F14_IMG_DATA_FORMAT_BC5;
- default:
- goto out_unknown;
- }
- }
-
- if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
- (sscreen->info.family == CHIP_STONEY ||
- sscreen->info.family == CHIP_VEGA10 ||
- sscreen->info.family == CHIP_RAVEN)) {
- switch (format) {
- case PIPE_FORMAT_ETC1_RGB8:
- case PIPE_FORMAT_ETC2_RGB8:
- case PIPE_FORMAT_ETC2_SRGB8:
- return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
- case PIPE_FORMAT_ETC2_RGB8A1:
- case PIPE_FORMAT_ETC2_SRGB8A1:
- return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
- case PIPE_FORMAT_ETC2_RGBA8:
- case PIPE_FORMAT_ETC2_SRGBA8:
- return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
- case PIPE_FORMAT_ETC2_R11_UNORM:
- case PIPE_FORMAT_ETC2_R11_SNORM:
- return V_008F14_IMG_DATA_FORMAT_ETC2_R;
- case PIPE_FORMAT_ETC2_RG11_UNORM:
- case PIPE_FORMAT_ETC2_RG11_SNORM:
- return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
- default:
- goto out_unknown;
- }
- }
-
- if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
- if (!sscreen->info.has_format_bc1_through_bc7)
- goto out_unknown;
-
- switch (format) {
- case PIPE_FORMAT_BPTC_RGBA_UNORM:
- case PIPE_FORMAT_BPTC_SRGBA:
- return V_008F14_IMG_DATA_FORMAT_BC7;
- case PIPE_FORMAT_BPTC_RGB_FLOAT:
- case PIPE_FORMAT_BPTC_RGB_UFLOAT:
- return V_008F14_IMG_DATA_FORMAT_BC6;
- default:
- goto out_unknown;
- }
- }
-
- if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
- switch (format) {
- case PIPE_FORMAT_R8G8_B8G8_UNORM:
- case PIPE_FORMAT_G8R8_B8R8_UNORM:
- return V_008F14_IMG_DATA_FORMAT_GB_GR;
- case PIPE_FORMAT_G8R8_G8B8_UNORM:
- case PIPE_FORMAT_R8G8_R8B8_UNORM:
- return V_008F14_IMG_DATA_FORMAT_BG_RG;
- default:
- goto out_unknown;
- }
- }
-
- if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
- if (!sscreen->info.has_format_bc1_through_bc7)
- goto out_unknown;
-
- switch (format) {
- case PIPE_FORMAT_DXT1_RGB:
- case PIPE_FORMAT_DXT1_RGBA:
- case PIPE_FORMAT_DXT1_SRGB:
- case PIPE_FORMAT_DXT1_SRGBA:
- return V_008F14_IMG_DATA_FORMAT_BC1;
- case PIPE_FORMAT_DXT3_RGBA:
- case PIPE_FORMAT_DXT3_SRGBA:
- return V_008F14_IMG_DATA_FORMAT_BC2;
- case PIPE_FORMAT_DXT5_RGBA:
- case PIPE_FORMAT_DXT5_SRGBA:
- return V_008F14_IMG_DATA_FORMAT_BC3;
- default:
- goto out_unknown;
- }
- }
-
- if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
- return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
- } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
- return V_008F14_IMG_DATA_FORMAT_10_11_11;
- }
-
- /* R8G8Bx_SNORM - TODO CxV8U8 */
-
- /* hw cannot support mixed formats (except depth/stencil, since only
- * depth is read).*/
- if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
- goto out_unknown;
-
- /* See whether the components are of the same size. */
- for (i = 1; i < desc->nr_channels; i++) {
- uniform = uniform && desc->channel[0].size == desc->channel[i].size;
- }
-
- /* Non-uniform formats. */
- if (!uniform) {
- switch(desc->nr_channels) {
- case 3:
- if (desc->channel[0].size == 5 &&
- desc->channel[1].size == 6 &&
- desc->channel[2].size == 5) {
- return V_008F14_IMG_DATA_FORMAT_5_6_5;
- }
- goto out_unknown;
- case 4:
- if (desc->channel[0].size == 5 &&
- desc->channel[1].size == 5 &&
- desc->channel[2].size == 5 &&
- desc->channel[3].size == 1) {
- return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
- }
- if (desc->channel[0].size == 1 &&
- desc->channel[1].size == 5 &&
- desc->channel[2].size == 5 &&
- desc->channel[3].size == 5) {
- return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
- }
- if (desc->channel[0].size == 10 &&
- desc->channel[1].size == 10 &&
- desc->channel[2].size == 10 &&
- desc->channel[3].size == 2) {
- return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
- }
- goto out_unknown;
- }
- goto out_unknown;
- }
-
- if (first_non_void < 0 || first_non_void > 3)
- goto out_unknown;
-
- /* uniform formats */
- switch (desc->channel[first_non_void].size) {
- case 4:
- switch (desc->nr_channels) {
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ bool uniform = true;
+ int i;
+
+ assert(sscreen->info.chip_class <= GFX9);
+
+ /* Colorspace (return non-RGB formats directly). */
+ switch (desc->colorspace) {
+ /* Depth stencil formats */
+ case UTIL_FORMAT_COLORSPACE_ZS:
+ switch (format) {
+ case PIPE_FORMAT_Z16_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_16;
+ case PIPE_FORMAT_X24S8_UINT:
+ case PIPE_FORMAT_S8X24_UINT:
+ /*
+ * Implemented as an 8_8_8_8 data format to fix texture
+ * gathers in stencil sampling. This affects at least
+ * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+ */
+ if (sscreen->info.chip_class <= GFX8)
+ return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+
+ if (format == PIPE_FORMAT_X24S8_UINT)
+ return V_008F14_IMG_DATA_FORMAT_8_24;
+ else
+ return V_008F14_IMG_DATA_FORMAT_24_8;
+ case PIPE_FORMAT_Z24X8_UNORM:
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ return V_008F14_IMG_DATA_FORMAT_8_24;
+ case PIPE_FORMAT_X8Z24_UNORM:
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_24_8;
+ case PIPE_FORMAT_S8_UINT:
+ return V_008F14_IMG_DATA_FORMAT_8;
+ case PIPE_FORMAT_Z32_FLOAT:
+ return V_008F14_IMG_DATA_FORMAT_32;
+ case PIPE_FORMAT_X32_S8X24_UINT:
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ return V_008F14_IMG_DATA_FORMAT_X24_8_32;
+ default:
+ goto out_unknown;
+ }
+
+ case UTIL_FORMAT_COLORSPACE_YUV:
+ goto out_unknown; /* TODO */
+
+ case UTIL_FORMAT_COLORSPACE_SRGB:
+ if (desc->nr_channels != 4 && desc->nr_channels != 1)
+ goto out_unknown;
+ break;
+
+ default:
+ break;
+ }
+
+ if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+ if (!sscreen->info.has_format_bc1_through_bc7)
+ goto out_unknown;
+
+ switch (format) {
+ case PIPE_FORMAT_RGTC1_SNORM:
+ case PIPE_FORMAT_LATC1_SNORM:
+ case PIPE_FORMAT_RGTC1_UNORM:
+ case PIPE_FORMAT_LATC1_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_BC4;
+ case PIPE_FORMAT_RGTC2_SNORM:
+ case PIPE_FORMAT_LATC2_SNORM:
+ case PIPE_FORMAT_RGTC2_UNORM:
+ case PIPE_FORMAT_LATC2_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_BC5;
+ default:
+ goto out_unknown;
+ }
+ }
+
+ if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
+ (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
+ sscreen->info.family == CHIP_RAVEN)) {
+ switch (format) {
+ case PIPE_FORMAT_ETC1_RGB8:
+ case PIPE_FORMAT_ETC2_RGB8:
+ case PIPE_FORMAT_ETC2_SRGB8:
+ return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
+ case PIPE_FORMAT_ETC2_RGB8A1:
+ case PIPE_FORMAT_ETC2_SRGB8A1:
+ return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
+ case PIPE_FORMAT_ETC2_RGBA8:
+ case PIPE_FORMAT_ETC2_SRGBA8:
+ return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
+ case PIPE_FORMAT_ETC2_R11_UNORM:
+ case PIPE_FORMAT_ETC2_R11_SNORM:
+ return V_008F14_IMG_DATA_FORMAT_ETC2_R;
+ case PIPE_FORMAT_ETC2_RG11_UNORM:
+ case PIPE_FORMAT_ETC2_RG11_SNORM:
+ return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
+ default:
+ goto out_unknown;
+ }
+ }
+
+ if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
+ if (!sscreen->info.has_format_bc1_through_bc7)
+ goto out_unknown;
+
+ switch (format) {
+ case PIPE_FORMAT_BPTC_RGBA_UNORM:
+ case PIPE_FORMAT_BPTC_SRGBA:
+ return V_008F14_IMG_DATA_FORMAT_BC7;
+ case PIPE_FORMAT_BPTC_RGB_FLOAT:
+ case PIPE_FORMAT_BPTC_RGB_UFLOAT:
+ return V_008F14_IMG_DATA_FORMAT_BC6;
+ default:
+ goto out_unknown;
+ }
+ }
+
+ if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+ switch (format) {
+ case PIPE_FORMAT_R8G8_B8G8_UNORM:
+ case PIPE_FORMAT_G8R8_B8R8_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_GB_GR;
+ case PIPE_FORMAT_G8R8_G8B8_UNORM:
+ case PIPE_FORMAT_R8G8_R8B8_UNORM:
+ return V_008F14_IMG_DATA_FORMAT_BG_RG;
+ default:
+ goto out_unknown;
+ }
+ }
+
+ if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+ if (!sscreen->info.has_format_bc1_through_bc7)
+ goto out_unknown;
+
+ switch (format) {
+ case PIPE_FORMAT_DXT1_RGB:
+ case PIPE_FORMAT_DXT1_RGBA:
+ case PIPE_FORMAT_DXT1_SRGB:
+ case PIPE_FORMAT_DXT1_SRGBA:
+ return V_008F14_IMG_DATA_FORMAT_BC1;
+ case PIPE_FORMAT_DXT3_RGBA:
+ case PIPE_FORMAT_DXT3_SRGBA:
+ return V_008F14_IMG_DATA_FORMAT_BC2;
+ case PIPE_FORMAT_DXT5_RGBA:
+ case PIPE_FORMAT_DXT5_SRGBA:
+ return V_008F14_IMG_DATA_FORMAT_BC3;
+ default:
+ goto out_unknown;
+ }
+ }
+
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ return V_008F14_IMG_DATA_FORMAT_10_11_11;
+ }
+
+ /* R8G8Bx_SNORM - TODO CxV8U8 */
+
+ /* hw cannot support mixed formats (except depth/stencil, since only
+ * depth is read).*/
+ if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+ goto out_unknown;
+
+ /* See whether the components are of the same size. */
+ for (i = 1; i < desc->nr_channels; i++) {
+ uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+ }
+
+ /* Non-uniform formats. */
+ if (!uniform) {
+ switch (desc->nr_channels) {
+ case 3:
+ if (desc->channel[0].size == 5 && desc->channel[1].size == 6 &&
+ desc->channel[2].size == 5) {
+ return V_008F14_IMG_DATA_FORMAT_5_6_5;
+ }
+ goto out_unknown;
+ case 4:
+ if (desc->channel[0].size == 5 && desc->channel[1].size == 5 &&
+ desc->channel[2].size == 5 && desc->channel[3].size == 1) {
+ return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
+ }
+ if (desc->channel[0].size == 1 && desc->channel[1].size == 5 &&
+ desc->channel[2].size == 5 && desc->channel[3].size == 5) {
+ return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
+ }
+ if (desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+ desc->channel[2].size == 10 && desc->channel[3].size == 2) {
+ return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
+ }
+ goto out_unknown;
+ }
+ goto out_unknown;
+ }
+
+ if (first_non_void < 0 || first_non_void > 3)
+ goto out_unknown;
+
+ /* uniform formats */
+ switch (desc->channel[first_non_void].size) {
+ case 4:
+ switch (desc->nr_channels) {
#if 0 /* Not supported for render targets */
case 2:
return V_008F14_IMG_DATA_FORMAT_4_4;
#endif
- case 4:
- return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
- }
- break;
- case 8:
- switch (desc->nr_channels) {
- case 1:
- return V_008F14_IMG_DATA_FORMAT_8;
- case 2:
- return V_008F14_IMG_DATA_FORMAT_8_8;
- case 4:
- return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
- }
- break;
- case 16:
- switch (desc->nr_channels) {
- case 1:
- return V_008F14_IMG_DATA_FORMAT_16;
- case 2:
- return V_008F14_IMG_DATA_FORMAT_16_16;
- case 4:
- return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
- }
- break;
- case 32:
- switch (desc->nr_channels) {
- case 1:
- return V_008F14_IMG_DATA_FORMAT_32;
- case 2:
- return V_008F14_IMG_DATA_FORMAT_32_32;
+ case 4:
+ return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
+ }
+ break;
+ case 8:
+ switch (desc->nr_channels) {
+ case 1:
+ return V_008F14_IMG_DATA_FORMAT_8;
+ case 2:
+ return V_008F14_IMG_DATA_FORMAT_8_8;
+ case 4:
+ return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+ }
+ break;
+ case 16:
+ switch (desc->nr_channels) {
+ case 1:
+ return V_008F14_IMG_DATA_FORMAT_16;
+ case 2:
+ return V_008F14_IMG_DATA_FORMAT_16_16;
+ case 4:
+ return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
+ }
+ break;
+ case 32:
+ switch (desc->nr_channels) {
+ case 1:
+ return V_008F14_IMG_DATA_FORMAT_32;
+ case 2:
+ return V_008F14_IMG_DATA_FORMAT_32_32;
#if 0 /* Not supported for render targets */
case 3:
return V_008F14_IMG_DATA_FORMAT_32_32_32;
#endif
- case 4:
- return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
- }
- }
+ case 4:
+ return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
+ }
+ }
out_unknown:
- return ~0;
+ return ~0;
}
static unsigned si_tex_wrap(unsigned wrap)
{
- switch (wrap) {
- default:
- case PIPE_TEX_WRAP_REPEAT:
- return V_008F30_SQ_TEX_WRAP;
- case PIPE_TEX_WRAP_CLAMP:
- return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
- case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
- return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
- case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
- return V_008F30_SQ_TEX_CLAMP_BORDER;
- case PIPE_TEX_WRAP_MIRROR_REPEAT:
- return V_008F30_SQ_TEX_MIRROR;
- case PIPE_TEX_WRAP_MIRROR_CLAMP:
- return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
- case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
- return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
- case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
- return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
- }
+ switch (wrap) {
+ default:
+ case PIPE_TEX_WRAP_REPEAT:
+ return V_008F30_SQ_TEX_WRAP;
+ case PIPE_TEX_WRAP_CLAMP:
+ return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
+ case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+ return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
+ case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+ return V_008F30_SQ_TEX_CLAMP_BORDER;
+ case PIPE_TEX_WRAP_MIRROR_REPEAT:
+ return V_008F30_SQ_TEX_MIRROR;
+ case PIPE_TEX_WRAP_MIRROR_CLAMP:
+ return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+ return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+ case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+ return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
+ }
}
static unsigned si_tex_mipfilter(unsigned filter)
{
- switch (filter) {
- case PIPE_TEX_MIPFILTER_NEAREST:
- return V_008F38_SQ_TEX_Z_FILTER_POINT;
- case PIPE_TEX_MIPFILTER_LINEAR:
- return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
- default:
- case PIPE_TEX_MIPFILTER_NONE:
- return V_008F38_SQ_TEX_Z_FILTER_NONE;
- }
+ switch (filter) {
+ case PIPE_TEX_MIPFILTER_NEAREST:
+ return V_008F38_SQ_TEX_Z_FILTER_POINT;
+ case PIPE_TEX_MIPFILTER_LINEAR:
+ return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
+ default:
+ case PIPE_TEX_MIPFILTER_NONE:
+ return V_008F38_SQ_TEX_Z_FILTER_NONE;
+ }
}
static unsigned si_tex_compare(unsigned compare)
{
- switch (compare) {
- default:
- case PIPE_FUNC_NEVER:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
- case PIPE_FUNC_LESS:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
- case PIPE_FUNC_EQUAL:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
- case PIPE_FUNC_LEQUAL:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
- case PIPE_FUNC_GREATER:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
- case PIPE_FUNC_NOTEQUAL:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
- case PIPE_FUNC_GEQUAL:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
- case PIPE_FUNC_ALWAYS:
- return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
- }
+ switch (compare) {
+ default:
+ case PIPE_FUNC_NEVER:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
+ case PIPE_FUNC_LESS:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
+ case PIPE_FUNC_EQUAL:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
+ case PIPE_FUNC_LEQUAL:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
+ case PIPE_FUNC_GREATER:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
+ case PIPE_FUNC_NOTEQUAL:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
+ case PIPE_FUNC_GEQUAL:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
+ case PIPE_FUNC_ALWAYS:
+ return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
+ }
}
-static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
- unsigned view_target, unsigned nr_samples)
+static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target,
+ unsigned nr_samples)
{
- unsigned res_target = tex->buffer.b.b.target;
-
- if (view_target == PIPE_TEXTURE_CUBE ||
- view_target == PIPE_TEXTURE_CUBE_ARRAY)
- res_target = view_target;
- /* If interpreting cubemaps as something else, set 2D_ARRAY. */
- else if (res_target == PIPE_TEXTURE_CUBE ||
- res_target == PIPE_TEXTURE_CUBE_ARRAY)
- res_target = PIPE_TEXTURE_2D_ARRAY;
-
- /* GFX9 allocates 1D textures as 2D. */
- if ((res_target == PIPE_TEXTURE_1D ||
- res_target == PIPE_TEXTURE_1D_ARRAY) &&
- sscreen->info.chip_class == GFX9 &&
- tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
- if (res_target == PIPE_TEXTURE_1D)
- res_target = PIPE_TEXTURE_2D;
- else
- res_target = PIPE_TEXTURE_2D_ARRAY;
- }
-
- switch (res_target) {
- default:
- case PIPE_TEXTURE_1D:
- return V_008F1C_SQ_RSRC_IMG_1D;
- case PIPE_TEXTURE_1D_ARRAY:
- return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
- case PIPE_TEXTURE_2D:
- case PIPE_TEXTURE_RECT:
- return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA :
- V_008F1C_SQ_RSRC_IMG_2D;
- case PIPE_TEXTURE_2D_ARRAY:
- return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY :
- V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
- case PIPE_TEXTURE_3D:
- return V_008F1C_SQ_RSRC_IMG_3D;
- case PIPE_TEXTURE_CUBE:
- case PIPE_TEXTURE_CUBE_ARRAY:
- return V_008F1C_SQ_RSRC_IMG_CUBE;
- }
+ unsigned res_target = tex->buffer.b.b.target;
+
+ if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY)
+ res_target = view_target;
+ /* If interpreting cubemaps as something else, set 2D_ARRAY. */
+ else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY)
+ res_target = PIPE_TEXTURE_2D_ARRAY;
+
+ /* GFX9 allocates 1D textures as 2D. */
+ if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) &&
+ sscreen->info.chip_class == GFX9 &&
+ tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
+ if (res_target == PIPE_TEXTURE_1D)
+ res_target = PIPE_TEXTURE_2D;
+ else
+ res_target = PIPE_TEXTURE_2D_ARRAY;
+ }
+
+ switch (res_target) {
+ default:
+ case PIPE_TEXTURE_1D:
+ return V_008F1C_SQ_RSRC_IMG_1D;
+ case PIPE_TEXTURE_1D_ARRAY:
+ return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
+ case PIPE_TEXTURE_2D:
+ case PIPE_TEXTURE_RECT:
+ return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D;
+ case PIPE_TEXTURE_2D_ARRAY:
+ return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+ case PIPE_TEXTURE_3D:
+ return V_008F1C_SQ_RSRC_IMG_3D;
+ case PIPE_TEXTURE_CUBE:
+ case PIPE_TEXTURE_CUBE_ARRAY:
+ return V_008F1C_SQ_RSRC_IMG_CUBE;
+ }
}
/*
static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_screen *sscreen = (struct si_screen *)screen;
- if (sscreen->info.chip_class >= GFX10) {
- const struct gfx10_format *fmt = &gfx10_format_table[format];
- if (!fmt->img_format || fmt->buffers_only)
- return false;
- return true;
- }
+ if (sscreen->info.chip_class >= GFX10) {
+ const struct gfx10_format *fmt = &gfx10_format_table[format];
+ if (!fmt->img_format || fmt->buffers_only)
+ return false;
+ return true;
+ }
- const struct util_format_description *desc = util_format_description(format);
- if (!desc)
- return false;
+ const struct util_format_description *desc = util_format_description(format);
+ if (!desc)
+ return false;
- return si_translate_texformat(screen, format, desc,
- util_format_get_first_non_void_channel(format)) != ~0U;
+ return si_translate_texformat(screen, format, desc,
+ util_format_get_first_non_void_channel(format)) != ~0U;
}
static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
- const struct util_format_description *desc,
- int first_non_void)
+ const struct util_format_description *desc,
+ int first_non_void)
{
- int i;
-
- assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
- if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
- return V_008F0C_BUF_DATA_FORMAT_10_11_11;
-
- assert(first_non_void >= 0);
-
- if (desc->nr_channels == 4 &&
- desc->channel[0].size == 10 &&
- desc->channel[1].size == 10 &&
- desc->channel[2].size == 10 &&
- desc->channel[3].size == 2)
- return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
-
- /* See whether the components are of the same size. */
- for (i = 0; i < desc->nr_channels; i++) {
- if (desc->channel[first_non_void].size != desc->channel[i].size)
- return V_008F0C_BUF_DATA_FORMAT_INVALID;
- }
-
- switch (desc->channel[first_non_void].size) {
- case 8:
- switch (desc->nr_channels) {
- case 1:
- case 3: /* 3 loads */
- return V_008F0C_BUF_DATA_FORMAT_8;
- case 2:
- return V_008F0C_BUF_DATA_FORMAT_8_8;
- case 4:
- return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
- }
- break;
- case 16:
- switch (desc->nr_channels) {
- case 1:
- case 3: /* 3 loads */
- return V_008F0C_BUF_DATA_FORMAT_16;
- case 2:
- return V_008F0C_BUF_DATA_FORMAT_16_16;
- case 4:
- return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
- }
- break;
- case 32:
- switch (desc->nr_channels) {
- case 1:
- return V_008F0C_BUF_DATA_FORMAT_32;
- case 2:
- return V_008F0C_BUF_DATA_FORMAT_32_32;
- case 3:
- return V_008F0C_BUF_DATA_FORMAT_32_32_32;
- case 4:
- return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
- }
- break;
- case 64:
- /* Legacy double formats. */
- switch (desc->nr_channels) {
- case 1: /* 1 load */
- return V_008F0C_BUF_DATA_FORMAT_32_32;
- case 2: /* 1 load */
- return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
- case 3: /* 3 loads */
- return V_008F0C_BUF_DATA_FORMAT_32_32;
- case 4: /* 2 loads */
- return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
- }
- break;
- }
-
- return V_008F0C_BUF_DATA_FORMAT_INVALID;
+ int i;
+
+ assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+ if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+ return V_008F0C_BUF_DATA_FORMAT_10_11_11;
+
+ assert(first_non_void >= 0);
+
+ if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+ desc->channel[2].size == 10 && desc->channel[3].size == 2)
+ return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
+
+ /* See whether the components are of the same size. */
+ for (i = 0; i < desc->nr_channels; i++) {
+ if (desc->channel[first_non_void].size != desc->channel[i].size)
+ return V_008F0C_BUF_DATA_FORMAT_INVALID;
+ }
+
+ switch (desc->channel[first_non_void].size) {
+ case 8:
+ switch (desc->nr_channels) {
+ case 1:
+ case 3: /* 3 loads */
+ return V_008F0C_BUF_DATA_FORMAT_8;
+ case 2:
+ return V_008F0C_BUF_DATA_FORMAT_8_8;
+ case 4:
+ return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
+ }
+ break;
+ case 16:
+ switch (desc->nr_channels) {
+ case 1:
+ case 3: /* 3 loads */
+ return V_008F0C_BUF_DATA_FORMAT_16;
+ case 2:
+ return V_008F0C_BUF_DATA_FORMAT_16_16;
+ case 4:
+ return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
+ }
+ break;
+ case 32:
+ switch (desc->nr_channels) {
+ case 1:
+ return V_008F0C_BUF_DATA_FORMAT_32;
+ case 2:
+ return V_008F0C_BUF_DATA_FORMAT_32_32;
+ case 3:
+ return V_008F0C_BUF_DATA_FORMAT_32_32_32;
+ case 4:
+ return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+ }
+ break;
+ case 64:
+ /* Legacy double formats. */
+ switch (desc->nr_channels) {
+ case 1: /* 1 load */
+ return V_008F0C_BUF_DATA_FORMAT_32_32;
+ case 2: /* 1 load */
+ return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+ case 3: /* 3 loads */
+ return V_008F0C_BUF_DATA_FORMAT_32_32;
+ case 4: /* 2 loads */
+ return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+ }
+ break;
+ }
+
+ return V_008F0C_BUF_DATA_FORMAT_INVALID;
}
static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
- const struct util_format_description *desc,
- int first_non_void)
+ const struct util_format_description *desc,
+ int first_non_void)
{
- assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
- if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
- return V_008F0C_BUF_NUM_FORMAT_FLOAT;
-
- assert(first_non_void >= 0);
-
- switch (desc->channel[first_non_void].type) {
- case UTIL_FORMAT_TYPE_SIGNED:
- case UTIL_FORMAT_TYPE_FIXED:
- if (desc->channel[first_non_void].size >= 32 ||
- desc->channel[first_non_void].pure_integer)
- return V_008F0C_BUF_NUM_FORMAT_SINT;
- else if (desc->channel[first_non_void].normalized)
- return V_008F0C_BUF_NUM_FORMAT_SNORM;
- else
- return V_008F0C_BUF_NUM_FORMAT_SSCALED;
- break;
- case UTIL_FORMAT_TYPE_UNSIGNED:
- if (desc->channel[first_non_void].size >= 32 ||
- desc->channel[first_non_void].pure_integer)
- return V_008F0C_BUF_NUM_FORMAT_UINT;
- else if (desc->channel[first_non_void].normalized)
- return V_008F0C_BUF_NUM_FORMAT_UNORM;
- else
- return V_008F0C_BUF_NUM_FORMAT_USCALED;
- break;
- case UTIL_FORMAT_TYPE_FLOAT:
- default:
- return V_008F0C_BUF_NUM_FORMAT_FLOAT;
- }
+ assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+ if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+ return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+
+ assert(first_non_void >= 0);
+
+ switch (desc->channel[first_non_void].type) {
+ case UTIL_FORMAT_TYPE_SIGNED:
+ case UTIL_FORMAT_TYPE_FIXED:
+ if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+ return V_008F0C_BUF_NUM_FORMAT_SINT;
+ else if (desc->channel[first_non_void].normalized)
+ return V_008F0C_BUF_NUM_FORMAT_SNORM;
+ else
+ return V_008F0C_BUF_NUM_FORMAT_SSCALED;
+ break;
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+ return V_008F0C_BUF_NUM_FORMAT_UINT;
+ else if (desc->channel[first_non_void].normalized)
+ return V_008F0C_BUF_NUM_FORMAT_UNORM;
+ else
+ return V_008F0C_BUF_NUM_FORMAT_USCALED;
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ default:
+ return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+ }
}
-static unsigned si_is_vertex_format_supported(struct pipe_screen *screen,
- enum pipe_format format,
- unsigned usage)
+static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format,
+ unsigned usage)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- const struct util_format_description *desc;
- int first_non_void;
- unsigned data_format;
-
- assert((usage & ~(PIPE_BIND_SHADER_IMAGE |
- PIPE_BIND_SAMPLER_VIEW |
- PIPE_BIND_VERTEX_BUFFER)) == 0);
-
- desc = util_format_description(format);
- if (!desc)
- return 0;
-
- /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
- * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
- * for read-only access (with caveats surrounding bounds checks), but
- * obviously fails for write access which we have to implement for
- * shader images. Luckily, OpenGL doesn't expect this to be supported
- * anyway, and so the only impact is on PBO uploads / downloads, which
- * shouldn't be expected to be fast for GL_RGB anyway.
- */
- if (desc->block.bits == 3 * 8 ||
- desc->block.bits == 3 * 16) {
- if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
- usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
- if (!usage)
- return 0;
- }
- }
-
- if (sscreen->info.chip_class >= GFX10) {
- const struct gfx10_format *fmt = &gfx10_format_table[format];
- if (!fmt->img_format || fmt->img_format >= 128)
- return 0;
- return usage;
- }
-
- first_non_void = util_format_get_first_non_void_channel(format);
- data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
- if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
- return 0;
-
- return usage;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ const struct util_format_description *desc;
+ int first_non_void;
+ unsigned data_format;
+
+ assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) ==
+ 0);
+
+ desc = util_format_description(format);
+ if (!desc)
+ return 0;
+
+ /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
+ * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
+ * for read-only access (with caveats surrounding bounds checks), but
+ * obviously fails for write access which we have to implement for
+ * shader images. Luckily, OpenGL doesn't expect this to be supported
+ * anyway, and so the only impact is on PBO uploads / downloads, which
+ * shouldn't be expected to be fast for GL_RGB anyway.
+ */
+ if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) {
+ if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
+ usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
+ if (!usage)
+ return 0;
+ }
+ }
+
+ if (sscreen->info.chip_class >= GFX10) {
+ const struct gfx10_format *fmt = &gfx10_format_table[format];
+ if (!fmt->img_format || fmt->img_format >= 128)
+ return 0;
+ return usage;
+ }
+
+ first_non_void = util_format_get_first_non_void_channel(format);
+ data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
+ if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
+ return 0;
+
+ return usage;
}
static bool si_is_colorbuffer_format_supported(enum pipe_format format)
{
- return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
- si_translate_colorswap(format, false) != ~0U;
+ return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
+ si_translate_colorswap(format, false) != ~0U;
}
static bool si_is_zs_format_supported(enum pipe_format format)
{
- return si_translate_dbformat(format) != V_028040_Z_INVALID;
+ return si_translate_dbformat(format) != V_028040_Z_INVALID;
}
-static bool si_is_format_supported(struct pipe_screen *screen,
- enum pipe_format format,
- enum pipe_texture_target target,
- unsigned sample_count,
- unsigned storage_sample_count,
- unsigned usage)
+static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+ enum pipe_texture_target target, unsigned sample_count,
+ unsigned storage_sample_count, unsigned usage)
{
- struct si_screen *sscreen = (struct si_screen *)screen;
- unsigned retval = 0;
-
- if (target >= PIPE_MAX_TEXTURE_TYPES) {
- PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
- return false;
- }
-
- if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
- return false;
-
- if (sample_count > 1) {
- if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
- return false;
-
- /* Only power-of-two sample counts are supported. */
- if (!util_is_power_of_two_or_zero(sample_count) ||
- !util_is_power_of_two_or_zero(storage_sample_count))
- return false;
-
- /* MSAA support without framebuffer attachments. */
- if (format == PIPE_FORMAT_NONE && sample_count <= 16)
- return true;
-
- if (!sscreen->info.has_eqaa_surface_allocator ||
- util_format_is_depth_or_stencil(format)) {
- /* Color without EQAA or depth/stencil. */
- if (sample_count > 8 ||
- sample_count != storage_sample_count)
- return false;
- } else {
- /* Color with EQAA. */
- if (sample_count > 16 ||
- storage_sample_count > 8)
- return false;
- }
- }
-
- if (usage & (PIPE_BIND_SAMPLER_VIEW |
- PIPE_BIND_SHADER_IMAGE)) {
- if (target == PIPE_BUFFER) {
- retval |= si_is_vertex_format_supported(
- screen, format, usage & (PIPE_BIND_SAMPLER_VIEW |
- PIPE_BIND_SHADER_IMAGE));
- } else {
- if (si_is_sampler_format_supported(screen, format))
- retval |= usage & (PIPE_BIND_SAMPLER_VIEW |
- PIPE_BIND_SHADER_IMAGE);
- }
- }
-
- if ((usage & (PIPE_BIND_RENDER_TARGET |
- PIPE_BIND_DISPLAY_TARGET |
- PIPE_BIND_SCANOUT |
- PIPE_BIND_SHARED |
- PIPE_BIND_BLENDABLE)) &&
- si_is_colorbuffer_format_supported(format)) {
- retval |= usage &
- (PIPE_BIND_RENDER_TARGET |
- PIPE_BIND_DISPLAY_TARGET |
- PIPE_BIND_SCANOUT |
- PIPE_BIND_SHARED);
- if (!util_format_is_pure_integer(format) &&
- !util_format_is_depth_or_stencil(format))
- retval |= usage & PIPE_BIND_BLENDABLE;
- }
-
- if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
- si_is_zs_format_supported(format)) {
- retval |= PIPE_BIND_DEPTH_STENCIL;
- }
-
- if (usage & PIPE_BIND_VERTEX_BUFFER) {
- retval |= si_is_vertex_format_supported(screen, format,
- PIPE_BIND_VERTEX_BUFFER);
- }
-
- if ((usage & PIPE_BIND_LINEAR) &&
- !util_format_is_compressed(format) &&
- !(usage & PIPE_BIND_DEPTH_STENCIL))
- retval |= PIPE_BIND_LINEAR;
-
- return retval == usage;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ unsigned retval = 0;
+
+ if (target >= PIPE_MAX_TEXTURE_TYPES) {
+ PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
+ return false;
+ }
+
+ if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
+ return false;
+
+ if (sample_count > 1) {
+ if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
+ return false;
+
+ /* Only power-of-two sample counts are supported. */
+ if (!util_is_power_of_two_or_zero(sample_count) ||
+ !util_is_power_of_two_or_zero(storage_sample_count))
+ return false;
+
+ /* MSAA support without framebuffer attachments. */
+ if (format == PIPE_FORMAT_NONE && sample_count <= 16)
+ return true;
+
+ if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
+ /* Color without EQAA or depth/stencil. */
+ if (sample_count > 8 || sample_count != storage_sample_count)
+ return false;
+ } else {
+ /* Color with EQAA. */
+ if (sample_count > 16 || storage_sample_count > 8)
+ return false;
+ }
+ }
+
+ if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) {
+ if (target == PIPE_BUFFER) {
+ retval |= si_is_vertex_format_supported(
+ screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE));
+ } else {
+ if (si_is_sampler_format_supported(screen, format))
+ retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
+ }
+ }
+
+ if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+ PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
+ si_is_colorbuffer_format_supported(format)) {
+ retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+ PIPE_BIND_SHARED);
+ if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
+ retval |= usage & PIPE_BIND_BLENDABLE;
+ }
+
+ if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) {
+ retval |= PIPE_BIND_DEPTH_STENCIL;
+ }
+
+ if (usage & PIPE_BIND_VERTEX_BUFFER) {
+ retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
+ }
+
+ if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
+ !(usage & PIPE_BIND_DEPTH_STENCIL))
+ retval |= PIPE_BIND_LINEAR;
+
+ return retval == usage;
}
/*
* framebuffer handling
*/
-static void si_choose_spi_color_formats(struct si_surface *surf,
- unsigned format, unsigned swap,
- unsigned ntype, bool is_depth)
+static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
+ unsigned ntype, bool is_depth)
{
- /* Alpha is needed for alpha-to-coverage.
- * Blending may be with or without alpha.
- */
- unsigned normal = 0; /* most optimal, may not support blending or export alpha */
- unsigned alpha = 0; /* exports alpha, but may not support blending */
- unsigned blend = 0; /* supports blending, but may not export alpha */
- unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
-
- /* Choose the SPI color formats. These are required values for RB+.
- * Other chips have multiple choices, though they are not necessarily better.
- */
- switch (format) {
- case V_028C70_COLOR_5_6_5:
- case V_028C70_COLOR_1_5_5_5:
- case V_028C70_COLOR_5_5_5_1:
- case V_028C70_COLOR_4_4_4_4:
- case V_028C70_COLOR_10_11_11:
- case V_028C70_COLOR_11_11_10:
- case V_028C70_COLOR_8:
- case V_028C70_COLOR_8_8:
- case V_028C70_COLOR_8_8_8_8:
- case V_028C70_COLOR_10_10_10_2:
- case V_028C70_COLOR_2_10_10_10:
- if (ntype == V_028C70_NUMBER_UINT)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
- else if (ntype == V_028C70_NUMBER_SINT)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
- else
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
- break;
-
- case V_028C70_COLOR_16:
- case V_028C70_COLOR_16_16:
- case V_028C70_COLOR_16_16_16_16:
- if (ntype == V_028C70_NUMBER_UNORM ||
- ntype == V_028C70_NUMBER_SNORM) {
- /* UNORM16 and SNORM16 don't support blending */
- if (ntype == V_028C70_NUMBER_UNORM)
- normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
- else
- normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
-
- /* Use 32 bits per channel for blending. */
- if (format == V_028C70_COLOR_16) {
- if (swap == V_028C70_SWAP_STD) { /* R */
- blend = V_028714_SPI_SHADER_32_R;
- blend_alpha = V_028714_SPI_SHADER_32_AR;
- } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
- blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
- else
- assert(0);
- } else if (format == V_028C70_COLOR_16_16) {
- if (swap == V_028C70_SWAP_STD) { /* RG */
- blend = V_028714_SPI_SHADER_32_GR;
- blend_alpha = V_028714_SPI_SHADER_32_ABGR;
- } else if (swap == V_028C70_SWAP_ALT) /* RA */
- blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
- else
- assert(0);
- } else /* 16_16_16_16 */
- blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
- } else if (ntype == V_028C70_NUMBER_UINT)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
- else if (ntype == V_028C70_NUMBER_SINT)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
- else if (ntype == V_028C70_NUMBER_FLOAT)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
- else
- assert(0);
- break;
-
- case V_028C70_COLOR_32:
- if (swap == V_028C70_SWAP_STD) { /* R */
- blend = normal = V_028714_SPI_SHADER_32_R;
- alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
- } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
- else
- assert(0);
- break;
-
- case V_028C70_COLOR_32_32:
- if (swap == V_028C70_SWAP_STD) { /* RG */
- blend = normal = V_028714_SPI_SHADER_32_GR;
- alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
- } else if (swap == V_028C70_SWAP_ALT) /* RA */
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
- else
- assert(0);
- break;
-
- case V_028C70_COLOR_32_32_32_32:
- case V_028C70_COLOR_8_24:
- case V_028C70_COLOR_24_8:
- case V_028C70_COLOR_X24_8_32_FLOAT:
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
- break;
-
- default:
- assert(0);
- return;
- }
-
- /* The DB->CB copy needs 32_ABGR. */
- if (is_depth)
- alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-
- surf->spi_shader_col_format = normal;
- surf->spi_shader_col_format_alpha = alpha;
- surf->spi_shader_col_format_blend = blend;
- surf->spi_shader_col_format_blend_alpha = blend_alpha;
+ /* Alpha is needed for alpha-to-coverage.
+ * Blending may be with or without alpha.
+ */
+ unsigned normal = 0; /* most optimal, may not support blending or export alpha */
+ unsigned alpha = 0; /* exports alpha, but may not support blending */
+ unsigned blend = 0; /* supports blending, but may not export alpha */
+ unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
+
+ /* Choose the SPI color formats. These are required values for RB+.
+ * Other chips have multiple choices, though they are not necessarily better.
+ */
+ switch (format) {
+ case V_028C70_COLOR_5_6_5:
+ case V_028C70_COLOR_1_5_5_5:
+ case V_028C70_COLOR_5_5_5_1:
+ case V_028C70_COLOR_4_4_4_4:
+ case V_028C70_COLOR_10_11_11:
+ case V_028C70_COLOR_11_11_10:
+ case V_028C70_COLOR_8:
+ case V_028C70_COLOR_8_8:
+ case V_028C70_COLOR_8_8_8_8:
+ case V_028C70_COLOR_10_10_10_2:
+ case V_028C70_COLOR_2_10_10_10:
+ if (ntype == V_028C70_NUMBER_UINT)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+ else if (ntype == V_028C70_NUMBER_SINT)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+ else
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+ break;
+
+ case V_028C70_COLOR_16:
+ case V_028C70_COLOR_16_16:
+ case V_028C70_COLOR_16_16_16_16:
+ if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
+ /* UNORM16 and SNORM16 don't support blending */
+ if (ntype == V_028C70_NUMBER_UNORM)
+ normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
+ else
+ normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
+
+ /* Use 32 bits per channel for blending. */
+ if (format == V_028C70_COLOR_16) {
+ if (swap == V_028C70_SWAP_STD) { /* R */
+ blend = V_028714_SPI_SHADER_32_R;
+ blend_alpha = V_028714_SPI_SHADER_32_AR;
+ } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+ blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+ else
+ assert(0);
+ } else if (format == V_028C70_COLOR_16_16) {
+ if (swap == V_028C70_SWAP_STD) { /* RG */
+ blend = V_028714_SPI_SHADER_32_GR;
+ blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+ } else if (swap == V_028C70_SWAP_ALT) /* RA */
+ blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+ else
+ assert(0);
+ } else /* 16_16_16_16 */
+ blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+ } else if (ntype == V_028C70_NUMBER_UINT)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+ else if (ntype == V_028C70_NUMBER_SINT)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+ else if (ntype == V_028C70_NUMBER_FLOAT)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+ else
+ assert(0);
+ break;
+
+ case V_028C70_COLOR_32:
+ if (swap == V_028C70_SWAP_STD) { /* R */
+ blend = normal = V_028714_SPI_SHADER_32_R;
+ alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
+ } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+ else
+ assert(0);
+ break;
+
+ case V_028C70_COLOR_32_32:
+ if (swap == V_028C70_SWAP_STD) { /* RG */
+ blend = normal = V_028714_SPI_SHADER_32_GR;
+ alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+ } else if (swap == V_028C70_SWAP_ALT) /* RA */
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+ else
+ assert(0);
+ break;
+
+ case V_028C70_COLOR_32_32_32_32:
+ case V_028C70_COLOR_8_24:
+ case V_028C70_COLOR_24_8:
+ case V_028C70_COLOR_X24_8_32_FLOAT:
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+ break;
+
+ default:
+ assert(0);
+ return;
+ }
+
+ /* The DB->CB copy needs 32_ABGR. */
+ if (is_depth)
+ alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+
+ surf->spi_shader_col_format = normal;
+ surf->spi_shader_col_format_alpha = alpha;
+ surf->spi_shader_col_format_blend = blend;
+ surf->spi_shader_col_format_blend_alpha = blend_alpha;
}
-static void si_initialize_color_surface(struct si_context *sctx,
- struct si_surface *surf)
+static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
{
- struct si_texture *tex = (struct si_texture*)surf->base.texture;
- unsigned color_info, color_attrib;
- unsigned format, swap, ntype, endian;
- const struct util_format_description *desc;
- int firstchan;
- unsigned blend_clamp = 0, blend_bypass = 0;
-
- desc = util_format_description(surf->base.format);
- for (firstchan = 0; firstchan < 4; firstchan++) {
- if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
- break;
- }
- }
- if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
- ntype = V_028C70_NUMBER_FLOAT;
- } else {
- ntype = V_028C70_NUMBER_UNORM;
- if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
- ntype = V_028C70_NUMBER_SRGB;
- else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
- if (desc->channel[firstchan].pure_integer) {
- ntype = V_028C70_NUMBER_SINT;
- } else {
- assert(desc->channel[firstchan].normalized);
- ntype = V_028C70_NUMBER_SNORM;
- }
- } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
- if (desc->channel[firstchan].pure_integer) {
- ntype = V_028C70_NUMBER_UINT;
- } else {
- assert(desc->channel[firstchan].normalized);
- ntype = V_028C70_NUMBER_UNORM;
- }
- }
- }
-
- format = si_translate_colorformat(surf->base.format);
- if (format == V_028C70_COLOR_INVALID) {
- PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
- }
- assert(format != V_028C70_COLOR_INVALID);
- swap = si_translate_colorswap(surf->base.format, false);
- endian = si_colorformat_endian_swap(format);
-
- /* blend clamp should be set for all NORM/SRGB types */
- if (ntype == V_028C70_NUMBER_UNORM ||
- ntype == V_028C70_NUMBER_SNORM ||
- ntype == V_028C70_NUMBER_SRGB)
- blend_clamp = 1;
-
- /* set blend bypass according to docs if SINT/UINT or
- 8/24 COLOR variants */
- if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
- format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
- format == V_028C70_COLOR_X24_8_32_FLOAT) {
- blend_clamp = 0;
- blend_bypass = 1;
- }
-
- if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
- if (format == V_028C70_COLOR_8 ||
- format == V_028C70_COLOR_8_8 ||
- format == V_028C70_COLOR_8_8_8_8)
- surf->color_is_int8 = true;
- else if (format == V_028C70_COLOR_10_10_10_2 ||
- format == V_028C70_COLOR_2_10_10_10)
- surf->color_is_int10 = true;
- }
-
- color_info = S_028C70_FORMAT(format) |
- S_028C70_COMP_SWAP(swap) |
- S_028C70_BLEND_CLAMP(blend_clamp) |
- S_028C70_BLEND_BYPASS(blend_bypass) |
- S_028C70_SIMPLE_FLOAT(1) |
- S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM &&
- ntype != V_028C70_NUMBER_SNORM &&
- ntype != V_028C70_NUMBER_SRGB &&
- format != V_028C70_COLOR_8_24 &&
- format != V_028C70_COLOR_24_8) |
- S_028C70_NUMBER_TYPE(ntype) |
- S_028C70_ENDIAN(endian);
-
- /* Intensity is implemented as Red, so treat it that way. */
- color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
- util_format_is_intensity(surf->base.format));
-
- if (tex->buffer.b.b.nr_samples > 1) {
- unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
- unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
-
- color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
- S_028C74_NUM_FRAGMENTS(log_fragments);
-
- if (tex->surface.fmask_offset) {
- color_info |= S_028C70_COMPRESSION(1);
- unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
-
- if (sctx->chip_class == GFX6) {
- /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
- color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
- }
- }
- }
-
- if (sctx->chip_class >= GFX10) {
- unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
- /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
- 64 for APU because all of our APUs to date use DIMMs which have
- a request granularity size of 64B while all other chips have a
- 32B request size */
- if (!sctx->screen->info.has_dedicated_vram)
- min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
- surf->cb_dcc_control =
- S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
- S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
- S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
- S_028C78_INDEPENDENT_64B_BLOCKS(0) |
- S_028C78_INDEPENDENT_128B_BLOCKS(1);
- } else if (sctx->chip_class >= GFX8) {
- unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
- unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
- /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
- 64 for APU because all of our APUs to date use DIMMs which have
- a request granularity size of 64B while all other chips have a
- 32B request size */
- if (!sctx->screen->info.has_dedicated_vram)
- min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
- if (tex->buffer.b.b.nr_storage_samples > 1) {
- if (tex->surface.bpe == 1)
- max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
- else if (tex->surface.bpe == 2)
- max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
- }
-
- surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
- S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
- S_028C78_INDEPENDENT_64B_BLOCKS(1);
- }
-
- /* This must be set for fast clear to work without FMASK. */
- if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
- unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
- color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
- }
-
- /* GFX10 field has the same base shift as the GFX6 field */
- unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
- S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
- unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
-
- if (sctx->chip_class >= GFX10) {
- color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
-
- surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
- S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
- S_028EE0_RESOURCE_LEVEL(1);
- } else if (sctx->chip_class == GFX9) {
- color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
- color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
- S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
- }
-
- if (sctx->chip_class >= GFX9) {
- surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
- S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
- S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
- }
-
- surf->cb_color_view = color_view;
- surf->cb_color_info = color_info;
- surf->cb_color_attrib = color_attrib;
-
- /* Determine pixel shader export format */
- si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
-
- surf->color_initialized = true;
+ struct si_texture *tex = (struct si_texture *)surf->base.texture;
+ unsigned color_info, color_attrib;
+ unsigned format, swap, ntype, endian;
+ const struct util_format_description *desc;
+ int firstchan;
+ unsigned blend_clamp = 0, blend_bypass = 0;
+
+ desc = util_format_description(surf->base.format);
+ for (firstchan = 0; firstchan < 4; firstchan++) {
+ if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
+ break;
+ }
+ }
+ if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
+ ntype = V_028C70_NUMBER_FLOAT;
+ } else {
+ ntype = V_028C70_NUMBER_UNORM;
+ if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+ ntype = V_028C70_NUMBER_SRGB;
+ else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
+ if (desc->channel[firstchan].pure_integer) {
+ ntype = V_028C70_NUMBER_SINT;
+ } else {
+ assert(desc->channel[firstchan].normalized);
+ ntype = V_028C70_NUMBER_SNORM;
+ }
+ } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+ if (desc->channel[firstchan].pure_integer) {
+ ntype = V_028C70_NUMBER_UINT;
+ } else {
+ assert(desc->channel[firstchan].normalized);
+ ntype = V_028C70_NUMBER_UNORM;
+ }
+ }
+ }
+
+ format = si_translate_colorformat(surf->base.format);
+ if (format == V_028C70_COLOR_INVALID) {
+ PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
+ }
+ assert(format != V_028C70_COLOR_INVALID);
+ swap = si_translate_colorswap(surf->base.format, false);
+ endian = si_colorformat_endian_swap(format);
+
+ /* blend clamp should be set for all NORM/SRGB types */
+ if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
+ ntype == V_028C70_NUMBER_SRGB)
+ blend_clamp = 1;
+
+ /* set blend bypass according to docs if SINT/UINT or
+ 8/24 COLOR variants */
+ if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
+ format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
+ format == V_028C70_COLOR_X24_8_32_FLOAT) {
+ blend_clamp = 0;
+ blend_bypass = 1;
+ }
+
+ if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
+ if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 ||
+ format == V_028C70_COLOR_8_8_8_8)
+ surf->color_is_int8 = true;
+ else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10)
+ surf->color_is_int10 = true;
+ }
+
+ color_info =
+ S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) |
+ S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) |
+ S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM &&
+ ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 &&
+ format != V_028C70_COLOR_24_8) |
+ S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian);
+
+ /* Intensity is implemented as Red, so treat it that way. */
+ color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
+ util_format_is_intensity(surf->base.format));
+
+ if (tex->buffer.b.b.nr_samples > 1) {
+ unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
+ unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
+
+ color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments);
+
+ if (tex->surface.fmask_offset) {
+ color_info |= S_028C70_COMPRESSION(1);
+ unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
+
+ if (sctx->chip_class == GFX6) {
+ /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
+ color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+ }
+ }
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+ /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+ 64 for APU because all of our APUs to date use DIMMs which have
+ a request granularity size of 64B while all other chips have a
+ 32B request size */
+ if (!sctx->screen->info.has_dedicated_vram)
+ min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+ surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+ S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+ S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+ S_028C78_INDEPENDENT_64B_BLOCKS(0) |
+ S_028C78_INDEPENDENT_128B_BLOCKS(1);
+ } else if (sctx->chip_class >= GFX8) {
+ unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
+ unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+ /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+ 64 for APU because all of our APUs to date use DIMMs which have
+ a request granularity size of 64B while all other chips have a
+ 32B request size */
+ if (!sctx->screen->info.has_dedicated_vram)
+ min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+ if (tex->buffer.b.b.nr_storage_samples > 1) {
+ if (tex->surface.bpe == 1)
+ max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+ else if (tex->surface.bpe == 2)
+ max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+ }
+
+ surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
+ S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+ S_028C78_INDEPENDENT_64B_BLOCKS(1);
+ }
+
+ /* This must be set for fast clear to work without FMASK. */
+ if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
+ unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
+ color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
+ }
+
+ /* GFX10 field has the same base shift as the GFX6 field */
+ unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
+ S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
+ unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
+
+ if (sctx->chip_class >= GFX10) {
+ color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
+
+ surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
+ S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
+ S_028EE0_RESOURCE_LEVEL(1);
+ } else if (sctx->chip_class == GFX9) {
+ color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
+ color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
+ S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
+ S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
+ S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
+ }
+
+ surf->cb_color_view = color_view;
+ surf->cb_color_info = color_info;
+ surf->cb_color_attrib = color_attrib;
+
+ /* Determine pixel shader export format */
+ si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
+
+ surf->color_initialized = true;
}
-static void si_init_depth_surface(struct si_context *sctx,
- struct si_surface *surf)
+static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf)
{
- struct si_texture *tex = (struct si_texture*)surf->base.texture;
- unsigned level = surf->base.u.tex.level;
- unsigned format, stencil_format;
- uint32_t z_info, s_info;
-
- format = si_translate_dbformat(tex->db_render_format);
- stencil_format = tex->surface.has_stencil ?
- V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
-
- assert(format != V_028040_Z_INVALID);
- if (format == V_028040_Z_INVALID)
- PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
-
- surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
- S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
- surf->db_htile_data_base = 0;
- surf->db_htile_surface = 0;
-
- if (sctx->chip_class >= GFX10) {
- surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
- S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
- }
-
- if (sctx->chip_class >= GFX9) {
- assert(tex->surface.u.gfx9.surf_offset == 0);
- surf->db_depth_base = tex->buffer.gpu_address >> 8;
- surf->db_stencil_base = (tex->buffer.gpu_address +
- tex->surface.u.gfx9.stencil_offset) >> 8;
- z_info = S_028038_FORMAT(format) |
- S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
- S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
- S_028038_MAXMIP(tex->buffer.b.b.last_level);
- s_info = S_02803C_FORMAT(stencil_format) |
- S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-
- if (sctx->chip_class == GFX9) {
- surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
- surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
- }
- surf->db_depth_view |= S_028008_MIPID(level);
- surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) |
- S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
-
- if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
- z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
- S_028038_ALLOW_EXPCLEAR(1);
-
- if (tex->tc_compatible_htile) {
- unsigned max_zplanes = 4;
-
- if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
- tex->buffer.b.b.nr_samples > 1)
- max_zplanes = 2;
-
- z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
-
- if (sctx->chip_class >= GFX10) {
- z_info |= S_028040_ITERATE_FLUSH(1);
- s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
- } else {
- z_info |= S_028038_ITERATE_FLUSH(1);
- s_info |= S_02803C_ITERATE_FLUSH(1);
- }
- }
-
- if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
- /* Stencil buffer workaround ported from the GFX6-GFX8 code.
- * See that for explanation.
- */
- s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
- } else {
- /* Use all HTILE for depth if there's no stencil. */
- s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
- }
-
- surf->db_htile_data_base = (tex->buffer.gpu_address +
- tex->surface.htile_offset) >> 8;
- surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
- S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
- if (sctx->chip_class == GFX9) {
- surf->db_htile_surface |=
- S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
- }
- }
- } else {
- /* GFX6-GFX8 */
- struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
-
- assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
-
- surf->db_depth_base = (tex->buffer.gpu_address +
- tex->surface.u.legacy.level[level].offset) >> 8;
- surf->db_stencil_base = (tex->buffer.gpu_address +
- tex->surface.u.legacy.stencil_level[level].offset) >> 8;
-
- z_info = S_028040_FORMAT(format) |
- S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
- s_info = S_028044_FORMAT(stencil_format);
- surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
-
- if (sctx->chip_class >= GFX7) {
- struct radeon_info *info = &sctx->screen->info;
- unsigned index = tex->surface.u.legacy.tiling_index[level];
- unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
- unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
- unsigned tile_mode = info->si_tile_mode_array[index];
- unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
- unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
-
- surf->db_depth_info |=
- S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
- S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
- S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
- S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
- S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
- S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
- z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
- s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
- } else {
- unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
- z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
- tile_mode_index = si_tile_mode_index(tex, level, true);
- s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
- }
-
- surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
- S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
- surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x *
- levelinfo->nblk_y) / 64 - 1);
-
- if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
- z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
- S_028040_ALLOW_EXPCLEAR(1);
-
- if (tex->surface.has_stencil) {
- /* Workaround: For a not yet understood reason, the
- * combination of MSAA, fast stencil clear and stencil
- * decompress messes with subsequent stencil buffer
- * uses. Problem was reproduced on Verde, Bonaire,
- * Tonga, and Carrizo.
- *
- * Disabling EXPCLEAR works around the problem.
- *
- * Check piglit's arb_texture_multisample-stencil-clear
- * test if you want to try changing this.
- */
- if (tex->buffer.b.b.nr_samples <= 1)
- s_info |= S_028044_ALLOW_EXPCLEAR(1);
- } else if (!tex->tc_compatible_htile) {
- /* Use all of the htile_buffer for depth if there's no stencil.
- * This must not be set when TC-compatible HTILE is enabled
- * due to a hw bug.
- */
- s_info |= S_028044_TILE_STENCIL_DISABLE(1);
- }
-
- surf->db_htile_data_base = (tex->buffer.gpu_address +
- tex->surface.htile_offset) >> 8;
- surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
-
- if (tex->tc_compatible_htile) {
- surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
-
- /* 0 = full compression. N = only compress up to N-1 Z planes. */
- if (tex->buffer.b.b.nr_samples <= 1)
- z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
- else if (tex->buffer.b.b.nr_samples <= 4)
- z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
- else
- z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
- }
- }
- }
-
- surf->db_z_info = z_info;
- surf->db_stencil_info = s_info;
-
- surf->depth_initialized = true;
+ struct si_texture *tex = (struct si_texture *)surf->base.texture;
+ unsigned level = surf->base.u.tex.level;
+ unsigned format, stencil_format;
+ uint32_t z_info, s_info;
+
+ format = si_translate_dbformat(tex->db_render_format);
+ stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
+
+ assert(format != V_028040_Z_INVALID);
+ if (format == V_028040_Z_INVALID)
+ PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
+
+ surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
+ S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
+ surf->db_htile_data_base = 0;
+ surf->db_htile_surface = 0;
+
+ if (sctx->chip_class >= GFX10) {
+ surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
+ S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ assert(tex->surface.u.gfx9.surf_offset == 0);
+ surf->db_depth_base = tex->buffer.gpu_address >> 8;
+ surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8;
+ z_info = S_028038_FORMAT(format) |
+ S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
+ S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+ S_028038_MAXMIP(tex->buffer.b.b.last_level);
+ s_info = S_02803C_FORMAT(stencil_format) |
+ S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+
+ if (sctx->chip_class == GFX9) {
+ surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
+ surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
+ }
+ surf->db_depth_view |= S_028008_MIPID(level);
+ surf->db_depth_size =
+ S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
+
+ if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+ z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
+
+ if (tex->tc_compatible_htile) {
+ unsigned max_zplanes = 4;
+
+ if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
+ max_zplanes = 2;
+
+ z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
+
+ if (sctx->chip_class >= GFX10) {
+ z_info |= S_028040_ITERATE_FLUSH(1);
+ s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
+ } else {
+ z_info |= S_028038_ITERATE_FLUSH(1);
+ s_info |= S_02803C_ITERATE_FLUSH(1);
+ }
+ }
+
+ if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
+ /* Stencil buffer workaround ported from the GFX6-GFX8 code.
+ * See that for explanation.
+ */
+ s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
+ } else {
+ /* Use all HTILE for depth if there's no stencil. */
+ s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
+ }
+
+ surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+ surf->db_htile_surface =
+ S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
+ if (sctx->chip_class == GFX9) {
+ surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
+ }
+ }
+ } else {
+ /* GFX6-GFX8 */
+ struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
+
+ assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
+
+ surf->db_depth_base =
+ (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8;
+ surf->db_stencil_base =
+ (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8;
+
+ z_info =
+ S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
+ s_info = S_028044_FORMAT(stencil_format);
+ surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
+
+ if (sctx->chip_class >= GFX7) {
+ struct radeon_info *info = &sctx->screen->info;
+ unsigned index = tex->surface.u.legacy.tiling_index[level];
+ unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
+ unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
+ unsigned tile_mode = info->si_tile_mode_array[index];
+ unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
+ unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
+
+ surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
+ S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
+ S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
+ S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
+ S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
+ S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
+ z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
+ s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
+ } else {
+ unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
+ z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
+ tile_mode_index = si_tile_mode_index(tex, level, true);
+ s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
+ }
+
+ surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
+ S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
+ surf->db_depth_slice =
+ S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1);
+
+ if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+ z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1);
+
+ if (tex->surface.has_stencil) {
+ /* Workaround: For a not yet understood reason, the
+ * combination of MSAA, fast stencil clear and stencil
+ * decompress messes with subsequent stencil buffer
+ * uses. Problem was reproduced on Verde, Bonaire,
+ * Tonga, and Carrizo.
+ *
+ * Disabling EXPCLEAR works around the problem.
+ *
+ * Check piglit's arb_texture_multisample-stencil-clear
+ * test if you want to try changing this.
+ */
+ if (tex->buffer.b.b.nr_samples <= 1)
+ s_info |= S_028044_ALLOW_EXPCLEAR(1);
+ } else if (!tex->tc_compatible_htile) {
+ /* Use all of the htile_buffer for depth if there's no stencil.
+ * This must not be set when TC-compatible HTILE is enabled
+ * due to a hw bug.
+ */
+ s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+ }
+
+ surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+ surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+ if (tex->tc_compatible_htile) {
+ surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+ /* 0 = full compression. N = only compress up to N-1 Z planes. */
+ if (tex->buffer.b.b.nr_samples <= 1)
+ z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+ else if (tex->buffer.b.b.nr_samples <= 4)
+ z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+ else
+ z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+ }
+ }
+ }
+
+ surf->db_z_info = z_info;
+ surf->db_stencil_info = s_info;
+
+ surf->depth_initialized = true;
}
void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
{
- if (sctx->decompression_enabled)
- return;
-
- if (sctx->framebuffer.state.zsbuf) {
- struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
- struct si_texture *tex = (struct si_texture *)surf->texture;
-
- tex->dirty_level_mask |= 1 << surf->u.tex.level;
-
- if (tex->surface.has_stencil)
- tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
- }
-
- unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
- while (compressed_cb_mask) {
- unsigned i = u_bit_scan(&compressed_cb_mask);
- struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
- struct si_texture *tex = (struct si_texture*)surf->texture;
-
- if (tex->surface.fmask_offset) {
- tex->dirty_level_mask |= 1 << surf->u.tex.level;
- tex->fmask_is_identity = false;
- }
- if (tex->dcc_gather_statistics)
- tex->separate_dcc_dirty = true;
- }
+ if (sctx->decompression_enabled)
+ return;
+
+ if (sctx->framebuffer.state.zsbuf) {
+ struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
+ struct si_texture *tex = (struct si_texture *)surf->texture;
+
+ tex->dirty_level_mask |= 1 << surf->u.tex.level;
+
+ if (tex->surface.has_stencil)
+ tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+ }
+
+ unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
+ while (compressed_cb_mask) {
+ unsigned i = u_bit_scan(&compressed_cb_mask);
+ struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+ struct si_texture *tex = (struct si_texture *)surf->texture;
+
+ if (tex->surface.fmask_offset) {
+ tex->dirty_level_mask |= 1 << surf->u.tex.level;
+ tex->fmask_is_identity = false;
+ }
+ if (tex->dcc_gather_statistics)
+ tex->separate_dcc_dirty = true;
+ }
}
static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
{
- for (int i = 0; i < state->nr_cbufs; ++i) {
- struct si_surface *surf = NULL;
- struct si_texture *tex;
+ for (int i = 0; i < state->nr_cbufs; ++i) {
+ struct si_surface *surf = NULL;
+ struct si_texture *tex;
- if (!state->cbufs[i])
- continue;
- surf = (struct si_surface*)state->cbufs[i];
- tex = (struct si_texture*)surf->base.texture;
+ if (!state->cbufs[i])
+ continue;
+ surf = (struct si_surface *)state->cbufs[i];
+ tex = (struct si_texture *)surf->base.texture;
- p_atomic_dec(&tex->framebuffers_bound);
- }
+ p_atomic_dec(&tex->framebuffers_bound);
+ }
}
static void si_set_framebuffer_state(struct pipe_context *ctx,
- const struct pipe_framebuffer_state *state)
+ const struct pipe_framebuffer_state *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_surface *surf = NULL;
- struct si_texture *tex;
- bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
- unsigned old_nr_samples = sctx->framebuffer.nr_samples;
- unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
- bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
- bool old_has_stencil =
- old_has_zsbuf &&
- ((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
- bool unbound = false;
- int i;
-
- /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
- * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
- * We could implement the full workaround here, but it's a useless case.
- */
- if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
- unreachable("the framebuffer shouldn't have zero area");
- return;
- }
-
- si_update_fb_dirtiness_after_rendering(sctx);
-
- for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (!sctx->framebuffer.state.cbufs[i])
- continue;
-
- tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
- if (tex->dcc_gather_statistics)
- vi_separate_dcc_stop_query(sctx, tex);
- }
-
- /* Disable DCC if the formats are incompatible. */
- for (i = 0; i < state->nr_cbufs; i++) {
- if (!state->cbufs[i])
- continue;
-
- surf = (struct si_surface*)state->cbufs[i];
- tex = (struct si_texture*)surf->base.texture;
-
- if (!surf->dcc_incompatible)
- continue;
-
- /* Since the DCC decompression calls back into set_framebuffer-
- * _state, we need to unbind the framebuffer, so that
- * vi_separate_dcc_stop_query isn't called twice with the same
- * color buffer.
- */
- if (!unbound) {
- util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
- unbound = true;
- }
-
- if (vi_dcc_enabled(tex, surf->base.u.tex.level))
- if (!si_texture_disable_dcc(sctx, tex))
- si_decompress_dcc(sctx, tex);
-
- surf->dcc_incompatible = false;
- }
-
- /* Only flush TC when changing the framebuffer state, because
- * the only client not using TC that can change textures is
- * the framebuffer.
- *
- * Wait for compute shaders because of possible transitions:
- * - FB write -> shader read
- * - shader write -> FB read
- *
- * DB caches are flushed on demand (using si_decompress_textures).
- *
- * When MSAA is enabled, CB and TC caches are flushed on demand
- * (after FMASK decompression). Shader write -> FB read transitions
- * cannot happen for MSAA textures, because MSAA shader images are
- * not supported.
- *
- * Only flush and wait for CB if there is actually a bound color buffer.
- */
- if (sctx->framebuffer.uncompressed_cb_mask) {
- si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
- sctx->framebuffer.CB_has_shader_readable_metadata,
- sctx->framebuffer.all_DCC_pipe_aligned);
- }
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
- /* u_blitter doesn't invoke depth decompression when it does multiple
- * blits in a row, but the only case when it matters for DB is when
- * doing generate_mipmap. So here we flush DB manually between
- * individual generate_mipmap blits.
- * Note that lower mipmap levels aren't compressed.
- */
- if (sctx->generate_mipmap_for_depth) {
- si_make_DB_shader_coherent(sctx, 1, false,
- sctx->framebuffer.DB_has_shader_readable_metadata);
- } else if (sctx->chip_class == GFX9) {
- /* It appears that DB metadata "leaks" in a sequence of:
- * - depth clear
- * - DCC decompress for shader image writes (with DB disabled)
- * - render with DEPTH_BEFORE_SHADER=1
- * Flushing DB metadata works around the problem.
- */
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
- }
-
- /* Take the maximum of the old and new count. If the new count is lower,
- * dirtying is needed to disable the unbound colorbuffers.
- */
- sctx->framebuffer.dirty_cbufs |=
- (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
- sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
-
- si_dec_framebuffer_counters(&sctx->framebuffer.state);
- util_copy_framebuffer_state(&sctx->framebuffer.state, state);
-
- sctx->framebuffer.colorbuf_enabled_4bit = 0;
- sctx->framebuffer.spi_shader_col_format = 0;
- sctx->framebuffer.spi_shader_col_format_alpha = 0;
- sctx->framebuffer.spi_shader_col_format_blend = 0;
- sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
- sctx->framebuffer.color_is_int8 = 0;
- sctx->framebuffer.color_is_int10 = 0;
-
- sctx->framebuffer.compressed_cb_mask = 0;
- sctx->framebuffer.uncompressed_cb_mask = 0;
- sctx->framebuffer.displayable_dcc_cb_mask = 0;
- sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
- sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
- sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
- sctx->framebuffer.any_dst_linear = false;
- sctx->framebuffer.CB_has_shader_readable_metadata = false;
- sctx->framebuffer.DB_has_shader_readable_metadata = false;
- sctx->framebuffer.all_DCC_pipe_aligned = true;
- sctx->framebuffer.min_bytes_per_pixel = 0;
-
- for (i = 0; i < state->nr_cbufs; i++) {
- if (!state->cbufs[i])
- continue;
-
- surf = (struct si_surface*)state->cbufs[i];
- tex = (struct si_texture*)surf->base.texture;
-
- if (!surf->color_initialized) {
- si_initialize_color_surface(sctx, surf);
- }
-
- sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
- sctx->framebuffer.spi_shader_col_format |=
- surf->spi_shader_col_format << (i * 4);
- sctx->framebuffer.spi_shader_col_format_alpha |=
- surf->spi_shader_col_format_alpha << (i * 4);
- sctx->framebuffer.spi_shader_col_format_blend |=
- surf->spi_shader_col_format_blend << (i * 4);
- sctx->framebuffer.spi_shader_col_format_blend_alpha |=
- surf->spi_shader_col_format_blend_alpha << (i * 4);
-
- if (surf->color_is_int8)
- sctx->framebuffer.color_is_int8 |= 1 << i;
- if (surf->color_is_int10)
- sctx->framebuffer.color_is_int10 |= 1 << i;
-
- if (tex->surface.fmask_offset)
- sctx->framebuffer.compressed_cb_mask |= 1 << i;
- else
- sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
-
- if (tex->surface.dcc_offset)
- sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
-
- /* Don't update nr_color_samples for non-AA buffers.
- * (e.g. destination of MSAA resolve)
- */
- if (tex->buffer.b.b.nr_samples >= 2 &&
- tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
- sctx->framebuffer.nr_color_samples =
- MIN2(sctx->framebuffer.nr_color_samples,
- tex->buffer.b.b.nr_storage_samples);
- sctx->framebuffer.nr_color_samples =
- MAX2(1, sctx->framebuffer.nr_color_samples);
- }
-
- if (tex->surface.is_linear)
- sctx->framebuffer.any_dst_linear = true;
-
- if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
- sctx->framebuffer.CB_has_shader_readable_metadata = true;
-
- if (sctx->chip_class >= GFX9 &&
- !tex->surface.u.gfx9.dcc.pipe_aligned)
- sctx->framebuffer.all_DCC_pipe_aligned = false;
- }
-
- si_context_add_resource_size(sctx, surf->base.texture);
-
- p_atomic_inc(&tex->framebuffers_bound);
-
- if (tex->dcc_gather_statistics) {
- /* Dirty tracking must be enabled for DCC usage analysis. */
- sctx->framebuffer.compressed_cb_mask |= 1 << i;
- vi_separate_dcc_start_query(sctx, tex);
- }
-
- /* Update the minimum but don't keep 0. */
- if (!sctx->framebuffer.min_bytes_per_pixel ||
- tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
- sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
- }
-
- /* For optimal DCC performance. */
- if (sctx->chip_class >= GFX10)
- sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
- else
- sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
-
- struct si_texture *zstex = NULL;
-
- if (state->zsbuf) {
- surf = (struct si_surface*)state->zsbuf;
- zstex = (struct si_texture*)surf->base.texture;
-
- if (!surf->depth_initialized) {
- si_init_depth_surface(sctx, surf);
- }
-
- if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level,
- PIPE_MASK_ZS))
- sctx->framebuffer.DB_has_shader_readable_metadata = true;
-
- si_context_add_resource_size(sctx, surf->base.texture);
-
- /* Update the minimum but don't keep 0. */
- if (!sctx->framebuffer.min_bytes_per_pixel ||
- zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
- sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
- }
-
- si_update_ps_colorbuf0_slot(sctx);
- si_update_poly_offset_state(sctx);
- si_update_ngg_small_prim_precision(sctx);
- si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
- si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-
- if (sctx->screen->dpbb_allowed)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
- if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
- if (sctx->screen->has_out_of_order_rast &&
- (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
- !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
- (zstex && zstex->surface.has_stencil != old_has_stencil)))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
- if (sctx->framebuffer.nr_samples != old_nr_samples) {
- struct pipe_constant_buffer constbuf = {0};
-
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
- constbuf.buffer = sctx->sample_pos_buffer;
-
- /* Set sample locations as fragment shader constants. */
- switch (sctx->framebuffer.nr_samples) {
- case 1:
- constbuf.buffer_offset = 0;
- break;
- case 2:
- constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 -
- (ubyte*)sctx->sample_positions.x1;
- break;
- case 4:
- constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 -
- (ubyte*)sctx->sample_positions.x1;
- break;
- case 8:
- constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 -
- (ubyte*)sctx->sample_positions.x1;
- break;
- case 16:
- constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 -
- (ubyte*)sctx->sample_positions.x1;
- break;
- default:
- PRINT_ERR("Requested an invalid number of samples %i.\n",
- sctx->framebuffer.nr_samples);
- assert(0);
- }
- constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
- si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
-
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
- }
-
- sctx->do_update_shaders = true;
-
- if (!sctx->decompression_enabled) {
- /* Prevent textures decompression when the framebuffer state
- * changes come from the decompression passes themselves.
- */
- sctx->need_check_render_feedback = true;
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_surface *surf = NULL;
+ struct si_texture *tex;
+ bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
+ unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+ unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+ bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+ bool old_has_stencil =
+ old_has_zsbuf &&
+ ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
+ bool unbound = false;
+ int i;
+
+ /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
+ * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+ * We could implement the full workaround here, but it's a useless case.
+ */
+ if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
+ unreachable("the framebuffer shouldn't have zero area");
+ return;
+ }
+
+ si_update_fb_dirtiness_after_rendering(sctx);
+
+ for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (!sctx->framebuffer.state.cbufs[i])
+ continue;
+
+ tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+ if (tex->dcc_gather_statistics)
+ vi_separate_dcc_stop_query(sctx, tex);
+ }
+
+ /* Disable DCC if the formats are incompatible. */
+ for (i = 0; i < state->nr_cbufs; i++) {
+ if (!state->cbufs[i])
+ continue;
+
+ surf = (struct si_surface *)state->cbufs[i];
+ tex = (struct si_texture *)surf->base.texture;
+
+ if (!surf->dcc_incompatible)
+ continue;
+
+ /* Since the DCC decompression calls back into set_framebuffer-
+ * _state, we need to unbind the framebuffer, so that
+ * vi_separate_dcc_stop_query isn't called twice with the same
+ * color buffer.
+ */
+ if (!unbound) {
+ util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
+ unbound = true;
+ }
+
+ if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+ if (!si_texture_disable_dcc(sctx, tex))
+ si_decompress_dcc(sctx, tex);
+
+ surf->dcc_incompatible = false;
+ }
+
+ /* Only flush TC when changing the framebuffer state, because
+ * the only client not using TC that can change textures is
+ * the framebuffer.
+ *
+ * Wait for compute shaders because of possible transitions:
+ * - FB write -> shader read
+ * - shader write -> FB read
+ *
+ * DB caches are flushed on demand (using si_decompress_textures).
+ *
+ * When MSAA is enabled, CB and TC caches are flushed on demand
+ * (after FMASK decompression). Shader write -> FB read transitions
+ * cannot happen for MSAA textures, because MSAA shader images are
+ * not supported.
+ *
+ * Only flush and wait for CB if there is actually a bound color buffer.
+ */
+ if (sctx->framebuffer.uncompressed_cb_mask) {
+ si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+ sctx->framebuffer.CB_has_shader_readable_metadata,
+ sctx->framebuffer.all_DCC_pipe_aligned);
+ }
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ /* u_blitter doesn't invoke depth decompression when it does multiple
+ * blits in a row, but the only case when it matters for DB is when
+ * doing generate_mipmap. So here we flush DB manually between
+ * individual generate_mipmap blits.
+ * Note that lower mipmap levels aren't compressed.
+ */
+ if (sctx->generate_mipmap_for_depth) {
+ si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
+ } else if (sctx->chip_class == GFX9) {
+ /* It appears that DB metadata "leaks" in a sequence of:
+ * - depth clear
+ * - DCC decompress for shader image writes (with DB disabled)
+ * - render with DEPTH_BEFORE_SHADER=1
+ * Flushing DB metadata works around the problem.
+ */
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+ }
+
+ /* Take the maximum of the old and new count. If the new count is lower,
+ * dirtying is needed to disable the unbound colorbuffers.
+ */
+ sctx->framebuffer.dirty_cbufs |=
+ (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
+ sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
+
+ si_dec_framebuffer_counters(&sctx->framebuffer.state);
+ util_copy_framebuffer_state(&sctx->framebuffer.state, state);
+
+ sctx->framebuffer.colorbuf_enabled_4bit = 0;
+ sctx->framebuffer.spi_shader_col_format = 0;
+ sctx->framebuffer.spi_shader_col_format_alpha = 0;
+ sctx->framebuffer.spi_shader_col_format_blend = 0;
+ sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
+ sctx->framebuffer.color_is_int8 = 0;
+ sctx->framebuffer.color_is_int10 = 0;
+
+ sctx->framebuffer.compressed_cb_mask = 0;
+ sctx->framebuffer.uncompressed_cb_mask = 0;
+ sctx->framebuffer.displayable_dcc_cb_mask = 0;
+ sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
+ sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
+ sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
+ sctx->framebuffer.any_dst_linear = false;
+ sctx->framebuffer.CB_has_shader_readable_metadata = false;
+ sctx->framebuffer.DB_has_shader_readable_metadata = false;
+ sctx->framebuffer.all_DCC_pipe_aligned = true;
+ sctx->framebuffer.min_bytes_per_pixel = 0;
+
+ for (i = 0; i < state->nr_cbufs; i++) {
+ if (!state->cbufs[i])
+ continue;
+
+ surf = (struct si_surface *)state->cbufs[i];
+ tex = (struct si_texture *)surf->base.texture;
+
+ if (!surf->color_initialized) {
+ si_initialize_color_surface(sctx, surf);
+ }
+
+ sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
+ sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4);
+ sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4);
+ sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4);
+ sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
+ << (i * 4);
+
+ if (surf->color_is_int8)
+ sctx->framebuffer.color_is_int8 |= 1 << i;
+ if (surf->color_is_int10)
+ sctx->framebuffer.color_is_int10 |= 1 << i;
+
+ if (tex->surface.fmask_offset)
+ sctx->framebuffer.compressed_cb_mask |= 1 << i;
+ else
+ sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
+
+ if (tex->surface.dcc_offset)
+ sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
+
+ /* Don't update nr_color_samples for non-AA buffers.
+ * (e.g. destination of MSAA resolve)
+ */
+ if (tex->buffer.b.b.nr_samples >= 2 &&
+ tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
+ sctx->framebuffer.nr_color_samples =
+ MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples);
+ sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples);
+ }
+
+ if (tex->surface.is_linear)
+ sctx->framebuffer.any_dst_linear = true;
+
+ if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
+ sctx->framebuffer.CB_has_shader_readable_metadata = true;
+
+ if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned)
+ sctx->framebuffer.all_DCC_pipe_aligned = false;
+ }
+
+ si_context_add_resource_size(sctx, surf->base.texture);
+
+ p_atomic_inc(&tex->framebuffers_bound);
+
+ if (tex->dcc_gather_statistics) {
+ /* Dirty tracking must be enabled for DCC usage analysis. */
+ sctx->framebuffer.compressed_cb_mask |= 1 << i;
+ vi_separate_dcc_start_query(sctx, tex);
+ }
+
+ /* Update the minimum but don't keep 0. */
+ if (!sctx->framebuffer.min_bytes_per_pixel ||
+ tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+ sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
+ }
+
+ /* For optimal DCC performance. */
+ if (sctx->chip_class >= GFX10)
+ sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
+ else
+ sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
+
+ struct si_texture *zstex = NULL;
+
+ if (state->zsbuf) {
+ surf = (struct si_surface *)state->zsbuf;
+ zstex = (struct si_texture *)surf->base.texture;
+
+ if (!surf->depth_initialized) {
+ si_init_depth_surface(sctx, surf);
+ }
+
+ if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
+ sctx->framebuffer.DB_has_shader_readable_metadata = true;
+
+ si_context_add_resource_size(sctx, surf->base.texture);
+
+ /* Update the minimum but don't keep 0. */
+ if (!sctx->framebuffer.min_bytes_per_pixel ||
+ zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+ sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
+ }
+
+ si_update_ps_colorbuf0_slot(sctx);
+ si_update_poly_offset_state(sctx);
+ si_update_ngg_small_prim_precision(sctx);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+
+ if (sctx->screen->dpbb_allowed)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+ if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+ if (sctx->screen->has_out_of_order_rast &&
+ (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+ !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+ (zstex && zstex->surface.has_stencil != old_has_stencil)))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+ if (sctx->framebuffer.nr_samples != old_nr_samples) {
+ struct pipe_constant_buffer constbuf = {0};
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+ constbuf.buffer = sctx->sample_pos_buffer;
+
+ /* Set sample locations as fragment shader constants. */
+ switch (sctx->framebuffer.nr_samples) {
+ case 1:
+ constbuf.buffer_offset = 0;
+ break;
+ case 2:
+ constbuf.buffer_offset =
+ (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1;
+ break;
+ case 4:
+ constbuf.buffer_offset =
+ (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1;
+ break;
+ case 8:
+ constbuf.buffer_offset =
+ (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1;
+ break;
+ case 16:
+ constbuf.buffer_offset =
+ (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1;
+ break;
+ default:
+ PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples);
+ assert(0);
+ }
+ constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
+ si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+ }
+
+ sctx->do_update_shaders = true;
+
+ if (!sctx->decompression_enabled) {
+ /* Prevent textures decompression when the framebuffer state
+ * changes come from the decompression passes themselves.
+ */
+ sctx->need_check_render_feedback = true;
+ }
}
static void si_emit_framebuffer_state(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
- unsigned i, nr_cbufs = state->nr_cbufs;
- struct si_texture *tex = NULL;
- struct si_surface *cb = NULL;
- unsigned cb_color_info = 0;
-
- /* Colorbuffers. */
- for (i = 0; i < nr_cbufs; i++) {
- uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
- unsigned cb_color_attrib;
-
- if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
- continue;
-
- cb = (struct si_surface*)state->cbufs[i];
- if (!cb) {
- radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
- S_028C70_FORMAT(V_028C70_COLOR_INVALID));
- continue;
- }
-
- tex = (struct si_texture *)cb->base.texture;
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- &tex->buffer, RADEON_USAGE_READWRITE,
- tex->buffer.b.b.nr_samples > 1 ?
- RADEON_PRIO_COLOR_BUFFER_MSAA :
- RADEON_PRIO_COLOR_BUFFER);
-
- if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- tex->cmask_buffer, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SEPARATE_META);
- }
-
- if (tex->dcc_separate_buffer)
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- tex->dcc_separate_buffer,
- RADEON_USAGE_READWRITE,
- RADEON_PRIO_SEPARATE_META);
-
- /* Compute mutable surface parameters. */
- cb_color_base = tex->buffer.gpu_address >> 8;
- cb_color_fmask = 0;
- cb_color_cmask = tex->cmask_base_address_reg;
- cb_dcc_base = 0;
- cb_color_info = cb->cb_color_info | tex->cb_color_info;
- cb_color_attrib = cb->cb_color_attrib;
-
- if (cb->base.u.tex.level > 0)
- cb_color_info &= C_028C70_FAST_CLEAR;
-
- if (tex->surface.fmask_offset) {
- cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
- cb_color_fmask |= tex->surface.fmask_tile_swizzle;
- }
-
- /* Set up DCC. */
- if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
- bool is_msaa_resolve_dst = state->cbufs[0] &&
- state->cbufs[0]->texture->nr_samples > 1 &&
- state->cbufs[1] == &cb->base &&
- state->cbufs[1]->texture->nr_samples <= 1;
-
- if (!is_msaa_resolve_dst)
- cb_color_info |= S_028C70_DCC_ENABLE(1);
-
- cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
- tex->surface.dcc_offset) >> 8;
-
- unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
- dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
- cb_dcc_base |= dcc_tile_swizzle;
- }
-
- if (sctx->chip_class >= GFX10) {
- unsigned cb_color_attrib3;
-
- /* Set mutable surface parameters. */
- cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
- cb_color_base |= tex->surface.tile_swizzle;
- if (!tex->surface.fmask_offset)
- cb_color_fmask = cb_color_base;
- if (cb->base.u.tex.level > 0)
- cb_color_cmask = cb_color_base;
-
- cb_color_attrib3 = cb->cb_color_attrib3 |
- S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
- S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
- S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
- S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
-
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, 0); /* hole */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
- radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
-
- radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4,
- cb_color_base >> 32);
- radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
- cb_color_cmask >> 32);
- radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
- cb_color_fmask >> 32);
- radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4,
- cb_dcc_base >> 32);
- radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4,
- cb->cb_color_attrib2);
- radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4,
- cb_color_attrib3);
- } else if (sctx->chip_class == GFX9) {
- struct gfx9_surf_meta_flags meta;
-
- if (tex->surface.dcc_offset)
- meta = tex->surface.u.gfx9.dcc;
- else
- meta = tex->surface.u.gfx9.cmask;
-
- /* Set mutable surface parameters. */
- cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
- cb_color_base |= tex->surface.tile_swizzle;
- if (!tex->surface.fmask_offset)
- cb_color_fmask = cb_color_base;
- if (cb->base.u.tex.level > 0)
- cb_color_cmask = cb_color_base;
- cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
- S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
- S_028C74_RB_ALIGNED(meta.rb_aligned) |
- S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
-
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
- radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
- radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
- radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
-
- radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
- S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
- } else {
- /* Compute mutable surface parameters (GFX6-GFX8). */
- const struct legacy_surf_level *level_info =
- &tex->surface.u.legacy.level[cb->base.u.tex.level];
- unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
- unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
-
- cb_color_base += level_info->offset >> 8;
- /* Only macrotiled modes can set tile swizzle. */
- if (level_info->mode == RADEON_SURF_MODE_2D)
- cb_color_base |= tex->surface.tile_swizzle;
-
- if (!tex->surface.fmask_offset)
- cb_color_fmask = cb_color_base;
- if (cb->base.u.tex.level > 0)
- cb_color_cmask = cb_color_base;
- if (cb_dcc_base)
- cb_dcc_base += level_info->dcc_offset >> 8;
-
- pitch_tile_max = level_info->nblk_x / 8 - 1;
- slice_tile_max = level_info->nblk_x *
- level_info->nblk_y / 64 - 1;
- tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
-
- cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
- cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
- cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
-
- if (tex->surface.fmask_offset) {
- if (sctx->chip_class >= GFX7)
- cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
- cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
- cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
- } else {
- /* This must be set for fast clear to work without FMASK. */
- if (sctx->chip_class >= GFX7)
- cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
- cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
- cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
- }
-
- radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
- sctx->chip_class >= GFX8 ? 14 : 13);
- radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
- radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */
- radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */
- radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
- radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
- radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
- radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
- radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
- radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
- radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
- radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */
- radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
- radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-
- if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
- radeon_emit(cs, cb_dcc_base);
- }
- }
- for (; i < 8 ; i++)
- if (sctx->framebuffer.dirty_cbufs & (1 << i))
- radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
-
- /* ZS buffer. */
- if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
- struct si_surface *zb = (struct si_surface*)state->zsbuf;
- struct si_texture *tex = (struct si_texture*)zb->base.texture;
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- &tex->buffer, RADEON_USAGE_READWRITE,
- zb->base.texture->nr_samples > 1 ?
- RADEON_PRIO_DEPTH_BUFFER_MSAA :
- RADEON_PRIO_DEPTH_BUFFER);
-
- if (sctx->chip_class >= GFX10) {
- radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
- radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
-
- radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
- radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
- radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
- S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
- radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
-
- radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
- radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */
- radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
- } else if (sctx->chip_class == GFX9) {
- radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
- radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
- radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
- radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
-
- radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
- radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
- S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
- radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
- radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
-
- radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
- radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */
- radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
- } else {
- radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-
- radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
- radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
- radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
- S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
- radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
- radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
- radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
- radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
- radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */
- }
-
- radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
- radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
- radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
-
- radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
- radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
- } else if (sctx->framebuffer.dirty_zsbuf) {
- if (sctx->chip_class == GFX9)
- radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
- else
- radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
-
- radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
- radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
- }
-
- /* Framebuffer dimensions. */
- /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
- radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
- S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
-
- if (sctx->screen->dfsm_allowed) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
- }
-
- sctx->framebuffer.dirty_cbufs = 0;
- sctx->framebuffer.dirty_zsbuf = false;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+ unsigned i, nr_cbufs = state->nr_cbufs;
+ struct si_texture *tex = NULL;
+ struct si_surface *cb = NULL;
+ unsigned cb_color_info = 0;
+
+ /* Colorbuffers. */
+ for (i = 0; i < nr_cbufs; i++) {
+ uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
+ unsigned cb_color_attrib;
+
+ if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
+ continue;
+
+ cb = (struct si_surface *)state->cbufs[i];
+ if (!cb) {
+ radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+ S_028C70_FORMAT(V_028C70_COLOR_INVALID));
+ continue;
+ }
+
+ tex = (struct si_texture *)cb->base.texture;
+ radeon_add_to_buffer_list(
+ sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+ tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER);
+
+ if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SEPARATE_META);
+ }
+
+ if (tex->dcc_separate_buffer)
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer,
+ RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
+
+ /* Compute mutable surface parameters. */
+ cb_color_base = tex->buffer.gpu_address >> 8;
+ cb_color_fmask = 0;
+ cb_color_cmask = tex->cmask_base_address_reg;
+ cb_dcc_base = 0;
+ cb_color_info = cb->cb_color_info | tex->cb_color_info;
+ cb_color_attrib = cb->cb_color_attrib;
+
+ if (cb->base.u.tex.level > 0)
+ cb_color_info &= C_028C70_FAST_CLEAR;
+
+ if (tex->surface.fmask_offset) {
+ cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
+ cb_color_fmask |= tex->surface.fmask_tile_swizzle;
+ }
+
+ /* Set up DCC. */
+ if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
+ bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 &&
+ state->cbufs[1] == &cb->base &&
+ state->cbufs[1]->texture->nr_samples <= 1;
+
+ if (!is_msaa_resolve_dst)
+ cb_color_info |= S_028C70_DCC_ENABLE(1);
+
+ cb_dcc_base =
+ ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >>
+ 8;
+
+ unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
+ dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
+ cb_dcc_base |= dcc_tile_swizzle;
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ unsigned cb_color_attrib3;
+
+ /* Set mutable surface parameters. */
+ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+ cb_color_base |= tex->surface.tile_swizzle;
+ if (!tex->surface.fmask_offset)
+ cb_color_fmask = cb_color_base;
+ if (cb->base.u.tex.level > 0)
+ cb_color_cmask = cb_color_base;
+
+ cb_color_attrib3 = cb->cb_color_attrib3 |
+ S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+ S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+ S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+ S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
+
+ radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+ radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(cs, 0); /* hole */
+ radeon_emit(cs, 0); /* hole */
+ radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(cs, 0); /* hole */
+ radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(cs, 0); /* hole */
+ radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+ radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
+
+ radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+ radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+ cb_color_cmask >> 32);
+ radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+ cb_color_fmask >> 32);
+ radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+ radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+ radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+ } else if (sctx->chip_class == GFX9) {
+ struct gfx9_surf_meta_flags meta;
+
+ if (tex->surface.dcc_offset)
+ meta = tex->surface.u.gfx9.dcc;
+ else
+ meta = tex->surface.u.gfx9.cmask;
+
+ /* Set mutable surface parameters. */
+ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+ cb_color_base |= tex->surface.tile_swizzle;
+ if (!tex->surface.fmask_offset)
+ cb_color_fmask = cb_color_base;
+ if (cb->base.u.tex.level > 0)
+ cb_color_cmask = cb_color_base;
+ cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+ S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+ S_028C74_RB_ALIGNED(meta.rb_aligned) |
+ S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
+
+ radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+ radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
+ radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */
+ radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+ radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+ radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+ radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
+ radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
+
+ radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+ S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
+ } else {
+ /* Compute mutable surface parameters (GFX6-GFX8). */
+ const struct legacy_surf_level *level_info =
+ &tex->surface.u.legacy.level[cb->base.u.tex.level];
+ unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
+ unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
+
+ cb_color_base += level_info->offset >> 8;
+ /* Only macrotiled modes can set tile swizzle. */
+ if (level_info->mode == RADEON_SURF_MODE_2D)
+ cb_color_base |= tex->surface.tile_swizzle;
+
+ if (!tex->surface.fmask_offset)
+ cb_color_fmask = cb_color_base;
+ if (cb->base.u.tex.level > 0)
+ cb_color_cmask = cb_color_base;
+ if (cb_dcc_base)
+ cb_dcc_base += level_info->dcc_offset >> 8;
+
+ pitch_tile_max = level_info->nblk_x / 8 - 1;
+ slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1;
+ tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
+
+ cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
+ cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
+ cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
+
+ if (tex->surface.fmask_offset) {
+ if (sctx->chip_class >= GFX7)
+ cb_color_pitch |=
+ S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
+ cb_color_attrib |=
+ S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+ cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
+ } else {
+ /* This must be set for fast clear to work without FMASK. */
+ if (sctx->chip_class >= GFX7)
+ cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
+ cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
+ cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
+ }
+
+ radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+ sctx->chip_class >= GFX8 ? 14 : 13);
+ radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
+ radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */
+ radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */
+ radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
+ radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
+ radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
+ radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
+ radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
+ radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+ radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
+ radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */
+ radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+ radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+
+ if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
+ radeon_emit(cs, cb_dcc_base);
+ }
+ }
+ for (; i < 8; i++)
+ if (sctx->framebuffer.dirty_cbufs & (1 << i))
+ radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+
+ /* ZS buffer. */
+ if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
+ struct si_surface *zb = (struct si_surface *)state->zsbuf;
+ struct si_texture *tex = (struct si_texture *)zb->base.texture;
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+ zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
+ : RADEON_PRIO_DEPTH_BUFFER);
+
+ if (sctx->chip_class >= GFX10) {
+ radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+ radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+
+ radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
+ radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+ radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+ S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+ radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+
+ radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
+ radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */
+ radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */
+ radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */
+ radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
+ radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+ } else if (sctx->chip_class == GFX9) {
+ radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
+ radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+ radeon_emit(cs,
+ S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+ radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
+
+ radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
+ radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+ S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+ radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+ radeon_emit(cs,
+ S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+
+ radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
+ radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */
+ radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+ } else {
+ radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+
+ radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
+ radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
+ radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+ S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+ radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+ radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
+ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+ radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
+ radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */
+ }
+
+ radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+ radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
+ radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
+
+ radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+ radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
+ } else if (sctx->framebuffer.dirty_zsbuf) {
+ if (sctx->chip_class == GFX9)
+ radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+ else
+ radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+
+ radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
+ radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+ }
+
+ /* Framebuffer dimensions. */
+ /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
+ radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+ S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
+
+ if (sctx->screen->dfsm_allowed) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+ }
+
+ sctx->framebuffer.dirty_cbufs = 0;
+ sctx->framebuffer.dirty_zsbuf = false;
}
static void si_emit_msaa_sample_locs(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- unsigned nr_samples = sctx->framebuffer.nr_samples;
- bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
-
- /* Smoothing (only possible with nr_samples == 1) uses the same
- * sample locations as the MSAA it simulates.
- */
- if (nr_samples <= 1 && sctx->smoothing_enabled)
- nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-
- /* On Polaris, the small primitive filter uses the sample locations
- * even when MSAA is off, so we need to make sure they're set to 0.
- *
- * GFX10 uses sample locations unconditionally, so they always need
- * to be set up.
- */
- if ((nr_samples >= 2 || has_msaa_sample_loc_bug ||
- sctx->chip_class >= GFX10) &&
- nr_samples != sctx->sample_locs_num_samples) {
- sctx->sample_locs_num_samples = nr_samples;
- si_emit_sample_locations(cs, nr_samples);
- }
-
- if (sctx->family >= CHIP_POLARIS10) {
- unsigned small_prim_filter_cntl =
- S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
- /* line bug */
- S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
-
- /* The alternative of setting sample locations to 0 would
- * require a DB flush to avoid Z errors, see
- * https://bugs.freedesktop.org/show_bug.cgi?id=96908
- */
- if (has_msaa_sample_loc_bug &&
- sctx->framebuffer.nr_samples > 1 &&
- !rs->multisample_enable)
- small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
-
- radeon_opt_set_context_reg(sctx,
- R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
- SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
- small_prim_filter_cntl);
- }
-
- /* The exclusion bits can be set to improve rasterization efficiency
- * if no sample lies on the pixel boundary (-8 sample offset).
- */
- bool exclusion = sctx->chip_class >= GFX7 &&
- (!rs->multisample_enable || nr_samples != 16);
- radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL,
- SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
- S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) |
- S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ unsigned nr_samples = sctx->framebuffer.nr_samples;
+ bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
+
+ /* Smoothing (only possible with nr_samples == 1) uses the same
+ * sample locations as the MSAA it simulates.
+ */
+ if (nr_samples <= 1 && sctx->smoothing_enabled)
+ nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+
+ /* On Polaris, the small primitive filter uses the sample locations
+ * even when MSAA is off, so we need to make sure they're set to 0.
+ *
+ * GFX10 uses sample locations unconditionally, so they always need
+ * to be set up.
+ */
+ if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) &&
+ nr_samples != sctx->sample_locs_num_samples) {
+ sctx->sample_locs_num_samples = nr_samples;
+ si_emit_sample_locations(cs, nr_samples);
+ }
+
+ if (sctx->family >= CHIP_POLARIS10) {
+ unsigned small_prim_filter_cntl =
+ S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
+ /* line bug */
+ S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
+
+ /* The alternative of setting sample locations to 0 would
+ * require a DB flush to avoid Z errors, see
+ * https://bugs.freedesktop.org/show_bug.cgi?id=96908
+ */
+ if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable)
+ small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
+
+ radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+ SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl);
+ }
+
+ /* The exclusion bits can be set to improve rasterization efficiency
+ * if no sample lies on the pixel boundary (-8 sample offset).
+ */
+ bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16);
+ radeon_opt_set_context_reg(
+ sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+ S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
}
static bool si_out_of_order_rasterization(struct si_context *sctx)
{
- struct si_state_blend *blend = sctx->queued.named.blend;
- struct si_state_dsa *dsa = sctx->queued.named.dsa;
+ struct si_state_blend *blend = sctx->queued.named.blend;
+ struct si_state_dsa *dsa = sctx->queued.named.dsa;
- if (!sctx->screen->has_out_of_order_rast)
- return false;
+ if (!sctx->screen->has_out_of_order_rast)
+ return false;
- unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
+ unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
- colormask &= blend->cb_target_enabled_4bit;
+ colormask &= blend->cb_target_enabled_4bit;
- /* Conservative: No logic op. */
- if (colormask && blend->logicop_enable)
- return false;
+ /* Conservative: No logic op. */
+ if (colormask && blend->logicop_enable)
+ return false;
- struct si_dsa_order_invariance dsa_order_invariant = {
- .zs = true, .pass_set = true, .pass_last = false
- };
+ struct si_dsa_order_invariance dsa_order_invariant = {.zs = true,
+ .pass_set = true,
+ .pass_last = false};
- if (sctx->framebuffer.state.zsbuf) {
- struct si_texture *zstex =
- (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
- bool has_stencil = zstex->surface.has_stencil;
- dsa_order_invariant = dsa->order_invariance[has_stencil];
- if (!dsa_order_invariant.zs)
- return false;
+ if (sctx->framebuffer.state.zsbuf) {
+ struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+ bool has_stencil = zstex->surface.has_stencil;
+ dsa_order_invariant = dsa->order_invariance[has_stencil];
+ if (!dsa_order_invariant.zs)
+ return false;
- /* The set of PS invocations is always order invariant,
- * except when early Z/S tests are requested. */
- if (sctx->ps_shader.cso &&
- sctx->ps_shader.cso->info.writes_memory &&
- sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
- !dsa_order_invariant.pass_set)
- return false;
+ /* The set of PS invocations is always order invariant,
+ * except when early Z/S tests are requested. */
+ if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory &&
+ sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+ !dsa_order_invariant.pass_set)
+ return false;
- if (sctx->num_perfect_occlusion_queries != 0 &&
- !dsa_order_invariant.pass_set)
- return false;
- }
+ if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
+ return false;
+ }
- if (!colormask)
- return true;
+ if (!colormask)
+ return true;
- unsigned blendmask = colormask & blend->blend_enable_4bit;
+ unsigned blendmask = colormask & blend->blend_enable_4bit;
- if (blendmask) {
- /* Only commutative blending. */
- if (blendmask & ~blend->commutative_4bit)
- return false;
+ if (blendmask) {
+ /* Only commutative blending. */
+ if (blendmask & ~blend->commutative_4bit)
+ return false;
- if (!dsa_order_invariant.pass_set)
- return false;
- }
+ if (!dsa_order_invariant.pass_set)
+ return false;
+ }
- if (colormask & ~blendmask) {
- if (!dsa_order_invariant.pass_last)
- return false;
- }
+ if (colormask & ~blendmask) {
+ if (!dsa_order_invariant.pass_last)
+ return false;
+ }
- return true;
+ return true;
}
static void si_emit_msaa_config(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
- /* 33% faster rendering to linear color buffers */
- bool dst_is_linear = sctx->framebuffer.any_dst_linear;
- bool out_of_order_rast = si_out_of_order_rasterization(sctx);
- unsigned sc_mode_cntl_1 =
- S_028A4C_WALK_SIZE(dst_is_linear) |
- S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
- S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
- S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
- S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
- /* always 1: */
- S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
- S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
- S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
- S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
- S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
- S_028A4C_FORCE_EOV_REZ_ENABLE(1);
- unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
- S_028804_INCOHERENT_EQAA_READS(1) |
- S_028804_INTERPOLATE_COMP_Z(1) |
- S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
- unsigned coverage_samples, color_samples, z_samples;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
- /* S: Coverage samples (up to 16x):
- * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
- * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
- *
- * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
- * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
- * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
- * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
- * # from the closest defined sample if Z is uncompressed (same quality as the number of
- * # Z samples).
- *
- * F: Color samples (up to 8x, must be <= coverage samples):
- * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
- * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
- *
- * Can be anything between coverage and color samples:
- * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
- * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
- * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
- * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
- * # All are currently set the same as coverage samples.
- *
- * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
- * flag for undefined color samples. A shader-based resolve must handle unknowns
- * or mask them out with AND. Unknowns can also be guessed from neighbors via
- * an edge-detect shader-based resolve, which is required to make "color samples = 1"
- * useful. The CB resolve always drops unknowns.
- *
- * Sensible AA configurations:
- * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
- * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
- * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
- * EQAA 8s 8z 8f = 8x MSAA
- * EQAA 8s 8z 4f - might look the same as 8x MSAA
- * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry
- * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed
- * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
- * EQAA 4s 4z 4f = 4x MSAA
- * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
- * EQAA 2s 2z 2f = 2x MSAA
- */
- if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
- coverage_samples = sctx->framebuffer.nr_samples;
- color_samples = sctx->framebuffer.nr_color_samples;
-
- if (sctx->framebuffer.state.zsbuf) {
- z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
- z_samples = MAX2(1, z_samples);
- } else {
- z_samples = coverage_samples;
- }
- } else if (sctx->smoothing_enabled) {
- coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
- } else {
- coverage_samples = color_samples = z_samples = 1;
- }
-
- /* Required by OpenGL line rasterization.
- *
- * TODO: We should also enable perpendicular endcaps for AA lines,
- * but that requires implementing line stippling in the pixel
- * shader. SC can only do line stippling with axis-aligned
- * endcaps.
- */
- unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
- unsigned sc_aa_config = 0;
-
- if (coverage_samples > 1) {
- /* distance from the pixel center, indexed by log2(nr_samples) */
- static unsigned max_dist[] = {
- 0, /* unused */
- 4, /* 2x MSAA */
- 6, /* 4x MSAA */
- 7, /* 8x MSAA */
- 8, /* 16x MSAA */
- };
- unsigned log_samples = util_logbase2(coverage_samples);
- unsigned log_z_samples = util_logbase2(z_samples);
- unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
- unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
-
- sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
- sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
- S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
- S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
-
- if (sctx->framebuffer.nr_samples > 1) {
- db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
- S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
- S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
- S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
- sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
- } else if (sctx->smoothing_enabled) {
- db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
- }
- }
-
- unsigned initial_cdw = cs->current.cdw;
-
- /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
- radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
- SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
- sc_aa_config);
- /* R_028804_DB_EQAA */
- radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
- db_eqaa);
- /* R_028A4C_PA_SC_MODE_CNTL_1 */
- radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
- SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
-
- if (initial_cdw != cs->current.cdw) {
- sctx->context_roll = true;
-
- /* GFX9: Flush DFSM when the AA mode changes. */
- if (sctx->screen->dfsm_allowed) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
- }
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
+ /* 33% faster rendering to linear color buffers */
+ bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+ bool out_of_order_rast = si_out_of_order_rasterization(sctx);
+ unsigned sc_mode_cntl_1 =
+ S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
+ S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+ S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+ S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
+ /* always 1: */
+ S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
+ S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
+ S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+ unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
+ S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
+ unsigned coverage_samples, color_samples, z_samples;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+ /* S: Coverage samples (up to 16x):
+ * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
+ * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
+ *
+ * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
+ * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
+ * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
+ * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
+ * # from the closest defined sample if Z is uncompressed (same quality as the number of
+ * # Z samples).
+ *
+ * F: Color samples (up to 8x, must be <= coverage samples):
+ * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
+ * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
+ *
+ * Can be anything between coverage and color samples:
+ * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
+ * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
+ * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
+ * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
+ * # All are currently set the same as coverage samples.
+ *
+ * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
+ * flag for undefined color samples. A shader-based resolve must handle unknowns
+ * or mask them out with AND. Unknowns can also be guessed from neighbors via
+ * an edge-detect shader-based resolve, which is required to make "color samples = 1"
+ * useful. The CB resolve always drops unknowns.
+ *
+ * Sensible AA configurations:
+ * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
+ * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
+ * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
+ * EQAA 8s 8z 8f = 8x MSAA
+ * EQAA 8s 8z 4f - might look the same as 8x MSAA
+ * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry
+ * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed
+ * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
+ * EQAA 4s 4z 4f = 4x MSAA
+ * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
+ * EQAA 2s 2z 2f = 2x MSAA
+ */
+ if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
+ coverage_samples = sctx->framebuffer.nr_samples;
+ color_samples = sctx->framebuffer.nr_color_samples;
+
+ if (sctx->framebuffer.state.zsbuf) {
+ z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
+ z_samples = MAX2(1, z_samples);
+ } else {
+ z_samples = coverage_samples;
+ }
+ } else if (sctx->smoothing_enabled) {
+ coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+ } else {
+ coverage_samples = color_samples = z_samples = 1;
+ }
+
+ /* Required by OpenGL line rasterization.
+ *
+ * TODO: We should also enable perpendicular endcaps for AA lines,
+ * but that requires implementing line stippling in the pixel
+ * shader. SC can only do line stippling with axis-aligned
+ * endcaps.
+ */
+ unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+ unsigned sc_aa_config = 0;
+
+ if (coverage_samples > 1) {
+ /* distance from the pixel center, indexed by log2(nr_samples) */
+ static unsigned max_dist[] = {
+ 0, /* unused */
+ 4, /* 2x MSAA */
+ 6, /* 4x MSAA */
+ 7, /* 8x MSAA */
+ 8, /* 16x MSAA */
+ };
+ unsigned log_samples = util_logbase2(coverage_samples);
+ unsigned log_z_samples = util_logbase2(z_samples);
+ unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
+ unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
+
+ sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
+ sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
+ S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
+ S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+
+ if (sctx->framebuffer.nr_samples > 1) {
+ db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
+ S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
+ S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
+ S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
+ sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
+ } else if (sctx->smoothing_enabled) {
+ db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
+ }
+ }
+
+ unsigned initial_cdw = cs->current.cdw;
+
+ /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
+ radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
+ sc_line_cntl, sc_aa_config);
+ /* R_028804_DB_EQAA */
+ radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa);
+ /* R_028A4C_PA_SC_MODE_CNTL_1 */
+ radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
+ sc_mode_cntl_1);
+
+ if (initial_cdw != cs->current.cdw) {
+ sctx->context_roll = true;
+
+ /* GFX9: Flush DFSM when the AA mode changes. */
+ if (sctx->screen->dfsm_allowed) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+ }
+ }
}
void si_update_ps_iter_samples(struct si_context *sctx)
{
- if (sctx->framebuffer.nr_samples > 1)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
- if (sctx->screen->dpbb_allowed)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+ if (sctx->framebuffer.nr_samples > 1)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ if (sctx->screen->dpbb_allowed)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
}
static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- /* The hardware can only do sample shading with 2^n samples. */
- min_samples = util_next_power_of_two(min_samples);
+ /* The hardware can only do sample shading with 2^n samples. */
+ min_samples = util_next_power_of_two(min_samples);
- if (sctx->ps_iter_samples == min_samples)
- return;
+ if (sctx->ps_iter_samples == min_samples)
+ return;
- sctx->ps_iter_samples = min_samples;
- sctx->do_update_shaders = true;
+ sctx->ps_iter_samples = min_samples;
+ sctx->do_update_shaders = true;
- si_update_ps_iter_samples(sctx);
+ si_update_ps_iter_samples(sctx);
}
/*
* Build the sampler view descriptor for a buffer texture.
* @param state 256-bit descriptor; only the high 128 bits are filled in
*/
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
- enum pipe_format format,
- unsigned offset, unsigned size,
- uint32_t *state)
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+ enum pipe_format format, unsigned offset, unsigned size,
+ uint32_t *state)
{
- const struct util_format_description *desc;
- unsigned stride;
- unsigned num_records;
-
- desc = util_format_description(format);
- stride = desc->block.bits / 8;
-
- num_records = size / stride;
- num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
-
- /* The NUM_RECORDS field has a different meaning depending on the chip,
- * instruction type, STRIDE, and SWIZZLE_ENABLE.
- *
- * GFX6-7,10:
- * - If STRIDE == 0, it's in byte units.
- * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
- *
- * GFX8:
- * - For SMEM and STRIDE == 0, it's in byte units.
- * - For SMEM and STRIDE != 0, it's in units of STRIDE.
- * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
- * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
- * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
- * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
- * using SMEM. This can be done in the shader by clearing STRIDE with s_and.
- * That way the same descriptor can be used by both SMEM and VMEM.
- *
- * GFX9:
- * - For SMEM and STRIDE == 0, it's in byte units.
- * - For SMEM and STRIDE != 0, it's in units of STRIDE.
- * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
- * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
- */
- if (screen->info.chip_class == GFX8)
- num_records *= stride;
-
- state[4] = 0;
- state[5] = S_008F04_STRIDE(stride);
- state[6] = num_records;
- state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
- S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
- S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
- S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
- if (screen->info.chip_class >= GFX10) {
- const struct gfx10_format *fmt = &gfx10_format_table[format];
-
- /* OOB_SELECT chooses the out-of-bounds check:
- * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
- * - 1: index >= NUM_RECORDS
- * - 2: NUM_RECORDS == 0
- * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
- * else: swizzle_address >= NUM_RECORDS
- */
- state[7] |= S_008F0C_FORMAT(fmt->img_format) |
- S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- int first_non_void;
- unsigned num_format, data_format;
-
- first_non_void = util_format_get_first_non_void_channel(format);
- num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
- data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
-
- state[7] |= S_008F0C_NUM_FORMAT(num_format) |
- S_008F0C_DATA_FORMAT(data_format);
- }
+ const struct util_format_description *desc;
+ unsigned stride;
+ unsigned num_records;
+
+ desc = util_format_description(format);
+ stride = desc->block.bits / 8;
+
+ num_records = size / stride;
+ num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
+
+ /* The NUM_RECORDS field has a different meaning depending on the chip,
+ * instruction type, STRIDE, and SWIZZLE_ENABLE.
+ *
+ * GFX6-7,10:
+ * - If STRIDE == 0, it's in byte units.
+ * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
+ *
+ * GFX8:
+ * - For SMEM and STRIDE == 0, it's in byte units.
+ * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+ * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
+ * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
+ * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
+ * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
+ * using SMEM. This can be done in the shader by clearing STRIDE with s_and.
+ * That way the same descriptor can be used by both SMEM and VMEM.
+ *
+ * GFX9:
+ * - For SMEM and STRIDE == 0, it's in byte units.
+ * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+ * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
+ * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
+ */
+ if (screen->info.chip_class == GFX8)
+ num_records *= stride;
+
+ state[4] = 0;
+ state[5] = S_008F04_STRIDE(stride);
+ state[6] = num_records;
+ state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+ S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+ S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+ S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+ if (screen->info.chip_class >= GFX10) {
+ const struct gfx10_format *fmt = &gfx10_format_table[format];
+
+ /* OOB_SELECT chooses the out-of-bounds check:
+ * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
+ * - 1: index >= NUM_RECORDS
+ * - 2: NUM_RECORDS == 0
+ * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
+ * else: swizzle_address >= NUM_RECORDS
+ */
+ state[7] |= S_008F0C_FORMAT(fmt->img_format) |
+ S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+ S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ int first_non_void;
+ unsigned num_format, data_format;
+
+ first_non_void = util_format_get_first_non_void_channel(format);
+ num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
+ data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
+
+ state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+ }
}
static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
{
- unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
-
- if (swizzle[3] == PIPE_SWIZZLE_X) {
- /* For the pre-defined border color values (white, opaque
- * black, transparent black), the only thing that matters is
- * that the alpha channel winds up in the correct place
- * (because the RGB channels are all the same) so either of
- * these enumerations will work.
- */
- if (swizzle[2] == PIPE_SWIZZLE_Y)
- bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
- else
- bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
- } else if (swizzle[0] == PIPE_SWIZZLE_X) {
- if (swizzle[1] == PIPE_SWIZZLE_Y)
- bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
- else
- bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
- } else if (swizzle[1] == PIPE_SWIZZLE_X) {
- bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
- } else if (swizzle[2] == PIPE_SWIZZLE_X) {
- bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
- }
-
- return bc_swizzle;
+ unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+
+ if (swizzle[3] == PIPE_SWIZZLE_X) {
+ /* For the pre-defined border color values (white, opaque
+ * black, transparent black), the only thing that matters is
+ * that the alpha channel winds up in the correct place
+ * (because the RGB channels are all the same) so either of
+ * these enumerations will work.
+ */
+ if (swizzle[2] == PIPE_SWIZZLE_Y)
+ bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
+ else
+ bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
+ } else if (swizzle[0] == PIPE_SWIZZLE_X) {
+ if (swizzle[1] == PIPE_SWIZZLE_Y)
+ bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+ else
+ bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
+ } else if (swizzle[1] == PIPE_SWIZZLE_X) {
+ bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
+ } else if (swizzle[2] == PIPE_SWIZZLE_X) {
+ bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
+ }
+
+ return bc_swizzle;
}
/**
* Build the sampler view descriptor for a texture.
*/
-static void
-gfx10_make_texture_descriptor(struct si_screen *screen,
- struct si_texture *tex,
- bool sampler,
- enum pipe_texture_target target,
- enum pipe_format pipe_format,
- const unsigned char state_swizzle[4],
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned width, unsigned height, unsigned depth,
- uint32_t *state,
- uint32_t *fmask_state)
+static void gfx10_make_texture_descriptor(
+ struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target,
+ enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level,
+ unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height,
+ unsigned depth, uint32_t *state, uint32_t *fmask_state)
{
- struct pipe_resource *res = &tex->buffer.b.b;
- const struct util_format_description *desc;
- unsigned img_format;
- unsigned char swizzle[4];
- unsigned type;
- uint64_t va;
-
- desc = util_format_description(pipe_format);
- img_format = gfx10_format_table[pipe_format].img_format;
-
- if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
- const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
- const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
- const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
- bool is_stencil = false;
-
- switch (pipe_format) {
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- case PIPE_FORMAT_X32_S8X24_UINT:
- case PIPE_FORMAT_X8Z24_UNORM:
- util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
- is_stencil = true;
- break;
- case PIPE_FORMAT_X24S8_UINT:
- /*
- * X24S8 is implemented as an 8_8_8_8 data format, to
- * fix texture gathers. This affects at least
- * GL45-CTS.texture_cube_map_array.sampling on GFX8.
- */
- util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
- is_stencil = true;
- break;
- default:
- util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
- is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
- }
-
- if (tex->upgraded_depth && !is_stencil) {
- assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
- img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
- }
- } else {
- util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
- }
-
- if (!sampler &&
- (res->target == PIPE_TEXTURE_CUBE ||
- res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
- /* For the purpose of shader images, treat cube maps as 2D
- * arrays.
- */
- type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
- } else {
- type = si_tex_dim(screen, tex, target, res->nr_samples);
- }
-
- if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
- height = 1;
- depth = res->array_size;
- } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
- type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
- if (sampler || res->target != PIPE_TEXTURE_3D)
- depth = res->array_size;
- } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
- depth = res->array_size / 6;
-
- state[0] = 0;
- state[1] = S_00A004_FORMAT(img_format) |
- S_00A004_WIDTH_LO(width - 1);
- state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
- S_00A008_HEIGHT(height - 1) |
- S_00A008_RESOURCE_LEVEL(1);
- state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
- S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
- S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
- S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
- S_00A00C_BASE_LEVEL(res->nr_samples > 1 ?
- 0 : first_level) |
- S_00A00C_LAST_LEVEL(res->nr_samples > 1 ?
- util_logbase2(res->nr_samples) :
- last_level) |
- S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) |
- S_00A00C_TYPE(type);
- /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
- * to know the total number of layers.
- */
- state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler)
- ? depth - 1 : last_layer) |
- S_00A010_BASE_ARRAY(first_layer);
- state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
- S_00A014_MAX_MIP(res->nr_samples > 1 ?
- util_logbase2(res->nr_samples) :
- tex->buffer.b.b.last_level) |
- S_00A014_PERF_MOD(4);
- state[6] = 0;
- state[7] = 0;
-
- if (tex->surface.dcc_offset) {
- state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
- S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
- S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
- }
-
- /* Initialize the sampler view for FMASK. */
- if (tex->surface.fmask_offset) {
- uint32_t format;
-
- va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
- switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
- case FMASK(2,1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
- break;
- case FMASK(2,2):
- format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
- break;
- case FMASK(4,1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
- break;
- case FMASK(4,2):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
- break;
- case FMASK(4,4):
- format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
- break;
- case FMASK(8,1):
- format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
- break;
- case FMASK(8,2):
- format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
- break;
- case FMASK(8,4):
- format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
- break;
- case FMASK(8,8):
- format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
- break;
- case FMASK(16,1):
- format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
- break;
- case FMASK(16,2):
- format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
- break;
- case FMASK(16,4):
- format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
- break;
- case FMASK(16,8):
- format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
- break;
- default:
- unreachable("invalid nr_samples");
- }
+ struct pipe_resource *res = &tex->buffer.b.b;
+ const struct util_format_description *desc;
+ unsigned img_format;
+ unsigned char swizzle[4];
+ unsigned type;
+ uint64_t va;
+
+ desc = util_format_description(pipe_format);
+ img_format = gfx10_format_table[pipe_format].img_format;
+
+ if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+ const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+ const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+ const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+ bool is_stencil = false;
+
+ switch (pipe_format) {
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ case PIPE_FORMAT_X32_S8X24_UINT:
+ case PIPE_FORMAT_X8Z24_UNORM:
+ util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+ is_stencil = true;
+ break;
+ case PIPE_FORMAT_X24S8_UINT:
+ /*
+ * X24S8 is implemented as an 8_8_8_8 data format, to
+ * fix texture gathers. This affects at least
+ * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+ */
+ util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+ is_stencil = true;
+ break;
+ default:
+ util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+ is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
+ }
+
+ if (tex->upgraded_depth && !is_stencil) {
+ assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
+ img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
+ }
+ } else {
+ util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+ }
+
+ if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
+ /* For the purpose of shader images, treat cube maps as 2D
+ * arrays.
+ */
+ type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+ } else {
+ type = si_tex_dim(screen, tex, target, res->nr_samples);
+ }
+
+ if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+ height = 1;
+ depth = res->array_size;
+ } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+ if (sampler || res->target != PIPE_TEXTURE_3D)
+ depth = res->array_size;
+ } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+ depth = res->array_size / 6;
+
+ state[0] = 0;
+ state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
+ state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+ S_00A008_RESOURCE_LEVEL(1);
+ state[3] =
+ S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+ S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+ S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+ S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+ S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
+ S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
+ S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
+ /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
+ * to know the total number of layers.
+ */
+ state[4] =
+ S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
+ S_00A010_BASE_ARRAY(first_layer);
+ state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
+ S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
+ : tex->buffer.b.b.last_level) |
+ S_00A014_PERF_MOD(4);
+ state[6] = 0;
+ state[7] = 0;
+
+ if (tex->surface.dcc_offset) {
+ state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+ S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+ S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+ }
+
+ /* Initialize the sampler view for FMASK. */
+ if (tex->surface.fmask_offset) {
+ uint32_t format;
+
+ va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+ switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+ case FMASK(2, 1):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
+ break;
+ case FMASK(2, 2):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
+ break;
+ case FMASK(4, 1):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
+ break;
+ case FMASK(4, 2):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
+ break;
+ case FMASK(4, 4):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
+ break;
+ case FMASK(8, 1):
+ format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
+ break;
+ case FMASK(8, 2):
+ format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
+ break;
+ case FMASK(8, 4):
+ format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
+ break;
+ case FMASK(8, 8):
+ format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
+ break;
+ case FMASK(16, 1):
+ format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
+ break;
+ case FMASK(16, 2):
+ format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
+ break;
+ case FMASK(16, 4):
+ format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
+ break;
+ case FMASK(16, 8):
+ format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
+ break;
+ default:
+ unreachable("invalid nr_samples");
+ }
#undef FMASK
- fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
- fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) |
- S_00A004_FORMAT(format) |
- S_00A004_WIDTH_LO(width - 1);
- fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
- S_00A008_HEIGHT(height - 1) |
- S_00A008_RESOURCE_LEVEL(1);
- fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
- S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
- S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
- S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
- S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
- S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
- fmask_state[4] = S_00A010_DEPTH(last_layer) |
- S_00A010_BASE_ARRAY(first_layer);
- fmask_state[5] = 0;
- fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
- fmask_state[7] = 0;
- }
+ fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+ fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) |
+ S_00A004_WIDTH_LO(width - 1);
+ fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+ S_00A008_RESOURCE_LEVEL(1);
+ fmask_state[3] =
+ S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+ S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+ S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+ S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
+ fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
+ fmask_state[5] = 0;
+ fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
+ fmask_state[7] = 0;
+ }
}
/**
* Build the sampler view descriptor for a texture (SI-GFX9).
*/
-static void
-si_make_texture_descriptor(struct si_screen *screen,
- struct si_texture *tex,
- bool sampler,
- enum pipe_texture_target target,
- enum pipe_format pipe_format,
- const unsigned char state_swizzle[4],
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned width, unsigned height, unsigned depth,
- uint32_t *state,
- uint32_t *fmask_state)
+static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex,
+ bool sampler, enum pipe_texture_target target,
+ enum pipe_format pipe_format,
+ const unsigned char state_swizzle[4], unsigned first_level,
+ unsigned last_level, unsigned first_layer,
+ unsigned last_layer, unsigned width, unsigned height,
+ unsigned depth, uint32_t *state, uint32_t *fmask_state)
{
- struct pipe_resource *res = &tex->buffer.b.b;
- const struct util_format_description *desc;
- unsigned char swizzle[4];
- int first_non_void;
- unsigned num_format, data_format, type, num_samples;
- uint64_t va;
-
- desc = util_format_description(pipe_format);
-
- num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ?
- MAX2(1, res->nr_samples) :
- MAX2(1, res->nr_storage_samples);
-
- if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
- const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
- const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
- const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
-
- switch (pipe_format) {
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- case PIPE_FORMAT_X32_S8X24_UINT:
- case PIPE_FORMAT_X8Z24_UNORM:
- util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
- break;
- case PIPE_FORMAT_X24S8_UINT:
- /*
- * X24S8 is implemented as an 8_8_8_8 data format, to
- * fix texture gathers. This affects at least
- * GL45-CTS.texture_cube_map_array.sampling on GFX8.
- */
- if (screen->info.chip_class <= GFX8)
- util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
- else
- util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
- break;
- default:
- util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
- }
- } else {
- util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
- }
-
- first_non_void = util_format_get_first_non_void_channel(pipe_format);
-
- switch (pipe_format) {
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
- break;
- default:
- if (first_non_void < 0) {
- if (util_format_is_compressed(pipe_format)) {
- switch (pipe_format) {
- case PIPE_FORMAT_DXT1_SRGB:
- case PIPE_FORMAT_DXT1_SRGBA:
- case PIPE_FORMAT_DXT3_SRGBA:
- case PIPE_FORMAT_DXT5_SRGBA:
- case PIPE_FORMAT_BPTC_SRGBA:
- case PIPE_FORMAT_ETC2_SRGB8:
- case PIPE_FORMAT_ETC2_SRGB8A1:
- case PIPE_FORMAT_ETC2_SRGBA8:
- num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
- break;
- case PIPE_FORMAT_RGTC1_SNORM:
- case PIPE_FORMAT_LATC1_SNORM:
- case PIPE_FORMAT_RGTC2_SNORM:
- case PIPE_FORMAT_LATC2_SNORM:
- case PIPE_FORMAT_ETC2_R11_SNORM:
- case PIPE_FORMAT_ETC2_RG11_SNORM:
- /* implies float, so use SNORM/UNORM to determine
- whether data is signed or not */
- case PIPE_FORMAT_BPTC_RGB_FLOAT:
- num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
- break;
- default:
- num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
- break;
- }
- } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
- num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
- } else {
- num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
- }
- } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
- num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
- } else {
- num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-
- switch (desc->channel[first_non_void].type) {
- case UTIL_FORMAT_TYPE_FLOAT:
- num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
- break;
- case UTIL_FORMAT_TYPE_SIGNED:
- if (desc->channel[first_non_void].normalized)
- num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
- else if (desc->channel[first_non_void].pure_integer)
- num_format = V_008F14_IMG_NUM_FORMAT_SINT;
- else
- num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
- break;
- case UTIL_FORMAT_TYPE_UNSIGNED:
- if (desc->channel[first_non_void].normalized)
- num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
- else if (desc->channel[first_non_void].pure_integer)
- num_format = V_008F14_IMG_NUM_FORMAT_UINT;
- else
- num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
- }
- }
- }
-
- data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
- if (data_format == ~0) {
- data_format = 0;
- }
-
- /* S8 with Z32 HTILE needs a special format. */
- if (screen->info.chip_class == GFX9 &&
- pipe_format == PIPE_FORMAT_S8_UINT &&
- tex->tc_compatible_htile)
- data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
-
- if (!sampler &&
- (res->target == PIPE_TEXTURE_CUBE ||
- res->target == PIPE_TEXTURE_CUBE_ARRAY ||
- (screen->info.chip_class <= GFX8 &&
- res->target == PIPE_TEXTURE_3D))) {
- /* For the purpose of shader images, treat cube maps and 3D
- * textures as 2D arrays. For 3D textures, the address
- * calculations for mipmaps are different, so we rely on the
- * caller to effectively disable mipmaps.
- */
- type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-
- assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
- } else {
- type = si_tex_dim(screen, tex, target, num_samples);
- }
-
- if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
- height = 1;
- depth = res->array_size;
- } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
- type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
- if (sampler || res->target != PIPE_TEXTURE_3D)
- depth = res->array_size;
- } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
- depth = res->array_size / 6;
-
- state[0] = 0;
- state[1] = (S_008F14_DATA_FORMAT(data_format) |
- S_008F14_NUM_FORMAT(num_format));
- state[2] = (S_008F18_WIDTH(width - 1) |
- S_008F18_HEIGHT(height - 1) |
- S_008F18_PERF_MOD(4));
- state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
- S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
- S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
- S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
- S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
- S_008F1C_LAST_LEVEL(num_samples > 1 ?
- util_logbase2(num_samples) :
- last_level) |
- S_008F1C_TYPE(type));
- state[4] = 0;
- state[5] = S_008F24_BASE_ARRAY(first_layer);
- state[6] = 0;
- state[7] = 0;
-
- if (screen->info.chip_class == GFX9) {
- unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
-
- /* Depth is the the last accessible layer on Gfx9.
- * The hw doesn't need to know the total number of layers.
- */
- if (type == V_008F1C_SQ_RSRC_IMG_3D)
- state[4] |= S_008F20_DEPTH(depth - 1);
- else
- state[4] |= S_008F20_DEPTH(last_layer);
-
- state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
- state[5] |= S_008F24_MAX_MIP(num_samples > 1 ?
- util_logbase2(num_samples) :
- tex->buffer.b.b.last_level);
- } else {
- state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
- state[4] |= S_008F20_DEPTH(depth - 1);
- state[5] |= S_008F24_LAST_ARRAY(last_layer);
- }
-
- if (tex->surface.dcc_offset) {
- state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
- } else {
- /* The last dword is unused by hw. The shader uses it to clear
- * bits in the first dword of sampler state.
- */
- if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
- if (first_level == last_level)
- state[7] = C_008F30_MAX_ANISO_RATIO;
- else
- state[7] = 0xffffffff;
- }
- }
-
- /* Initialize the sampler view for FMASK. */
- if (tex->surface.fmask_offset) {
- uint32_t data_format, num_format;
-
- va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
- if (screen->info.chip_class == GFX9) {
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
- switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
- case FMASK(2,1):
- num_format = V_008F14_IMG_FMASK_8_2_1;
- break;
- case FMASK(2,2):
- num_format = V_008F14_IMG_FMASK_8_2_2;
- break;
- case FMASK(4,1):
- num_format = V_008F14_IMG_FMASK_8_4_1;
- break;
- case FMASK(4,2):
- num_format = V_008F14_IMG_FMASK_8_4_2;
- break;
- case FMASK(4,4):
- num_format = V_008F14_IMG_FMASK_8_4_4;
- break;
- case FMASK(8,1):
- num_format = V_008F14_IMG_FMASK_8_8_1;
- break;
- case FMASK(8,2):
- num_format = V_008F14_IMG_FMASK_16_8_2;
- break;
- case FMASK(8,4):
- num_format = V_008F14_IMG_FMASK_32_8_4;
- break;
- case FMASK(8,8):
- num_format = V_008F14_IMG_FMASK_32_8_8;
- break;
- case FMASK(16,1):
- num_format = V_008F14_IMG_FMASK_16_16_1;
- break;
- case FMASK(16,2):
- num_format = V_008F14_IMG_FMASK_32_16_2;
- break;
- case FMASK(16,4):
- num_format = V_008F14_IMG_FMASK_64_16_4;
- break;
- case FMASK(16,8):
- num_format = V_008F14_IMG_FMASK_64_16_8;
- break;
- default:
- unreachable("invalid nr_samples");
- }
- } else {
- switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
- case FMASK(2,1):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
- break;
- case FMASK(2,2):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
- break;
- case FMASK(4,1):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
- break;
- case FMASK(4,2):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
- break;
- case FMASK(4,4):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
- break;
- case FMASK(8,1):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
- break;
- case FMASK(8,2):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
- break;
- case FMASK(8,4):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
- break;
- case FMASK(8,8):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
- break;
- case FMASK(16,1):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
- break;
- case FMASK(16,2):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
- break;
- case FMASK(16,4):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
- break;
- case FMASK(16,8):
- data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
- break;
- default:
- unreachable("invalid nr_samples");
- }
- num_format = V_008F14_IMG_NUM_FORMAT_UINT;
- }
+ struct pipe_resource *res = &tex->buffer.b.b;
+ const struct util_format_description *desc;
+ unsigned char swizzle[4];
+ int first_non_void;
+ unsigned num_format, data_format, type, num_samples;
+ uint64_t va;
+
+ desc = util_format_description(pipe_format);
+
+ num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples)
+ : MAX2(1, res->nr_storage_samples);
+
+ if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+ const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+ const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+ const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+
+ switch (pipe_format) {
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ case PIPE_FORMAT_X32_S8X24_UINT:
+ case PIPE_FORMAT_X8Z24_UNORM:
+ util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+ break;
+ case PIPE_FORMAT_X24S8_UINT:
+ /*
+ * X24S8 is implemented as an 8_8_8_8 data format, to
+ * fix texture gathers. This affects at least
+ * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+ */
+ if (screen->info.chip_class <= GFX8)
+ util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+ else
+ util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+ break;
+ default:
+ util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+ }
+ } else {
+ util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+ }
+
+ first_non_void = util_format_get_first_non_void_channel(pipe_format);
+
+ switch (pipe_format) {
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+ break;
+ default:
+ if (first_non_void < 0) {
+ if (util_format_is_compressed(pipe_format)) {
+ switch (pipe_format) {
+ case PIPE_FORMAT_DXT1_SRGB:
+ case PIPE_FORMAT_DXT1_SRGBA:
+ case PIPE_FORMAT_DXT3_SRGBA:
+ case PIPE_FORMAT_DXT5_SRGBA:
+ case PIPE_FORMAT_BPTC_SRGBA:
+ case PIPE_FORMAT_ETC2_SRGB8:
+ case PIPE_FORMAT_ETC2_SRGB8A1:
+ case PIPE_FORMAT_ETC2_SRGBA8:
+ num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+ break;
+ case PIPE_FORMAT_RGTC1_SNORM:
+ case PIPE_FORMAT_LATC1_SNORM:
+ case PIPE_FORMAT_RGTC2_SNORM:
+ case PIPE_FORMAT_LATC2_SNORM:
+ case PIPE_FORMAT_ETC2_R11_SNORM:
+ case PIPE_FORMAT_ETC2_RG11_SNORM:
+ /* implies float, so use SNORM/UNORM to determine
+ whether data is signed or not */
+ case PIPE_FORMAT_BPTC_RGB_FLOAT:
+ num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+ break;
+ default:
+ num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+ break;
+ }
+ } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+ num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+ } else {
+ num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+ }
+ } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+ num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+ } else {
+ num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+
+ switch (desc->channel[first_non_void].type) {
+ case UTIL_FORMAT_TYPE_FLOAT:
+ num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ if (desc->channel[first_non_void].normalized)
+ num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+ else if (desc->channel[first_non_void].pure_integer)
+ num_format = V_008F14_IMG_NUM_FORMAT_SINT;
+ else
+ num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
+ break;
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ if (desc->channel[first_non_void].normalized)
+ num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+ else if (desc->channel[first_non_void].pure_integer)
+ num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+ else
+ num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
+ }
+ }
+ }
+
+ data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
+ if (data_format == ~0) {
+ data_format = 0;
+ }
+
+ /* S8 with Z32 HTILE needs a special format. */
+ if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT &&
+ tex->tc_compatible_htile)
+ data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
+
+ if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
+ (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) {
+ /* For the purpose of shader images, treat cube maps and 3D
+ * textures as 2D arrays. For 3D textures, the address
+ * calculations for mipmaps are different, so we rely on the
+ * caller to effectively disable mipmaps.
+ */
+ type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+
+ assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
+ } else {
+ type = si_tex_dim(screen, tex, target, num_samples);
+ }
+
+ if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+ height = 1;
+ depth = res->array_size;
+ } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+ if (sampler || res->target != PIPE_TEXTURE_3D)
+ depth = res->array_size;
+ } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+ depth = res->array_size / 6;
+
+ state[0] = 0;
+ state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format));
+ state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
+ state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+ S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+ S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+ S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+ S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
+ S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) |
+ S_008F1C_TYPE(type));
+ state[4] = 0;
+ state[5] = S_008F24_BASE_ARRAY(first_layer);
+ state[6] = 0;
+ state[7] = 0;
+
+ if (screen->info.chip_class == GFX9) {
+ unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
+
+ /* Depth is the the last accessible layer on Gfx9.
+ * The hw doesn't need to know the total number of layers.
+ */
+ if (type == V_008F1C_SQ_RSRC_IMG_3D)
+ state[4] |= S_008F20_DEPTH(depth - 1);
+ else
+ state[4] |= S_008F20_DEPTH(last_layer);
+
+ state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
+ state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples)
+ : tex->buffer.b.b.last_level);
+ } else {
+ state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
+ state[4] |= S_008F20_DEPTH(depth - 1);
+ state[5] |= S_008F24_LAST_ARRAY(last_layer);
+ }
+
+ if (tex->surface.dcc_offset) {
+ state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+ } else {
+ /* The last dword is unused by hw. The shader uses it to clear
+ * bits in the first dword of sampler state.
+ */
+ if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
+ if (first_level == last_level)
+ state[7] = C_008F30_MAX_ANISO_RATIO;
+ else
+ state[7] = 0xffffffff;
+ }
+ }
+
+ /* Initialize the sampler view for FMASK. */
+ if (tex->surface.fmask_offset) {
+ uint32_t data_format, num_format;
+
+ va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+ if (screen->info.chip_class == GFX9) {
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
+ switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+ case FMASK(2, 1):
+ num_format = V_008F14_IMG_FMASK_8_2_1;
+ break;
+ case FMASK(2, 2):
+ num_format = V_008F14_IMG_FMASK_8_2_2;
+ break;
+ case FMASK(4, 1):
+ num_format = V_008F14_IMG_FMASK_8_4_1;
+ break;
+ case FMASK(4, 2):
+ num_format = V_008F14_IMG_FMASK_8_4_2;
+ break;
+ case FMASK(4, 4):
+ num_format = V_008F14_IMG_FMASK_8_4_4;
+ break;
+ case FMASK(8, 1):
+ num_format = V_008F14_IMG_FMASK_8_8_1;
+ break;
+ case FMASK(8, 2):
+ num_format = V_008F14_IMG_FMASK_16_8_2;
+ break;
+ case FMASK(8, 4):
+ num_format = V_008F14_IMG_FMASK_32_8_4;
+ break;
+ case FMASK(8, 8):
+ num_format = V_008F14_IMG_FMASK_32_8_8;
+ break;
+ case FMASK(16, 1):
+ num_format = V_008F14_IMG_FMASK_16_16_1;
+ break;
+ case FMASK(16, 2):
+ num_format = V_008F14_IMG_FMASK_32_16_2;
+ break;
+ case FMASK(16, 4):
+ num_format = V_008F14_IMG_FMASK_64_16_4;
+ break;
+ case FMASK(16, 8):
+ num_format = V_008F14_IMG_FMASK_64_16_8;
+ break;
+ default:
+ unreachable("invalid nr_samples");
+ }
+ } else {
+ switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+ case FMASK(2, 1):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
+ break;
+ case FMASK(2, 2):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
+ break;
+ case FMASK(4, 1):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
+ break;
+ case FMASK(4, 2):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
+ break;
+ case FMASK(4, 4):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
+ break;
+ case FMASK(8, 1):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
+ break;
+ case FMASK(8, 2):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
+ break;
+ case FMASK(8, 4):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
+ break;
+ case FMASK(8, 8):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
+ break;
+ case FMASK(16, 1):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
+ break;
+ case FMASK(16, 2):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
+ break;
+ case FMASK(16, 4):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
+ break;
+ case FMASK(16, 8):
+ data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
+ break;
+ default:
+ unreachable("invalid nr_samples");
+ }
+ num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+ }
#undef FMASK
- fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
- fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
- S_008F14_DATA_FORMAT(data_format) |
- S_008F14_NUM_FORMAT(num_format);
- fmask_state[2] = S_008F18_WIDTH(width - 1) |
- S_008F18_HEIGHT(height - 1);
- fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
- S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
- S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
- S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
- S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
- fmask_state[4] = 0;
- fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
- fmask_state[6] = 0;
- fmask_state[7] = 0;
-
- if (screen->info.chip_class == GFX9) {
- fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
- fmask_state[4] |= S_008F20_DEPTH(last_layer) |
- S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
- fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
- S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
- } else {
- fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
- fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
- S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
- fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
- }
- }
+ fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+ fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) |
+ S_008F14_NUM_FORMAT(num_format);
+ fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
+ fmask_state[3] =
+ S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+ S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+ S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
+ fmask_state[4] = 0;
+ fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
+ fmask_state[6] = 0;
+ fmask_state[7] = 0;
+
+ if (screen->info.chip_class == GFX9) {
+ fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
+ fmask_state[4] |=
+ S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
+ fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+ S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
+ } else {
+ fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+ fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
+ S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
+ fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
+ }
+ }
}
/**
* @param height0 height0 override (for compressed textures as int)
* @param force_level set the base address to the level (for compressed textures)
*/
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
- struct pipe_resource *texture,
- const struct pipe_sampler_view *state,
- unsigned width0, unsigned height0,
- unsigned force_level)
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+ struct pipe_resource *texture,
+ const struct pipe_sampler_view *state,
+ unsigned width0, unsigned height0,
+ unsigned force_level)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
- struct si_texture *tex = (struct si_texture*)texture;
- unsigned base_level, first_level, last_level;
- unsigned char state_swizzle[4];
- unsigned height, depth, width;
- unsigned last_layer = state->u.tex.last_layer;
- enum pipe_format pipe_format;
- const struct legacy_surf_level *surflevel;
-
- if (!view)
- return NULL;
-
- /* initialize base object */
- view->base = *state;
- view->base.texture = NULL;
- view->base.reference.count = 1;
- view->base.context = ctx;
-
- assert(texture);
- pipe_resource_reference(&view->base.texture, texture);
-
- if (state->format == PIPE_FORMAT_X24S8_UINT ||
- state->format == PIPE_FORMAT_S8X24_UINT ||
- state->format == PIPE_FORMAT_X32_S8X24_UINT ||
- state->format == PIPE_FORMAT_S8_UINT)
- view->is_stencil_sampler = true;
-
- /* Buffer resource. */
- if (texture->target == PIPE_BUFFER) {
- si_make_buffer_descriptor(sctx->screen,
- si_resource(texture),
- state->format,
- state->u.buf.offset,
- state->u.buf.size,
- view->state);
- return &view->base;
- }
-
- state_swizzle[0] = state->swizzle_r;
- state_swizzle[1] = state->swizzle_g;
- state_swizzle[2] = state->swizzle_b;
- state_swizzle[3] = state->swizzle_a;
-
- base_level = 0;
- first_level = state->u.tex.first_level;
- last_level = state->u.tex.last_level;
- width = width0;
- height = height0;
- depth = texture->depth0;
-
- if (sctx->chip_class <= GFX8 && force_level) {
- assert(force_level == first_level &&
- force_level == last_level);
- base_level = force_level;
- first_level = 0;
- last_level = 0;
- width = u_minify(width, force_level);
- height = u_minify(height, force_level);
- depth = u_minify(depth, force_level);
- }
-
- /* This is not needed if state trackers set last_layer correctly. */
- if (state->target == PIPE_TEXTURE_1D ||
- state->target == PIPE_TEXTURE_2D ||
- state->target == PIPE_TEXTURE_RECT ||
- state->target == PIPE_TEXTURE_CUBE)
- last_layer = state->u.tex.first_layer;
-
- /* Texturing with separate depth and stencil. */
- pipe_format = state->format;
-
- /* Depth/stencil texturing sometimes needs separate texture. */
- if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
- if (!tex->flushed_depth_texture &&
- !si_init_flushed_depth_texture(ctx, texture)) {
- pipe_resource_reference(&view->base.texture, NULL);
- FREE(view);
- return NULL;
- }
-
- assert(tex->flushed_depth_texture);
-
- /* Override format for the case where the flushed texture
- * contains only Z or only S.
- */
- if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
- pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
-
- tex = tex->flushed_depth_texture;
- }
-
- surflevel = tex->surface.u.legacy.level;
-
- if (tex->db_compatible) {
- if (!view->is_stencil_sampler)
- pipe_format = tex->db_render_format;
-
- switch (pipe_format) {
- case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
- pipe_format = PIPE_FORMAT_Z32_FLOAT;
- break;
- case PIPE_FORMAT_X8Z24_UNORM:
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- /* Z24 is always stored like this for DB
- * compatibility.
- */
- pipe_format = PIPE_FORMAT_Z24X8_UNORM;
- break;
- case PIPE_FORMAT_X24S8_UINT:
- case PIPE_FORMAT_S8X24_UINT:
- case PIPE_FORMAT_X32_S8X24_UINT:
- pipe_format = PIPE_FORMAT_S8_UINT;
- surflevel = tex->surface.u.legacy.stencil_level;
- break;
- default:;
- }
- }
-
- view->dcc_incompatible =
- vi_dcc_formats_are_incompatible(texture,
- state->u.tex.first_level,
- state->format);
-
- sctx->screen->make_texture_descriptor(sctx->screen, tex, true,
- state->target, pipe_format, state_swizzle,
- first_level, last_level,
- state->u.tex.first_layer, last_layer,
- width, height, depth,
- view->state, view->fmask_state);
-
- const struct util_format_description *desc = util_format_description(pipe_format);
- view->is_integer = false;
-
- for (unsigned i = 0; i < desc->nr_channels; ++i) {
- if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
- continue;
-
- /* Whether the number format is {U,S}{SCALED,INT} */
- view->is_integer =
- (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
- desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
- (desc->channel[i].pure_integer || !desc->channel[i].normalized);
- break;
- }
-
- view->base_level_info = &surflevel[base_level];
- view->base_level = base_level;
- view->block_width = util_format_get_blockwidth(pipe_format);
- return &view->base;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+ struct si_texture *tex = (struct si_texture *)texture;
+ unsigned base_level, first_level, last_level;
+ unsigned char state_swizzle[4];
+ unsigned height, depth, width;
+ unsigned last_layer = state->u.tex.last_layer;
+ enum pipe_format pipe_format;
+ const struct legacy_surf_level *surflevel;
+
+ if (!view)
+ return NULL;
+
+ /* initialize base object */
+ view->base = *state;
+ view->base.texture = NULL;
+ view->base.reference.count = 1;
+ view->base.context = ctx;
+
+ assert(texture);
+ pipe_resource_reference(&view->base.texture, texture);
+
+ if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT ||
+ state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT)
+ view->is_stencil_sampler = true;
+
+ /* Buffer resource. */
+ if (texture->target == PIPE_BUFFER) {
+ si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format,
+ state->u.buf.offset, state->u.buf.size, view->state);
+ return &view->base;
+ }
+
+ state_swizzle[0] = state->swizzle_r;
+ state_swizzle[1] = state->swizzle_g;
+ state_swizzle[2] = state->swizzle_b;
+ state_swizzle[3] = state->swizzle_a;
+
+ base_level = 0;
+ first_level = state->u.tex.first_level;
+ last_level = state->u.tex.last_level;
+ width = width0;
+ height = height0;
+ depth = texture->depth0;
+
+ if (sctx->chip_class <= GFX8 && force_level) {
+ assert(force_level == first_level && force_level == last_level);
+ base_level = force_level;
+ first_level = 0;
+ last_level = 0;
+ width = u_minify(width, force_level);
+ height = u_minify(height, force_level);
+ depth = u_minify(depth, force_level);
+ }
+
+ /* This is not needed if state trackers set last_layer correctly. */
+ if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
+ state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
+ last_layer = state->u.tex.first_layer;
+
+ /* Texturing with separate depth and stencil. */
+ pipe_format = state->format;
+
+ /* Depth/stencil texturing sometimes needs separate texture. */
+ if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
+ if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) {
+ pipe_resource_reference(&view->base.texture, NULL);
+ FREE(view);
+ return NULL;
+ }
+
+ assert(tex->flushed_depth_texture);
+
+ /* Override format for the case where the flushed texture
+ * contains only Z or only S.
+ */
+ if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
+ pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
+
+ tex = tex->flushed_depth_texture;
+ }
+
+ surflevel = tex->surface.u.legacy.level;
+
+ if (tex->db_compatible) {
+ if (!view->is_stencil_sampler)
+ pipe_format = tex->db_render_format;
+
+ switch (pipe_format) {
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ pipe_format = PIPE_FORMAT_Z32_FLOAT;
+ break;
+ case PIPE_FORMAT_X8Z24_UNORM:
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ /* Z24 is always stored like this for DB
+ * compatibility.
+ */
+ pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+ break;
+ case PIPE_FORMAT_X24S8_UINT:
+ case PIPE_FORMAT_S8X24_UINT:
+ case PIPE_FORMAT_X32_S8X24_UINT:
+ pipe_format = PIPE_FORMAT_S8_UINT;
+ surflevel = tex->surface.u.legacy.stencil_level;
+ break;
+ default:;
+ }
+ }
+
+ view->dcc_incompatible =
+ vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format);
+
+ sctx->screen->make_texture_descriptor(
+ sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level,
+ state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state);
+
+ const struct util_format_description *desc = util_format_description(pipe_format);
+ view->is_integer = false;
+
+ for (unsigned i = 0; i < desc->nr_channels; ++i) {
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
+ continue;
+
+ /* Whether the number format is {U,S}{SCALED,INT} */
+ view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+ desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
+ (desc->channel[i].pure_integer || !desc->channel[i].normalized);
+ break;
+ }
+
+ view->base_level_info = &surflevel[base_level];
+ view->base_level = base_level;
+ view->block_width = util_format_get_blockwidth(pipe_format);
+ return &view->base;
}
-static struct pipe_sampler_view *
-si_create_sampler_view(struct pipe_context *ctx,
- struct pipe_resource *texture,
- const struct pipe_sampler_view *state)
+static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
+ struct pipe_resource *texture,
+ const struct pipe_sampler_view *state)
{
- return si_create_sampler_view_custom(ctx, texture, state,
- texture ? texture->width0 : 0,
- texture ? texture->height0 : 0, 0);
+ return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0,
+ texture ? texture->height0 : 0, 0);
}
-static void si_sampler_view_destroy(struct pipe_context *ctx,
- struct pipe_sampler_view *state)
+static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state)
{
- struct si_sampler_view *view = (struct si_sampler_view *)state;
+ struct si_sampler_view *view = (struct si_sampler_view *)state;
- pipe_resource_reference(&state->texture, NULL);
- FREE(view);
+ pipe_resource_reference(&state->texture, NULL);
+ FREE(view);
}
static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
{
- return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
- wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
- (linear_filter &&
- (wrap == PIPE_TEX_WRAP_CLAMP ||
- wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
+ return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
+ (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
}
static uint32_t si_translate_border_color(struct si_context *sctx,
- const struct pipe_sampler_state *state,
- const union pipe_color_union *color,
- bool is_integer)
+ const struct pipe_sampler_state *state,
+ const union pipe_color_union *color, bool is_integer)
{
- bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
- state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
-
- if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
- !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
- !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
- return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
-
-#define simple_border_types(elt) \
-do { \
- if (color->elt[0] == 0 && color->elt[1] == 0 && \
- color->elt[2] == 0 && color->elt[3] == 0) \
- return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
- if (color->elt[0] == 0 && color->elt[1] == 0 && \
- color->elt[2] == 0 && color->elt[3] == 1) \
- return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
- if (color->elt[0] == 1 && color->elt[1] == 1 && \
- color->elt[2] == 1 && color->elt[3] == 1) \
- return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
-} while (false)
-
- if (is_integer)
- simple_border_types(ui);
- else
- simple_border_types(f);
+ bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+ state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
+
+ if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
+ !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
+ !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
+ return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+
+#define simple_border_types(elt) \
+ do { \
+ if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0) \
+ return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
+ if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1) \
+ return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
+ if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1) \
+ return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
+ } while (false)
+
+ if (is_integer)
+ simple_border_types(ui);
+ else
+ simple_border_types(f);
#undef simple_border_types
- int i;
-
- /* Check if the border has been uploaded already. */
- for (i = 0; i < sctx->border_color_count; i++)
- if (memcmp(&sctx->border_color_table[i], color,
- sizeof(*color)) == 0)
- break;
-
- if (i >= SI_MAX_BORDER_COLORS) {
- /* Getting 4096 unique border colors is very unlikely. */
- fprintf(stderr, "radeonsi: The border color table is full. "
- "Any new border colors will be just black. "
- "Please file a bug.\n");
- return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
- }
-
- if (i == sctx->border_color_count) {
- /* Upload a new border color. */
- memcpy(&sctx->border_color_table[i], color,
- sizeof(*color));
- util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
- color, sizeof(*color));
- sctx->border_color_count++;
- }
-
- return S_008F3C_BORDER_COLOR_PTR(i) |
- S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
+ int i;
+
+ /* Check if the border has been uploaded already. */
+ for (i = 0; i < sctx->border_color_count; i++)
+ if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0)
+ break;
+
+ if (i >= SI_MAX_BORDER_COLORS) {
+ /* Getting 4096 unique border colors is very unlikely. */
+ fprintf(stderr, "radeonsi: The border color table is full. "
+ "Any new border colors will be just black. "
+ "Please file a bug.\n");
+ return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+ }
+
+ if (i == sctx->border_color_count) {
+ /* Upload a new border color. */
+ memcpy(&sctx->border_color_table[i], color, sizeof(*color));
+ util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color));
+ sctx->border_color_count++;
+ }
+
+ return S_008F3C_BORDER_COLOR_PTR(i) |
+ S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
}
static inline int S_FIXED(float value, unsigned frac_bits)
{
- return value * (1 << frac_bits);
+ return value * (1 << frac_bits);
}
static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
{
- if (filter == PIPE_TEX_FILTER_LINEAR)
- return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
- : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
- else
- return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
- : V_008F38_SQ_TEX_XY_FILTER_POINT;
+ if (filter == PIPE_TEX_FILTER_LINEAR)
+ return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
+ : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
+ else
+ return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
+ : V_008F38_SQ_TEX_XY_FILTER_POINT;
}
static inline unsigned si_tex_aniso_filter(unsigned filter)
{
- if (filter < 2)
- return 0;
- if (filter < 4)
- return 1;
- if (filter < 8)
- return 2;
- if (filter < 16)
- return 3;
- return 4;
+ if (filter < 2)
+ return 0;
+ if (filter < 4)
+ return 1;
+ if (filter < 8)
+ return 2;
+ if (filter < 16)
+ return 3;
+ return 4;
}
static void *si_create_sampler_state(struct pipe_context *ctx,
- const struct pipe_sampler_state *state)
+ const struct pipe_sampler_state *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_screen *sscreen = sctx->screen;
- struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
- unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso
- : state->max_anisotropy;
- unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
- union pipe_color_union clamped_border_color;
-
- if (!rstate) {
- return NULL;
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_screen *sscreen = sctx->screen;
+ struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
+ unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
+ unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
+ union pipe_color_union clamped_border_color;
+
+ if (!rstate) {
+ return NULL;
+ }
#ifndef NDEBUG
- rstate->magic = SI_SAMPLER_STATE_MAGIC;
+ rstate->magic = SI_SAMPLER_STATE_MAGIC;
#endif
- rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) |
- S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
- S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) |
- S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
- S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
- S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
- S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) |
- S_008F30_ANISO_BIAS(max_aniso_ratio) |
- S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
- S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
- rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
- S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
- S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
- rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
- S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
- S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
- S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
- S_008F38_MIP_POINT_PRECLAMP(0));
- rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
-
- if (sscreen->info.chip_class >= GFX10) {
- rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
- } else {
- rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
- S_008F38_FILTER_PREC_FIX(1) |
- S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
- }
-
- /* Create sampler resource for integer textures. */
- memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
- rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
-
- /* Create sampler resource for upgraded depth textures. */
- memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
-
- for (unsigned i = 0; i < 4; ++i) {
- /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
- * when the border color is 1.0. */
- clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
- }
-
- if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
- if (sscreen->info.chip_class <= GFX9)
- rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
- } else {
- rstate->upgraded_depth_val[3] =
- si_translate_border_color(sctx, state, &clamped_border_color, false);
- }
-
- return rstate;
+ rstate->val[0] =
+ (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
+ S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
+ S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
+ S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
+ S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
+ S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
+ S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
+ rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
+ S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
+ S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
+ rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
+ S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
+ S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
+ S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
+ S_008F38_MIP_POINT_PRECLAMP(0));
+ rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
+ } else {
+ rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
+ S_008F38_FILTER_PREC_FIX(1) |
+ S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
+ }
+
+ /* Create sampler resource for integer textures. */
+ memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
+ rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
+
+ /* Create sampler resource for upgraded depth textures. */
+ memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
+
+ for (unsigned i = 0; i < 4; ++i) {
+ /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
+ * when the border color is 1.0. */
+ clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
+ }
+
+ if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
+ if (sscreen->info.chip_class <= GFX9)
+ rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
+ } else {
+ rstate->upgraded_depth_val[3] =
+ si_translate_border_color(sctx, state, &clamped_border_color, false);
+ }
+
+ return rstate;
}
static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- if (sctx->sample_mask == (uint16_t)sample_mask)
- return;
+ if (sctx->sample_mask == (uint16_t)sample_mask)
+ return;
- sctx->sample_mask = sample_mask;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
+ sctx->sample_mask = sample_mask;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
}
static void si_emit_sample_mask(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned mask = sctx->sample_mask;
-
- /* Needed for line and polygon smoothing as well as for the Polaris
- * small primitive filter. We expect the state tracker to take care of
- * this for us.
- */
- assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
- (mask & 1 && sctx->blitter->running));
-
- radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
- radeon_emit(cs, mask | (mask << 16));
- radeon_emit(cs, mask | (mask << 16));
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned mask = sctx->sample_mask;
+
+ /* Needed for line and polygon smoothing as well as for the Polaris
+ * small primitive filter. We expect the state tracker to take care of
+ * this for us.
+ */
+ assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
+ (mask & 1 && sctx->blitter->running));
+
+ radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+ radeon_emit(cs, mask | (mask << 16));
+ radeon_emit(cs, mask | (mask << 16));
}
static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
{
#ifndef NDEBUG
- struct si_sampler_state *s = state;
+ struct si_sampler_state *s = state;
- assert(s->magic == SI_SAMPLER_STATE_MAGIC);
- s->magic = 0;
+ assert(s->magic == SI_SAMPLER_STATE_MAGIC);
+ s->magic = 0;
#endif
- free(state);
+ free(state);
}
/*
* Vertex elements & buffers
*/
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
{
- struct util_fast_udiv_info info =
- util_compute_fast_udiv_info(D, num_bits, 32);
-
- struct si_fast_udiv_info32 result = {
- info.multiplier,
- info.pre_shift,
- info.post_shift,
- info.increment,
- };
- return result;
+ struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32);
+
+ struct si_fast_udiv_info32 result = {
+ info.multiplier,
+ info.pre_shift,
+ info.post_shift,
+ info.increment,
+ };
+ return result;
}
-static void *si_create_vertex_elements(struct pipe_context *ctx,
- unsigned count,
- const struct pipe_vertex_element *elements)
+static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
+ const struct pipe_vertex_element *elements)
{
- struct si_screen *sscreen = (struct si_screen*)ctx->screen;
- struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
- bool used[SI_NUM_VERTEX_BUFFERS] = {};
- struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
- STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
- STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
- STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
- STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
- STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
- int i;
-
- assert(count <= SI_MAX_ATTRIBS);
- if (!v)
- return NULL;
-
- v->count = count;
-
- unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ?
- count - sscreen->num_vbos_in_user_sgprs : 0;
- v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
-
- for (i = 0; i < count; ++i) {
- const struct util_format_description *desc;
- const struct util_format_channel_description *channel;
- int first_non_void;
- unsigned vbo_index = elements[i].vertex_buffer_index;
-
- if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
- FREE(v);
- return NULL;
- }
-
- unsigned instance_divisor = elements[i].instance_divisor;
- if (instance_divisor) {
- v->uses_instance_divisors = true;
-
- if (instance_divisor == 1) {
- v->instance_divisor_is_one |= 1u << i;
- } else {
- v->instance_divisor_is_fetched |= 1u << i;
- divisor_factors[i] =
- si_compute_fast_udiv_info32(instance_divisor, 32);
- }
- }
-
- if (!used[vbo_index]) {
- v->first_vb_use_mask |= 1 << i;
- used[vbo_index] = true;
- }
-
- desc = util_format_description(elements[i].src_format);
- first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
- channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-
- v->format_size[i] = desc->block.bits / 8;
- v->src_offset[i] = elements[i].src_offset;
- v->vertex_buffer_index[i] = vbo_index;
-
- bool always_fix = false;
- union si_vs_fix_fetch fix_fetch;
- unsigned log_hw_load_size; /* the load element size as seen by the hardware */
-
- fix_fetch.bits = 0;
- log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
-
- if (channel) {
- switch (channel->type) {
- case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
- case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
- case UTIL_FORMAT_TYPE_SIGNED: {
- if (channel->pure_integer)
- fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
- else if (channel->normalized)
- fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
- else
- fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
- break;
- }
- case UTIL_FORMAT_TYPE_UNSIGNED: {
- if (channel->pure_integer)
- fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
- else if (channel->normalized)
- fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
- else
- fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
- break;
- }
- default: unreachable("bad format type");
- }
- } else {
- switch (elements[i].src_format) {
- case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
- default: unreachable("bad other format");
- }
- }
-
- if (desc->channel[0].size == 10) {
- fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
- log_hw_load_size = 2;
-
- /* The hardware always treats the 2-bit alpha channel as
- * unsigned, so a shader workaround is needed. The affected
- * chips are GFX8 and older except Stoney (GFX8.1).
- */
- always_fix = sscreen->info.chip_class <= GFX8 &&
- sscreen->info.family != CHIP_STONEY &&
- channel->type == UTIL_FORMAT_TYPE_SIGNED;
- } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
- fix_fetch.u.log_size = 3; /* special encoding */
- fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
- log_hw_load_size = 2;
- } else {
- fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
- fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
-
- /* Always fix up:
- * - doubles (multiple loads + truncate to float)
- * - 32-bit requiring a conversion
- */
- always_fix =
- (fix_fetch.u.log_size == 3) ||
- (fix_fetch.u.log_size == 2 &&
- fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
- fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
- fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
-
- /* Also fixup 8_8_8 and 16_16_16. */
- if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
- always_fix = true;
- log_hw_load_size = fix_fetch.u.log_size;
- }
- }
-
- if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
- assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
- (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
- fix_fetch.u.reverse = 1;
- }
-
- /* Force the workaround for unaligned access here already if the
- * offset relative to the vertex buffer base is unaligned.
- *
- * There is a theoretical case in which this is too conservative:
- * if the vertex buffer's offset is also unaligned in just the
- * right way, we end up with an aligned address after all.
- * However, this case should be extremely rare in practice (it
- * won't happen in well-behaved applications), and taking it
- * into account would complicate the fast path (where everything
- * is nicely aligned).
- */
- bool check_alignment =
- log_hw_load_size >= 1 &&
- (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10);
- bool opencode = sscreen->options.vs_fetch_always_opencode;
-
- if (check_alignment &&
- (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
- opencode = true;
-
- if (always_fix || check_alignment || opencode)
- v->fix_fetch[i] = fix_fetch.bits;
-
- if (opencode)
- v->fix_fetch_opencode |= 1 << i;
- if (opencode || always_fix)
- v->fix_fetch_always |= 1 << i;
-
- if (check_alignment && !opencode) {
- assert(log_hw_load_size == 1 || log_hw_load_size == 2);
-
- v->fix_fetch_unaligned |= 1 << i;
- v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
- v->vb_alignment_check_mask |= 1 << vbo_index;
- }
-
- v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
- S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
- S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
- S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
- if (sscreen->info.chip_class >= GFX10) {
- const struct gfx10_format *fmt =
- &gfx10_format_table[elements[i].src_format];
- assert(fmt->img_format != 0 && fmt->img_format < 128);
- v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) |
- S_008F0C_RESOURCE_LEVEL(1);
- } else {
- unsigned data_format, num_format;
- data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
- num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
- v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
- S_008F0C_DATA_FORMAT(data_format);
- }
- }
-
- if (v->instance_divisor_is_fetched) {
- unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
-
- v->instance_divisor_factor_buffer =
- (struct si_resource*)
- pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
- num_divisors * sizeof(divisor_factors[0]));
- if (!v->instance_divisor_factor_buffer) {
- FREE(v);
- return NULL;
- }
- void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
- NULL, PIPE_TRANSFER_WRITE);
- memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
- }
- return v;
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
+ bool used[SI_NUM_VERTEX_BUFFERS] = {};
+ struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+ STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
+ STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
+ int i;
+
+ assert(count <= SI_MAX_ATTRIBS);
+ if (!v)
+ return NULL;
+
+ v->count = count;
+
+ unsigned alloc_count =
+ count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+ v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
+
+ for (i = 0; i < count; ++i) {
+ const struct util_format_description *desc;
+ const struct util_format_channel_description *channel;
+ int first_non_void;
+ unsigned vbo_index = elements[i].vertex_buffer_index;
+
+ if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
+ FREE(v);
+ return NULL;
+ }
+
+ unsigned instance_divisor = elements[i].instance_divisor;
+ if (instance_divisor) {
+ v->uses_instance_divisors = true;
+
+ if (instance_divisor == 1) {
+ v->instance_divisor_is_one |= 1u << i;
+ } else {
+ v->instance_divisor_is_fetched |= 1u << i;
+ divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32);
+ }
+ }
+
+ if (!used[vbo_index]) {
+ v->first_vb_use_mask |= 1 << i;
+ used[vbo_index] = true;
+ }
+
+ desc = util_format_description(elements[i].src_format);
+ first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
+ channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
+
+ v->format_size[i] = desc->block.bits / 8;
+ v->src_offset[i] = elements[i].src_offset;
+ v->vertex_buffer_index[i] = vbo_index;
+
+ bool always_fix = false;
+ union si_vs_fix_fetch fix_fetch;
+ unsigned log_hw_load_size; /* the load element size as seen by the hardware */
+
+ fix_fetch.bits = 0;
+ log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+ if (channel) {
+ switch (channel->type) {
+ case UTIL_FORMAT_TYPE_FLOAT:
+ fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+ break;
+ case UTIL_FORMAT_TYPE_FIXED:
+ fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED: {
+ if (channel->pure_integer)
+ fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+ else if (channel->normalized)
+ fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
+ else
+ fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+ break;
+ }
+ case UTIL_FORMAT_TYPE_UNSIGNED: {
+ if (channel->pure_integer)
+ fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+ else if (channel->normalized)
+ fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
+ else
+ fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+ break;
+ }
+ default:
+ unreachable("bad format type");
+ }
+ } else {
+ switch (elements[i].src_format) {
+ case PIPE_FORMAT_R11G11B10_FLOAT:
+ fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+ break;
+ default:
+ unreachable("bad other format");
+ }
+ }
+
+ if (desc->channel[0].size == 10) {
+ fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+ log_hw_load_size = 2;
+
+ /* The hardware always treats the 2-bit alpha channel as
+ * unsigned, so a shader workaround is needed. The affected
+ * chips are GFX8 and older except Stoney (GFX8.1).
+ */
+ always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY &&
+ channel->type == UTIL_FORMAT_TYPE_SIGNED;
+ } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ fix_fetch.u.log_size = 3; /* special encoding */
+ fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+ log_hw_load_size = 2;
+ } else {
+ fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+ fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+ /* Always fix up:
+ * - doubles (multiple loads + truncate to float)
+ * - 32-bit requiring a conversion
+ */
+ always_fix = (fix_fetch.u.log_size == 3) ||
+ (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+ fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+ fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+ /* Also fixup 8_8_8 and 16_16_16. */
+ if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+ always_fix = true;
+ log_hw_load_size = fix_fetch.u.log_size;
+ }
+ }
+
+ if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+ assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+ (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+ fix_fetch.u.reverse = 1;
+ }
+
+ /* Force the workaround for unaligned access here already if the
+ * offset relative to the vertex buffer base is unaligned.
+ *
+ * There is a theoretical case in which this is too conservative:
+ * if the vertex buffer's offset is also unaligned in just the
+ * right way, we end up with an aligned address after all.
+ * However, this case should be extremely rare in practice (it
+ * won't happen in well-behaved applications), and taking it
+ * into account would complicate the fast path (where everything
+ * is nicely aligned).
+ */
+ bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
+ sscreen->info.chip_class == GFX10);
+ bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+ if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+ opencode = true;
+
+ if (always_fix || check_alignment || opencode)
+ v->fix_fetch[i] = fix_fetch.bits;
+
+ if (opencode)
+ v->fix_fetch_opencode |= 1 << i;
+ if (opencode || always_fix)
+ v->fix_fetch_always |= 1 << i;
+
+ if (check_alignment && !opencode) {
+ assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+ v->fix_fetch_unaligned |= 1 << i;
+ v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+ v->vb_alignment_check_mask |= 1 << vbo_index;
+ }
+
+ v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+ S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+ S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+ S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+ if (sscreen->info.chip_class >= GFX10) {
+ const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format];
+ assert(fmt->img_format != 0 && fmt->img_format < 128);
+ v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1);
+ } else {
+ unsigned data_format, num_format;
+ data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+ num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+ v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+ }
+ }
+
+ if (v->instance_divisor_is_fetched) {
+ unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+ v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create(
+ &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0]));
+ if (!v->instance_divisor_factor_buffer) {
+ FREE(v);
+ return NULL;
+ }
+ void *map =
+ sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+ memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+ }
+ return v;
}
static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_vertex_elements *old = sctx->vertex_elements;
- struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
- sctx->vertex_elements = v;
- sctx->num_vertex_elements = v ? v->count : 0;
-
- if (sctx->num_vertex_elements) {
- sctx->vertex_buffers_dirty = true;
- } else {
- sctx->vertex_buffer_pointer_dirty = false;
- sctx->vertex_buffer_user_sgprs_dirty = false;
- }
-
- if (v &&
- (!old ||
- old->count != v->count ||
- old->uses_instance_divisors != v->uses_instance_divisors ||
- /* we don't check which divisors changed */
- v->uses_instance_divisors ||
- (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
- ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
- memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
- sizeof(v->vertex_buffer_index[0]) * v->count)) ||
- /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
- * functions of fix_fetch and the src_offset alignment.
- * If they change and fix_fetch doesn't, it must be due to different
- * src_offset alignment, which is reflected in fix_fetch_opencode. */
- old->fix_fetch_opencode != v->fix_fetch_opencode ||
- memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
- sctx->do_update_shaders = true;
-
- if (v && v->instance_divisor_is_fetched) {
- struct pipe_constant_buffer cb;
-
- cb.buffer = &v->instance_divisor_factor_buffer->b.b;
- cb.user_buffer = NULL;
- cb.buffer_offset = 0;
- cb.buffer_size = 0xffffffff;
- si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_vertex_elements *old = sctx->vertex_elements;
+ struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+ sctx->vertex_elements = v;
+ sctx->num_vertex_elements = v ? v->count : 0;
+
+ if (sctx->num_vertex_elements) {
+ sctx->vertex_buffers_dirty = true;
+ } else {
+ sctx->vertex_buffer_pointer_dirty = false;
+ sctx->vertex_buffer_user_sgprs_dirty = false;
+ }
+
+ if (v && (!old || old->count != v->count ||
+ old->uses_instance_divisors != v->uses_instance_divisors ||
+ /* we don't check which divisors changed */
+ v->uses_instance_divisors ||
+ (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
+ sctx->vertex_buffer_unaligned ||
+ ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+ memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+ sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+ /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+ * functions of fix_fetch and the src_offset alignment.
+ * If they change and fix_fetch doesn't, it must be due to different
+ * src_offset alignment, which is reflected in fix_fetch_opencode. */
+ old->fix_fetch_opencode != v->fix_fetch_opencode ||
+ memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
+ sctx->do_update_shaders = true;
+
+ if (v && v->instance_divisor_is_fetched) {
+ struct pipe_constant_buffer cb;
+
+ cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+ cb.user_buffer = NULL;
+ cb.buffer_offset = 0;
+ cb.buffer_size = 0xffffffff;
+ si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
+ }
}
static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
- if (sctx->vertex_elements == state) {
- sctx->vertex_elements = NULL;
- sctx->num_vertex_elements = 0;
- }
- si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
- FREE(state);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+ if (sctx->vertex_elements == state) {
+ sctx->vertex_elements = NULL;
+ sctx->num_vertex_elements = 0;
+ }
+ si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
+ FREE(state);
}
-static void si_set_vertex_buffers(struct pipe_context *ctx,
- unsigned start_slot, unsigned count,
- const struct pipe_vertex_buffer *buffers)
+static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count,
+ const struct pipe_vertex_buffer *buffers)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
- unsigned updated_mask = u_bit_consecutive(start_slot, count);
- uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
- uint32_t unaligned = 0;
- int i;
-
- assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
-
- if (buffers) {
- for (i = 0; i < count; i++) {
- const struct pipe_vertex_buffer *src = buffers + i;
- struct pipe_vertex_buffer *dsti = dst + i;
- struct pipe_resource *buf = src->buffer.resource;
- unsigned slot_bit = 1 << (start_slot + i);
-
- pipe_resource_reference(&dsti->buffer.resource, buf);
- dsti->buffer_offset = src->buffer_offset;
- dsti->stride = src->stride;
-
- if (dsti->buffer_offset & 3 || dsti->stride & 3)
- unaligned |= slot_bit;
-
- si_context_add_resource_size(sctx, buf);
- if (buf)
- si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
- }
- } else {
- for (i = 0; i < count; i++) {
- pipe_resource_reference(&dst[i].buffer.resource, NULL);
- }
- unaligned &= ~updated_mask;
- }
- sctx->vertex_buffers_dirty = true;
- sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
-
- /* Check whether alignment may have changed in a way that requires
- * shader changes. This check is conservative: a vertex buffer can only
- * trigger a shader change if the misalignment amount changes (e.g.
- * from byte-aligned to short-aligned), but we only keep track of
- * whether buffers are at least dword-aligned, since that should always
- * be the case in well-behaved applications anyway.
- */
- if (sctx->vertex_elements &&
- (sctx->vertex_elements->vb_alignment_check_mask &
- (unaligned | orig_unaligned) & updated_mask))
- sctx->do_update_shaders = true;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+ unsigned updated_mask = u_bit_consecutive(start_slot, count);
+ uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+ uint32_t unaligned = 0;
+ int i;
+
+ assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
+
+ if (buffers) {
+ for (i = 0; i < count; i++) {
+ const struct pipe_vertex_buffer *src = buffers + i;
+ struct pipe_vertex_buffer *dsti = dst + i;
+ struct pipe_resource *buf = src->buffer.resource;
+ unsigned slot_bit = 1 << (start_slot + i);
+
+ pipe_resource_reference(&dsti->buffer.resource, buf);
+ dsti->buffer_offset = src->buffer_offset;
+ dsti->stride = src->stride;
+
+ if (dsti->buffer_offset & 3 || dsti->stride & 3)
+ unaligned |= slot_bit;
+
+ si_context_add_resource_size(sctx, buf);
+ if (buf)
+ si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+ }
+ } else {
+ for (i = 0; i < count; i++) {
+ pipe_resource_reference(&dst[i].buffer.resource, NULL);
+ }
+ unaligned &= ~updated_mask;
+ }
+ sctx->vertex_buffers_dirty = true;
+ sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
+
+ /* Check whether alignment may have changed in a way that requires
+ * shader changes. This check is conservative: a vertex buffer can only
+ * trigger a shader change if the misalignment amount changes (e.g.
+ * from byte-aligned to short-aligned), but we only keep track of
+ * whether buffers are at least dword-aligned, since that should always
+ * be the case in well-behaved applications anyway.
+ */
+ if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask &
+ (unaligned | orig_unaligned) & updated_mask))
+ sctx->do_update_shaders = true;
}
/*
* Misc
*/
-static void si_set_tess_state(struct pipe_context *ctx,
- const float default_outer_level[4],
- const float default_inner_level[2])
+static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4],
+ const float default_inner_level[2])
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct pipe_constant_buffer cb;
- float array[8];
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_constant_buffer cb;
+ float array[8];
- memcpy(array, default_outer_level, sizeof(float) * 4);
- memcpy(array+4, default_inner_level, sizeof(float) * 2);
+ memcpy(array, default_outer_level, sizeof(float) * 4);
+ memcpy(array + 4, default_inner_level, sizeof(float) * 2);
- cb.buffer = NULL;
- cb.user_buffer = NULL;
- cb.buffer_size = sizeof(array);
+ cb.buffer = NULL;
+ cb.user_buffer = NULL;
+ cb.buffer_size = sizeof(array);
- si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer,
- (void*)array, sizeof(array),
- &cb.buffer_offset);
+ si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array),
+ &cb.buffer_offset);
- si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
- pipe_resource_reference(&cb.buffer, NULL);
+ si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
+ pipe_resource_reference(&cb.buffer, NULL);
}
static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- si_update_fb_dirtiness_after_rendering(sctx);
+ si_update_fb_dirtiness_after_rendering(sctx);
- /* Multisample surfaces are flushed in si_decompress_textures. */
- if (sctx->framebuffer.uncompressed_cb_mask) {
- si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
- sctx->framebuffer.CB_has_shader_readable_metadata,
- sctx->framebuffer.all_DCC_pipe_aligned);
- }
+ /* Multisample surfaces are flushed in si_decompress_textures. */
+ if (sctx->framebuffer.uncompressed_cb_mask) {
+ si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+ sctx->framebuffer.CB_has_shader_readable_metadata,
+ sctx->framebuffer.all_DCC_pipe_aligned);
+ }
}
/* This only ensures coherency for shader image/buffer stores. */
static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
{
- struct si_context *sctx = (struct si_context *)ctx;
-
- if (!(flags & ~PIPE_BARRIER_UPDATE))
- return;
-
- /* Subsequent commands must wait for all shader invocations to
- * complete. */
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
-
- if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
- sctx->flags |= SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE;
-
- if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
- PIPE_BARRIER_SHADER_BUFFER |
- PIPE_BARRIER_TEXTURE |
- PIPE_BARRIER_IMAGE |
- PIPE_BARRIER_STREAMOUT_BUFFER |
- PIPE_BARRIER_GLOBAL_BUFFER)) {
- /* As far as I can tell, L1 contents are written back to L2
- * automatically at end of shader, but the contents of other
- * L1 caches might still be stale. */
- sctx->flags |= SI_CONTEXT_INV_VCACHE;
- }
-
- if (flags & PIPE_BARRIER_INDEX_BUFFER) {
- /* Indices are read through TC L2 since GFX8.
- * L1 isn't used.
- */
- if (sctx->screen->info.chip_class <= GFX7)
- sctx->flags |= SI_CONTEXT_WB_L2;
- }
-
- /* MSAA color, any depth and any stencil are flushed in
- * si_decompress_textures when needed.
- */
- if (flags & PIPE_BARRIER_FRAMEBUFFER &&
- sctx->framebuffer.uncompressed_cb_mask) {
- sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
- if (sctx->chip_class <= GFX8)
- sctx->flags |= SI_CONTEXT_WB_L2;
- }
-
- /* Indirect buffers use TC L2 on GFX9, but not older hw. */
- if (sctx->screen->info.chip_class <= GFX8 &&
- flags & PIPE_BARRIER_INDIRECT_BUFFER)
- sctx->flags |= SI_CONTEXT_WB_L2;
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (!(flags & ~PIPE_BARRIER_UPDATE))
+ return;
+
+ /* Subsequent commands must wait for all shader invocations to
+ * complete. */
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+ if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
+ sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+ if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
+ PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) {
+ /* As far as I can tell, L1 contents are written back to L2
+ * automatically at end of shader, but the contents of other
+ * L1 caches might still be stale. */
+ sctx->flags |= SI_CONTEXT_INV_VCACHE;
+ }
+
+ if (flags & PIPE_BARRIER_INDEX_BUFFER) {
+ /* Indices are read through TC L2 since GFX8.
+ * L1 isn't used.
+ */
+ if (sctx->screen->info.chip_class <= GFX7)
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ }
+
+ /* MSAA color, any depth and any stencil are flushed in
+ * si_decompress_textures when needed.
+ */
+ if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
+ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+ if (sctx->chip_class <= GFX8)
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ }
+
+ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+ if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
+ sctx->flags |= SI_CONTEXT_WB_L2;
}
static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
{
- struct pipe_blend_state blend;
+ struct pipe_blend_state blend;
- memset(&blend, 0, sizeof(blend));
- blend.independent_blend_enable = true;
- blend.rt[0].colormask = 0xf;
- return si_create_blend_state_mode(&sctx->b, &blend, mode);
+ memset(&blend, 0, sizeof(blend));
+ blend.independent_blend_enable = true;
+ blend.rt[0].colormask = 0xf;
+ return si_create_blend_state_mode(&sctx->b, &blend, mode);
}
static void si_init_config(struct si_context *sctx);
void si_init_state_compute_functions(struct si_context *sctx)
{
- sctx->b.create_sampler_state = si_create_sampler_state;
- sctx->b.delete_sampler_state = si_delete_sampler_state;
- sctx->b.create_sampler_view = si_create_sampler_view;
- sctx->b.sampler_view_destroy = si_sampler_view_destroy;
- sctx->b.memory_barrier = si_memory_barrier;
+ sctx->b.create_sampler_state = si_create_sampler_state;
+ sctx->b.delete_sampler_state = si_delete_sampler_state;
+ sctx->b.create_sampler_view = si_create_sampler_view;
+ sctx->b.sampler_view_destroy = si_sampler_view_destroy;
+ sctx->b.memory_barrier = si_memory_barrier;
}
void si_init_state_functions(struct si_context *sctx)
{
- sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
- sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
- sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
- sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
- sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
- sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
- sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
- sctx->atoms.s.blend_color.emit = si_emit_blend_color;
- sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
- sctx->atoms.s.clip_state.emit = si_emit_clip_state;
- sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
-
- sctx->b.create_blend_state = si_create_blend_state;
- sctx->b.bind_blend_state = si_bind_blend_state;
- sctx->b.delete_blend_state = si_delete_blend_state;
- sctx->b.set_blend_color = si_set_blend_color;
-
- sctx->b.create_rasterizer_state = si_create_rs_state;
- sctx->b.bind_rasterizer_state = si_bind_rs_state;
- sctx->b.delete_rasterizer_state = si_delete_rs_state;
-
- sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
- sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
- sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
-
- sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
- sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
- sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
- sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
- sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
-
- sctx->b.set_clip_state = si_set_clip_state;
- sctx->b.set_stencil_ref = si_set_stencil_ref;
-
- sctx->b.set_framebuffer_state = si_set_framebuffer_state;
-
- sctx->b.set_sample_mask = si_set_sample_mask;
-
- sctx->b.create_vertex_elements_state = si_create_vertex_elements;
- sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
- sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
- sctx->b.set_vertex_buffers = si_set_vertex_buffers;
-
- sctx->b.texture_barrier = si_texture_barrier;
- sctx->b.set_min_samples = si_set_min_samples;
- sctx->b.set_tess_state = si_set_tess_state;
-
- sctx->b.set_active_query_state = si_set_active_query_state;
-
- si_init_config(sctx);
+ sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
+ sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
+ sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
+ sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
+ sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
+ sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
+ sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
+ sctx->atoms.s.blend_color.emit = si_emit_blend_color;
+ sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
+ sctx->atoms.s.clip_state.emit = si_emit_clip_state;
+ sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
+
+ sctx->b.create_blend_state = si_create_blend_state;
+ sctx->b.bind_blend_state = si_bind_blend_state;
+ sctx->b.delete_blend_state = si_delete_blend_state;
+ sctx->b.set_blend_color = si_set_blend_color;
+
+ sctx->b.create_rasterizer_state = si_create_rs_state;
+ sctx->b.bind_rasterizer_state = si_bind_rs_state;
+ sctx->b.delete_rasterizer_state = si_delete_rs_state;
+
+ sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
+ sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
+ sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
+
+ sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
+ sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
+ sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
+ sctx->custom_blend_eliminate_fastclear =
+ si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
+ sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
+
+ sctx->b.set_clip_state = si_set_clip_state;
+ sctx->b.set_stencil_ref = si_set_stencil_ref;
+
+ sctx->b.set_framebuffer_state = si_set_framebuffer_state;
+
+ sctx->b.set_sample_mask = si_set_sample_mask;
+
+ sctx->b.create_vertex_elements_state = si_create_vertex_elements;
+ sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
+ sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
+ sctx->b.set_vertex_buffers = si_set_vertex_buffers;
+
+ sctx->b.texture_barrier = si_texture_barrier;
+ sctx->b.set_min_samples = si_set_min_samples;
+ sctx->b.set_tess_state = si_set_tess_state;
+
+ sctx->b.set_active_query_state = si_set_active_query_state;
+
+ si_init_config(sctx);
}
void si_init_screen_state_functions(struct si_screen *sscreen)
{
- sscreen->b.is_format_supported = si_is_format_supported;
+ sscreen->b.is_format_supported = si_is_format_supported;
- if (sscreen->info.chip_class >= GFX10) {
- sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
- } else {
- sscreen->make_texture_descriptor = si_make_texture_descriptor;
- }
+ if (sscreen->info.chip_class >= GFX10) {
+ sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
+ } else {
+ sscreen->make_texture_descriptor = si_make_texture_descriptor;
+ }
}
-static void si_set_grbm_gfx_index(struct si_context *sctx,
- struct si_pm4_state *pm4, unsigned value)
+static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
{
- unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX :
- R_00802C_GRBM_GFX_INDEX;
- si_pm4_set_reg(pm4, reg, value);
+ unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX;
+ si_pm4_set_reg(pm4, reg, value);
}
-static void si_set_grbm_gfx_index_se(struct si_context *sctx,
- struct si_pm4_state *pm4, unsigned se)
+static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se)
{
- assert(se == ~0 || se < sctx->screen->info.max_se);
- si_set_grbm_gfx_index(sctx, pm4,
- (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) :
- S_030800_SE_INDEX(se)) |
- S_030800_SH_BROADCAST_WRITES(1) |
- S_030800_INSTANCE_BROADCAST_WRITES(1));
+ assert(se == ~0 || se < sctx->screen->info.max_se);
+ si_set_grbm_gfx_index(sctx, pm4,
+ (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) |
+ S_030800_SH_BROADCAST_WRITES(1) |
+ S_030800_INSTANCE_BROADCAST_WRITES(1));
}
-static void
-si_write_harvested_raster_configs(struct si_context *sctx,
- struct si_pm4_state *pm4,
- unsigned raster_config,
- unsigned raster_config_1)
+static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4,
+ unsigned raster_config, unsigned raster_config_1)
{
- unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
- unsigned raster_config_se[4];
- unsigned se;
-
- ac_get_harvested_configs(&sctx->screen->info,
- raster_config,
- &raster_config_1,
- raster_config_se);
-
- for (se = 0; se < num_se; se++) {
- si_set_grbm_gfx_index_se(sctx, pm4, se);
- si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
- }
- si_set_grbm_gfx_index(sctx, pm4, ~0);
-
- if (sctx->chip_class >= GFX7) {
- si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
- }
+ unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
+ unsigned raster_config_se[4];
+ unsigned se;
+
+ ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se);
+
+ for (se = 0; se < num_se; se++) {
+ si_set_grbm_gfx_index_se(sctx, pm4, se);
+ si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
+ }
+ si_set_grbm_gfx_index(sctx, pm4, ~0);
+
+ if (sctx->chip_class >= GFX7) {
+ si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+ }
}
static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
{
- struct si_screen *sscreen = sctx->screen;
- unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
- unsigned rb_mask = sscreen->info.enabled_rb_mask;
- unsigned raster_config = sscreen->pa_sc_raster_config;
- unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
-
- if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
- /* Always use the default config when all backends are enabled
- * (or when we failed to determine the enabled backends).
- */
- si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
- raster_config);
- if (sctx->chip_class >= GFX7)
- si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
- raster_config_1);
- } else {
- si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
- }
+ struct si_screen *sscreen = sctx->screen;
+ unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
+ unsigned rb_mask = sscreen->info.enabled_rb_mask;
+ unsigned raster_config = sscreen->pa_sc_raster_config;
+ unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
+
+ if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+ /* Always use the default config when all backends are enabled
+ * (or when we failed to determine the enabled backends).
+ */
+ si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config);
+ if (sctx->chip_class >= GFX7)
+ si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+ } else {
+ si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
+ }
}
static void si_init_config(struct si_context *sctx)
{
- struct si_screen *sscreen = sctx->screen;
- uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
- bool has_clear_state = sscreen->info.has_clear_state;
- struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-
- if (!pm4)
- return;
-
- si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
- si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
- si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
- si_pm4_cmd_end(pm4, false);
-
- if (has_clear_state) {
- si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
- si_pm4_cmd_add(pm4, 0);
- si_pm4_cmd_end(pm4, false);
- }
-
- if (sctx->chip_class <= GFX8)
- si_set_raster_config(sctx, pm4);
-
- si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
- if (!has_clear_state)
- si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
-
- /* FIXME calculate these values somehow ??? */
- if (sctx->chip_class <= GFX8) {
- si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
- si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
- }
-
- if (!has_clear_state) {
- si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
- si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
- si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
- }
-
- if (sscreen->info.chip_class <= GFX9)
- si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
- if (!has_clear_state)
- si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
- if (sctx->chip_class < GFX7)
- si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
- S_008A14_CLIP_VTX_REORDER_ENA(1));
-
- /* CLEAR_STATE doesn't restore these correctly. */
- si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
- si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
- S_028244_BR_X(16384) | S_028244_BR_Y(16384));
-
- /* CLEAR_STATE doesn't clear these correctly on certain generations.
- * I don't know why. Deduced by trial and error.
- */
- if (sctx->chip_class <= GFX7 || !has_clear_state) {
- si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
- si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
- si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
- si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
- S_028034_BR_X(16384) | S_028034_BR_Y(16384));
- }
-
- if (!has_clear_state) {
- si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
- S_028230_ER_TRI(0xA) |
- S_028230_ER_POINT(0xA) |
- S_028230_ER_RECT(0xA) |
- /* Required by DX10_DIAMOND_TEST_ENA: */
- S_028230_ER_LINE_LR(0x1A) |
- S_028230_ER_LINE_RL(0x26) |
- S_028230_ER_LINE_TB(0xA) |
- S_028230_ER_LINE_BT(0xA));
- si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
- si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
- si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
- si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
- si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
- }
-
- if (sctx->chip_class >= GFX10) {
- si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
- si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
- si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
- si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
- si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
- si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
- } else if (sctx->chip_class == GFX9) {
- si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
- si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
- si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
- } else {
- /* These registers, when written, also overwrite the CLEAR_STATE
- * context, so we can't rely on CLEAR_STATE setting them.
- * It would be an issue if there was another UMD changing them.
- */
- si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
- si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
- si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
- }
-
- if (sctx->chip_class >= GFX7) {
- if (sctx->chip_class >= GFX10) {
- /* Logical CUs 16 - 31 */
- si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
- S_00B404_CU_EN(0xffff));
- si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
- S_00B104_CU_EN(0xffff));
- si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
- S_00B004_CU_EN(0xffff));
- }
-
- if (sctx->chip_class >= GFX9) {
- si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
- S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
- } else {
- si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
- S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
- si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
- S_00B41C_WAVE_LIMIT(0x3F));
- si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
- S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
-
- /* If this is 0, Bonaire can hang even if GS isn't being used.
- * Other chips are unaffected. These are suboptimal values,
- * but we don't use on-chip GS.
- */
- si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
- S_028A44_ES_VERTS_PER_SUBGRP(64) |
- S_028A44_GS_PRIMS_PER_SUBGRP(4));
- }
-
- /* Compute LATE_ALLOC_VS.LIMIT. */
- unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
- unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
- unsigned cu_mask_vs = 0xffff;
- unsigned cu_mask_gs = 0xffff;
-
- if (sctx->chip_class >= GFX10) {
- /* For Wave32, the hw will launch twice the number of late
- * alloc waves, so 1 == 2x wave32.
- */
- if (!sscreen->info.use_late_alloc) {
- late_alloc_wave64 = 0;
- } else if (num_cu_per_sh <= 6) {
- late_alloc_wave64 = num_cu_per_sh - 2;
- } else {
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
- /* CU2 & CU3 disabled because of the dual CU design */
- /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
- cu_mask_vs = 0xfff3;
- cu_mask_gs = sscreen->use_ngg &&
- sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
- }
- } else {
- if (!sscreen->info.use_late_alloc) {
- late_alloc_wave64 = 0;
- } else if (num_cu_per_sh <= 4) {
- /* Too few available compute units per SH. Disallowing
- * VS to run on one CU could hurt us more than late VS
- * allocation would help.
- *
- * 2 is the highest safe number that allows us to keep
- * all CUs enabled.
- */
- late_alloc_wave64 = 2;
- } else {
- /* This is a good initial value, allowing 1 late_alloc
- * wave per SIMD on num_cu - 2.
- */
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
- }
-
- if (late_alloc_wave64 > 2)
- cu_mask_vs = 0xfffe; /* 1 CU disabled */
- }
-
- /* VS can't execute on one CU if the limit is > 2. */
- si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
- S_00B118_CU_EN(cu_mask_vs) |
- S_00B118_WAVE_LIMIT(0x3F));
- si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
- S_00B11C_LIMIT(late_alloc_wave64));
-
- si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
- S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
-
- si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
- S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
- }
-
- if (sctx->chip_class >= GFX10) {
- /* Break up a pixel wave if it contains deallocs for more than
- * half the parameter cache.
- *
- * To avoid a deadlock where pixel waves aren't launched
- * because they're waiting for more pixels while the frontend
- * is stuck waiting for PC space, the maximum allowed value is
- * the size of the PC minus the largest possible allocation for
- * a single primitive shader subgroup.
- */
- si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
- S_028C50_MAX_DEALLOCS_IN_WAVE(512));
- si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-
- if (!has_clear_state) {
- si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
- sscreen->info.pa_sc_tile_steering_override);
- }
-
- /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
- unsigned meta_write_policy, meta_read_policy;
- /* TODO: investigate whether LRU improves performance on other chips too */
- if (sscreen->info.num_render_backends <= 4) {
- meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
- meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */
- } else {
- meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
- meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */
- }
-
- si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
- S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
- S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
- S_02807C_HTILE_WR_POLICY(meta_write_policy) |
- S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
- S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
- S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
- S_02807C_HTILE_RD_POLICY(meta_read_policy));
-
- si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
- S_028410_CMASK_WR_POLICY(meta_write_policy) |
- S_028410_FMASK_WR_POLICY(meta_write_policy) |
- S_028410_DCC_WR_POLICY(meta_write_policy) |
- S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
- S_028410_CMASK_RD_POLICY(meta_read_policy) |
- S_028410_FMASK_RD_POLICY(meta_read_policy) |
- S_028410_DCC_RD_POLICY(meta_read_policy) |
- S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
- si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
-
- si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
- S_00B0C0_SOFT_GROUPING_EN(1) |
- S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
- si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
- }
-
- if (sctx->chip_class >= GFX9) {
- si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
- S_028B50_ACCUM_ISOLINE(40) |
- S_028B50_ACCUM_TRI(30) |
- S_028B50_ACCUM_QUAD(24) |
- S_028B50_DONUT_SPLIT(24) |
- S_028B50_TRAP_SPLIT(6));
- } else if (sctx->chip_class >= GFX8) {
- unsigned vgt_tess_distribution;
-
- vgt_tess_distribution =
- S_028B50_ACCUM_ISOLINE(32) |
- S_028B50_ACCUM_TRI(11) |
- S_028B50_ACCUM_QUAD(11) |
- S_028B50_DONUT_SPLIT(16);
-
- /* Testing with Unigine Heaven extreme tesselation yielded best results
- * with TRAP_SPLIT = 3.
- */
- if (sctx->family == CHIP_FIJI ||
- sctx->family >= CHIP_POLARIS10)
- vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
-
- si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
- } else if (!has_clear_state) {
- si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
- si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
- }
-
- si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
- if (sctx->chip_class >= GFX7) {
- si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI,
- S_028084_ADDRESS(border_color_va >> 40));
- }
- si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ,
- RADEON_PRIO_BORDER_COLORS);
-
- if (sctx->chip_class >= GFX9) {
- si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
- S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
- S_028C48_MAX_PRIM_PER_BATCH(1023));
- si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
- S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
- si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
- }
-
- si_pm4_upload_indirect_buffer(sctx, pm4);
- sctx->init_config = pm4;
+ struct si_screen *sscreen = sctx->screen;
+ uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
+ bool has_clear_state = sscreen->info.has_clear_state;
+ struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+
+ if (!pm4)
+ return;
+
+ si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
+ si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
+ si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
+ si_pm4_cmd_end(pm4, false);
+
+ if (has_clear_state) {
+ si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
+ si_pm4_cmd_add(pm4, 0);
+ si_pm4_cmd_end(pm4, false);
+ }
+
+ if (sctx->chip_class <= GFX8)
+ si_set_raster_config(sctx, pm4);
+
+ si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+ if (!has_clear_state)
+ si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
+
+ /* FIXME calculate these values somehow ??? */
+ if (sctx->chip_class <= GFX8) {
+ si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+ si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
+ }
+
+ if (!has_clear_state) {
+ si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
+ si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
+ si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+ }
+
+ if (sscreen->info.chip_class <= GFX9)
+ si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+ if (!has_clear_state)
+ si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
+ if (sctx->chip_class < GFX7)
+ si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
+ S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
+
+ /* CLEAR_STATE doesn't restore these correctly. */
+ si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
+ si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
+ S_028244_BR_X(16384) | S_028244_BR_Y(16384));
+
+ /* CLEAR_STATE doesn't clear these correctly on certain generations.
+ * I don't know why. Deduced by trial and error.
+ */
+ if (sctx->chip_class <= GFX7 || !has_clear_state) {
+ si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+ si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
+ si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
+ si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
+ S_028034_BR_X(16384) | S_028034_BR_Y(16384));
+ }
+
+ if (!has_clear_state) {
+ si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
+ S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) |
+ /* Required by DX10_DIAMOND_TEST_ENA: */
+ S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) |
+ S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA));
+ si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
+ si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
+ si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
+ si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
+ si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
+ si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
+ si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
+ si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
+ si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
+ si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
+ } else if (sctx->chip_class == GFX9) {
+ si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
+ si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
+ si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
+ } else {
+ /* These registers, when written, also overwrite the CLEAR_STATE
+ * context, so we can't rely on CLEAR_STATE setting them.
+ * It would be an issue if there was another UMD changing them.
+ */
+ si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
+ si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
+ si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
+ }
+
+ if (sctx->chip_class >= GFX7) {
+ if (sctx->chip_class >= GFX10) {
+ /* Logical CUs 16 - 31 */
+ si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
+ si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
+ si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+ S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
+ } else {
+ si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+ S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+ si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
+ si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+ S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
+
+ /* If this is 0, Bonaire can hang even if GS isn't being used.
+ * Other chips are unaffected. These are suboptimal values,
+ * but we don't use on-chip GS.
+ */
+ si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+ S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
+ }
+
+ /* Compute LATE_ALLOC_VS.LIMIT. */
+ unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+ unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
+ unsigned cu_mask_vs = 0xffff;
+ unsigned cu_mask_gs = 0xffff;
+
+ if (sctx->chip_class >= GFX10) {
+ /* For Wave32, the hw will launch twice the number of late
+ * alloc waves, so 1 == 2x wave32.
+ */
+ if (!sscreen->info.use_late_alloc) {
+ late_alloc_wave64 = 0;
+ } else if (num_cu_per_sh <= 6) {
+ late_alloc_wave64 = num_cu_per_sh - 2;
+ } else {
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+ /* CU2 & CU3 disabled because of the dual CU design */
+ /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
+ cu_mask_vs = 0xfff3;
+ cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
+ }
+ } else {
+ if (!sscreen->info.use_late_alloc) {
+ late_alloc_wave64 = 0;
+ } else if (num_cu_per_sh <= 4) {
+ /* Too few available compute units per SH. Disallowing
+ * VS to run on one CU could hurt us more than late VS
+ * allocation would help.
+ *
+ * 2 is the highest safe number that allows us to keep
+ * all CUs enabled.
+ */
+ late_alloc_wave64 = 2;
+ } else {
+ /* This is a good initial value, allowing 1 late_alloc
+ * wave per SIMD on num_cu - 2.
+ */
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+ }
+
+ if (late_alloc_wave64 > 2)
+ cu_mask_vs = 0xfffe; /* 1 CU disabled */
+ }
+
+ /* VS can't execute on one CU if the limit is > 2. */
+ si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+ S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
+ si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+
+ si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+ S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
+
+ si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+ S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ /* Break up a pixel wave if it contains deallocs for more than
+ * half the parameter cache.
+ *
+ * To avoid a deadlock where pixel waves aren't launched
+ * because they're waiting for more pixels while the frontend
+ * is stuck waiting for PC space, the maximum allowed value is
+ * the size of the PC minus the largest possible allocation for
+ * a single primitive shader subgroup.
+ */
+ si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
+ si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+
+ if (!has_clear_state) {
+ si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
+ sscreen->info.pa_sc_tile_steering_override);
+ }
+
+ /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
+ unsigned meta_write_policy, meta_read_policy;
+ /* TODO: investigate whether LRU improves performance on other chips too */
+ if (sscreen->info.num_render_backends <= 4) {
+ meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+ meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */
+ } else {
+ meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
+ meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */
+ }
+
+ si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
+ S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+ S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+ S_02807C_HTILE_WR_POLICY(meta_write_policy) |
+ S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+ S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+ S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+ S_02807C_HTILE_RD_POLICY(meta_read_policy));
+
+ si_pm4_set_reg(
+ pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
+ S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) |
+ S_028410_DCC_WR_POLICY(meta_write_policy) |
+ S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
+ S_028410_CMASK_RD_POLICY(meta_read_policy) |
+ S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
+ S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
+ si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
+
+ si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
+ S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
+ si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
+ S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
+ S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
+ } else if (sctx->chip_class >= GFX8) {
+ unsigned vgt_tess_distribution;
+
+ vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
+ S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16);
+
+ /* Testing with Unigine Heaven extreme tesselation yielded best results
+ * with TRAP_SPLIT = 3.
+ */
+ if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10)
+ vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
+
+ si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
+ } else if (!has_clear_state) {
+ si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+ si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
+ }
+
+ si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
+ if (sctx->chip_class >= GFX7) {
+ si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
+ }
+ si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
+
+ if (sctx->chip_class >= GFX9) {
+ si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
+ S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
+ S_028C48_MAX_PRIM_PER_BATCH(1023));
+ si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
+ S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
+ si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
+ }
+
+ si_pm4_upload_indirect_buffer(sctx, pm4);
+ sctx->init_config = pm4;
}
#ifndef SI_STATE_H
#define SI_STATE_H
-#include "si_pm4.h"
-
#include "pipebuffer/pb_slab.h"
+#include "si_pm4.h"
#include "util/u_blitter.h"
-#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
-#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
+#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL + 1)
+#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE + 1)
-#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
-#define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */
-#define SI_NUM_CONST_BUFFERS 16
-#define SI_NUM_IMAGES 16
-#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
-#define SI_NUM_SHADER_BUFFERS 16
+#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
+#define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */
+#define SI_NUM_CONST_BUFFERS 16
+#define SI_NUM_IMAGES 16
+#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
+#define SI_NUM_SHADER_BUFFERS 16
struct si_screen;
struct si_shader;
struct si_qbo_state;
struct si_state_blend {
- struct si_pm4_state pm4;
- uint32_t cb_target_mask;
- /* Set 0xf or 0x0 (4 bits) per render target if the following is
- * true. ANDed with spi_shader_col_format.
- */
- unsigned cb_target_enabled_4bit;
- unsigned blend_enable_4bit;
- unsigned need_src_alpha_4bit;
- unsigned commutative_4bit;
- unsigned dcc_msaa_corruption_4bit;
- bool alpha_to_coverage:1;
- bool alpha_to_one:1;
- bool dual_src_blend:1;
- bool logicop_enable:1;
+ struct si_pm4_state pm4;
+ uint32_t cb_target_mask;
+ /* Set 0xf or 0x0 (4 bits) per render target if the following is
+ * true. ANDed with spi_shader_col_format.
+ */
+ unsigned cb_target_enabled_4bit;
+ unsigned blend_enable_4bit;
+ unsigned need_src_alpha_4bit;
+ unsigned commutative_4bit;
+ unsigned dcc_msaa_corruption_4bit;
+ bool alpha_to_coverage : 1;
+ bool alpha_to_one : 1;
+ bool dual_src_blend : 1;
+ bool logicop_enable : 1;
};
struct si_state_rasterizer {
- struct si_pm4_state pm4;
- /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
- struct si_pm4_state *pm4_poly_offset;
- unsigned pa_sc_line_stipple;
- unsigned pa_cl_clip_cntl;
- float line_width;
- float max_point_size;
- unsigned sprite_coord_enable:8;
- unsigned clip_plane_enable:8;
- unsigned half_pixel_center:1;
- unsigned flatshade:1;
- unsigned flatshade_first:1;
- unsigned two_side:1;
- unsigned multisample_enable:1;
- unsigned force_persample_interp:1;
- unsigned line_stipple_enable:1;
- unsigned poly_stipple_enable:1;
- unsigned line_smooth:1;
- unsigned poly_smooth:1;
- unsigned uses_poly_offset:1;
- unsigned clamp_fragment_color:1;
- unsigned clamp_vertex_color:1;
- unsigned rasterizer_discard:1;
- unsigned scissor_enable:1;
- unsigned clip_halfz:1;
- unsigned cull_front:1;
- unsigned cull_back:1;
- unsigned depth_clamp_any:1;
- unsigned provoking_vertex_first:1;
- unsigned polygon_mode_enabled:1;
- unsigned polygon_mode_is_lines:1;
+ struct si_pm4_state pm4;
+ /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
+ struct si_pm4_state *pm4_poly_offset;
+ unsigned pa_sc_line_stipple;
+ unsigned pa_cl_clip_cntl;
+ float line_width;
+ float max_point_size;
+ unsigned sprite_coord_enable : 8;
+ unsigned clip_plane_enable : 8;
+ unsigned half_pixel_center : 1;
+ unsigned flatshade : 1;
+ unsigned flatshade_first : 1;
+ unsigned two_side : 1;
+ unsigned multisample_enable : 1;
+ unsigned force_persample_interp : 1;
+ unsigned line_stipple_enable : 1;
+ unsigned poly_stipple_enable : 1;
+ unsigned line_smooth : 1;
+ unsigned poly_smooth : 1;
+ unsigned uses_poly_offset : 1;
+ unsigned clamp_fragment_color : 1;
+ unsigned clamp_vertex_color : 1;
+ unsigned rasterizer_discard : 1;
+ unsigned scissor_enable : 1;
+ unsigned clip_halfz : 1;
+ unsigned cull_front : 1;
+ unsigned cull_back : 1;
+ unsigned depth_clamp_any : 1;
+ unsigned provoking_vertex_first : 1;
+ unsigned polygon_mode_enabled : 1;
+ unsigned polygon_mode_is_lines : 1;
};
struct si_dsa_stencil_ref_part {
- uint8_t valuemask[2];
- uint8_t writemask[2];
+ uint8_t valuemask[2];
+ uint8_t writemask[2];
};
struct si_dsa_order_invariance {
- /** Whether the final result in Z/S buffers is guaranteed to be
- * invariant under changes to the order in which fragments arrive. */
- bool zs:1;
-
- /** Whether the set of fragments that pass the combined Z/S test is
- * guaranteed to be invariant under changes to the order in which
- * fragments arrive. */
- bool pass_set:1;
-
- /** Whether the last fragment that passes the combined Z/S test at each
- * sample is guaranteed to be invariant under changes to the order in
- * which fragments arrive. */
- bool pass_last:1;
+ /** Whether the final result in Z/S buffers is guaranteed to be
+ * invariant under changes to the order in which fragments arrive. */
+ bool zs : 1;
+
+ /** Whether the set of fragments that pass the combined Z/S test is
+ * guaranteed to be invariant under changes to the order in which
+ * fragments arrive. */
+ bool pass_set : 1;
+
+ /** Whether the last fragment that passes the combined Z/S test at each
+ * sample is guaranteed to be invariant under changes to the order in
+ * which fragments arrive. */
+ bool pass_last : 1;
};
struct si_state_dsa {
- struct si_pm4_state pm4;
- struct si_dsa_stencil_ref_part stencil_ref;
-
- /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
- struct si_dsa_order_invariance order_invariance[2];
-
- ubyte alpha_func:3;
- bool depth_enabled:1;
- bool depth_write_enabled:1;
- bool stencil_enabled:1;
- bool stencil_write_enabled:1;
- bool db_can_write:1;
-
+ struct si_pm4_state pm4;
+ struct si_dsa_stencil_ref_part stencil_ref;
+
+ /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
+ struct si_dsa_order_invariance order_invariance[2];
+
+ ubyte alpha_func : 3;
+ bool depth_enabled : 1;
+ bool depth_write_enabled : 1;
+ bool stencil_enabled : 1;
+ bool stencil_write_enabled : 1;
+ bool db_can_write : 1;
};
struct si_stencil_ref {
- struct pipe_stencil_ref state;
- struct si_dsa_stencil_ref_part dsa_part;
+ struct pipe_stencil_ref state;
+ struct si_dsa_stencil_ref_part dsa_part;
};
-struct si_vertex_elements
-{
- struct si_resource *instance_divisor_factor_buffer;
- uint32_t rsrc_word3[SI_MAX_ATTRIBS];
- uint16_t src_offset[SI_MAX_ATTRIBS];
- uint8_t fix_fetch[SI_MAX_ATTRIBS];
- uint8_t format_size[SI_MAX_ATTRIBS];
- uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
-
- /* Bitmask of elements that always need a fixup to be applied. */
- uint16_t fix_fetch_always;
-
- /* Bitmask of elements whose fetch should always be opencoded. */
- uint16_t fix_fetch_opencode;
-
- /* Bitmask of elements which need to be opencoded if the vertex buffer
- * is unaligned. */
- uint16_t fix_fetch_unaligned;
-
- /* For elements in fix_fetch_unaligned: whether the effective
- * element load size as seen by the hardware is a dword (as opposed
- * to a short).
- */
- uint16_t hw_load_is_dword;
-
- /* Bitmask of vertex buffers requiring alignment check */
- uint16_t vb_alignment_check_mask;
-
- uint8_t count;
- bool uses_instance_divisors;
-
- uint16_t first_vb_use_mask;
- /* Vertex buffer descriptor list size aligned for optimal prefetch. */
- uint16_t vb_desc_list_alloc_size;
- uint16_t instance_divisor_is_one; /* bitmask of inputs */
- uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+struct si_vertex_elements {
+ struct si_resource *instance_divisor_factor_buffer;
+ uint32_t rsrc_word3[SI_MAX_ATTRIBS];
+ uint16_t src_offset[SI_MAX_ATTRIBS];
+ uint8_t fix_fetch[SI_MAX_ATTRIBS];
+ uint8_t format_size[SI_MAX_ATTRIBS];
+ uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
+
+ /* Bitmask of elements that always need a fixup to be applied. */
+ uint16_t fix_fetch_always;
+
+ /* Bitmask of elements whose fetch should always be opencoded. */
+ uint16_t fix_fetch_opencode;
+
+ /* Bitmask of elements which need to be opencoded if the vertex buffer
+ * is unaligned. */
+ uint16_t fix_fetch_unaligned;
+
+ /* For elements in fix_fetch_unaligned: whether the effective
+ * element load size as seen by the hardware is a dword (as opposed
+ * to a short).
+ */
+ uint16_t hw_load_is_dword;
+
+ /* Bitmask of vertex buffers requiring alignment check */
+ uint16_t vb_alignment_check_mask;
+
+ uint8_t count;
+ bool uses_instance_divisors;
+
+ uint16_t first_vb_use_mask;
+ /* Vertex buffer descriptor list size aligned for optimal prefetch. */
+ uint16_t vb_desc_list_alloc_size;
+ uint16_t instance_divisor_is_one; /* bitmask of inputs */
+ uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
};
union si_state {
- struct {
- struct si_state_blend *blend;
- struct si_state_rasterizer *rasterizer;
- struct si_state_dsa *dsa;
- struct si_pm4_state *poly_offset;
- struct si_pm4_state *ls;
- struct si_pm4_state *hs;
- struct si_pm4_state *es;
- struct si_pm4_state *gs;
- struct si_pm4_state *vgt_shader_config;
- struct si_pm4_state *vs;
- struct si_pm4_state *ps;
- } named;
- struct si_pm4_state *array[0];
+ struct {
+ struct si_state_blend *blend;
+ struct si_state_rasterizer *rasterizer;
+ struct si_state_dsa *dsa;
+ struct si_pm4_state *poly_offset;
+ struct si_pm4_state *ls;
+ struct si_pm4_state *hs;
+ struct si_pm4_state *es;
+ struct si_pm4_state *gs;
+ struct si_pm4_state *vgt_shader_config;
+ struct si_pm4_state *vs;
+ struct si_pm4_state *ps;
+ } named;
+ struct si_pm4_state *array[0];
};
-#define SI_STATE_IDX(name) \
- (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
+#define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
#define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
-#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
+#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
static inline unsigned si_states_that_always_roll_context(void)
{
- return (SI_STATE_BIT(blend) |
- SI_STATE_BIT(rasterizer) |
- SI_STATE_BIT(dsa) |
- SI_STATE_BIT(poly_offset) |
- SI_STATE_BIT(vgt_shader_config));
+ return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
+ SI_STATE_BIT(poly_offset) | SI_STATE_BIT(vgt_shader_config));
}
union si_state_atoms {
- struct {
- /* The order matters. */
- struct si_atom render_cond;
- struct si_atom streamout_begin;
- struct si_atom streamout_enable; /* must be after streamout_begin */
- struct si_atom framebuffer;
- struct si_atom msaa_sample_locs;
- struct si_atom db_render_state;
- struct si_atom dpbb_state;
- struct si_atom msaa_config;
- struct si_atom sample_mask;
- struct si_atom cb_render_state;
- struct si_atom blend_color;
- struct si_atom clip_regs;
- struct si_atom clip_state;
- struct si_atom shader_pointers;
- struct si_atom guardband;
- struct si_atom scissors;
- struct si_atom viewports;
- struct si_atom stencil_ref;
- struct si_atom spi_map;
- struct si_atom scratch_state;
- struct si_atom window_rectangles;
- struct si_atom shader_query;
- } s;
- struct si_atom array[0];
+ struct {
+ /* The order matters. */
+ struct si_atom render_cond;
+ struct si_atom streamout_begin;
+ struct si_atom streamout_enable; /* must be after streamout_begin */
+ struct si_atom framebuffer;
+ struct si_atom msaa_sample_locs;
+ struct si_atom db_render_state;
+ struct si_atom dpbb_state;
+ struct si_atom msaa_config;
+ struct si_atom sample_mask;
+ struct si_atom cb_render_state;
+ struct si_atom blend_color;
+ struct si_atom clip_regs;
+ struct si_atom clip_state;
+ struct si_atom shader_pointers;
+ struct si_atom guardband;
+ struct si_atom scissors;
+ struct si_atom viewports;
+ struct si_atom stencil_ref;
+ struct si_atom spi_map;
+ struct si_atom scratch_state;
+ struct si_atom window_rectangles;
+ struct si_atom shader_query;
+ } s;
+ struct si_atom array[0];
};
-#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \
- sizeof(struct si_atom)))
-#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*))
+#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
+#define SI_NUM_ATOMS (sizeof(union si_state_atoms) / sizeof(struct si_atom *))
static inline unsigned si_atoms_that_always_roll_context(void)
{
- return (SI_ATOM_BIT(streamout_begin) |
- SI_ATOM_BIT(streamout_enable) |
- SI_ATOM_BIT(framebuffer) |
- SI_ATOM_BIT(msaa_sample_locs) |
- SI_ATOM_BIT(sample_mask) |
- SI_ATOM_BIT(blend_color) |
- SI_ATOM_BIT(clip_state) |
- SI_ATOM_BIT(scissors) |
- SI_ATOM_BIT(viewports) |
- SI_ATOM_BIT(stencil_ref) |
- SI_ATOM_BIT(scratch_state) |
- SI_ATOM_BIT(window_rectangles));
+ return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
+ SI_ATOM_BIT(msaa_sample_locs) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) |
+ SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) |
+ SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles));
}
struct si_shader_data {
- uint32_t sh_base[SI_NUM_SHADERS];
+ uint32_t sh_base[SI_NUM_SHADERS];
};
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \
- (S_02881C_USE_VTX_POINT_SIZE(1) | \
- S_02881C_USE_VTX_EDGE_FLAG(1) | \
- S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | \
- S_02881C_USE_VTX_VIEWPORT_INDX(1) | \
- S_02881C_VS_OUT_MISC_VEC_ENA(1) | \
- S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
+#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \
+ (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) | \
+ S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) | \
+ S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
/* The list of registers whose emitted values are remembered by si_context. */
-enum si_tracked_reg {
- SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
- SI_TRACKED_DB_COUNT_CONTROL,
+enum si_tracked_reg
+{
+ SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
+ SI_TRACKED_DB_COUNT_CONTROL,
- SI_TRACKED_DB_RENDER_OVERRIDE2,
- SI_TRACKED_DB_SHADER_CONTROL,
+ SI_TRACKED_DB_RENDER_OVERRIDE2,
+ SI_TRACKED_DB_SHADER_CONTROL,
- SI_TRACKED_CB_TARGET_MASK,
- SI_TRACKED_CB_DCC_CONTROL,
+ SI_TRACKED_CB_TARGET_MASK,
+ SI_TRACKED_CB_DCC_CONTROL,
- SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
- SI_TRACKED_SX_BLEND_OPT_EPSILON,
- SI_TRACKED_SX_BLEND_OPT_CONTROL,
+ SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
+ SI_TRACKED_SX_BLEND_OPT_EPSILON,
+ SI_TRACKED_SX_BLEND_OPT_CONTROL,
- SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
- SI_TRACKED_PA_SC_AA_CONFIG,
+ SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
+ SI_TRACKED_PA_SC_AA_CONFIG,
- SI_TRACKED_DB_EQAA,
- SI_TRACKED_PA_SC_MODE_CNTL_1,
+ SI_TRACKED_DB_EQAA,
+ SI_TRACKED_PA_SC_MODE_CNTL_1,
- SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
- SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+ SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+ SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
- SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
- SI_TRACKED_PA_CL_CLIP_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+ SI_TRACKED_PA_CL_CLIP_CNTL,
- SI_TRACKED_PA_SC_BINNER_CNTL_0,
- SI_TRACKED_DB_DFSM_CONTROL,
+ SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ SI_TRACKED_DB_DFSM_CONTROL,
- SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
- SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
- SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
- SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
+ SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
+ SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
+ SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
+ SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
- SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
- SI_TRACKED_PA_SU_VTX_CNTL,
+ SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+ SI_TRACKED_PA_SU_VTX_CNTL,
- SI_TRACKED_PA_SC_CLIPRECT_RULE,
+ SI_TRACKED_PA_SC_CLIPRECT_RULE,
- SI_TRACKED_PA_SC_LINE_STIPPLE,
+ SI_TRACKED_PA_SC_LINE_STIPPLE,
- SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
- SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
- SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
- SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
+ SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
+ SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
+ SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
- SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
- SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_GS_MAX_VERT_OUT,
- SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
- SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
- SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
- SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
+ SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
+ SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
+ SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
+ SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
- SI_TRACKED_VGT_GS_INSTANCE_CNT,
- SI_TRACKED_VGT_GS_ONCHIP_CNTL,
- SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
- SI_TRACKED_VGT_GS_MODE,
- SI_TRACKED_VGT_PRIMITIVEID_EN,
- SI_TRACKED_VGT_REUSE_OFF,
- SI_TRACKED_SPI_VS_OUT_CONFIG,
- SI_TRACKED_PA_CL_VTE_CNTL,
- SI_TRACKED_PA_CL_NGG_CNTL,
- SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
- SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+ SI_TRACKED_VGT_GS_INSTANCE_CNT,
+ SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+ SI_TRACKED_VGT_GS_MODE,
+ SI_TRACKED_VGT_PRIMITIVEID_EN,
+ SI_TRACKED_VGT_REUSE_OFF,
+ SI_TRACKED_SPI_VS_OUT_CONFIG,
+ SI_TRACKED_PA_CL_VTE_CNTL,
+ SI_TRACKED_PA_CL_NGG_CNTL,
+ SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+ SI_TRACKED_GE_NGG_SUBGRP_CNTL,
- SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
- SI_TRACKED_SPI_SHADER_POS_FORMAT,
+ SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
+ SI_TRACKED_SPI_SHADER_POS_FORMAT,
- SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
- SI_TRACKED_SPI_PS_INPUT_ADDR,
+ SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
+ SI_TRACKED_SPI_PS_INPUT_ADDR,
- SI_TRACKED_SPI_BARYC_CNTL,
- SI_TRACKED_SPI_PS_IN_CONTROL,
+ SI_TRACKED_SPI_BARYC_CNTL,
+ SI_TRACKED_SPI_PS_IN_CONTROL,
- SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
- SI_TRACKED_SPI_SHADER_COL_FORMAT,
+ SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
+ SI_TRACKED_SPI_SHADER_COL_FORMAT,
- SI_TRACKED_CB_SHADER_MASK,
- SI_TRACKED_VGT_TF_PARAM,
- SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ SI_TRACKED_CB_SHADER_MASK,
+ SI_TRACKED_VGT_TF_PARAM,
+ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
- SI_TRACKED_GE_PC_ALLOC,
+ SI_TRACKED_GE_PC_ALLOC,
- SI_NUM_TRACKED_REGS,
+ SI_NUM_TRACKED_REGS,
};
struct si_tracked_regs {
- uint64_t reg_saved;
- uint32_t reg_value[SI_NUM_TRACKED_REGS];
- uint32_t spi_ps_input_cntl[32];
+ uint64_t reg_saved;
+ uint32_t reg_value[SI_NUM_TRACKED_REGS];
+ uint32_t spi_ps_input_cntl[32];
};
/* Private read-write buffer slots. */
-enum {
- SI_ES_RING_ESGS,
- SI_GS_RING_ESGS,
+enum
+{
+ SI_ES_RING_ESGS,
+ SI_GS_RING_ESGS,
- SI_RING_GSVS,
+ SI_RING_GSVS,
- SI_VS_STREAMOUT_BUF0,
- SI_VS_STREAMOUT_BUF1,
- SI_VS_STREAMOUT_BUF2,
- SI_VS_STREAMOUT_BUF3,
+ SI_VS_STREAMOUT_BUF0,
+ SI_VS_STREAMOUT_BUF1,
+ SI_VS_STREAMOUT_BUF2,
+ SI_VS_STREAMOUT_BUF3,
- SI_HS_CONST_DEFAULT_TESS_LEVELS,
- SI_VS_CONST_INSTANCE_DIVISORS,
- SI_VS_CONST_CLIP_PLANES,
- SI_PS_CONST_POLY_STIPPLE,
- SI_PS_CONST_SAMPLE_POSITIONS,
+ SI_HS_CONST_DEFAULT_TESS_LEVELS,
+ SI_VS_CONST_INSTANCE_DIVISORS,
+ SI_VS_CONST_CLIP_PLANES,
+ SI_PS_CONST_POLY_STIPPLE,
+ SI_PS_CONST_SAMPLE_POSITIONS,
- /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
- SI_PS_IMAGE_COLORBUF0,
- SI_PS_IMAGE_COLORBUF0_HI,
- SI_PS_IMAGE_COLORBUF0_FMASK,
- SI_PS_IMAGE_COLORBUF0_FMASK_HI,
+ /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
+ SI_PS_IMAGE_COLORBUF0,
+ SI_PS_IMAGE_COLORBUF0_HI,
+ SI_PS_IMAGE_COLORBUF0_FMASK,
+ SI_PS_IMAGE_COLORBUF0_FMASK_HI,
- GFX10_GS_QUERY_BUF,
+ GFX10_GS_QUERY_BUF,
- SI_NUM_RW_BUFFERS,
+ SI_NUM_RW_BUFFERS,
};
/* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
* 11 - compute const and shader buffers
* 12 - compute samplers and images
*/
-enum {
- SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
- SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
- SI_NUM_SHADER_DESCS,
+enum
+{
+ SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ SI_NUM_SHADER_DESCS,
};
-#define SI_DESCS_RW_BUFFERS 0
-#define SI_DESCS_FIRST_SHADER 1
-#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + \
- PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
-#define SI_NUM_DESCS (SI_DESCS_FIRST_SHADER + \
- SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
+#define SI_DESCS_RW_BUFFERS 0
+#define SI_DESCS_FIRST_SHADER 1
+#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
+#define SI_NUM_DESCS (SI_DESCS_FIRST_SHADER + SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
-#define SI_DESCS_SHADER_MASK(name) \
- u_bit_consecutive(SI_DESCS_FIRST_SHADER + \
- PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \
- SI_NUM_SHADER_DESCS)
+#define SI_DESCS_SHADER_MASK(name) \
+ u_bit_consecutive(SI_DESCS_FIRST_SHADER + PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \
+ SI_NUM_SHADER_DESCS)
-static inline unsigned
-si_const_and_shader_buffer_descriptors_idx(unsigned shader)
+static inline unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader)
{
- return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
- SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
+ return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+ SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
}
-static inline unsigned
-si_sampler_and_image_descriptors_idx(unsigned shader)
+static inline unsigned si_sampler_and_image_descriptors_idx(unsigned shader)
{
- return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
- SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
+ return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
}
/* This represents descriptors in memory, such as buffer resources,
* image resources, and sampler states.
*/
struct si_descriptors {
- /* The list of descriptors in malloc'd memory. */
- uint32_t *list;
- /* The list in mapped GPU memory. */
- uint32_t *gpu_list;
-
- /* The buffer where the descriptors have been uploaded. */
- struct si_resource *buffer;
- uint64_t gpu_address;
-
- /* The maximum number of descriptors. */
- uint32_t num_elements;
-
- /* Slots that are used by currently-bound shaders.
- * It determines which slots are uploaded.
- */
- uint32_t first_active_slot;
- uint32_t num_active_slots;
-
- /* The SH register offset relative to USER_DATA*_0 where the pointer
- * to the descriptor array will be stored. */
- short shader_userdata_offset;
- /* The size of one descriptor. */
- ubyte element_dw_size;
- /* If there is only one slot enabled, bind it directly instead of
- * uploading descriptors. -1 if disabled. */
- signed char slot_index_to_bind_directly;
+ /* The list of descriptors in malloc'd memory. */
+ uint32_t *list;
+ /* The list in mapped GPU memory. */
+ uint32_t *gpu_list;
+
+ /* The buffer where the descriptors have been uploaded. */
+ struct si_resource *buffer;
+ uint64_t gpu_address;
+
+ /* The maximum number of descriptors. */
+ uint32_t num_elements;
+
+ /* Slots that are used by currently-bound shaders.
+ * It determines which slots are uploaded.
+ */
+ uint32_t first_active_slot;
+ uint32_t num_active_slots;
+
+ /* The SH register offset relative to USER_DATA*_0 where the pointer
+ * to the descriptor array will be stored. */
+ short shader_userdata_offset;
+ /* The size of one descriptor. */
+ ubyte element_dw_size;
+ /* If there is only one slot enabled, bind it directly instead of
+ * uploading descriptors. -1 if disabled. */
+ signed char slot_index_to_bind_directly;
};
struct si_buffer_resources {
- struct pipe_resource **buffers; /* this has num_buffers elements */
- unsigned *offsets; /* this has num_buffers elements */
+ struct pipe_resource **buffers; /* this has num_buffers elements */
+ unsigned *offsets; /* this has num_buffers elements */
- enum radeon_bo_priority priority:6;
- enum radeon_bo_priority priority_constbuf:6;
+ enum radeon_bo_priority priority : 6;
+ enum radeon_bo_priority priority_constbuf : 6;
- /* The i-th bit is set if that element is enabled (non-NULL resource). */
- unsigned enabled_mask;
- unsigned writable_mask;
+ /* The i-th bit is set if that element is enabled (non-NULL resource). */
+ unsigned enabled_mask;
+ unsigned writable_mask;
};
-#define si_pm4_state_changed(sctx, member) \
- ((sctx)->queued.named.member != (sctx)->emitted.named.member)
+#define si_pm4_state_changed(sctx, member) \
+ ((sctx)->queued.named.member != (sctx)->emitted.named.member)
-#define si_pm4_state_enabled_and_changed(sctx, member) \
- ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
+#define si_pm4_state_enabled_and_changed(sctx, member) \
+ ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
-#define si_pm4_bind_state(sctx, member, value) \
- do { \
- (sctx)->queued.named.member = (value); \
- (sctx)->dirty_states |= SI_STATE_BIT(member); \
- } while(0)
+#define si_pm4_bind_state(sctx, member, value) \
+ do { \
+ (sctx)->queued.named.member = (value); \
+ (sctx)->dirty_states |= SI_STATE_BIT(member); \
+ } while (0)
-#define si_pm4_delete_state(sctx, member, value) \
- do { \
- if ((sctx)->queued.named.member == (value)) { \
- (sctx)->queued.named.member = NULL; \
- } \
- si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \
- SI_STATE_IDX(member)); \
- } while(0)
+#define si_pm4_delete_state(sctx, member, value) \
+ do { \
+ if ((sctx)->queued.named.member == (value)) { \
+ (sctx)->queued.named.member = NULL; \
+ } \
+ si_pm4_free_state(sctx, (struct si_pm4_state *)(value), SI_STATE_IDX(member)); \
+ } while (0)
/* si_descriptors.c */
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
- struct si_texture *tex,
- const struct legacy_surf_level *base_level_info,
- unsigned base_level, unsigned first_level,
- unsigned block_width, bool is_stencil,
- uint32_t *state);
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+ const struct legacy_surf_level *base_level_info,
+ unsigned base_level, unsigned first_level, unsigned block_width,
+ bool is_stencil, uint32_t *state);
void si_update_ps_colorbuf0_slot(struct si_context *sctx);
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
- uint slot, struct pipe_constant_buffer *cbuf);
-void si_get_shader_buffers(struct si_context *sctx,
- enum pipe_shader_type shader,
- uint start_slot, uint count,
- struct pipe_shader_buffer *sbuf);
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
- struct pipe_resource *buffer,
- unsigned stride, unsigned num_records,
- bool add_tid, bool swizzle,
- unsigned element_size, unsigned index_stride, uint64_t offset);
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+ struct pipe_constant_buffer *cbuf);
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+ uint count, struct pipe_shader_buffer *sbuf);
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+ unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+ unsigned element_size, unsigned index_stride, uint64_t offset);
void si_init_all_descriptors(struct si_context *sctx);
bool si_upload_vertex_buffer_descriptors(struct si_context *sctx);
bool si_upload_graphics_shader_descriptors(struct si_context *sctx);
void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx);
void si_compute_resources_add_all_to_bo_list(struct si_context *sctx);
void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
- const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+ unsigned size, uint32_t *const_offset);
void si_update_all_texture_descriptors(struct si_context *sctx);
void si_shader_change_notify(struct si_context *sctx);
void si_update_needs_color_decompress_masks(struct si_context *sctx);
void si_emit_graphics_shader_pointers(struct si_context *sctx);
void si_emit_compute_shader_pointers(struct si_context *sctx);
-void si_set_rw_buffer(struct si_context *sctx,
- uint slot, const struct pipe_constant_buffer *input);
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input);
void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
- const struct pipe_shader_buffer *sbuffer);
+ const struct pipe_shader_buffer *sbuffer);
void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
- uint64_t new_active_mask);
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
- struct si_shader_selector *sel);
-bool si_bindless_descriptor_can_reclaim_slab(void *priv,
- struct pb_slab_entry *entry);
-struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
- unsigned entry_size,
- unsigned group_index);
+ uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel);
+bool si_bindless_descriptor_can_reclaim_slab(void *priv, struct pb_slab_entry *entry);
+struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, unsigned entry_size,
+ unsigned group_index);
void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
/* si_state.c */
void si_init_state_compute_functions(struct si_context *sctx);
void si_init_state_functions(struct si_context *sctx);
void si_init_screen_state_functions(struct si_screen *sscreen);
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
- enum pipe_format format,
- unsigned offset, unsigned size,
- uint32_t *state);
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
- struct pipe_resource *texture,
- const struct pipe_sampler_view *state,
- unsigned width0, unsigned height0,
- unsigned force_level);
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+ enum pipe_format format, unsigned offset, unsigned size,
+ uint32_t *state);
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+ struct pipe_resource *texture,
+ const struct pipe_sampler_view *state,
+ unsigned width0, unsigned height0,
+ unsigned force_level);
void si_update_fb_dirtiness_after_rendering(struct si_context *sctx);
void si_update_ps_iter_samples(struct si_context *sctx);
void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
-void si_set_occlusion_query_state(struct si_context *sctx,
- bool old_perfect_enable);
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable);
struct si_fast_udiv_info32 {
unsigned multiplier; /* the "magic number" multiplier */
- unsigned pre_shift; /* shift for the dividend before multiplying */
+ unsigned pre_shift; /* shift for the dividend before multiplying */
unsigned post_shift; /* shift for the dividend after multiplying */
- int increment; /* 0 or 1; if set then increment the numerator, using one of
- the two strategies */
+ int increment; /* 0 or 1; if set then increment the numerator, using one of
+ the two strategies */
};
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
/* si_state_binning.c */
void si_emit_dpbb_state(struct si_context *sctx);
/* si_state_shaders.c */
void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
- unsigned char ir_sha1_cache_key[20]);
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
- unsigned char ir_sha1_cache_key[20],
- struct si_shader *shader);
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
- unsigned char ir_sha1_cache_key[20],
- struct si_shader *shader,
- bool insert_into_disk_cache);
+ unsigned char ir_sha1_cache_key[20]);
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+ struct si_shader *shader);
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+ struct si_shader *shader, bool insert_into_disk_cache);
bool si_update_shaders(struct si_context *sctx);
void si_init_screen_live_shader_cache(struct si_screen *sscreen);
void si_init_shader_functions(struct si_context *sctx);
bool si_init_shader_cache(struct si_screen *sscreen);
void si_destroy_shader_cache(struct si_screen *sscreen);
void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
- struct util_queue_fence *ready_fence,
- struct si_compiler_ctx_state *compiler_ctx_state,
- void *job, util_queue_execute_func execute);
-void si_get_active_slot_masks(const struct si_shader_info *info,
- uint32_t *const_and_shader_buffers,
- uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen,
- struct si_shader_ctx_state *state,
- struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key,
- int thread_index,
- bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx,
- struct si_shader_selector *vs,
- struct si_shader_key *key,
- struct si_vs_prolog_bits *prolog_key);
+ struct util_queue_fence *ready_fence,
+ struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+ util_queue_execute_func execute);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+ uint64_t *samplers_and_images);
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+ struct si_compiler_ctx_state *compiler_state,
+ struct si_shader_key *key, int thread_index, bool optimized_or_none);
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+ struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
unsigned si_get_input_prim(const struct si_shader_selector *gs);
bool si_update_ngg(struct si_context *sctx);
/* si_state_draw.c */
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
- unsigned cp_coher_cntl);
+ unsigned cp_coher_cntl);
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
void gfx10_emit_cache_flush(struct si_context *sctx);
void si_emit_cache_flush(struct si_context *sctx);
/* si_state_streamout.c */
void si_streamout_buffers_dirty(struct si_context *sctx);
void si_emit_streamout_end(struct si_context *sctx);
-void si_update_prims_generated_query_state(struct si_context *sctx,
- unsigned type, int diff);
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff);
void si_init_streamout_functions(struct si_context *sctx);
-
static inline unsigned si_get_constbuf_slot(unsigned slot)
{
- /* Constant buffers are in slots [16..31], ascending */
- return SI_NUM_SHADER_BUFFERS + slot;
+ /* Constant buffers are in slots [16..31], ascending */
+ return SI_NUM_SHADER_BUFFERS + slot;
}
static inline unsigned si_get_shaderbuf_slot(unsigned slot)
{
- /* shader buffers are in slots [15..0], descending */
- return SI_NUM_SHADER_BUFFERS - 1 - slot;
+ /* shader buffers are in slots [15..0], descending */
+ return SI_NUM_SHADER_BUFFERS - 1 - slot;
}
static inline unsigned si_get_sampler_slot(unsigned slot)
{
- /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
- /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */
- return SI_NUM_IMAGE_SLOTS / 2 + slot;
+ /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
+ /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */
+ return SI_NUM_IMAGE_SLOTS / 2 + slot;
}
static inline unsigned si_get_image_slot(unsigned slot)
{
- /* image slots are in [31..0] (sampler slots [15..0]), descending */
- /* images are in slots [31..16], while FMASKs are in slots [15..0] */
- return SI_NUM_IMAGE_SLOTS - 1 - slot;
+ /* image slots are in [31..0] (sampler slots [15..0]), descending */
+ /* images are in slots [31..16], while FMASKs are in slots [15..0] */
+ return SI_NUM_IMAGE_SLOTS - 1 - slot;
}
#endif
#include "sid.h"
struct uvec2 {
- unsigned x, y;
+ unsigned x, y;
};
struct si_bin_size_map {
- unsigned start;
- unsigned bin_size_x;
- unsigned bin_size_y;
+ unsigned start;
+ unsigned bin_size_x;
+ unsigned bin_size_y;
};
typedef struct si_bin_size_map si_bin_size_subtable[3][10];
/* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
-static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
- const si_bin_size_subtable table[],
- unsigned sum)
+static struct uvec2 si_find_bin_size(struct si_screen *sscreen, const si_bin_size_subtable table[],
+ unsigned sum)
{
- unsigned log_num_rb_per_se =
- util_logbase2_ceil(sscreen->info.num_render_backends /
- sscreen->info.max_se);
- unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
- unsigned i;
-
- /* Get the chip-specific subtable. */
- const struct si_bin_size_map *subtable =
- &table[log_num_rb_per_se][log_num_se][0];
-
- for (i = 0; subtable[i].bin_size_x != 0; i++) {
- if (sum >= subtable[i].start && sum < subtable[i + 1].start)
- break;
- }
-
- struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
- return size;
+ unsigned log_num_rb_per_se =
+ util_logbase2_ceil(sscreen->info.num_render_backends / sscreen->info.max_se);
+ unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
+ unsigned i;
+
+ /* Get the chip-specific subtable. */
+ const struct si_bin_size_map *subtable = &table[log_num_rb_per_se][log_num_se][0];
+
+ for (i = 0; subtable[i].bin_size_x != 0; i++) {
+ if (sum >= subtable[i].start && sum < subtable[i + 1].start)
+ break;
+ }
+
+ struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
+ return size;
}
-static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
- unsigned cb_target_enabled_4bit)
+static struct uvec2 si_get_color_bin_size(struct si_context *sctx, unsigned cb_target_enabled_4bit)
{
- unsigned num_fragments = sctx->framebuffer.nr_color_samples;
- unsigned sum = 0;
-
- /* Compute the sum of all Bpp. */
- for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
- continue;
-
- struct si_texture *tex =
- (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
- sum += tex->surface.bpe;
- }
-
- /* Multiply the sum by some function of the number of samples. */
- if (num_fragments >= 2) {
- if (si_get_ps_iter_samples(sctx) >= 2)
- sum *= num_fragments;
- else
- sum *= 2;
- }
-
- static const si_bin_size_subtable table[] = {
- {
- /* One RB / SE */
- {
- /* One shader engine */
- { 0, 128, 128 },
- { 1, 64, 128 },
- { 2, 32, 128 },
- { 3, 16, 128 },
- { 17, 0, 0 },
- },
- {
- /* Two shader engines */
- { 0, 128, 128 },
- { 2, 64, 128 },
- { 3, 32, 128 },
- { 5, 16, 128 },
- { 17, 0, 0 },
- },
- {
- /* Four shader engines */
- { 0, 128, 128 },
- { 3, 64, 128 },
- { 5, 16, 128 },
- { 17, 0, 0 },
- },
- },
- {
- /* Two RB / SE */
- {
- /* One shader engine */
- { 0, 128, 128 },
- { 2, 64, 128 },
- { 3, 32, 128 },
- { 9, 16, 128 },
- { 33, 0, 0 },
- },
- {
- /* Two shader engines */
- { 0, 128, 128 },
- { 3, 64, 128 },
- { 5, 32, 128 },
- { 9, 16, 128 },
- { 33, 0, 0 },
- },
- {
- /* Four shader engines */
- { 0, 256, 256 },
- { 2, 128, 256 },
- { 3, 128, 128 },
- { 5, 64, 128 },
- { 9, 16, 128 },
- { 33, 0, 0 },
- },
- },
- {
- /* Four RB / SE */
- {
- /* One shader engine */
- { 0, 128, 256 },
- { 2, 128, 128 },
- { 3, 64, 128 },
- { 5, 32, 128 },
- { 9, 16, 128 },
- { 17, 0, 0 },
- },
- {
- /* Two shader engines */
- { 0, 256, 256 },
- { 2, 128, 256 },
- { 3, 128, 128 },
- { 5, 64, 128 },
- { 9, 32, 128 },
- { 17, 16, 128 },
- { 33, 0, 0 },
- },
- {
- /* Four shader engines */
- { 0, 256, 512 },
- { 2, 128, 512 },
- { 3, 64, 512 },
- { 5, 32, 512 },
- { 9, 32, 256 },
- { 17, 32, 128 },
- { 33, 0, 0 },
- },
- },
- };
-
- return si_find_bin_size(sctx->screen, table, sum);
+ unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+ unsigned sum = 0;
+
+ /* Compute the sum of all Bpp. */
+ for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
+ continue;
+
+ struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+ sum += tex->surface.bpe;
+ }
+
+ /* Multiply the sum by some function of the number of samples. */
+ if (num_fragments >= 2) {
+ if (si_get_ps_iter_samples(sctx) >= 2)
+ sum *= num_fragments;
+ else
+ sum *= 2;
+ }
+
+ static const si_bin_size_subtable table[] = {
+ {
+ /* One RB / SE */
+ {
+ /* One shader engine */
+ {0, 128, 128},
+ {1, 64, 128},
+ {2, 32, 128},
+ {3, 16, 128},
+ {17, 0, 0},
+ },
+ {
+ /* Two shader engines */
+ {0, 128, 128},
+ {2, 64, 128},
+ {3, 32, 128},
+ {5, 16, 128},
+ {17, 0, 0},
+ },
+ {
+ /* Four shader engines */
+ {0, 128, 128},
+ {3, 64, 128},
+ {5, 16, 128},
+ {17, 0, 0},
+ },
+ },
+ {
+ /* Two RB / SE */
+ {
+ /* One shader engine */
+ {0, 128, 128},
+ {2, 64, 128},
+ {3, 32, 128},
+ {9, 16, 128},
+ {33, 0, 0},
+ },
+ {
+ /* Two shader engines */
+ {0, 128, 128},
+ {3, 64, 128},
+ {5, 32, 128},
+ {9, 16, 128},
+ {33, 0, 0},
+ },
+ {
+ /* Four shader engines */
+ {0, 256, 256},
+ {2, 128, 256},
+ {3, 128, 128},
+ {5, 64, 128},
+ {9, 16, 128},
+ {33, 0, 0},
+ },
+ },
+ {
+ /* Four RB / SE */
+ {
+ /* One shader engine */
+ {0, 128, 256},
+ {2, 128, 128},
+ {3, 64, 128},
+ {5, 32, 128},
+ {9, 16, 128},
+ {17, 0, 0},
+ },
+ {
+ /* Two shader engines */
+ {0, 256, 256},
+ {2, 128, 256},
+ {3, 128, 128},
+ {5, 64, 128},
+ {9, 32, 128},
+ {17, 16, 128},
+ {33, 0, 0},
+ },
+ {
+ /* Four shader engines */
+ {0, 256, 512},
+ {2, 128, 512},
+ {3, 64, 512},
+ {5, 32, 512},
+ {9, 32, 256},
+ {17, 32, 128},
+ {33, 0, 0},
+ },
+ },
+ };
+
+ return si_find_bin_size(sctx->screen, table, sum);
}
static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
{
- struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
- if (!sctx->framebuffer.state.zsbuf ||
- (!dsa->depth_enabled && !dsa->stencil_enabled)) {
- /* Return the max size. */
- struct uvec2 size = {512, 512};
- return size;
- }
-
- struct si_texture *tex =
- (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
- unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
- unsigned stencil_coeff = tex->surface.has_stencil &&
- dsa->stencil_enabled ? 1 : 0;
- unsigned sum = 4 * (depth_coeff + stencil_coeff) *
- MAX2(tex->buffer.b.b.nr_samples, 1);
-
- static const si_bin_size_subtable table[] = {
- {
- // One RB / SE
- {
- // One shader engine
- { 0, 64, 512 },
- { 2, 64, 256 },
- { 4, 64, 128 },
- { 7, 32, 128 },
- { 13, 16, 128 },
- { 49, 0, 0 },
- },
- {
- // Two shader engines
- { 0, 128, 512 },
- { 2, 64, 512 },
- { 4, 64, 256 },
- { 7, 64, 128 },
- { 13, 32, 128 },
- { 25, 16, 128 },
- { 49, 0, 0 },
- },
- {
- // Four shader engines
- { 0, 256, 512 },
- { 2, 128, 512 },
- { 4, 64, 512 },
- { 7, 64, 256 },
- { 13, 64, 128 },
- { 25, 16, 128 },
- { 49, 0, 0 },
- },
- },
- {
- // Two RB / SE
- {
- // One shader engine
- { 0, 128, 512 },
- { 2, 64, 512 },
- { 4, 64, 256 },
- { 7, 64, 128 },
- { 13, 32, 128 },
- { 25, 16, 128 },
- { 97, 0, 0 },
- },
- {
- // Two shader engines
- { 0, 256, 512 },
- { 2, 128, 512 },
- { 4, 64, 512 },
- { 7, 64, 256 },
- { 13, 64, 128 },
- { 25, 32, 128 },
- { 49, 16, 128 },
- { 97, 0, 0 },
- },
- {
- // Four shader engines
- { 0, 512, 512 },
- { 2, 256, 512 },
- { 4, 128, 512 },
- { 7, 64, 512 },
- { 13, 64, 256 },
- { 25, 64, 128 },
- { 49, 16, 128 },
- { 97, 0, 0 },
- },
- },
- {
- // Four RB / SE
- {
- // One shader engine
- { 0, 256, 512 },
- { 2, 128, 512 },
- { 4, 64, 512 },
- { 7, 64, 256 },
- { 13, 64, 128 },
- { 25, 32, 128 },
- { 49, 16, 128 },
- { 193, 0, 0 },
- },
- {
- // Two shader engines
- { 0, 512, 512 },
- { 2, 256, 512 },
- { 4, 128, 512 },
- { 7, 64, 512 },
- { 13, 64, 256 },
- { 25, 64, 128 },
- { 49, 32, 128 },
- { 97, 16, 128 },
- { 193, 0, 0 },
- },
- {
- // Four shader engines
- { 0, 512, 512 },
- { 4, 256, 512 },
- { 7, 128, 512 },
- { 13, 64, 512 },
- { 25, 32, 512 },
- { 49, 32, 256 },
- { 97, 16, 128 },
- { 193, 0, 0 },
- },
- },
- };
-
- return si_find_bin_size(sctx->screen, table, sum);
+ struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+ if (!sctx->framebuffer.state.zsbuf || (!dsa->depth_enabled && !dsa->stencil_enabled)) {
+ /* Return the max size. */
+ struct uvec2 size = {512, 512};
+ return size;
+ }
+
+ struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+ unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
+ unsigned stencil_coeff = tex->surface.has_stencil && dsa->stencil_enabled ? 1 : 0;
+ unsigned sum = 4 * (depth_coeff + stencil_coeff) * MAX2(tex->buffer.b.b.nr_samples, 1);
+
+ static const si_bin_size_subtable table[] = {
+ {
+ // One RB / SE
+ {
+ // One shader engine
+ {0, 64, 512},
+ {2, 64, 256},
+ {4, 64, 128},
+ {7, 32, 128},
+ {13, 16, 128},
+ {49, 0, 0},
+ },
+ {
+ // Two shader engines
+ {0, 128, 512},
+ {2, 64, 512},
+ {4, 64, 256},
+ {7, 64, 128},
+ {13, 32, 128},
+ {25, 16, 128},
+ {49, 0, 0},
+ },
+ {
+ // Four shader engines
+ {0, 256, 512},
+ {2, 128, 512},
+ {4, 64, 512},
+ {7, 64, 256},
+ {13, 64, 128},
+ {25, 16, 128},
+ {49, 0, 0},
+ },
+ },
+ {
+ // Two RB / SE
+ {
+ // One shader engine
+ {0, 128, 512},
+ {2, 64, 512},
+ {4, 64, 256},
+ {7, 64, 128},
+ {13, 32, 128},
+ {25, 16, 128},
+ {97, 0, 0},
+ },
+ {
+ // Two shader engines
+ {0, 256, 512},
+ {2, 128, 512},
+ {4, 64, 512},
+ {7, 64, 256},
+ {13, 64, 128},
+ {25, 32, 128},
+ {49, 16, 128},
+ {97, 0, 0},
+ },
+ {
+ // Four shader engines
+ {0, 512, 512},
+ {2, 256, 512},
+ {4, 128, 512},
+ {7, 64, 512},
+ {13, 64, 256},
+ {25, 64, 128},
+ {49, 16, 128},
+ {97, 0, 0},
+ },
+ },
+ {
+ // Four RB / SE
+ {
+ // One shader engine
+ {0, 256, 512},
+ {2, 128, 512},
+ {4, 64, 512},
+ {7, 64, 256},
+ {13, 64, 128},
+ {25, 32, 128},
+ {49, 16, 128},
+ {193, 0, 0},
+ },
+ {
+ // Two shader engines
+ {0, 512, 512},
+ {2, 256, 512},
+ {4, 128, 512},
+ {7, 64, 512},
+ {13, 64, 256},
+ {25, 64, 128},
+ {49, 32, 128},
+ {97, 16, 128},
+ {193, 0, 0},
+ },
+ {
+ // Four shader engines
+ {0, 512, 512},
+ {4, 256, 512},
+ {7, 128, 512},
+ {13, 64, 512},
+ {25, 32, 512},
+ {49, 32, 256},
+ {97, 16, 128},
+ {193, 0, 0},
+ },
+ },
+ };
+
+ return si_find_bin_size(sctx->screen, table, sum);
}
-static void gfx10_get_bin_sizes(struct si_context *sctx,
- unsigned cb_target_enabled_4bit,
- struct uvec2 *color_bin_size,
- struct uvec2 *depth_bin_size)
+static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enabled_4bit,
+ struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size)
{
- const unsigned ZsTagSize = 64;
- const unsigned ZsNumTags = 312;
- const unsigned CcTagSize = 1024;
- const unsigned CcReadTags = 31;
- const unsigned FcTagSize = 256;
- const unsigned FcReadTags = 44;
-
- const unsigned num_rbs = sctx->screen->info.num_render_backends;
- const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
-
- const unsigned depthBinSizeTagPart = ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
- const unsigned colorBinSizeTagPart = ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
- const unsigned fmaskBinSizeTagPart = ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
-
- const unsigned minBinSizeX = 128;
- const unsigned minBinSizeY = 64;
-
- const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
- const unsigned num_samples = sctx->framebuffer.nr_samples;
- const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
-
- /* Calculate cColor and cFmask(if applicable) */
- unsigned cColor = 0;
- unsigned cFmask = 0;
- bool has_fmask = false;
-
- for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
- if (!sctx->framebuffer.state.cbufs[i])
- continue;
-
- struct si_texture *tex =
- (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
- const unsigned mmrt =
- num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
-
- cColor += tex->surface.bpe * mmrt;
- if (num_samples >= 2 /* if FMASK is bound */) {
- const unsigned fragmentsLog2 = util_logbase2(num_fragments);
- const unsigned samplesLog2 = util_logbase2(num_samples);
-
- static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
- { 0, 1, 1, 1, 2 }, /* fragments = 1 */
- { 0, 1, 1, 2, 4 }, /* fragments = 2 */
- { 0, 1, 1, 4, 8 }, /* fragments = 4 */
- { 0, 1, 2, 4, 8 } /* fragments = 8 */
- };
- cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
- has_fmask = true;
- }
- }
- cColor = MAX2(cColor, 1u);
-
- const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
- const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
- const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2); /* round down height */
-
- unsigned binSizeX = colorBinSizeX;
- unsigned binSizeY = colorBinSizeY;
-
- if (has_fmask) {
- cFmask = MAX2(cFmask, 1u);
-
- const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
- const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
- const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2); /* round down height */
-
- /* use the smaller of the Color vs. Fmask bin sizes */
- if (fmaskLog2Pixels < colorLog2Pixels) {
- binSizeX = fmaskBinSizeX;
- binSizeY = fmaskBinSizeY;
- }
- }
-
- /* Return size adjusted for minimum bin size */
- color_bin_size->x = MAX2(binSizeX, minBinSizeX);
- color_bin_size->y = MAX2(binSizeY, minBinSizeY);
-
- if (!sctx->framebuffer.state.zsbuf) {
- /* Set to max sizes when no depth buffer is bound. */
- depth_bin_size->x = 512;
- depth_bin_size->y = 512;
- } else {
- struct si_texture *zstex = (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
- struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
- const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0;
- const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
- const unsigned cDepth = (cPerDepthSample + cPerStencilSample) *
- MAX2(zstex->buffer.b.b.nr_samples, 1);
-
- const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
- unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2);
- unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2);
-
- depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
- depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
- }
+ const unsigned ZsTagSize = 64;
+ const unsigned ZsNumTags = 312;
+ const unsigned CcTagSize = 1024;
+ const unsigned CcReadTags = 31;
+ const unsigned FcTagSize = 256;
+ const unsigned FcReadTags = 44;
+
+ const unsigned num_rbs = sctx->screen->info.num_render_backends;
+ const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
+
+ const unsigned depthBinSizeTagPart =
+ ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
+ const unsigned colorBinSizeTagPart =
+ ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
+ const unsigned fmaskBinSizeTagPart =
+ ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
+
+ const unsigned minBinSizeX = 128;
+ const unsigned minBinSizeY = 64;
+
+ const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+ const unsigned num_samples = sctx->framebuffer.nr_samples;
+ const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
+
+ /* Calculate cColor and cFmask(if applicable) */
+ unsigned cColor = 0;
+ unsigned cFmask = 0;
+ bool has_fmask = false;
+
+ for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+ if (!sctx->framebuffer.state.cbufs[i])
+ continue;
+
+ struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+ const unsigned mmrt = num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
+
+ cColor += tex->surface.bpe * mmrt;
+ if (num_samples >= 2 /* if FMASK is bound */) {
+ const unsigned fragmentsLog2 = util_logbase2(num_fragments);
+ const unsigned samplesLog2 = util_logbase2(num_samples);
+
+ static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
+ {0, 1, 1, 1, 2}, /* fragments = 1 */
+ {0, 1, 1, 2, 4}, /* fragments = 2 */
+ {0, 1, 1, 4, 8}, /* fragments = 4 */
+ {0, 1, 2, 4, 8} /* fragments = 8 */
+ };
+ cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
+ has_fmask = true;
+ }
+ }
+ cColor = MAX2(cColor, 1u);
+
+ const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
+ const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
+ const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2); /* round down height */
+
+ unsigned binSizeX = colorBinSizeX;
+ unsigned binSizeY = colorBinSizeY;
+
+ if (has_fmask) {
+ cFmask = MAX2(cFmask, 1u);
+
+ const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
+ const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
+ const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2); /* round down height */
+
+ /* use the smaller of the Color vs. Fmask bin sizes */
+ if (fmaskLog2Pixels < colorLog2Pixels) {
+ binSizeX = fmaskBinSizeX;
+ binSizeY = fmaskBinSizeY;
+ }
+ }
+
+ /* Return size adjusted for minimum bin size */
+ color_bin_size->x = MAX2(binSizeX, minBinSizeX);
+ color_bin_size->y = MAX2(binSizeY, minBinSizeY);
+
+ if (!sctx->framebuffer.state.zsbuf) {
+ /* Set to max sizes when no depth buffer is bound. */
+ depth_bin_size->x = 512;
+ depth_bin_size->y = 512;
+ } else {
+ struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+ struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+ const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0;
+ const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
+ const unsigned cDepth =
+ (cPerDepthSample + cPerStencilSample) * MAX2(zstex->buffer.b.b.nr_samples, 1);
+
+ const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
+ unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2);
+ unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2);
+
+ depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
+ depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
+ }
}
static void si_emit_dpbb_disable(struct si_context *sctx)
{
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
- if (sctx->chip_class >= GFX10) {
- struct uvec2 bin_size = {};
- struct uvec2 bin_size_extend = {};
-
- bin_size.x = 128;
- bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
-
- if (bin_size.x >= 32)
- bin_size_extend.x = util_logbase2(bin_size.x) - 5;
- if (bin_size.y >= 32)
- bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
- radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
- SI_TRACKED_PA_SC_BINNER_CNTL_0,
- S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
- S_028C44_BIN_SIZE_X(bin_size.x == 16) |
- S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
- S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
- S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
- S_028C44_DISABLE_START_OF_PRIM(1) |
- S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
- } else {
- radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
- SI_TRACKED_PA_SC_BINNER_CNTL_0,
- S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
- S_028C44_DISABLE_START_OF_PRIM(1) |
- S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
- sctx->family == CHIP_VEGA20 ||
- sctx->family >= CHIP_RAVEN2) &&
- sctx->last_binning_enabled != 0));
- }
-
- unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
- : R_028060_DB_DFSM_CONTROL;
- radeon_opt_set_context_reg(sctx, db_dfsm_control,
- SI_TRACKED_DB_DFSM_CONTROL,
- S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
- S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
-
- sctx->last_binning_enabled = false;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (sctx->chip_class >= GFX10) {
+ struct uvec2 bin_size = {};
+ struct uvec2 bin_size_extend = {};
+
+ bin_size.x = 128;
+ bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
+
+ if (bin_size.x >= 32)
+ bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+ if (bin_size.y >= 32)
+ bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+ radeon_opt_set_context_reg(
+ sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
+ S_028C44_BIN_SIZE_X(bin_size.x == 16) | S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+ S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+ S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | S_028C44_DISABLE_START_OF_PRIM(1) |
+ S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
+ } else {
+ radeon_opt_set_context_reg(
+ sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+ S_028C44_DISABLE_START_OF_PRIM(1) |
+ S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+ sctx->family == CHIP_VEGA20 ||
+ sctx->family >= CHIP_RAVEN2) &&
+ sctx->last_binning_enabled != 0));
+ }
+
+ unsigned db_dfsm_control =
+ sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+ radeon_opt_set_context_reg(
+ sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+ S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ sctx->last_binning_enabled = false;
}
void si_emit_dpbb_state(struct si_context *sctx)
{
- struct si_screen *sscreen = sctx->screen;
- struct si_state_blend *blend = sctx->queued.named.blend;
- struct si_state_dsa *dsa = sctx->queued.named.dsa;
- unsigned db_shader_control = sctx->ps_db_shader_control;
-
- assert(sctx->chip_class >= GFX9);
-
- if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
- si_emit_dpbb_disable(sctx);
- return;
- }
-
- bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
- G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
- G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
- blend->alpha_to_coverage;
-
- bool db_can_reject_z_trivially =
- !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
- G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
- G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
-
- /* Disable DPBB when it's believed to be inefficient. */
- if (sscreen->info.num_render_backends > 4 &&
- ps_can_kill &&
- db_can_reject_z_trivially &&
- sctx->framebuffer.state.zsbuf &&
- dsa->db_can_write) {
- si_emit_dpbb_disable(sctx);
- return;
- }
-
- /* Compute the bin size. */
- /* TODO: We could also look at enabled pixel shader outputs. */
- unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
- blend->cb_target_enabled_4bit;
- struct uvec2 color_bin_size, depth_bin_size;
-
- if (sctx->chip_class >= GFX10) {
- gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit,
- &color_bin_size, &depth_bin_size);
- } else {
- color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
- depth_bin_size = si_get_depth_bin_size(sctx);
- }
-
- unsigned color_area = color_bin_size.x * color_bin_size.y;
- unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
-
- struct uvec2 bin_size = color_area < depth_area ? color_bin_size
- : depth_bin_size;
-
- if (!bin_size.x || !bin_size.y) {
- si_emit_dpbb_disable(sctx);
- return;
- }
-
- /* Enable DFSM if it's preferred. */
- unsigned punchout_mode = V_028060_FORCE_OFF;
- bool disable_start_of_prim = true;
- bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
- sctx->framebuffer.state.zsbuf &&
- sctx->framebuffer.nr_samples !=
- MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
-
- if (sscreen->dfsm_allowed &&
- !zs_eqaa_dfsm_bug &&
- cb_target_enabled_4bit &&
- !G_02880C_KILL_ENABLE(db_shader_control) &&
- /* These two also imply that DFSM is disabled when PS writes to memory. */
- !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
- !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
- G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
- punchout_mode = V_028060_AUTO;
- disable_start_of_prim = (cb_target_enabled_4bit &
- blend->blend_enable_4bit) != 0;
- }
-
- /* Tunable parameters. Also test with DFSM enabled/disabled. */
- unsigned context_states_per_bin; /* allowed range: [1, 6] */
- unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
- unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
-
- /* Tuned for Raven. Vega might need different values. */
- if (sscreen->info.has_dedicated_vram) {
- if (sscreen->info.num_render_backends > 4) {
- context_states_per_bin = 1;
- persistent_states_per_bin = 1;
- } else {
- context_states_per_bin = 3;
- persistent_states_per_bin = 8;
- }
- } else {
- /* This is a workaround for:
- * https://bugs.freedesktop.org/show_bug.cgi?id=110214
- * (an alternative is to insert manual BATCH_BREAK event when
- * a context_roll is detected). */
- context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
- /* Using 32 here can cause GPU hangs on RAVEN1 */
- persistent_states_per_bin = 16;
- }
- fpovs_per_batch = 63;
-
- /* Emit registers. */
- struct uvec2 bin_size_extend = {};
- if (bin_size.x >= 32)
- bin_size_extend.x = util_logbase2(bin_size.x) - 5;
- if (bin_size.y >= 32)
- bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- radeon_opt_set_context_reg(
- sctx, R_028C44_PA_SC_BINNER_CNTL_0,
- SI_TRACKED_PA_SC_BINNER_CNTL_0,
- S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
- S_028C44_BIN_SIZE_X(bin_size.x == 16) |
- S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
- S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
- S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
- S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
- S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
- S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
- S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
- S_028C44_OPTIMAL_BIN_SELECTION(1) |
- S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
- sctx->family == CHIP_VEGA20 ||
- sctx->family >= CHIP_RAVEN2) &&
- sctx->last_binning_enabled != 1));
-
- unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
- : R_028060_DB_DFSM_CONTROL;
- radeon_opt_set_context_reg(sctx, db_dfsm_control,
- SI_TRACKED_DB_DFSM_CONTROL,
- S_028060_PUNCHOUT_MODE(punchout_mode) |
- S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
-
- sctx->last_binning_enabled = true;
+ struct si_screen *sscreen = sctx->screen;
+ struct si_state_blend *blend = sctx->queued.named.blend;
+ struct si_state_dsa *dsa = sctx->queued.named.dsa;
+ unsigned db_shader_control = sctx->ps_db_shader_control;
+
+ assert(sctx->chip_class >= GFX9);
+
+ if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
+ si_emit_dpbb_disable(sctx);
+ return;
+ }
+
+ bool ps_can_kill =
+ G_02880C_KILL_ENABLE(db_shader_control) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
+ G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
+
+ bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
+ G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+ G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
+
+ /* Disable DPBB when it's believed to be inefficient. */
+ if (sscreen->info.num_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
+ sctx->framebuffer.state.zsbuf && dsa->db_can_write) {
+ si_emit_dpbb_disable(sctx);
+ return;
+ }
+
+ /* Compute the bin size. */
+ /* TODO: We could also look at enabled pixel shader outputs. */
+ unsigned cb_target_enabled_4bit =
+ sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit;
+ struct uvec2 color_bin_size, depth_bin_size;
+
+ if (sctx->chip_class >= GFX10) {
+ gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, &color_bin_size, &depth_bin_size);
+ } else {
+ color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
+ depth_bin_size = si_get_depth_bin_size(sctx);
+ }
+
+ unsigned color_area = color_bin_size.x * color_bin_size.y;
+ unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
+
+ struct uvec2 bin_size = color_area < depth_area ? color_bin_size : depth_bin_size;
+
+ if (!bin_size.x || !bin_size.y) {
+ si_emit_dpbb_disable(sctx);
+ return;
+ }
+
+ /* Enable DFSM if it's preferred. */
+ unsigned punchout_mode = V_028060_FORCE_OFF;
+ bool disable_start_of_prim = true;
+ bool zs_eqaa_dfsm_bug =
+ sctx->chip_class == GFX9 && sctx->framebuffer.state.zsbuf &&
+ sctx->framebuffer.nr_samples != MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
+
+ if (sscreen->dfsm_allowed && !zs_eqaa_dfsm_bug && cb_target_enabled_4bit &&
+ !G_02880C_KILL_ENABLE(db_shader_control) &&
+ /* These two also imply that DFSM is disabled when PS writes to memory. */
+ !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
+ !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
+ G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
+ punchout_mode = V_028060_AUTO;
+ disable_start_of_prim = (cb_target_enabled_4bit & blend->blend_enable_4bit) != 0;
+ }
+
+ /* Tunable parameters. Also test with DFSM enabled/disabled. */
+ unsigned context_states_per_bin; /* allowed range: [1, 6] */
+ unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
+ unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
+
+ /* Tuned for Raven. Vega might need different values. */
+ if (sscreen->info.has_dedicated_vram) {
+ if (sscreen->info.num_render_backends > 4) {
+ context_states_per_bin = 1;
+ persistent_states_per_bin = 1;
+ } else {
+ context_states_per_bin = 3;
+ persistent_states_per_bin = 8;
+ }
+ } else {
+ /* This is a workaround for:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=110214
+ * (an alternative is to insert manual BATCH_BREAK event when
+ * a context_roll is detected). */
+ context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
+ /* Using 32 here can cause GPU hangs on RAVEN1 */
+ persistent_states_per_bin = 16;
+ }
+ fpovs_per_batch = 63;
+
+ /* Emit registers. */
+ struct uvec2 bin_size_extend = {};
+ if (bin_size.x >= 32)
+ bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+ if (bin_size.y >= 32)
+ bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ radeon_opt_set_context_reg(
+ sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+ S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+ S_028C44_BIN_SIZE_Y(bin_size.y == 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+ S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+ S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
+ S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
+ S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+ S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1) |
+ S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+ sctx->family == CHIP_VEGA20 ||
+ sctx->family >= CHIP_RAVEN2) &&
+ sctx->last_binning_enabled != 1));
+
+ unsigned db_dfsm_control =
+ sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+ radeon_opt_set_context_reg(
+ sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+ S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ sctx->last_binning_enabled = true;
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+#include "ac_debug.h"
#include "si_build_pm4.h"
#include "sid.h"
-
#include "util/u_index_modify.h"
#include "util/u_log.h"
-#include "util/u_upload_mgr.h"
#include "util/u_prim.h"
#include "util/u_suballoc.h"
-
-#include "ac_debug.h"
+#include "util/u_upload_mgr.h"
/* special primitive types */
-#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
+#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
static unsigned si_conv_pipe_prim(unsigned mode)
{
- static const unsigned prim_conv[] = {
- [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,
- [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,
- [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,
- [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,
- [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,
- [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,
- [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,
- [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,
- [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,
- [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,
- [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,
- [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,
- [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,
- [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,
- [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,
- [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST
- };
- assert(mode < ARRAY_SIZE(prim_conv));
- return prim_conv[mode];
+ static const unsigned prim_conv[] = {
+ [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,
+ [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,
+ [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,
+ [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,
+ [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,
+ [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,
+ [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,
+ [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,
+ [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,
+ [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,
+ [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,
+ [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,
+ [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,
+ [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST};
+ assert(mode < ARRAY_SIZE(prim_conv));
+ return prim_conv[mode];
}
/**
* The information about LDS and other non-compile-time parameters is then
* written to userdata SGPRs.
*/
-static void si_emit_derived_tess_state(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned *num_patches)
+static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info,
+ unsigned *num_patches)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct si_shader *ls_current;
- struct si_shader_selector *ls;
- /* The TES pointer will only be used for sctx->last_tcs.
- * It would be wrong to think that TCS = TES. */
- struct si_shader_selector *tcs =
- sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
- unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
- bool has_primid_instancing_bug = sctx->chip_class == GFX6 &&
- sctx->screen->info.max_se == 1;
- unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
- unsigned num_tcs_input_cp = info->vertices_per_patch;
- unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
- unsigned num_tcs_patch_outputs;
- unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
- unsigned input_patch_size, output_patch_size, output_patch0_offset;
- unsigned perpatch_output_offset, lds_size;
- unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
- unsigned offchip_layout, hardware_lds_size, ls_hs_config;
-
- /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
- if (sctx->chip_class >= GFX9) {
- if (sctx->tcs_shader.cso)
- ls_current = sctx->tcs_shader.current;
- else
- ls_current = sctx->fixed_func_tcs_shader.current;
-
- ls = ls_current->key.part.tcs.ls;
- } else {
- ls_current = sctx->vs_shader.current;
- ls = sctx->vs_shader.cso;
- }
-
- if (sctx->last_ls == ls_current &&
- sctx->last_tcs == tcs &&
- sctx->last_tes_sh_base == tes_sh_base &&
- sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
- (!has_primid_instancing_bug ||
- (sctx->last_tess_uses_primid == tess_uses_primid))) {
- *num_patches = sctx->last_num_patches;
- return;
- }
-
- sctx->last_ls = ls_current;
- sctx->last_tcs = tcs;
- sctx->last_tes_sh_base = tes_sh_base;
- sctx->last_num_tcs_input_cp = num_tcs_input_cp;
- sctx->last_tess_uses_primid = tess_uses_primid;
-
- /* This calculates how shader inputs and outputs among VS, TCS, and TES
- * are laid out in LDS. */
- num_tcs_inputs = util_last_bit64(ls->outputs_written);
-
- if (sctx->tcs_shader.cso) {
- num_tcs_outputs = util_last_bit64(tcs->outputs_written);
- num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
- num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
- } else {
- /* No TCS. Route varyings from LS to TES. */
- num_tcs_outputs = num_tcs_inputs;
- num_tcs_output_cp = num_tcs_input_cp;
- num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
- }
-
- input_vertex_size = ls->lshs_vertex_stride;
- output_vertex_size = num_tcs_outputs * 16;
-
- input_patch_size = num_tcs_input_cp * input_vertex_size;
-
- pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
- output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
-
- /* Ensure that we only need one wave per SIMD so we don't need to check
- * resource usage. Also ensures that the number of tcs in and out
- * vertices per threadgroup are at most 256.
- */
- unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
- *num_patches = 256 / max_verts_per_patch;
-
- /* Make sure that the data fits in LDS. This assumes the shaders only
- * use LDS for the inputs and outputs.
- *
- * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
- * with 2 CUs if we use more than 32K. The closed Vulkan driver also
- * uses 32K at most on all GCN chips.
- */
- hardware_lds_size = 32768;
- *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
- output_patch_size));
-
- /* Make sure the output data fits in the offchip buffer */
- *num_patches = MIN2(*num_patches,
- (sctx->screen->tess_offchip_block_dw_size * 4) /
- output_patch_size);
-
- /* Not necessary for correctness, but improves performance.
- * The hardware can do more, but the radeonsi shader constant is
- * limited to 6 bits.
- */
- *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
-
- /* When distributed tessellation is unsupported, switch between SEs
- * at a higher frequency to compensate for it.
- */
- if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
- *num_patches = MIN2(*num_patches, 16); /* recommended */
-
- /* Make sure that vector lanes are reasonably occupied. It probably
- * doesn't matter much because this is LS-HS, and TES is likely to
- * occupy significantly more CUs.
- */
- unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
- unsigned wave_size = sctx->screen->ge_wave_size;
-
- if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4)
- *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
-
- if (sctx->chip_class == GFX6) {
- /* GFX6 bug workaround, related to power management. Limit LS-HS
- * threadgroups to only one wave.
- */
- unsigned one_wave = wave_size / max_verts_per_patch;
- *num_patches = MIN2(*num_patches, one_wave);
- }
-
- /* The VGT HS block increments the patch ID unconditionally
- * within a single threadgroup. This results in incorrect
- * patch IDs when instanced draws are used.
- *
- * The intended solution is to restrict threadgroups to
- * a single instance by setting SWITCH_ON_EOI, which
- * should cause IA to split instances up. However, this
- * doesn't work correctly on GFX6 when there is no other
- * SE to switch to.
- */
- if (has_primid_instancing_bug && tess_uses_primid)
- *num_patches = 1;
-
- sctx->last_num_patches = *num_patches;
-
- output_patch0_offset = input_patch_size * *num_patches;
- perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
-
- /* Compute userdata SGPRs. */
- assert(((input_vertex_size / 4) & ~0xff) == 0);
- assert(((output_vertex_size / 4) & ~0xff) == 0);
- assert(((input_patch_size / 4) & ~0x1fff) == 0);
- assert(((output_patch_size / 4) & ~0x1fff) == 0);
- assert(((output_patch0_offset / 16) & ~0xffff) == 0);
- assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
- assert(num_tcs_input_cp <= 32);
- assert(num_tcs_output_cp <= 32);
-
- uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
- assert((ring_va & u_bit_consecutive(0, 19)) == 0);
-
- tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
- S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
- tcs_out_layout = (output_patch_size / 4) |
- (num_tcs_input_cp << 13) |
- ring_va;
- tcs_out_offsets = (output_patch0_offset / 16) |
- ((perpatch_output_offset / 16) << 16);
- offchip_layout = *num_patches |
- (num_tcs_output_cp << 6) |
- (pervertex_output_patch_size * *num_patches << 12);
-
- /* Compute the LDS size. */
- lds_size = output_patch0_offset + output_patch_size * *num_patches;
-
- if (sctx->chip_class >= GFX7) {
- assert(lds_size <= 65536);
- lds_size = align(lds_size, 512) / 512;
- } else {
- assert(lds_size <= 32768);
- lds_size = align(lds_size, 256) / 256;
- }
-
- /* Set SI_SGPR_VS_STATE_BITS. */
- sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
- C_VS_STATE_LS_OUT_VERTEX_SIZE;
- sctx->current_vs_state |= tcs_in_layout;
-
- /* We should be able to support in-shader LDS use with LLVM >= 9
- * by just adding the lds_sizes together, but it has never
- * been tested. */
- assert(ls_current->config.lds_size == 0);
-
- if (sctx->chip_class >= GFX9) {
- unsigned hs_rsrc2 = ls_current->config.rsrc2;
-
- if (sctx->chip_class >= GFX10)
- hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
- else
- hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
-
- radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
-
- /* Set userdata SGPRs for merged LS-HS. */
- radeon_set_sh_reg_seq(cs,
- R_00B430_SPI_SHADER_USER_DATA_LS_0 +
- GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
- radeon_emit(cs, offchip_layout);
- radeon_emit(cs, tcs_out_offsets);
- radeon_emit(cs, tcs_out_layout);
- } else {
- unsigned ls_rsrc2 = ls_current->config.rsrc2;
-
- si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
- ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
- /* Due to a hw bug, RSRC2_LS must be written twice with another
- * LS register written in between. */
- if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
- radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
- radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
- radeon_emit(cs, ls_current->config.rsrc1);
- radeon_emit(cs, ls_rsrc2);
-
- /* Set userdata SGPRs for TCS. */
- radeon_set_sh_reg_seq(cs,
- R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
- radeon_emit(cs, offchip_layout);
- radeon_emit(cs, tcs_out_offsets);
- radeon_emit(cs, tcs_out_layout);
- radeon_emit(cs, tcs_in_layout);
- }
-
- /* Set userdata SGPRs for TES. */
- radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
- radeon_emit(cs, offchip_layout);
- radeon_emit(cs, ring_va);
-
- ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
- S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
- S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
-
- if (sctx->last_ls_hs_config != ls_hs_config) {
- if (sctx->chip_class >= GFX7) {
- radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
- ls_hs_config);
- } else {
- radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
- ls_hs_config);
- }
- sctx->last_ls_hs_config = ls_hs_config;
- sctx->context_roll = true;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct si_shader *ls_current;
+ struct si_shader_selector *ls;
+ /* The TES pointer will only be used for sctx->last_tcs.
+ * It would be wrong to think that TCS = TES. */
+ struct si_shader_selector *tcs =
+ sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
+ unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
+ bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;
+ unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
+ unsigned num_tcs_input_cp = info->vertices_per_patch;
+ unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+ unsigned num_tcs_patch_outputs;
+ unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+ unsigned input_patch_size, output_patch_size, output_patch0_offset;
+ unsigned perpatch_output_offset, lds_size;
+ unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+ unsigned offchip_layout, hardware_lds_size, ls_hs_config;
+
+ /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
+ if (sctx->chip_class >= GFX9) {
+ if (sctx->tcs_shader.cso)
+ ls_current = sctx->tcs_shader.current;
+ else
+ ls_current = sctx->fixed_func_tcs_shader.current;
+
+ ls = ls_current->key.part.tcs.ls;
+ } else {
+ ls_current = sctx->vs_shader.current;
+ ls = sctx->vs_shader.cso;
+ }
+
+ if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
+ sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
+ (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) {
+ *num_patches = sctx->last_num_patches;
+ return;
+ }
+
+ sctx->last_ls = ls_current;
+ sctx->last_tcs = tcs;
+ sctx->last_tes_sh_base = tes_sh_base;
+ sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+ sctx->last_tess_uses_primid = tess_uses_primid;
+
+ /* This calculates how shader inputs and outputs among VS, TCS, and TES
+ * are laid out in LDS. */
+ num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+ if (sctx->tcs_shader.cso) {
+ num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+ num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+ num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+ } else {
+ /* No TCS. Route varyings from LS to TES. */
+ num_tcs_outputs = num_tcs_inputs;
+ num_tcs_output_cp = num_tcs_input_cp;
+ num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+ }
+
+ input_vertex_size = ls->lshs_vertex_stride;
+ output_vertex_size = num_tcs_outputs * 16;
+
+ input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+ pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+ output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+ /* Ensure that we only need one wave per SIMD so we don't need to check
+ * resource usage. Also ensures that the number of tcs in and out
+ * vertices per threadgroup are at most 256.
+ */
+ unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
+ *num_patches = 256 / max_verts_per_patch;
+
+ /* Make sure that the data fits in LDS. This assumes the shaders only
+ * use LDS for the inputs and outputs.
+ *
+ * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
+ * with 2 CUs if we use more than 32K. The closed Vulkan driver also
+ * uses 32K at most on all GCN chips.
+ */
+ hardware_lds_size = 32768;
+ *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+
+ /* Make sure the output data fits in the offchip buffer */
+ *num_patches =
+ MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);
+
+ /* Not necessary for correctness, but improves performance.
+ * The hardware can do more, but the radeonsi shader constant is
+ * limited to 6 bits.
+ */
+ *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+
+ /* When distributed tessellation is unsupported, switch between SEs
+ * at a higher frequency to compensate for it.
+ */
+ if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
+ *num_patches = MIN2(*num_patches, 16); /* recommended */
+
+ /* Make sure that vector lanes are reasonably occupied. It probably
+ * doesn't matter much because this is LS-HS, and TES is likely to
+ * occupy significantly more CUs.
+ */
+ unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
+ unsigned wave_size = sctx->screen->ge_wave_size;
+
+ if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size * 3 / 4)
+ *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
+
+ if (sctx->chip_class == GFX6) {
+ /* GFX6 bug workaround, related to power management. Limit LS-HS
+ * threadgroups to only one wave.
+ */
+ unsigned one_wave = wave_size / max_verts_per_patch;
+ *num_patches = MIN2(*num_patches, one_wave);
+ }
+
+ /* The VGT HS block increments the patch ID unconditionally
+ * within a single threadgroup. This results in incorrect
+ * patch IDs when instanced draws are used.
+ *
+ * The intended solution is to restrict threadgroups to
+ * a single instance by setting SWITCH_ON_EOI, which
+ * should cause IA to split instances up. However, this
+ * doesn't work correctly on GFX6 when there is no other
+ * SE to switch to.
+ */
+ if (has_primid_instancing_bug && tess_uses_primid)
+ *num_patches = 1;
+
+ sctx->last_num_patches = *num_patches;
+
+ output_patch0_offset = input_patch_size * *num_patches;
+ perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+ /* Compute userdata SGPRs. */
+ assert(((input_vertex_size / 4) & ~0xff) == 0);
+ assert(((output_vertex_size / 4) & ~0xff) == 0);
+ assert(((input_patch_size / 4) & ~0x1fff) == 0);
+ assert(((output_patch_size / 4) & ~0x1fff) == 0);
+ assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+ assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+ assert(num_tcs_input_cp <= 32);
+ assert(num_tcs_output_cp <= 32);
+
+ uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
+ assert((ring_va & u_bit_consecutive(0, 19)) == 0);
+
+ tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
+ S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
+ tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
+ tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
+ offchip_layout =
+ *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12);
+
+ /* Compute the LDS size. */
+ lds_size = output_patch0_offset + output_patch_size * *num_patches;
+
+ if (sctx->chip_class >= GFX7) {
+ assert(lds_size <= 65536);
+ lds_size = align(lds_size, 512) / 512;
+ } else {
+ assert(lds_size <= 32768);
+ lds_size = align(lds_size, 256) / 256;
+ }
+
+ /* Set SI_SGPR_VS_STATE_BITS. */
+ sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE;
+ sctx->current_vs_state |= tcs_in_layout;
+
+ /* We should be able to support in-shader LDS use with LLVM >= 9
+ * by just adding the lds_sizes together, but it has never
+ * been tested. */
+ assert(ls_current->config.lds_size == 0);
+
+ if (sctx->chip_class >= GFX9) {
+ unsigned hs_rsrc2 = ls_current->config.rsrc2;
+
+ if (sctx->chip_class >= GFX10)
+ hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
+ else
+ hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
+
+ radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+
+ /* Set userdata SGPRs for merged LS-HS. */
+ radeon_set_sh_reg_seq(
+ cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+ radeon_emit(cs, offchip_layout);
+ radeon_emit(cs, tcs_out_offsets);
+ radeon_emit(cs, tcs_out_layout);
+ } else {
+ unsigned ls_rsrc2 = ls_current->config.rsrc2;
+
+ si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+ ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+ /* Due to a hw bug, RSRC2_LS must be written twice with another
+ * LS register written in between. */
+ if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
+ radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+ radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+ radeon_emit(cs, ls_current->config.rsrc1);
+ radeon_emit(cs, ls_rsrc2);
+
+ /* Set userdata SGPRs for TCS. */
+ radeon_set_sh_reg_seq(
+ cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+ radeon_emit(cs, offchip_layout);
+ radeon_emit(cs, tcs_out_offsets);
+ radeon_emit(cs, tcs_out_layout);
+ radeon_emit(cs, tcs_in_layout);
+ }
+
+ /* Set userdata SGPRs for TES. */
+ radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
+ radeon_emit(cs, offchip_layout);
+ radeon_emit(cs, ring_va);
+
+ ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
+ S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
+
+ if (sctx->last_ls_hs_config != ls_hs_config) {
+ if (sctx->chip_class >= GFX7) {
+ radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
+ } else {
+ radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
+ }
+ sctx->last_ls_hs_config = ls_hs_config;
+ sctx->context_roll = true;
+ }
}
static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info,
- enum pipe_prim_type prim)
+ enum pipe_prim_type prim)
{
- switch (prim) {
- case PIPE_PRIM_PATCHES:
- return info->count / info->vertices_per_patch;
- case PIPE_PRIM_POLYGON:
- return info->count >= 3;
- case SI_PRIM_RECTANGLE_LIST:
- return info->count / 3;
- default:
- return u_decomposed_prims_for_vertices(prim, info->count);
- }
+ switch (prim) {
+ case PIPE_PRIM_PATCHES:
+ return info->count / info->vertices_per_patch;
+ case PIPE_PRIM_POLYGON:
+ return info->count >= 3;
+ case SI_PRIM_RECTANGLE_LIST:
+ return info->count / 3;
+ default:
+ return u_decomposed_prims_for_vertices(prim, info->count);
+ }
}
-static unsigned
-si_get_init_multi_vgt_param(struct si_screen *sscreen,
- union si_vgt_param_key *key)
+static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key)
{
- STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
- unsigned max_primgroup_in_wave = 2;
-
- /* SWITCH_ON_EOP(0) is always preferable. */
- bool wd_switch_on_eop = false;
- bool ia_switch_on_eop = false;
- bool ia_switch_on_eoi = false;
- bool partial_vs_wave = false;
- bool partial_es_wave = false;
-
- if (key->u.uses_tess) {
- /* SWITCH_ON_EOI must be set if PrimID is used. */
- if (key->u.tess_uses_prim_id)
- ia_switch_on_eoi = true;
-
- /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
- if ((sscreen->info.family == CHIP_TAHITI ||
- sscreen->info.family == CHIP_PITCAIRN ||
- sscreen->info.family == CHIP_BONAIRE) &&
- key->u.uses_gs)
- partial_vs_wave = true;
-
- /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
- if (sscreen->info.has_distributed_tess) {
- if (key->u.uses_gs) {
- if (sscreen->info.chip_class == GFX8)
- partial_es_wave = true;
- } else {
- partial_vs_wave = true;
- }
- }
- }
-
- /* This is a hardware requirement. */
- if (key->u.line_stipple_enabled ||
- (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
- ia_switch_on_eop = true;
- wd_switch_on_eop = true;
- }
-
- if (sscreen->info.chip_class >= GFX7) {
- /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
- * 4 shader engines. Set 1 to pass the assertion below.
- * The other cases are hardware requirements.
- *
- * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
- * for points, line strips, and tri strips.
- */
- if (sscreen->info.max_se <= 2 ||
- key->u.prim == PIPE_PRIM_POLYGON ||
- key->u.prim == PIPE_PRIM_LINE_LOOP ||
- key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
- key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
- (key->u.primitive_restart &&
- (sscreen->info.family < CHIP_POLARIS10 ||
- (key->u.prim != PIPE_PRIM_POINTS &&
- key->u.prim != PIPE_PRIM_LINE_STRIP &&
- key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
- key->u.count_from_stream_output)
- wd_switch_on_eop = true;
-
- /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
- * We don't know that for indirect drawing, so treat it as
- * always problematic. */
- if (sscreen->info.family == CHIP_HAWAII &&
- key->u.uses_instancing)
- wd_switch_on_eop = true;
-
- /* Performance recommendation for 4 SE Gfx7-8 parts if
- * instances are smaller than a primgroup.
- * Assume indirect draws always use small instances.
- * This is needed for good VS wave utilization.
- */
- if (sscreen->info.chip_class <= GFX8 &&
- sscreen->info.max_se == 4 &&
- key->u.multi_instances_smaller_than_primgroup)
- wd_switch_on_eop = true;
-
- /* Required on GFX7 and later. */
- if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
- ia_switch_on_eoi = true;
-
- /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
- * to work around a GS hang.
- */
- if (key->u.uses_gs &&
- (sscreen->info.family == CHIP_TONGA ||
- sscreen->info.family == CHIP_FIJI ||
- sscreen->info.family == CHIP_POLARIS10 ||
- sscreen->info.family == CHIP_POLARIS11 ||
- sscreen->info.family == CHIP_POLARIS12 ||
- sscreen->info.family == CHIP_VEGAM))
- partial_vs_wave = true;
-
- /* Required by Hawaii and, for some special cases, by GFX8. */
- if (ia_switch_on_eoi &&
- (sscreen->info.family == CHIP_HAWAII ||
- (sscreen->info.chip_class == GFX8 &&
- (key->u.uses_gs || max_primgroup_in_wave != 2))))
- partial_vs_wave = true;
-
- /* Instancing bug on Bonaire. */
- if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi &&
- key->u.uses_instancing)
- partial_vs_wave = true;
-
- /* This only applies to Polaris10 and later 4 SE chips.
- * wd_switch_on_eop is already true on all other chips.
- */
- if (!wd_switch_on_eop && key->u.primitive_restart)
- partial_vs_wave = true;
-
- /* If the WD switch is false, the IA switch must be false too. */
- assert(wd_switch_on_eop || !ia_switch_on_eop);
- }
-
- /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
- if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
- partial_es_wave = true;
-
- return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
- S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
- S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
- S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
- S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
- /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
- S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ?
- max_primgroup_in_wave : 0) |
- S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
- S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
+ STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
+ unsigned max_primgroup_in_wave = 2;
+
+ /* SWITCH_ON_EOP(0) is always preferable. */
+ bool wd_switch_on_eop = false;
+ bool ia_switch_on_eop = false;
+ bool ia_switch_on_eoi = false;
+ bool partial_vs_wave = false;
+ bool partial_es_wave = false;
+
+ if (key->u.uses_tess) {
+ /* SWITCH_ON_EOI must be set if PrimID is used. */
+ if (key->u.tess_uses_prim_id)
+ ia_switch_on_eoi = true;
+
+ /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+ if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN ||
+ sscreen->info.family == CHIP_BONAIRE) &&
+ key->u.uses_gs)
+ partial_vs_wave = true;
+
+ /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
+ if (sscreen->info.has_distributed_tess) {
+ if (key->u.uses_gs) {
+ if (sscreen->info.chip_class == GFX8)
+ partial_es_wave = true;
+ } else {
+ partial_vs_wave = true;
+ }
+ }
+ }
+
+ /* This is a hardware requirement. */
+ if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
+ ia_switch_on_eop = true;
+ wd_switch_on_eop = true;
+ }
+
+ if (sscreen->info.chip_class >= GFX7) {
+ /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
+ * 4 shader engines. Set 1 to pass the assertion below.
+ * The other cases are hardware requirements.
+ *
+ * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
+ * for points, line strips, and tri strips.
+ */
+ if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON ||
+ key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
+ key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
+ (key->u.primitive_restart &&
+ (sscreen->info.family < CHIP_POLARIS10 ||
+ (key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP &&
+ key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
+ key->u.count_from_stream_output)
+ wd_switch_on_eop = true;
+
+ /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
+ * We don't know that for indirect drawing, so treat it as
+ * always problematic. */
+ if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing)
+ wd_switch_on_eop = true;
+
+ /* Performance recommendation for 4 SE Gfx7-8 parts if
+ * instances are smaller than a primgroup.
+ * Assume indirect draws always use small instances.
+ * This is needed for good VS wave utilization.
+ */
+ if (sscreen->info.chip_class <= GFX8 && sscreen->info.max_se == 4 &&
+ key->u.multi_instances_smaller_than_primgroup)
+ wd_switch_on_eop = true;
+
+ /* Required on GFX7 and later. */
+ if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
+ ia_switch_on_eoi = true;
+
+ /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
+ * to work around a GS hang.
+ */
+ if (key->u.uses_gs &&
+ (sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+ sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+ sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM))
+ partial_vs_wave = true;
+
+ /* Required by Hawaii and, for some special cases, by GFX8. */
+ if (ia_switch_on_eoi &&
+ (sscreen->info.family == CHIP_HAWAII ||
+ (sscreen->info.chip_class == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2))))
+ partial_vs_wave = true;
+
+ /* Instancing bug on Bonaire. */
+ if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing)
+ partial_vs_wave = true;
+
+ /* This only applies to Polaris10 and later 4 SE chips.
+ * wd_switch_on_eop is already true on all other chips.
+ */
+ if (!wd_switch_on_eop && key->u.primitive_restart)
+ partial_vs_wave = true;
+
+ /* If the WD switch is false, the IA switch must be false too. */
+ assert(wd_switch_on_eop || !ia_switch_on_eop);
+ }
+
+ /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+ if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
+ partial_es_wave = true;
+
+ return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
+ S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+ S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
+ S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
+ /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
+ S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? max_primgroup_in_wave
+ : 0) |
+ S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
+ S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
}
static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
{
- for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
- for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
- for (int multi_instances = 0; multi_instances < 2; multi_instances++)
- for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
- for (int count_from_so = 0; count_from_so < 2; count_from_so++)
- for (int line_stipple = 0; line_stipple < 2; line_stipple++)
- for (int uses_tess = 0; uses_tess < 2; uses_tess++)
- for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
- for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
- union si_vgt_param_key key;
-
- key.index = 0;
- key.u.prim = prim;
- key.u.uses_instancing = uses_instancing;
- key.u.multi_instances_smaller_than_primgroup = multi_instances;
- key.u.primitive_restart = primitive_restart;
- key.u.count_from_stream_output = count_from_so;
- key.u.line_stipple_enabled = line_stipple;
- key.u.uses_tess = uses_tess;
- key.u.tess_uses_prim_id = tess_uses_primid;
- key.u.uses_gs = uses_gs;
-
- sctx->ia_multi_vgt_param[key.index] =
- si_get_init_multi_vgt_param(sctx->screen, &key);
- }
+ for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
+ for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
+ for (int multi_instances = 0; multi_instances < 2; multi_instances++)
+ for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
+ for (int count_from_so = 0; count_from_so < 2; count_from_so++)
+ for (int line_stipple = 0; line_stipple < 2; line_stipple++)
+ for (int uses_tess = 0; uses_tess < 2; uses_tess++)
+ for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
+ for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
+ union si_vgt_param_key key;
+
+ key.index = 0;
+ key.u.prim = prim;
+ key.u.uses_instancing = uses_instancing;
+ key.u.multi_instances_smaller_than_primgroup = multi_instances;
+ key.u.primitive_restart = primitive_restart;
+ key.u.count_from_stream_output = count_from_so;
+ key.u.line_stipple_enabled = line_stipple;
+ key.u.uses_tess = uses_tess;
+ key.u.tess_uses_prim_id = tess_uses_primid;
+ key.u.uses_gs = uses_gs;
+
+ sctx->ia_multi_vgt_param[key.index] =
+ si_get_init_multi_vgt_param(sctx->screen, &key);
+ }
}
static bool si_is_line_stipple_enabled(struct si_context *sctx)
{
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- return rs->line_stipple_enable &&
- sctx->current_rast_prim != PIPE_PRIM_POINTS &&
- (rs->polygon_mode_is_lines ||
- util_prim_is_lines(sctx->current_rast_prim));
+ return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+ (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
}
static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
- const struct pipe_draw_info *info,
- enum pipe_prim_type prim,
- unsigned num_patches,
- unsigned instance_count,
- bool primitive_restart)
+ const struct pipe_draw_info *info,
+ enum pipe_prim_type prim, unsigned num_patches,
+ unsigned instance_count, bool primitive_restart)
{
- union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
- unsigned primgroup_size;
- unsigned ia_multi_vgt_param;
-
- if (sctx->tes_shader.cso) {
- primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
- } else if (sctx->gs_shader.cso) {
- primgroup_size = 64; /* recommended with a GS */
- } else {
- primgroup_size = 128; /* recommended without a GS and tess */
- }
-
- key.u.prim = prim;
- key.u.uses_instancing = info->indirect || instance_count > 1;
- key.u.multi_instances_smaller_than_primgroup =
- info->indirect ||
- (instance_count > 1 &&
- (info->count_from_stream_output ||
- si_num_prims_for_vertices(info, prim) < primgroup_size));
- key.u.primitive_restart = primitive_restart;
- key.u.count_from_stream_output = info->count_from_stream_output != NULL;
- key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
-
- ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
- S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
-
- if (sctx->gs_shader.cso) {
- /* GS requirement. */
- if (sctx->chip_class <= GFX8 &&
- SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
- ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
-
- /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
- * The hw doc says all multi-SE chips are affected, but Vulkan
- * only applies it to Hawaii. Do what Vulkan does.
- */
- if (sctx->family == CHIP_HAWAII &&
- G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
- (info->indirect ||
- (instance_count > 1 &&
- (info->count_from_stream_output ||
- si_num_prims_for_vertices(info, prim) <= 1))))
- sctx->flags |= SI_CONTEXT_VGT_FLUSH;
- }
-
- return ia_multi_vgt_param;
+ union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+ unsigned primgroup_size;
+ unsigned ia_multi_vgt_param;
+
+ if (sctx->tes_shader.cso) {
+ primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+ } else if (sctx->gs_shader.cso) {
+ primgroup_size = 64; /* recommended with a GS */
+ } else {
+ primgroup_size = 128; /* recommended without a GS and tess */
+ }
+
+ key.u.prim = prim;
+ key.u.uses_instancing = info->indirect || instance_count > 1;
+ key.u.multi_instances_smaller_than_primgroup =
+ info->indirect ||
+ (instance_count > 1 &&
+ (info->count_from_stream_output || si_num_prims_for_vertices(info, prim) < primgroup_size));
+ key.u.primitive_restart = primitive_restart;
+ key.u.count_from_stream_output = info->count_from_stream_output != NULL;
+ key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
+
+ ia_multi_vgt_param =
+ sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
+
+ if (sctx->gs_shader.cso) {
+ /* GS requirement. */
+ if (sctx->chip_class <= GFX8 &&
+ SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+ ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
+
+ /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
+ * The hw doc says all multi-SE chips are affected, but Vulkan
+ * only applies it to Hawaii. Do what Vulkan does.
+ */
+ if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
+ (info->indirect || (instance_count > 1 && (info->count_from_stream_output ||
+ si_num_prims_for_vertices(info, prim) <= 1))))
+ sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+ }
+
+ return ia_multi_vgt_param;
}
static unsigned si_conv_prim_to_gs_out(unsigned mode)
{
- static const int prim_conv[] = {
- [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
- [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
- [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
- [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
- [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
- [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
- [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
- [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
- [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0,
- };
- assert(mode < ARRAY_SIZE(prim_conv));
-
- return prim_conv[mode];
+ static const int prim_conv[] = {
+ [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+ [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+ [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0,
+ };
+ assert(mode < ARRAY_SIZE(prim_conv));
+
+ return prim_conv[mode];
}
/* rast_prim is the primitive type after GS. */
static void si_emit_rasterizer_prim_state(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- enum pipe_prim_type rast_prim = sctx->current_rast_prim;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- unsigned initial_cdw = cs->current.cdw;
-
- if (unlikely(si_is_line_stipple_enabled(sctx))) {
- /* For lines, reset the stipple pattern at each primitive. Otherwise,
- * reset the stipple pattern at each packet (line strips, line loops).
- */
- unsigned value = rs->pa_sc_line_stipple |
- S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
-
- radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE,
- SI_TRACKED_PA_SC_LINE_STIPPLE, value);
- }
-
- unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
- if (unlikely(gs_out_prim != sctx->last_gs_out_prim &&
- (sctx->ngg || sctx->gs_shader.cso))) {
- radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
- sctx->last_gs_out_prim = gs_out_prim;
- }
-
- if (initial_cdw != cs->current.cdw)
- sctx->context_roll = true;
-
- if (sctx->ngg) {
- unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
-
- sctx->current_vs_state &= C_VS_STATE_OUTPRIM &
- C_VS_STATE_PROVOKING_VTX_INDEX;
- sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim) |
- S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ enum pipe_prim_type rast_prim = sctx->current_rast_prim;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ unsigned initial_cdw = cs->current.cdw;
+
+ if (unlikely(si_is_line_stipple_enabled(sctx))) {
+ /* For lines, reset the stipple pattern at each primitive. Otherwise,
+ * reset the stipple pattern at each packet (line strips, line loops).
+ */
+ unsigned value =
+ rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
+
+ radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE,
+ value);
+ }
+
+ unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
+ if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) {
+ radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+ sctx->last_gs_out_prim = gs_out_prim;
+ }
+
+ if (initial_cdw != cs->current.cdw)
+ sctx->context_roll = true;
+
+ if (sctx->ngg) {
+ unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
+
+ sctx->current_vs_state &= C_VS_STATE_OUTPRIM & C_VS_STATE_PROVOKING_VTX_INDEX;
+ sctx->current_vs_state |=
+ S_VS_STATE_OUTPRIM(gs_out_prim) | S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
+ }
}
-static void si_emit_vs_state(struct si_context *sctx,
- const struct pipe_draw_info *info)
+static void si_emit_vs_state(struct si_context *sctx, const struct pipe_draw_info *info)
{
- sctx->current_vs_state &= C_VS_STATE_INDEXED;
- sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
-
- if (sctx->num_vs_blit_sgprs) {
- /* Re-emit the state after we leave u_blitter. */
- sctx->last_vs_state = ~0;
- return;
- }
-
- if (sctx->current_vs_state != sctx->last_vs_state) {
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
- radeon_set_sh_reg(cs,
- sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
- SI_SGPR_VS_STATE_BITS * 4,
- sctx->current_vs_state);
-
- /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
- * before the rasterizer.
- *
- * For TES or the GS copy shader without NGG:
- */
- if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
- R_00B130_SPI_SHADER_USER_DATA_VS_0) {
- radeon_set_sh_reg(cs,
- R_00B130_SPI_SHADER_USER_DATA_VS_0 +
- SI_SGPR_VS_STATE_BITS * 4,
- sctx->current_vs_state);
- }
-
- /* For NGG: */
- if (sctx->screen->use_ngg &&
- sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
- R_00B230_SPI_SHADER_USER_DATA_GS_0) {
- radeon_set_sh_reg(cs,
- R_00B230_SPI_SHADER_USER_DATA_GS_0 +
- SI_SGPR_VS_STATE_BITS * 4,
- sctx->current_vs_state);
- }
-
- sctx->last_vs_state = sctx->current_vs_state;
- }
+ sctx->current_vs_state &= C_VS_STATE_INDEXED;
+ sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
+
+ if (sctx->num_vs_blit_sgprs) {
+ /* Re-emit the state after we leave u_blitter. */
+ sctx->last_vs_state = ~0;
+ return;
+ }
+
+ if (sctx->current_vs_state != sctx->last_vs_state) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
+ radeon_set_sh_reg(
+ cs, sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4,
+ sctx->current_vs_state);
+
+ /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
+ * before the rasterizer.
+ *
+ * For TES or the GS copy shader without NGG:
+ */
+ if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
+ radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
+ sctx->current_vs_state);
+ }
+
+ /* For NGG: */
+ if (sctx->screen->use_ngg &&
+ sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B230_SPI_SHADER_USER_DATA_GS_0) {
+ radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
+ sctx->current_vs_state);
+ }
+
+ sctx->last_vs_state = sctx->current_vs_state;
+ }
}
-static inline bool si_prim_restart_index_changed(struct si_context *sctx,
- bool primitive_restart,
- unsigned restart_index)
+static inline bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart,
+ unsigned restart_index)
{
- return primitive_restart &&
- (restart_index != sctx->last_restart_index ||
- sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
+ return primitive_restart && (restart_index != sctx->last_restart_index ||
+ sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
}
-static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
- const struct pipe_draw_info *info,
- enum pipe_prim_type prim,
- unsigned num_patches,
- unsigned instance_count,
- bool primitive_restart)
+static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info,
+ enum pipe_prim_type prim, unsigned num_patches,
+ unsigned instance_count, bool primitive_restart)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned ia_multi_vgt_param;
-
- ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, num_patches,
- instance_count, primitive_restart);
-
- /* Draw state. */
- if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
- if (sctx->chip_class == GFX9)
- radeon_set_uconfig_reg_idx(cs, sctx->screen,
- R_030960_IA_MULTI_VGT_PARAM, 4,
- ia_multi_vgt_param);
- else if (sctx->chip_class >= GFX7)
- radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
- else
- radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
-
- sctx->last_multi_vgt_param = ia_multi_vgt_param;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned ia_multi_vgt_param;
+
+ ia_multi_vgt_param =
+ si_get_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+ /* Draw state. */
+ if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+ if (sctx->chip_class == GFX9)
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
+ ia_multi_vgt_param);
+ else if (sctx->chip_class >= GFX7)
+ radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
+ else
+ radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+
+ sctx->last_multi_vgt_param = ia_multi_vgt_param;
+ }
}
/* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
*/
static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
{
- union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
- unsigned ge_cntl;
-
- if (sctx->ngg) {
- if (sctx->tes_shader.cso) {
- ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
- S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
- S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
- } else {
- ge_cntl = si_get_vs_state(sctx)->ge_cntl;
- }
- } else {
- unsigned primgroup_size;
- unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */;
-
- if (sctx->tes_shader.cso) {
- primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
- } else if (sctx->gs_shader.cso) {
- unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
- primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
- } else {
- primgroup_size = 128; /* recommended without a GS and tess */
- }
-
- ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) |
- S_03096C_VERT_GRP_SIZE(vertgroup_size) |
- S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
- }
-
- ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
-
- if (ge_cntl != sctx->last_multi_vgt_param) {
- radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
- sctx->last_multi_vgt_param = ge_cntl;
- }
+ union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+ unsigned ge_cntl;
+
+ if (sctx->ngg) {
+ if (sctx->tes_shader.cso) {
+ ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
+ S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+ S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
+ } else {
+ ge_cntl = si_get_vs_state(sctx)->ge_cntl;
+ }
+ } else {
+ unsigned primgroup_size;
+ unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
+ ;
+
+ if (sctx->tes_shader.cso) {
+ primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+ } else if (sctx->gs_shader.cso) {
+ unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
+ primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
+ } else {
+ primgroup_size = 128; /* recommended without a GS and tess */
+ }
+
+ ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | S_03096C_VERT_GRP_SIZE(vertgroup_size) |
+ S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
+ }
+
+ ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
+
+ if (ge_cntl != sctx->last_multi_vgt_param) {
+ radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
+ sctx->last_multi_vgt_param = ge_cntl;
+ }
}
-static void si_emit_draw_registers(struct si_context *sctx,
- const struct pipe_draw_info *info,
- enum pipe_prim_type prim,
- unsigned num_patches,
- unsigned instance_count,
- bool primitive_restart)
+static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info,
+ enum pipe_prim_type prim, unsigned num_patches,
+ unsigned instance_count, bool primitive_restart)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned vgt_prim = si_conv_pipe_prim(prim);
-
- if (sctx->chip_class >= GFX10)
- gfx10_emit_ge_cntl(sctx, num_patches);
- else
- si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches,
- instance_count, primitive_restart);
-
- if (vgt_prim != sctx->last_prim) {
- if (sctx->chip_class >= GFX10)
- radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
- else if (sctx->chip_class >= GFX7)
- radeon_set_uconfig_reg_idx(cs, sctx->screen,
- R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
- else
- radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
-
- sctx->last_prim = vgt_prim;
- }
-
- /* Primitive restart. */
- if (primitive_restart != sctx->last_primitive_restart_en) {
- if (sctx->chip_class >= GFX9)
- radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
- primitive_restart);
- else
- radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
- primitive_restart);
-
- sctx->last_primitive_restart_en = primitive_restart;
-
- }
- if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
- radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
- info->restart_index);
- sctx->last_restart_index = info->restart_index;
- sctx->context_roll = true;
- }
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned vgt_prim = si_conv_pipe_prim(prim);
+
+ if (sctx->chip_class >= GFX10)
+ gfx10_emit_ge_cntl(sctx, num_patches);
+ else
+ si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+ if (vgt_prim != sctx->last_prim) {
+ if (sctx->chip_class >= GFX10)
+ radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
+ else if (sctx->chip_class >= GFX7)
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
+ else
+ radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
+
+ sctx->last_prim = vgt_prim;
+ }
+
+ /* Primitive restart. */
+ if (primitive_restart != sctx->last_primitive_restart_en) {
+ if (sctx->chip_class >= GFX9)
+ radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+ else
+ radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+
+ sctx->last_primitive_restart_en = primitive_restart;
+ }
+ if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
+ radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index);
+ sctx->last_restart_index = info->restart_index;
+ sctx->context_roll = true;
+ }
}
-static void si_emit_draw_packets(struct si_context *sctx,
- const struct pipe_draw_info *info,
- struct pipe_resource *indexbuf,
- unsigned index_size,
- unsigned index_offset,
- unsigned instance_count,
- bool dispatch_prim_discard_cs,
- unsigned original_index_size)
+static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
+ struct pipe_resource *indexbuf, unsigned index_size,
+ unsigned index_offset, unsigned instance_count,
+ bool dispatch_prim_discard_cs, unsigned original_index_size)
{
- struct pipe_draw_indirect_info *indirect = info->indirect;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
- bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
- uint32_t index_max_size = 0;
- uint64_t index_va = 0;
-
- if (info->count_from_stream_output) {
- struct si_streamout_target *t =
- (struct si_streamout_target*)info->count_from_stream_output;
-
- radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
- t->stride_in_dw);
- si_cp_copy_data(sctx, sctx->gfx_cs,
- COPY_DATA_REG, NULL,
- R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2,
- COPY_DATA_SRC_MEM, t->buf_filled_size,
- t->buf_filled_size_offset);
- }
-
- /* draw packet */
- if (index_size) {
- if (index_size != sctx->last_index_size) {
- unsigned index_type;
-
- /* index type */
- switch (index_size) {
- case 1:
- index_type = V_028A7C_VGT_INDEX_8;
- break;
- case 2:
- index_type = V_028A7C_VGT_INDEX_16 |
- (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
- V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
- break;
- case 4:
- index_type = V_028A7C_VGT_INDEX_32 |
- (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
- V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
- break;
- default:
- assert(!"unreachable");
- return;
- }
-
- if (sctx->chip_class >= GFX9) {
- radeon_set_uconfig_reg_idx(cs, sctx->screen,
- R_03090C_VGT_INDEX_TYPE, 2,
- index_type);
- } else {
- radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
- radeon_emit(cs, index_type);
- }
-
- sctx->last_index_size = index_size;
- }
-
- if (original_index_size) {
- index_max_size = (indexbuf->width0 - index_offset) /
- original_index_size;
- /* Skip draw calls with 0-sized index buffers.
- * They cause a hang on some chips, like Navi10-14.
- */
- if (!index_max_size)
- return;
-
- index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(indexbuf),
- RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
- }
- } else {
- /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
- * so the state must be re-emitted before the next indexed draw.
- */
- if (sctx->chip_class >= GFX7)
- sctx->last_index_size = -1;
- }
-
- if (indirect) {
- uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
-
- assert(indirect_va % 8 == 0);
-
- si_invalidate_draw_sh_constants(sctx);
-
- radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
- radeon_emit(cs, 1);
- radeon_emit(cs, indirect_va);
- radeon_emit(cs, indirect_va >> 32);
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- si_resource(indirect->buffer),
- RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
- unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
- : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-
- assert(indirect->offset % 4 == 0);
-
- if (index_size) {
- radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
- radeon_emit(cs, index_va);
- radeon_emit(cs, index_va >> 32);
-
- radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
- radeon_emit(cs, index_max_size);
- }
-
- if (!sctx->screen->has_draw_indirect_multi) {
- radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT
- : PKT3_DRAW_INDIRECT,
- 3, render_cond_bit));
- radeon_emit(cs, indirect->offset);
- radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
- radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
- radeon_emit(cs, di_src_sel);
- } else {
- uint64_t count_va = 0;
-
- if (indirect->indirect_draw_count) {
- struct si_resource *params_buf =
- si_resource(indirect->indirect_draw_count);
-
- radeon_add_to_buffer_list(
- sctx, sctx->gfx_cs, params_buf,
- RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
- count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
- }
-
- radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
- PKT3_DRAW_INDIRECT_MULTI,
- 8, render_cond_bit));
- radeon_emit(cs, indirect->offset);
- radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
- radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
- radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
- S_2C3_DRAW_INDEX_ENABLE(1) |
- S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
- radeon_emit(cs, indirect->draw_count);
- radeon_emit(cs, count_va);
- radeon_emit(cs, count_va >> 32);
- radeon_emit(cs, indirect->stride);
- radeon_emit(cs, di_src_sel);
- }
- } else {
- int base_vertex;
-
- if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
- sctx->last_instance_count != instance_count) {
- radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
- radeon_emit(cs, instance_count);
- sctx->last_instance_count = instance_count;
- }
-
- /* Base vertex and start instance. */
- base_vertex = original_index_size ? info->index_bias : info->start;
-
- if (sctx->num_vs_blit_sgprs) {
- /* Re-emit draw constants after we leave u_blitter. */
- si_invalidate_draw_sh_constants(sctx);
-
- /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
- radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4,
- sctx->num_vs_blit_sgprs);
- radeon_emit_array(cs, sctx->vs_blit_sh_data,
- sctx->num_vs_blit_sgprs);
- } else if (base_vertex != sctx->last_base_vertex ||
- sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
- info->start_instance != sctx->last_start_instance ||
- info->drawid != sctx->last_drawid ||
- sh_base_reg != sctx->last_sh_base_reg) {
- radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
- radeon_emit(cs, base_vertex);
- radeon_emit(cs, info->start_instance);
- radeon_emit(cs, info->drawid);
-
- sctx->last_base_vertex = base_vertex;
- sctx->last_start_instance = info->start_instance;
- sctx->last_drawid = info->drawid;
- sctx->last_sh_base_reg = sh_base_reg;
- }
-
- if (index_size) {
- if (dispatch_prim_discard_cs) {
- index_va += info->start * original_index_size;
- index_max_size = MIN2(index_max_size, info->count);
-
- si_dispatch_prim_discard_cs_and_draw(sctx, info,
- original_index_size,
- base_vertex,
- index_va, index_max_size);
- return;
- }
-
- index_va += info->start * index_size;
-
- radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
- radeon_emit(cs, index_max_size);
- radeon_emit(cs, index_va);
- radeon_emit(cs, index_va >> 32);
- radeon_emit(cs, info->count);
- radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
- } else {
- radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
- radeon_emit(cs, info->count);
- radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
- S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
- }
- }
+ struct pipe_draw_indirect_info *indirect = info->indirect;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
+ bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+ uint32_t index_max_size = 0;
+ uint64_t index_va = 0;
+
+ if (info->count_from_stream_output) {
+ struct si_streamout_target *t = (struct si_streamout_target *)info->count_from_stream_output;
+
+ radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
+ si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL,
+ R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
+ t->buf_filled_size, t->buf_filled_size_offset);
+ }
+
+ /* draw packet */
+ if (index_size) {
+ if (index_size != sctx->last_index_size) {
+ unsigned index_type;
+
+ /* index type */
+ switch (index_size) {
+ case 1:
+ index_type = V_028A7C_VGT_INDEX_8;
+ break;
+ case 2:
+ index_type =
+ V_028A7C_VGT_INDEX_16 |
+ (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
+ break;
+ case 4:
+ index_type =
+ V_028A7C_VGT_INDEX_32 |
+ (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
+ break;
+ default:
+ assert(!"unreachable");
+ return;
+ }
+
+ if (sctx->chip_class >= GFX9) {
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+ radeon_emit(cs, index_type);
+ }
+
+ sctx->last_index_size = index_size;
+ }
+
+ if (original_index_size) {
+ index_max_size = (indexbuf->width0 - index_offset) / original_index_size;
+ /* Skip draw calls with 0-sized index buffers.
+ * They cause a hang on some chips, like Navi10-14.
+ */
+ if (!index_max_size)
+ return;
+
+ index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+ RADEON_PRIO_INDEX_BUFFER);
+ }
+ } else {
+ /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
+ * so the state must be re-emitted before the next indexed draw.
+ */
+ if (sctx->chip_class >= GFX7)
+ sctx->last_index_size = -1;
+ }
+
+ if (indirect) {
+ uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
+
+ assert(indirect_va % 8 == 0);
+
+ si_invalidate_draw_sh_constants(sctx);
+
+ radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
+ radeon_emit(cs, 1);
+ radeon_emit(cs, indirect_va);
+ radeon_emit(cs, indirect_va >> 32);
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indirect->buffer),
+ RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+ unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+
+ assert(indirect->offset % 4 == 0);
+
+ if (index_size) {
+ radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
+ radeon_emit(cs, index_va);
+ radeon_emit(cs, index_va >> 32);
+
+ radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
+ radeon_emit(cs, index_max_size);
+ }
+
+ if (!sctx->screen->has_draw_indirect_multi) {
+ radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
+ render_cond_bit));
+ radeon_emit(cs, indirect->offset);
+ radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+ radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+ radeon_emit(cs, di_src_sel);
+ } else {
+ uint64_t count_va = 0;
+
+ if (indirect->indirect_draw_count) {
+ struct si_resource *params_buf = si_resource(indirect->indirect_draw_count);
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, params_buf, RADEON_USAGE_READ,
+ RADEON_PRIO_DRAW_INDIRECT);
+
+ count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
+ }
+
+ radeon_emit(cs,
+ PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
+ render_cond_bit));
+ radeon_emit(cs, indirect->offset);
+ radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+ radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+ radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
+ S_2C3_DRAW_INDEX_ENABLE(1) |
+ S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+ radeon_emit(cs, indirect->draw_count);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, count_va >> 32);
+ radeon_emit(cs, indirect->stride);
+ radeon_emit(cs, di_src_sel);
+ }
+ } else {
+ int base_vertex;
+
+ if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
+ sctx->last_instance_count != instance_count) {
+ radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+ radeon_emit(cs, instance_count);
+ sctx->last_instance_count = instance_count;
+ }
+
+ /* Base vertex and start instance. */
+ base_vertex = original_index_size ? info->index_bias : info->start;
+
+ if (sctx->num_vs_blit_sgprs) {
+ /* Re-emit draw constants after we leave u_blitter. */
+ si_invalidate_draw_sh_constants(sctx);
+
+ /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
+ radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
+ radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
+ } else if (base_vertex != sctx->last_base_vertex ||
+ sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
+ info->start_instance != sctx->last_start_instance ||
+ info->drawid != sctx->last_drawid || sh_base_reg != sctx->last_sh_base_reg) {
+ radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
+ radeon_emit(cs, base_vertex);
+ radeon_emit(cs, info->start_instance);
+ radeon_emit(cs, info->drawid);
+
+ sctx->last_base_vertex = base_vertex;
+ sctx->last_start_instance = info->start_instance;
+ sctx->last_drawid = info->drawid;
+ sctx->last_sh_base_reg = sh_base_reg;
+ }
+
+ if (index_size) {
+ if (dispatch_prim_discard_cs) {
+ index_va += info->start * original_index_size;
+ index_max_size = MIN2(index_max_size, info->count);
+
+ si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex,
+ index_va, index_max_size);
+ return;
+ }
+
+ index_va += info->start * index_size;
+
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+ radeon_emit(cs, index_max_size);
+ radeon_emit(cs, index_va);
+ radeon_emit(cs, index_va >> 32);
+ radeon_emit(cs, info->count);
+ radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
+ } else {
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
+ radeon_emit(cs, info->count);
+ radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+ S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
+ }
+ }
}
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
- unsigned cp_coher_cntl)
+void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
{
- bool compute_ib = !sctx->has_graphics ||
- cs == sctx->prim_discard_compute_cs;
-
- assert(sctx->chip_class <= GFX9);
-
- if (sctx->chip_class == GFX9 || compute_ib) {
- /* Flush caches and wait for the caches to assert idle. */
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- } else {
- /* ACQUIRE_MEM is only required on a compute ring. */
- radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- }
-
- /* ACQUIRE_MEM has an implicit context roll if the current context
- * is busy. */
- if (!compute_ib)
- sctx->context_roll = true;
+ bool compute_ib = !sctx->has_graphics || cs == sctx->prim_discard_compute_cs;
+
+ assert(sctx->chip_class <= GFX9);
+
+ if (sctx->chip_class == GFX9 || compute_ib) {
+ /* Flush caches and wait for the caches to assert idle. */
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ } else {
+ /* ACQUIRE_MEM is only required on a compute ring. */
+ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ }
+
+ /* ACQUIRE_MEM has an implicit context roll if the current context
+ * is busy. */
+ if (!compute_ib)
+ sctx->context_roll = true;
}
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
{
- if (!si_compute_prim_discard_enabled(sctx))
- return;
-
- if (!sctx->barrier_buf) {
- u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
- &sctx->barrier_buf_offset,
- (struct pipe_resource**)&sctx->barrier_buf);
- }
-
- /* Emit a placeholder to signal the next compute IB to start.
- * See si_compute_prim_discard.c for explanation.
- */
- uint32_t signal = 1;
- si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
- 4, V_370_MEM, V_370_ME, &signal);
-
- sctx->last_pkt3_write_data =
- &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
-
- /* Only the last occurence of WRITE_DATA will be executed.
- * The packet will be enabled in si_flush_gfx_cs.
- */
- *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
+ if (!si_compute_prim_discard_enabled(sctx))
+ return;
+
+ if (!sctx->barrier_buf) {
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
+ (struct pipe_resource **)&sctx->barrier_buf);
+ }
+
+ /* Emit a placeholder to signal the next compute IB to start.
+ * See si_compute_prim_discard.c for explanation.
+ */
+ uint32_t signal = 1;
+ si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
+ &signal);
+
+ sctx->last_pkt3_write_data = &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
+
+ /* Only the last occurence of WRITE_DATA will be executed.
+ * The packet will be enabled in si_flush_gfx_cs.
+ */
+ *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
}
void gfx10_emit_cache_flush(struct si_context *ctx)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
- uint32_t gcr_cntl = 0;
- unsigned cb_db_event = 0;
- unsigned flags = ctx->flags;
-
- if (!ctx->has_graphics) {
- /* Only process compute flags. */
- flags &= SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 |
- SI_CONTEXT_WB_L2 |
- SI_CONTEXT_INV_L2_METADATA |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- /* We don't need these. */
- assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC |
- SI_CONTEXT_FLUSH_AND_INV_DB_META)));
-
- if (flags & SI_CONTEXT_VGT_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
- }
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
- ctx->num_cb_cache_flushes++;
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- ctx->num_db_cache_flushes++;
-
- if (flags & SI_CONTEXT_INV_ICACHE)
- gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
- if (flags & SI_CONTEXT_INV_SCACHE) {
- /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
- * to FORWARD when both L1 and L2 are written out (WB or INV).
- */
- gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
- }
- if (flags & SI_CONTEXT_INV_VCACHE)
- gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
-
- /* The L2 cache ops are:
- * - INV: - invalidate lines that reflect memory (were loaded from memory)
- * - don't touch lines that were overwritten (were stored by gfx clients)
- * - WB: - don't touch lines that reflect memory
- * - write back lines that were overwritten
- * - WB | INV: - invalidate lines that reflect memory
- * - write back lines that were overwritten
- *
- * GLM doesn't support WB alone. If WB is set, INV must be set too.
- */
- if (flags & SI_CONTEXT_INV_L2) {
- /* Writeback and invalidate everything in L2. */
- gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
- S_586_GLM_INV(1) | S_586_GLM_WB(1);
- ctx->num_L2_invalidates++;
- } else if (flags & SI_CONTEXT_WB_L2) {
- gcr_cntl |= S_586_GL2_WB(1) |
- S_586_GLM_WB(1) | S_586_GLM_INV(1);
- } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
- gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
- }
-
- if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
- EVENT_INDEX(0));
- }
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
- /* Flush HTILE. Will wait for idle later. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
- EVENT_INDEX(0));
- }
-
- /* First flush CB/DB, then L1/L2. */
- gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
-
- if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
- (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
- cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
- } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
- } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
- cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
- } else {
- assert(0);
- }
- } else {
- /* Wait for graphics shaders to go idle if requested. */
- if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- /* Only count explicit shader flushes, not implicit ones. */
- ctx->num_vs_flushes++;
- ctx->num_ps_flushes++;
- } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- ctx->num_vs_flushes++;
- }
- }
-
- if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
- ctx->num_cs_flushes++;
- ctx->compute_is_busy = false;
- }
-
- if (cb_db_event) {
- /* CB/DB flush and invalidate (or possibly just a wait for a
- * meta flush) via RELEASE_MEM.
- *
- * Combine this with other cache flushes when possible; this
- * requires affected shaders to be idle, so do it after the
- * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
- * implied).
- */
- uint64_t va;
-
- /* Do the flush (enqueue the event and wait for it). */
- va = ctx->wait_mem_scratch->gpu_address;
- ctx->wait_mem_number++;
-
- /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
- unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
- unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
- unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
- unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
- assert(G_586_GL2_US(gcr_cntl) == 0);
- assert(G_586_GL2_RANGE(gcr_cntl) == 0);
- assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
- unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
- unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
- unsigned gcr_seq = G_586_SEQ(gcr_cntl);
-
- gcr_cntl &= C_586_GLM_WB &
- C_586_GLM_INV &
- C_586_GLV_INV &
- C_586_GL1_INV &
- C_586_GL2_INV &
- C_586_GL2_WB; /* keep SEQ */
-
- si_cp_release_mem(ctx, cs, cb_db_event,
- S_490_GLM_WB(glm_wb) |
- S_490_GLM_INV(glm_inv) |
- S_490_GLV_INV(glv_inv) |
- S_490_GL1_INV(gl1_inv) |
- S_490_GL2_INV(gl2_inv) |
- S_490_GL2_WB(gl2_wb) |
- S_490_SEQ(gcr_seq),
- EOP_DST_SEL_MEM,
- EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
- EOP_DATA_SEL_VALUE_32BIT,
- ctx->wait_mem_scratch, va,
- ctx->wait_mem_number, SI_NOT_QUERY);
- si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff,
- WAIT_REG_MEM_EQUAL);
- }
-
- /* Ignore fields that only modify the behavior of other fields. */
- if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
- /* Flush caches and wait for the caches to assert idle.
- * The cache flush is executed in the ME, but the PFP waits
- * for completion.
- */
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
- radeon_emit(cs, 0); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
- } else if (cb_db_event ||
- (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH |
- SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH))) {
- /* We need to ensure that PFP waits as well. */
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
- }
-
- if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
- EVENT_INDEX(0));
- } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
- EVENT_INDEX(0));
- }
-
- ctx->flags = 0;
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ uint32_t gcr_cntl = 0;
+ unsigned cb_db_event = 0;
+ unsigned flags = ctx->flags;
+
+ if (!ctx->has_graphics) {
+ /* Only process compute flags. */
+ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* We don't need these. */
+ assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
+
+ if (flags & SI_CONTEXT_VGT_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ }
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+ ctx->num_cb_cache_flushes++;
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ ctx->num_db_cache_flushes++;
+
+ if (flags & SI_CONTEXT_INV_ICACHE)
+ gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+ if (flags & SI_CONTEXT_INV_SCACHE) {
+ /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
+ * to FORWARD when both L1 and L2 are written out (WB or INV).
+ */
+ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+ }
+ if (flags & SI_CONTEXT_INV_VCACHE)
+ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+ /* The L2 cache ops are:
+ * - INV: - invalidate lines that reflect memory (were loaded from memory)
+ * - don't touch lines that were overwritten (were stored by gfx clients)
+ * - WB: - don't touch lines that reflect memory
+ * - write back lines that were overwritten
+ * - WB | INV: - invalidate lines that reflect memory
+ * - write back lines that were overwritten
+ *
+ * GLM doesn't support WB alone. If WB is set, INV must be set too.
+ */
+ if (flags & SI_CONTEXT_INV_L2) {
+ /* Writeback and invalidate everything in L2. */
+ gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
+ ctx->num_L2_invalidates++;
+ } else if (flags & SI_CONTEXT_WB_L2) {
+ gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
+ } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
+ gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
+ }
+
+ if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+ }
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+ /* Flush HTILE. Will wait for idle later. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+ }
+
+ /* First flush CB/DB, then L1/L2. */
+ gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
+
+ if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
+ (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+ cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+ } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+ } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+ cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+ } else {
+ assert(0);
+ }
+ } else {
+ /* Wait for graphics shaders to go idle if requested. */
+ if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ /* Only count explicit shader flushes, not implicit ones. */
+ ctx->num_vs_flushes++;
+ ctx->num_ps_flushes++;
+ } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ ctx->num_vs_flushes++;
+ }
+ }
+
+ if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+ ctx->num_cs_flushes++;
+ ctx->compute_is_busy = false;
+ }
+
+ if (cb_db_event) {
+ /* CB/DB flush and invalidate (or possibly just a wait for a
+ * meta flush) via RELEASE_MEM.
+ *
+ * Combine this with other cache flushes when possible; this
+ * requires affected shaders to be idle, so do it after the
+ * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
+ * implied).
+ */
+ uint64_t va;
+
+ /* Do the flush (enqueue the event and wait for it). */
+ va = ctx->wait_mem_scratch->gpu_address;
+ ctx->wait_mem_number++;
+
+ /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
+ unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
+ unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
+ unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
+ unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
+ assert(G_586_GL2_US(gcr_cntl) == 0);
+ assert(G_586_GL2_RANGE(gcr_cntl) == 0);
+ assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
+ unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
+ unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
+ unsigned gcr_seq = G_586_SEQ(gcr_cntl);
+
+ gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
+ C_586_GL2_WB; /* keep SEQ */
+
+ si_cp_release_mem(ctx, cs, cb_db_event,
+ S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
+ S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
+ S_490_SEQ(gcr_seq),
+ EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_VALUE_32BIT, ctx->wait_mem_scratch, va, ctx->wait_mem_number,
+ SI_NOT_QUERY);
+ si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+ }
+
+ /* Ignore fields that only modify the behavior of other fields. */
+ if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
+ /* Flush caches and wait for the caches to assert idle.
+ * The cache flush is executed in the ME, but the PFP waits
+ * for completion.
+ */
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+ radeon_emit(cs, 0); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
+ } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* We need to ensure that PFP waits as well. */
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ }
+
+ if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+ } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+ }
+
+ ctx->flags = 0;
}
void si_emit_cache_flush(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- uint32_t flags = sctx->flags;
-
- if (!sctx->has_graphics) {
- /* Only process compute flags. */
- flags &= SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 |
- SI_CONTEXT_WB_L2 |
- SI_CONTEXT_INV_L2_METADATA |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- uint32_t cp_coher_cntl = 0;
- const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
- SI_CONTEXT_FLUSH_AND_INV_DB);
- const bool is_barrier = flush_cb_db ||
- /* INV_ICACHE == beginning of gfx IB. Checking
- * INV_ICACHE fixes corruption for DeusExMD with
- * compute-based culling, but I don't know why.
- */
- flags & (SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_VS_PARTIAL_FLUSH) ||
- (flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
- sctx->compute_is_busy);
-
- assert(sctx->chip_class <= GFX9);
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
- sctx->num_cb_cache_flushes++;
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- sctx->num_db_cache_flushes++;
-
- /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
- * bit is set. An alternative way is to write SQC_CACHES, but that
- * doesn't seem to work reliably. Since the bug doesn't affect
- * correctness (it only does more work than necessary) and
- * the performance impact is likely negligible, there is no plan
- * to add a workaround for it.
- */
-
- if (flags & SI_CONTEXT_INV_ICACHE)
- cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
- if (flags & SI_CONTEXT_INV_SCACHE)
- cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
-
- if (sctx->chip_class <= GFX8) {
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
- S_0085F0_CB0_DEST_BASE_ENA(1) |
- S_0085F0_CB1_DEST_BASE_ENA(1) |
- S_0085F0_CB2_DEST_BASE_ENA(1) |
- S_0085F0_CB3_DEST_BASE_ENA(1) |
- S_0085F0_CB4_DEST_BASE_ENA(1) |
- S_0085F0_CB5_DEST_BASE_ENA(1) |
- S_0085F0_CB6_DEST_BASE_ENA(1) |
- S_0085F0_CB7_DEST_BASE_ENA(1);
-
- /* Necessary for DCC */
- if (sctx->chip_class == GFX8)
- si_cp_release_mem(sctx, cs,
- V_028A90_FLUSH_AND_INV_CB_DATA_TS,
- 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
- EOP_DATA_SEL_DISCARD, NULL,
- 0, 0, SI_NOT_QUERY);
- }
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
- S_0085F0_DB_DEST_BASE_ENA(1);
- }
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
- }
- if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB |
- SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
- /* Flush HTILE. SURFACE_SYNC will wait for idle. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
- }
-
- /* Wait for shader engines to go idle.
- * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
- * for everything including CB/DB cache flushes.
- */
- if (!flush_cb_db) {
- if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- /* Only count explicit shader flushes, not implicit ones
- * done by SURFACE_SYNC.
- */
- sctx->num_vs_flushes++;
- sctx->num_ps_flushes++;
- } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- sctx->num_vs_flushes++;
- }
- }
-
- if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
- sctx->compute_is_busy) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- sctx->num_cs_flushes++;
- sctx->compute_is_busy = false;
- }
-
- /* VGT state synchronization. */
- if (flags & SI_CONTEXT_VGT_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
- }
- if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
- }
-
- /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
- * wait for idle on GFX9. We have to use a TS event.
- */
- if (sctx->chip_class == GFX9 && flush_cb_db) {
- uint64_t va;
- unsigned tc_flags, cb_db_event;
-
- /* Set the CB/DB flush event. */
- switch (flush_cb_db) {
- case SI_CONTEXT_FLUSH_AND_INV_CB:
- cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
- break;
- case SI_CONTEXT_FLUSH_AND_INV_DB:
- cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
- break;
- default:
- /* both CB & DB */
- cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
- }
-
- /* These are the only allowed combinations. If you need to
- * do multiple operations at once, do them separately.
- * All operations that invalidate L2 also seem to invalidate
- * metadata. Volatile (VOL) and WC flushes are not listed here.
- *
- * TC | TC_WB = writeback & invalidate L2 & L1
- * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
- * TC_WB | TC_NC = writeback L2 for MTYPE == NC
- * TC | TC_NC = invalidate L2 for MTYPE == NC
- * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
- * TCL1 = invalidate L1
- */
- tc_flags = 0;
-
- if (flags & SI_CONTEXT_INV_L2_METADATA) {
- tc_flags = EVENT_TC_ACTION_ENA |
- EVENT_TC_MD_ACTION_ENA;
- }
-
- /* Ideally flush TC together with CB/DB. */
- if (flags & SI_CONTEXT_INV_L2) {
- /* Writeback and invalidate everything in L2 & L1. */
- tc_flags = EVENT_TC_ACTION_ENA |
- EVENT_TC_WB_ACTION_ENA;
-
- /* Clear the flags. */
- flags &= ~(SI_CONTEXT_INV_L2 |
- SI_CONTEXT_WB_L2 |
- SI_CONTEXT_INV_VCACHE);
- sctx->num_L2_invalidates++;
- }
-
- /* Do the flush (enqueue the event and wait for it). */
- va = sctx->wait_mem_scratch->gpu_address;
- sctx->wait_mem_number++;
-
- si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
- EOP_DST_SEL_MEM,
- EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
- EOP_DATA_SEL_VALUE_32BIT,
- sctx->wait_mem_scratch, va,
- sctx->wait_mem_number, SI_NOT_QUERY);
- si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
- WAIT_REG_MEM_EQUAL);
- }
-
- /* Make sure ME is idle (it executes most packets) before continuing.
- * This prevents read-after-write hazards between PFP and ME.
- */
- if (sctx->has_graphics &&
- (cp_coher_cntl ||
- (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
- SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 |
- SI_CONTEXT_WB_L2)))) {
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
- }
-
- /* GFX6-GFX8 only:
- * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
- * waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
- *
- * cp_coher_cntl should contain all necessary flags except TC flags
- * at this point.
- *
- * GFX6-GFX7 don't support L2 write-back.
- */
- if (flags & SI_CONTEXT_INV_L2 ||
- (sctx->chip_class <= GFX7 &&
- (flags & SI_CONTEXT_WB_L2))) {
- /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
- * WB must be set on GFX8+ when TC_ACTION is set.
- */
- si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
- S_0085F0_TC_ACTION_ENA(1) |
- S_0085F0_TCL1_ACTION_ENA(1) |
- S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
- cp_coher_cntl = 0;
- sctx->num_L2_invalidates++;
- } else {
- /* L1 invalidation and L2 writeback must be done separately,
- * because both operations can't be done together.
- */
- if (flags & SI_CONTEXT_WB_L2) {
- /* WB = write-back
- * NC = apply to non-coherent MTYPEs
- * (i.e. MTYPE <= 1, which is what we use everywhere)
- *
- * WB doesn't work without NC.
- */
- si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
- S_0301F0_TC_WB_ACTION_ENA(1) |
- S_0301F0_TC_NC_ACTION_ENA(1));
- cp_coher_cntl = 0;
- sctx->num_L2_writebacks++;
- }
- if (flags & SI_CONTEXT_INV_VCACHE) {
- /* Invalidate per-CU VMEM L1. */
- si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
- S_0085F0_TCL1_ACTION_ENA(1));
- cp_coher_cntl = 0;
- }
- }
-
- /* If TC flushes haven't cleared this... */
- if (cp_coher_cntl)
- si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
-
- if (is_barrier)
- si_prim_discard_signal_next_compute_ib_start(sctx);
-
- if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
- EVENT_INDEX(0));
- } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
- EVENT_INDEX(0));
- }
-
- sctx->flags = 0;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ uint32_t flags = sctx->flags;
+
+ if (!sctx->has_graphics) {
+ /* Only process compute flags. */
+ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ uint32_t cp_coher_cntl = 0;
+ const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
+ const bool is_barrier =
+ flush_cb_db ||
+ /* INV_ICACHE == beginning of gfx IB. Checking
+ * INV_ICACHE fixes corruption for DeusExMD with
+ * compute-based culling, but I don't know why.
+ */
+ flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+ (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
+
+ assert(sctx->chip_class <= GFX9);
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+ sctx->num_cb_cache_flushes++;
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ sctx->num_db_cache_flushes++;
+
+ /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
+ * bit is set. An alternative way is to write SQC_CACHES, but that
+ * doesn't seem to work reliably. Since the bug doesn't affect
+ * correctness (it only does more work than necessary) and
+ * the performance impact is likely negligible, there is no plan
+ * to add a workaround for it.
+ */
+
+ if (flags & SI_CONTEXT_INV_ICACHE)
+ cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+ if (flags & SI_CONTEXT_INV_SCACHE)
+ cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+
+ if (sctx->chip_class <= GFX8) {
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
+ S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
+ S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
+ S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
+ S_0085F0_CB7_DEST_BASE_ENA(1);
+
+ /* Necessary for DCC */
+ if (sctx->chip_class == GFX8)
+ si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
+ }
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
+ }
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+ }
+ if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
+ /* Flush HTILE. SURFACE_SYNC will wait for idle. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+ }
+
+ /* Wait for shader engines to go idle.
+ * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
+ * for everything including CB/DB cache flushes.
+ */
+ if (!flush_cb_db) {
+ if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ /* Only count explicit shader flushes, not implicit ones
+ * done by SURFACE_SYNC.
+ */
+ sctx->num_vs_flushes++;
+ sctx->num_ps_flushes++;
+ } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ sctx->num_vs_flushes++;
+ }
+ }
+
+ if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ sctx->num_cs_flushes++;
+ sctx->compute_is_busy = false;
+ }
+
+ /* VGT state synchronization. */
+ if (flags & SI_CONTEXT_VGT_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ }
+ if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
+ }
+
+ /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
+ * wait for idle on GFX9. We have to use a TS event.
+ */
+ if (sctx->chip_class == GFX9 && flush_cb_db) {
+ uint64_t va;
+ unsigned tc_flags, cb_db_event;
+
+ /* Set the CB/DB flush event. */
+ switch (flush_cb_db) {
+ case SI_CONTEXT_FLUSH_AND_INV_CB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+ break;
+ case SI_CONTEXT_FLUSH_AND_INV_DB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+ break;
+ default:
+ /* both CB & DB */
+ cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+ }
+
+ /* These are the only allowed combinations. If you need to
+ * do multiple operations at once, do them separately.
+ * All operations that invalidate L2 also seem to invalidate
+ * metadata. Volatile (VOL) and WC flushes are not listed here.
+ *
+ * TC | TC_WB = writeback & invalidate L2 & L1
+ * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+ * TC_WB | TC_NC = writeback L2 for MTYPE == NC
+ * TC | TC_NC = invalidate L2 for MTYPE == NC
+ * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
+ * TCL1 = invalidate L1
+ */
+ tc_flags = 0;
+
+ if (flags & SI_CONTEXT_INV_L2_METADATA) {
+ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
+ }
+
+ /* Ideally flush TC together with CB/DB. */
+ if (flags & SI_CONTEXT_INV_L2) {
+ /* Writeback and invalidate everything in L2 & L1. */
+ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
+
+ /* Clear the flags. */
+ flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
+ sctx->num_L2_invalidates++;
+ }
+
+ /* Do the flush (enqueue the event and wait for it). */
+ va = sctx->wait_mem_scratch->gpu_address;
+ sctx->wait_mem_number++;
+
+ si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
+ sctx->wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
+ si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+ }
+
+ /* Make sure ME is idle (it executes most packets) before continuing.
+ * This prevents read-after-write hazards between PFP and ME.
+ */
+ if (sctx->has_graphics &&
+ (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ }
+
+ /* GFX6-GFX8 only:
+ * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+ * waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
+ *
+ * cp_coher_cntl should contain all necessary flags except TC flags
+ * at this point.
+ *
+ * GFX6-GFX7 don't support L2 write-back.
+ */
+ if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
+ /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
+ * WB must be set on GFX8+ when TC_ACTION is set.
+ */
+ si_emit_surface_sync(sctx, sctx->gfx_cs,
+ cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
+ cp_coher_cntl = 0;
+ sctx->num_L2_invalidates++;
+ } else {
+ /* L1 invalidation and L2 writeback must be done separately,
+ * because both operations can't be done together.
+ */
+ if (flags & SI_CONTEXT_WB_L2) {
+ /* WB = write-back
+ * NC = apply to non-coherent MTYPEs
+ * (i.e. MTYPE <= 1, which is what we use everywhere)
+ *
+ * WB doesn't work without NC.
+ */
+ si_emit_surface_sync(
+ sctx, sctx->gfx_cs,
+ cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ sctx->num_L2_writebacks++;
+ }
+ if (flags & SI_CONTEXT_INV_VCACHE) {
+ /* Invalidate per-CU VMEM L1. */
+ si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
+ }
+
+ /* If TC flushes haven't cleared this... */
+ if (cp_coher_cntl)
+ si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
+
+ if (is_barrier)
+ si_prim_discard_signal_next_compute_ib_start(sctx);
+
+ if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+ } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+ }
+
+ sctx->flags = 0;
}
-static void si_get_draw_start_count(struct si_context *sctx,
- const struct pipe_draw_info *info,
- unsigned *start, unsigned *count)
+static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
+ unsigned *start, unsigned *count)
{
- struct pipe_draw_indirect_info *indirect = info->indirect;
-
- if (indirect) {
- unsigned indirect_count;
- struct pipe_transfer *transfer;
- unsigned begin, end;
- unsigned map_size;
- unsigned *data;
-
- if (indirect->indirect_draw_count) {
- data = pipe_buffer_map_range(&sctx->b,
- indirect->indirect_draw_count,
- indirect->indirect_draw_count_offset,
- sizeof(unsigned),
- PIPE_TRANSFER_READ, &transfer);
-
- indirect_count = *data;
-
- pipe_buffer_unmap(&sctx->b, transfer);
- } else {
- indirect_count = indirect->draw_count;
- }
-
- if (!indirect_count) {
- *start = *count = 0;
- return;
- }
-
- map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
- data = pipe_buffer_map_range(&sctx->b, indirect->buffer,
- indirect->offset, map_size,
- PIPE_TRANSFER_READ, &transfer);
-
- begin = UINT_MAX;
- end = 0;
-
- for (unsigned i = 0; i < indirect_count; ++i) {
- unsigned count = data[0];
- unsigned start = data[2];
-
- if (count > 0) {
- begin = MIN2(begin, start);
- end = MAX2(end, start + count);
- }
-
- data += indirect->stride / sizeof(unsigned);
- }
-
- pipe_buffer_unmap(&sctx->b, transfer);
-
- if (begin < end) {
- *start = begin;
- *count = end - begin;
- } else {
- *start = *count = 0;
- }
- } else {
- *start = info->start;
- *count = info->count;
- }
+ struct pipe_draw_indirect_info *indirect = info->indirect;
+
+ if (indirect) {
+ unsigned indirect_count;
+ struct pipe_transfer *transfer;
+ unsigned begin, end;
+ unsigned map_size;
+ unsigned *data;
+
+ if (indirect->indirect_draw_count) {
+ data = pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count,
+ indirect->indirect_draw_count_offset, sizeof(unsigned),
+ PIPE_TRANSFER_READ, &transfer);
+
+ indirect_count = *data;
+
+ pipe_buffer_unmap(&sctx->b, transfer);
+ } else {
+ indirect_count = indirect->draw_count;
+ }
+
+ if (!indirect_count) {
+ *start = *count = 0;
+ return;
+ }
+
+ map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
+ data = pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size,
+ PIPE_TRANSFER_READ, &transfer);
+
+ begin = UINT_MAX;
+ end = 0;
+
+ for (unsigned i = 0; i < indirect_count; ++i) {
+ unsigned count = data[0];
+ unsigned start = data[2];
+
+ if (count > 0) {
+ begin = MIN2(begin, start);
+ end = MAX2(end, start + count);
+ }
+
+ data += indirect->stride / sizeof(unsigned);
+ }
+
+ pipe_buffer_unmap(&sctx->b, transfer);
+
+ if (begin < end) {
+ *start = begin;
+ *count = end - begin;
+ } else {
+ *start = *count = 0;
+ }
+ } else {
+ *start = info->start;
+ *count = info->count;
+ }
}
static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
- enum pipe_prim_type prim, unsigned instance_count,
- bool primitive_restart, unsigned skip_atom_mask)
+ enum pipe_prim_type prim, unsigned instance_count,
+ bool primitive_restart, unsigned skip_atom_mask)
{
- unsigned num_patches = 0;
+ unsigned num_patches = 0;
- si_emit_rasterizer_prim_state(sctx);
- if (sctx->tes_shader.cso)
- si_emit_derived_tess_state(sctx, info, &num_patches);
+ si_emit_rasterizer_prim_state(sctx);
+ if (sctx->tes_shader.cso)
+ si_emit_derived_tess_state(sctx, info, &num_patches);
- /* Emit state atoms. */
- unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
- while (mask)
- sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
+ /* Emit state atoms. */
+ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+ while (mask)
+ sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
- sctx->dirty_atoms &= skip_atom_mask;
+ sctx->dirty_atoms &= skip_atom_mask;
- /* Emit states. */
- mask = sctx->dirty_states;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct si_pm4_state *state = sctx->queued.array[i];
+ /* Emit states. */
+ mask = sctx->dirty_states;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct si_pm4_state *state = sctx->queued.array[i];
- if (!state || sctx->emitted.array[i] == state)
- continue;
+ if (!state || sctx->emitted.array[i] == state)
+ continue;
- si_pm4_emit(sctx, state);
- sctx->emitted.array[i] = state;
- }
- sctx->dirty_states = 0;
+ si_pm4_emit(sctx, state);
+ sctx->emitted.array[i] = state;
+ }
+ sctx->dirty_states = 0;
- /* Emit draw states. */
- si_emit_vs_state(sctx, info);
- si_emit_draw_registers(sctx, info, prim, num_patches, instance_count,
- primitive_restart);
+ /* Emit draw states. */
+ si_emit_vs_state(sctx, info);
+ si_emit_draw_registers(sctx, info, prim, num_patches, instance_count, primitive_restart);
}
-static bool
-si_all_vs_resources_read_only(struct si_context *sctx,
- struct pipe_resource *indexbuf)
+static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
{
- struct radeon_winsys *ws = sctx->ws;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
- /* Index buffer. */
- if (indexbuf &&
- ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
- RADEON_USAGE_WRITE))
- goto has_write_reference;
-
- /* Vertex buffers. */
- struct si_vertex_elements *velems = sctx->vertex_elements;
- unsigned num_velems = velems->count;
-
- for (unsigned i = 0; i < num_velems; i++) {
- if (!((1 << i) & velems->first_vb_use_mask))
- continue;
-
- unsigned vb_index = velems->vertex_buffer_index[i];
- struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
- if (!res)
- continue;
-
- if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
- RADEON_USAGE_WRITE))
- goto has_write_reference;
- }
-
- /* Constant and shader buffers. */
- struct si_descriptors *buffers =
- &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
- for (unsigned i = 0; i < buffers->num_active_slots; i++) {
- unsigned index = buffers->first_active_slot + i;
- struct pipe_resource *res =
- sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
- if (!res)
- continue;
-
- if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
- RADEON_USAGE_WRITE))
- goto has_write_reference;
- }
-
- /* Samplers. */
- struct si_shader_selector *vs = sctx->vs_shader.cso;
- if (vs->info.samplers_declared) {
- unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
-
- for (unsigned i = 0; i < num_samplers; i++) {
- struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
- if (!view)
- continue;
-
- if (ws->cs_is_buffer_referenced(cs,
- si_resource(view->texture)->buf,
- RADEON_USAGE_WRITE))
- goto has_write_reference;
- }
- }
-
- /* Images. */
- if (vs->info.images_declared) {
- unsigned num_images = util_last_bit(vs->info.images_declared);
-
- for (unsigned i = 0; i < num_images; i++) {
- struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
- if (!res)
- continue;
-
- if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
- RADEON_USAGE_WRITE))
- goto has_write_reference;
- }
- }
-
- return true;
+ struct radeon_winsys *ws = sctx->ws;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+ /* Index buffer. */
+ if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
+ goto has_write_reference;
+
+ /* Vertex buffers. */
+ struct si_vertex_elements *velems = sctx->vertex_elements;
+ unsigned num_velems = velems->count;
+
+ for (unsigned i = 0; i < num_velems; i++) {
+ if (!((1 << i) & velems->first_vb_use_mask))
+ continue;
+
+ unsigned vb_index = velems->vertex_buffer_index[i];
+ struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
+ if (!res)
+ continue;
+
+ if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+ goto has_write_reference;
+ }
+
+ /* Constant and shader buffers. */
+ struct si_descriptors *buffers =
+ &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
+ for (unsigned i = 0; i < buffers->num_active_slots; i++) {
+ unsigned index = buffers->first_active_slot + i;
+ struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
+ if (!res)
+ continue;
+
+ if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+ goto has_write_reference;
+ }
+
+ /* Samplers. */
+ struct si_shader_selector *vs = sctx->vs_shader.cso;
+ if (vs->info.samplers_declared) {
+ unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
+
+ for (unsigned i = 0; i < num_samplers; i++) {
+ struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
+ if (!view)
+ continue;
+
+ if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
+ goto has_write_reference;
+ }
+ }
+
+ /* Images. */
+ if (vs->info.images_declared) {
+ unsigned num_images = util_last_bit(vs->info.images_declared);
+
+ for (unsigned i = 0; i < num_images; i++) {
+ struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
+ if (!res)
+ continue;
+
+ if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+ goto has_write_reference;
+ }
+ }
+
+ return true;
has_write_reference:
- /* If the current gfx IB has enough packets, flush it to remove write
- * references to buffers.
- */
- if (cs->prev_dw + cs->current.cdw > 2048) {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- assert(si_all_vs_resources_read_only(sctx, indexbuf));
- return true;
- }
- return false;
+ /* If the current gfx IB has enough packets, flush it to remove write
+ * references to buffers.
+ */
+ if (cs->prev_dw + cs->current.cdw > 2048) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ assert(si_all_vs_resources_read_only(sctx, indexbuf));
+ return true;
+ }
+ return false;
}
static ALWAYS_INLINE bool pd_msg(const char *s)
{
- if (SI_PRIM_DISCARD_DEBUG)
- printf("PD failed: %s\n", s);
- return false;
+ if (SI_PRIM_DISCARD_DEBUG)
+ printf("PD failed: %s\n", s);
+ return false;
}
static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct pipe_resource *indexbuf = info->index.resource;
- unsigned dirty_tex_counter, dirty_buf_counter;
- enum pipe_prim_type rast_prim, prim = info->mode;
- unsigned index_size = info->index_size;
- unsigned index_offset = info->indirect ? info->start * index_size : 0;
- unsigned instance_count = info->instance_count;
- bool primitive_restart = info->primitive_restart &&
- (!sctx->screen->options.prim_restart_tri_strips_only ||
- (prim != PIPE_PRIM_TRIANGLE_STRIP &&
- prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-
- if (likely(!info->indirect)) {
- /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
- * no workaround for indirect draws, but we can at least skip
- * direct draws.
- */
- if (unlikely(!instance_count))
- return;
-
- /* Handle count == 0. */
- if (unlikely(!info->count &&
- (index_size || !info->count_from_stream_output)))
- return;
- }
-
- struct si_shader_selector *vs = sctx->vs_shader.cso;
- if (unlikely(!vs ||
- sctx->num_vertex_elements < vs->num_vs_inputs ||
- (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
- (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
- assert(0);
- return;
- }
-
- /* Recompute and re-emit the texture resource states if needed. */
- dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
- if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
- sctx->last_dirty_tex_counter = dirty_tex_counter;
- sctx->framebuffer.dirty_cbufs |=
- ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
- sctx->framebuffer.dirty_zsbuf = true;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
- si_update_all_texture_descriptors(sctx);
- }
-
- dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
- if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
- sctx->last_dirty_buf_counter = dirty_buf_counter;
- /* Rebind all buffers unconditionally. */
- si_rebind_buffer(sctx, NULL);
- }
-
- si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
-
- /* Set the rasterization primitive type.
- *
- * This must be done after si_decompress_textures, which can call
- * draw_vbo recursively, and before si_update_shaders, which uses
- * current_rast_prim for this draw_vbo call. */
- if (sctx->gs_shader.cso) {
- /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
- rast_prim = sctx->gs_shader.cso->rast_prim;
- } else if (sctx->tes_shader.cso) {
- /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
- rast_prim = sctx->tes_shader.cso->rast_prim;
- } else if (util_rast_prim_is_triangles(prim)) {
- rast_prim = PIPE_PRIM_TRIANGLES;
- } else {
- /* Only possibilities, POINTS, LINE*, RECTANGLES */
- rast_prim = prim;
- }
-
- if (rast_prim != sctx->current_rast_prim) {
- if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
- util_prim_is_points_or_lines(rast_prim))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
- sctx->current_rast_prim = rast_prim;
- sctx->do_update_shaders = true;
- }
-
- if (sctx->tes_shader.cso &&
- sctx->screen->info.has_ls_vgpr_init_bug) {
- /* Determine whether the LS VGPR fix should be applied.
- *
- * It is only required when num input CPs > num output CPs,
- * which cannot happen with the fixed function TCS. We should
- * also update this bit when switching from TCS to fixed
- * function TCS.
- */
- struct si_shader_selector *tcs = sctx->tcs_shader.cso;
- bool ls_vgpr_fix =
- tcs &&
- info->vertices_per_patch >
- tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-
- if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
- sctx->ls_vgpr_fix = ls_vgpr_fix;
- sctx->do_update_shaders = true;
- }
- }
-
- if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
- /* Determine whether the GS triangle strip adjacency fix should
- * be applied. Rotate every other triangle if
- * - triangle strips with adjacency are fed to the GS and
- * - primitive restart is disabled (the rotation doesn't help
- * when the restart occurs after an odd number of triangles).
- */
- bool gs_tri_strip_adj_fix =
- !sctx->tes_shader.cso &&
- prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
- !primitive_restart;
-
- if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
- sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
- sctx->do_update_shaders = true;
- }
- }
-
- if (index_size) {
- /* Translate or upload, if needed. */
- /* 8-bit indices are supported on GFX8. */
- if (sctx->chip_class <= GFX7 && index_size == 1) {
- unsigned start, count, start_offset, size, offset;
- void *ptr;
-
- si_get_draw_start_count(sctx, info, &start, &count);
- start_offset = start * 2;
- size = count * 2;
-
- indexbuf = NULL;
- u_upload_alloc(ctx->stream_uploader, start_offset,
- size,
- si_optimal_tcc_alignment(sctx, size),
- &offset, &indexbuf, &ptr);
- if (!indexbuf)
- return;
-
- util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0,
- index_offset + start,
- count, ptr);
-
- /* info->start will be added by the drawing code */
- index_offset = offset - start_offset;
- index_size = 2;
- } else if (info->has_user_indices) {
- unsigned start_offset;
-
- assert(!info->indirect);
- start_offset = info->start * index_size;
-
- indexbuf = NULL;
- u_upload_data(ctx->stream_uploader, start_offset,
- info->count * index_size,
- sctx->screen->info.tcc_cache_line_size,
- (char*)info->index.user + start_offset,
- &index_offset, &indexbuf);
- if (!indexbuf)
- return;
-
- /* info->start will be added by the drawing code */
- index_offset -= start_offset;
- } else if (sctx->chip_class <= GFX7 &&
- si_resource(indexbuf)->TC_L2_dirty) {
- /* GFX8 reads index buffers through TC L2, so it doesn't
- * need this. */
- sctx->flags |= SI_CONTEXT_WB_L2;
- si_resource(indexbuf)->TC_L2_dirty = false;
- }
- }
-
- bool dispatch_prim_discard_cs = false;
- bool prim_discard_cs_instancing = false;
- unsigned original_index_size = index_size;
- unsigned direct_count = 0;
-
- if (info->indirect) {
- struct pipe_draw_indirect_info *indirect = info->indirect;
-
- /* Add the buffer size for memory checking in need_cs_space. */
- si_context_add_resource_size(sctx, indirect->buffer);
-
- /* Indirect buffers use TC L2 on GFX9, but not older hw. */
- if (sctx->chip_class <= GFX8) {
- if (si_resource(indirect->buffer)->TC_L2_dirty) {
- sctx->flags |= SI_CONTEXT_WB_L2;
- si_resource(indirect->buffer)->TC_L2_dirty = false;
- }
-
- if (indirect->indirect_draw_count &&
- si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
- sctx->flags |= SI_CONTEXT_WB_L2;
- si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
- }
- }
- } else {
- /* Multiply by 3 for strips and fans to get an approximate vertex
- * count as triangles. */
- direct_count = info->count * instance_count *
- (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
- }
-
- /* Determine if we can use the primitive discard compute shader. */
- if (si_compute_prim_discard_enabled(sctx) &&
- (direct_count > sctx->prim_discard_vertex_count_threshold ?
- (sctx->compute_num_verts_rejected += direct_count, true) : /* Add, then return true. */
- (sctx->compute_num_verts_ineligible += direct_count, false)) && /* Add, then return false. */
- (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
- (primitive_restart ?
- /* Supported prim types with primitive restart: */
- (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
- /* Disallow instancing with primitive restart: */
- (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
- /* Supported prim types without primitive restart + allow instancing: */
- (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
- (1 << PIPE_PRIM_TRIANGLE_STRIP) |
- (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
- /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
- /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
- (instance_count == 1 ||
- (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
- pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
- (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
- (!sctx->render_cond || pd_msg("render condition")) &&
- /* Forced enablement ignores pipeline statistics queries. */
- (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
- (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
- pd_msg("pipestat or primgen query")) &&
- (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
- (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
- (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
- (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
- !rs->polygon_mode_enabled &&
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct pipe_resource *indexbuf = info->index.resource;
+ unsigned dirty_tex_counter, dirty_buf_counter;
+ enum pipe_prim_type rast_prim, prim = info->mode;
+ unsigned index_size = info->index_size;
+ unsigned index_offset = info->indirect ? info->start * index_size : 0;
+ unsigned instance_count = info->instance_count;
+ bool primitive_restart =
+ info->primitive_restart &&
+ (!sctx->screen->options.prim_restart_tri_strips_only ||
+ (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
+
+ if (likely(!info->indirect)) {
+ /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
+ * no workaround for indirect draws, but we can at least skip
+ * direct draws.
+ */
+ if (unlikely(!instance_count))
+ return;
+
+ /* Handle count == 0. */
+ if (unlikely(!info->count && (index_size || !info->count_from_stream_output)))
+ return;
+ }
+
+ struct si_shader_selector *vs = sctx->vs_shader.cso;
+ if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
+ (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
+ (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
+ assert(0);
+ return;
+ }
+
+ /* Recompute and re-emit the texture resource states if needed. */
+ dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
+ if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
+ sctx->last_dirty_tex_counter = dirty_tex_counter;
+ sctx->framebuffer.dirty_cbufs |= ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
+ sctx->framebuffer.dirty_zsbuf = true;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+ si_update_all_texture_descriptors(sctx);
+ }
+
+ dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
+ if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
+ sctx->last_dirty_buf_counter = dirty_buf_counter;
+ /* Rebind all buffers unconditionally. */
+ si_rebind_buffer(sctx, NULL);
+ }
+
+ si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
+
+ /* Set the rasterization primitive type.
+ *
+ * This must be done after si_decompress_textures, which can call
+ * draw_vbo recursively, and before si_update_shaders, which uses
+ * current_rast_prim for this draw_vbo call. */
+ if (sctx->gs_shader.cso) {
+ /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+ rast_prim = sctx->gs_shader.cso->rast_prim;
+ } else if (sctx->tes_shader.cso) {
+ /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+ rast_prim = sctx->tes_shader.cso->rast_prim;
+ } else if (util_rast_prim_is_triangles(prim)) {
+ rast_prim = PIPE_PRIM_TRIANGLES;
+ } else {
+ /* Only possibilities, POINTS, LINE*, RECTANGLES */
+ rast_prim = prim;
+ }
+
+ if (rast_prim != sctx->current_rast_prim) {
+ if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
+ util_prim_is_points_or_lines(rast_prim))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+ sctx->current_rast_prim = rast_prim;
+ sctx->do_update_shaders = true;
+ }
+
+ if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) {
+ /* Determine whether the LS VGPR fix should be applied.
+ *
+ * It is only required when num input CPs > num output CPs,
+ * which cannot happen with the fixed function TCS. We should
+ * also update this bit when switching from TCS to fixed
+ * function TCS.
+ */
+ struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+ bool ls_vgpr_fix =
+ tcs && info->vertices_per_patch > tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+ if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+ sctx->ls_vgpr_fix = ls_vgpr_fix;
+ sctx->do_update_shaders = true;
+ }
+ }
+
+ if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
+ /* Determine whether the GS triangle strip adjacency fix should
+ * be applied. Rotate every other triangle if
+ * - triangle strips with adjacency are fed to the GS and
+ * - primitive restart is disabled (the rotation doesn't help
+ * when the restart occurs after an odd number of triangles).
+ */
+ bool gs_tri_strip_adj_fix =
+ !sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
+
+ if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
+ sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
+ sctx->do_update_shaders = true;
+ }
+ }
+
+ if (index_size) {
+ /* Translate or upload, if needed. */
+ /* 8-bit indices are supported on GFX8. */
+ if (sctx->chip_class <= GFX7 && index_size == 1) {
+ unsigned start, count, start_offset, size, offset;
+ void *ptr;
+
+ si_get_draw_start_count(sctx, info, &start, &count);
+ start_offset = start * 2;
+ size = count * 2;
+
+ indexbuf = NULL;
+ u_upload_alloc(ctx->stream_uploader, start_offset, size,
+ si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
+ if (!indexbuf)
+ return;
+
+ util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
+
+ /* info->start will be added by the drawing code */
+ index_offset = offset - start_offset;
+ index_size = 2;
+ } else if (info->has_user_indices) {
+ unsigned start_offset;
+
+ assert(!info->indirect);
+ start_offset = info->start * index_size;
+
+ indexbuf = NULL;
+ u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size,
+ sctx->screen->info.tcc_cache_line_size,
+ (char *)info->index.user + start_offset, &index_offset, &indexbuf);
+ if (!indexbuf)
+ return;
+
+ /* info->start will be added by the drawing code */
+ index_offset -= start_offset;
+ } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
+ /* GFX8 reads index buffers through TC L2, so it doesn't
+ * need this. */
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ si_resource(indexbuf)->TC_L2_dirty = false;
+ }
+ }
+
+ bool dispatch_prim_discard_cs = false;
+ bool prim_discard_cs_instancing = false;
+ unsigned original_index_size = index_size;
+ unsigned direct_count = 0;
+
+ if (info->indirect) {
+ struct pipe_draw_indirect_info *indirect = info->indirect;
+
+ /* Add the buffer size for memory checking in need_cs_space. */
+ si_context_add_resource_size(sctx, indirect->buffer);
+
+ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+ if (sctx->chip_class <= GFX8) {
+ if (si_resource(indirect->buffer)->TC_L2_dirty) {
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ si_resource(indirect->buffer)->TC_L2_dirty = false;
+ }
+
+ if (indirect->indirect_draw_count &&
+ si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+ }
+ }
+ } else {
+ /* Multiply by 3 for strips and fans to get an approximate vertex
+ * count as triangles. */
+ direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
+ }
+
+ /* Determine if we can use the primitive discard compute shader. */
+ if (si_compute_prim_discard_enabled(sctx) &&
+ (direct_count > sctx->prim_discard_vertex_count_threshold
+ ? (sctx->compute_num_verts_rejected += direct_count, true)
+ : /* Add, then return true. */
+ (sctx->compute_num_verts_ineligible += direct_count,
+ false)) && /* Add, then return false. */
+ (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
+ (primitive_restart ?
+ /* Supported prim types with primitive restart: */
+ (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
+ /* Disallow instancing with primitive restart: */
+ (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart"))
+ :
+ /* Supported prim types without primitive restart + allow instancing: */
+ (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+ (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
+ /* Instancing is limited to 16-bit indices, because InstanceID is packed into
+ VertexID. */
+ /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
+ (instance_count == 1 ||
+ (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
+ pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
+ (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
+ (!sctx->render_cond || pd_msg("render condition")) &&
+ /* Forced enablement ignores pipeline statistics queries. */
+ (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
+ (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
+ pd_msg("pipestat or primgen query")) &&
+ (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
+ (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
+ (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+ (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
+ !rs->polygon_mode_enabled &&
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
- (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
- (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
- (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
- (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
- !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
- !sctx->vs_shader.cso->so.num_outputs &&
+ (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
+ (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
+ (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
+ (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
+ !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+ !sctx->vs_shader.cso->so.num_outputs &&
#else
- (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
+ (sctx->vs_shader.cso->prim_discard_cs_allowed ||
+ pd_msg("VS shader uses unsupported features")) &&
#endif
- /* Check that all buffers are used for read only, because compute
- * dispatches can run ahead. */
- (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
- switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
- case SI_PRIM_DISCARD_ENABLED:
- original_index_size = index_size;
- prim_discard_cs_instancing = instance_count > 1;
- dispatch_prim_discard_cs = true;
-
- /* The compute shader changes/lowers the following: */
- prim = PIPE_PRIM_TRIANGLES;
- index_size = 4;
- instance_count = 1;
- primitive_restart = false;
- sctx->compute_num_verts_rejected -= direct_count;
- sctx->compute_num_verts_accepted += direct_count;
- break;
- case SI_PRIM_DISCARD_DISABLED:
- break;
- case SI_PRIM_DISCARD_DRAW_SPLIT:
- sctx->compute_num_verts_rejected -= direct_count;
- goto return_cleanup;
- }
- }
-
- if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
- sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
- sctx->do_update_shaders = true;
- }
-
- /* Update NGG culling settings. */
- if (sctx->ngg &&
- !dispatch_prim_discard_cs &&
- rast_prim == PIPE_PRIM_TRIANGLES &&
- (sctx->screen->always_use_ngg_culling ||
- /* At least 1024 non-indexed vertices (8 subgroups) are needed
- * per draw call (no TES/GS) to enable NGG culling.
- */
- (!index_size && direct_count >= 1024 &&
- (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
- !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
- si_get_vs(sctx)->cso->ngg_culling_allowed) {
- unsigned ngg_culling = 0;
-
- if (rs->rasterizer_discard) {
- ngg_culling |= SI_NGG_CULL_FRONT_FACE |
- SI_NGG_CULL_BACK_FACE;
- } else {
- /* Polygon mode can't use view and small primitive culling,
- * because it draws points or lines where the culling depends
- * on the point or line width.
- */
- if (!rs->polygon_mode_enabled)
- ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-
- if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
- ngg_culling |= SI_NGG_CULL_FRONT_FACE;
- if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
- ngg_culling |= SI_NGG_CULL_BACK_FACE;
- }
-
- /* Use NGG fast launch for certain non-indexed primitive types.
- * A draw must have at least 1 full primitive.
- */
- if (ngg_culling && !index_size && direct_count >= 3 &&
- !sctx->tes_shader.cso && !sctx->gs_shader.cso) {
- if (prim == PIPE_PRIM_TRIANGLES)
- ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
- else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
- ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
- }
-
- if (ngg_culling != sctx->ngg_culling) {
- /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
- * See issues #2418, #2426, #2434
- */
- if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
- sctx->flags |= SI_CONTEXT_VGT_FLUSH;
- sctx->ngg_culling = ngg_culling;
- sctx->do_update_shaders = true;
- }
- } else if (sctx->ngg_culling) {
- sctx->ngg_culling = false;
- sctx->do_update_shaders = true;
- }
-
- if (sctx->do_update_shaders && !si_update_shaders(sctx))
- goto return_cleanup;
-
- si_need_gfx_cs_space(sctx);
-
- if (sctx->bo_list_add_all_gfx_resources)
- si_gfx_resources_add_all_to_bo_list(sctx);
-
- /* Since we've called si_context_add_resource_size for vertex buffers,
- * this must be called after si_need_cs_space, because we must let
- * need_cs_space flush before we add buffers to the buffer list.
- */
- if (!si_upload_vertex_buffer_descriptors(sctx))
- goto return_cleanup;
-
- /* Vega10/Raven scissor bug workaround. When any context register is
- * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
- * registers must be written too.
- */
- unsigned masked_atoms = 0;
-
- if (sctx->screen->info.has_gfx9_scissor_bug) {
- masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
-
- if (info->count_from_stream_output ||
- sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
- sctx->dirty_states & si_states_that_always_roll_context())
- sctx->context_roll = true;
- }
-
- /* Use optimal packet order based on whether we need to sync the pipeline. */
- if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
- SI_CONTEXT_FLUSH_AND_INV_DB |
- SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH))) {
- /* If we have to wait for idle, set all states first, so that all
- * SET packets are processed in parallel with previous draw calls.
- * Then draw and prefetch at the end. This ensures that the time
- * the CUs are idle is very short.
- */
- if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
- masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
-
- if (!si_upload_graphics_shader_descriptors(sctx))
- goto return_cleanup;
-
- /* Emit all states except possibly render condition. */
- si_emit_all_states(sctx, info, prim, instance_count,
- primitive_restart, masked_atoms);
- sctx->emit_cache_flush(sctx);
- /* <-- CUs are idle here. */
-
- if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
- sctx->atoms.s.render_cond.emit(sctx);
-
- if (sctx->screen->info.has_gfx9_scissor_bug &&
- (sctx->context_roll ||
- si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
- sctx->atoms.s.scissors.emit(sctx);
-
- sctx->dirty_atoms = 0;
-
- si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
- instance_count, dispatch_prim_discard_cs,
- original_index_size);
- /* <-- CUs are busy here. */
-
- /* Start prefetches after the draw has been started. Both will run
- * in parallel, but starting the draw first is more important.
- */
- if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
- cik_emit_prefetch_L2(sctx, false);
- } else {
- /* If we don't wait for idle, start prefetches first, then set
- * states, and draw at the end.
- */
- if (sctx->flags)
- sctx->emit_cache_flush(sctx);
-
- /* Only prefetch the API VS and VBO descriptors. */
- if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
- cik_emit_prefetch_L2(sctx, true);
-
- if (!si_upload_graphics_shader_descriptors(sctx))
- goto return_cleanup;
-
- si_emit_all_states(sctx, info, prim, instance_count,
- primitive_restart, masked_atoms);
-
- if (sctx->screen->info.has_gfx9_scissor_bug &&
- (sctx->context_roll ||
- si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
- sctx->atoms.s.scissors.emit(sctx);
-
- sctx->dirty_atoms = 0;
-
- si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
- instance_count, dispatch_prim_discard_cs,
- original_index_size);
-
- /* Prefetch the remaining shaders after the draw has been
- * started. */
- if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
- cik_emit_prefetch_L2(sctx, false);
- }
-
- /* Mark the displayable dcc buffer as dirty in order to update
- * it on the next call to si_flush_resource. */
- if (sctx->screen->info.use_display_dcc_with_retile_blit) {
- /* Don't use si_update_fb_dirtiness_after_rendering because it'll
- * cause unnecessary texture decompressions on each draw. */
- unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
- while (displayable_dcc_cb_mask) {
- unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
- struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
- struct si_texture *tex = (struct si_texture*) surf->texture;
- tex->displayable_dcc_dirty = true;
- }
- }
-
- /* Clear the context roll flag after the draw call. */
- sctx->context_roll = false;
-
- if (unlikely(sctx->current_saved_cs)) {
- si_trace_emit(sctx);
- si_log_draw_state(sctx, sctx->log);
- }
-
- /* Workaround for a VGT hang when streamout is enabled.
- * It must be done after drawing. */
- if ((sctx->family == CHIP_HAWAII ||
- sctx->family == CHIP_TONGA ||
- sctx->family == CHIP_FIJI) &&
- si_get_strmout_en(sctx)) {
- sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
- }
-
- if (unlikely(sctx->decompression_enabled)) {
- sctx->num_decompress_calls++;
- } else {
- sctx->num_draw_calls++;
- if (sctx->framebuffer.state.nr_cbufs > 1)
- sctx->num_mrt_draw_calls++;
- if (primitive_restart)
- sctx->num_prim_restart_calls++;
- if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
- sctx->num_spill_draw_calls++;
- }
+ /* Check that all buffers are used for read only, because compute
+ * dispatches can run ahead. */
+ (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
+ pd_msg("write reference"))) {
+ switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
+ case SI_PRIM_DISCARD_ENABLED:
+ original_index_size = index_size;
+ prim_discard_cs_instancing = instance_count > 1;
+ dispatch_prim_discard_cs = true;
+
+ /* The compute shader changes/lowers the following: */
+ prim = PIPE_PRIM_TRIANGLES;
+ index_size = 4;
+ instance_count = 1;
+ primitive_restart = false;
+ sctx->compute_num_verts_rejected -= direct_count;
+ sctx->compute_num_verts_accepted += direct_count;
+ break;
+ case SI_PRIM_DISCARD_DISABLED:
+ break;
+ case SI_PRIM_DISCARD_DRAW_SPLIT:
+ sctx->compute_num_verts_rejected -= direct_count;
+ goto return_cleanup;
+ }
+ }
+
+ if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+ sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
+ sctx->do_update_shaders = true;
+ }
+
+ /* Update NGG culling settings. */
+ if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
+ (sctx->screen->always_use_ngg_culling ||
+ /* At least 1024 non-indexed vertices (8 subgroups) are needed
+ * per draw call (no TES/GS) to enable NGG culling.
+ */
+ (!index_size && direct_count >= 1024 &&
+ (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
+ !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
+ si_get_vs(sctx)->cso->ngg_culling_allowed) {
+ unsigned ngg_culling = 0;
+
+ if (rs->rasterizer_discard) {
+ ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+ } else {
+ /* Polygon mode can't use view and small primitive culling,
+ * because it draws points or lines where the culling depends
+ * on the point or line width.
+ */
+ if (!rs->polygon_mode_enabled)
+ ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+
+ if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
+ ngg_culling |= SI_NGG_CULL_FRONT_FACE;
+ if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
+ ngg_culling |= SI_NGG_CULL_BACK_FACE;
+ }
+
+ /* Use NGG fast launch for certain non-indexed primitive types.
+ * A draw must have at least 1 full primitive.
+ */
+ if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso &&
+ !sctx->gs_shader.cso) {
+ if (prim == PIPE_PRIM_TRIANGLES)
+ ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+ else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
+ ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
+ }
+
+ if (ngg_culling != sctx->ngg_culling) {
+ /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
+ * See issues #2418, #2426, #2434
+ */
+ if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+ sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+ sctx->ngg_culling = ngg_culling;
+ sctx->do_update_shaders = true;
+ }
+ } else if (sctx->ngg_culling) {
+ sctx->ngg_culling = false;
+ sctx->do_update_shaders = true;
+ }
+
+ if (sctx->do_update_shaders && !si_update_shaders(sctx))
+ goto return_cleanup;
+
+ si_need_gfx_cs_space(sctx);
+
+ if (sctx->bo_list_add_all_gfx_resources)
+ si_gfx_resources_add_all_to_bo_list(sctx);
+
+ /* Since we've called si_context_add_resource_size for vertex buffers,
+ * this must be called after si_need_cs_space, because we must let
+ * need_cs_space flush before we add buffers to the buffer list.
+ */
+ if (!si_upload_vertex_buffer_descriptors(sctx))
+ goto return_cleanup;
+
+ /* Vega10/Raven scissor bug workaround. When any context register is
+ * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+ * registers must be written too.
+ */
+ unsigned masked_atoms = 0;
+
+ if (sctx->screen->info.has_gfx9_scissor_bug) {
+ masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+ if (info->count_from_stream_output ||
+ sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+ sctx->dirty_states & si_states_that_always_roll_context())
+ sctx->context_roll = true;
+ }
+
+ /* Use optimal packet order based on whether we need to sync the pipeline. */
+ if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
+ SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* If we have to wait for idle, set all states first, so that all
+ * SET packets are processed in parallel with previous draw calls.
+ * Then draw and prefetch at the end. This ensures that the time
+ * the CUs are idle is very short.
+ */
+ if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
+ masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
+
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ goto return_cleanup;
+
+ /* Emit all states except possibly render condition. */
+ si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+ sctx->emit_cache_flush(sctx);
+ /* <-- CUs are idle here. */
+
+ if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
+ sctx->atoms.s.render_cond.emit(sctx);
+
+ if (sctx->screen->info.has_gfx9_scissor_bug &&
+ (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+ sctx->atoms.s.scissors.emit(sctx);
+
+ sctx->dirty_atoms = 0;
+
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+ dispatch_prim_discard_cs, original_index_size);
+ /* <-- CUs are busy here. */
+
+ /* Start prefetches after the draw has been started. Both will run
+ * in parallel, but starting the draw first is more important.
+ */
+ if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx, false);
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and draw at the end.
+ */
+ if (sctx->flags)
+ sctx->emit_cache_flush(sctx);
+
+ /* Only prefetch the API VS and VBO descriptors. */
+ if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx, true);
+
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ goto return_cleanup;
+
+ si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+
+ if (sctx->screen->info.has_gfx9_scissor_bug &&
+ (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+ sctx->atoms.s.scissors.emit(sctx);
+
+ sctx->dirty_atoms = 0;
+
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+ dispatch_prim_discard_cs, original_index_size);
+
+ /* Prefetch the remaining shaders after the draw has been
+ * started. */
+ if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx, false);
+ }
+
+ /* Mark the displayable dcc buffer as dirty in order to update
+ * it on the next call to si_flush_resource. */
+ if (sctx->screen->info.use_display_dcc_with_retile_blit) {
+ /* Don't use si_update_fb_dirtiness_after_rendering because it'll
+ * cause unnecessary texture decompressions on each draw. */
+ unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
+ while (displayable_dcc_cb_mask) {
+ unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
+ struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+ struct si_texture *tex = (struct si_texture *)surf->texture;
+ tex->displayable_dcc_dirty = true;
+ }
+ }
+
+ /* Clear the context roll flag after the draw call. */
+ sctx->context_roll = false;
+
+ if (unlikely(sctx->current_saved_cs)) {
+ si_trace_emit(sctx);
+ si_log_draw_state(sctx, sctx->log);
+ }
+
+ /* Workaround for a VGT hang when streamout is enabled.
+ * It must be done after drawing. */
+ if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
+ si_get_strmout_en(sctx)) {
+ sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
+ }
+
+ if (unlikely(sctx->decompression_enabled)) {
+ sctx->num_decompress_calls++;
+ } else {
+ sctx->num_draw_calls++;
+ if (sctx->framebuffer.state.nr_cbufs > 1)
+ sctx->num_mrt_draw_calls++;
+ if (primitive_restart)
+ sctx->num_prim_restart_calls++;
+ if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+ sctx->num_spill_draw_calls++;
+ }
return_cleanup:
- if (index_size && indexbuf != info->index.resource)
- pipe_resource_reference(&indexbuf, NULL);
+ if (index_size && indexbuf != info->index.resource)
+ pipe_resource_reference(&indexbuf, NULL);
}
-static void
-si_draw_rectangle(struct blitter_context *blitter,
- void *vertex_elements_cso,
- blitter_get_vs_func get_vs,
- int x1, int y1, int x2, int y2,
- float depth, unsigned num_instances,
- enum blitter_attrib_type type,
- const union blitter_attrib *attrib)
+static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
+ blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
+ float depth, unsigned num_instances, enum blitter_attrib_type type,
+ const union blitter_attrib *attrib)
{
- struct pipe_context *pipe = util_blitter_get_pipe(blitter);
- struct si_context *sctx = (struct si_context*)pipe;
-
- /* Pack position coordinates as signed int16. */
- sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) |
- ((uint32_t)(y1 & 0xffff) << 16);
- sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) |
- ((uint32_t)(y2 & 0xffff) << 16);
- sctx->vs_blit_sh_data[2] = fui(depth);
-
- switch (type) {
- case UTIL_BLITTER_ATTRIB_COLOR:
- memcpy(&sctx->vs_blit_sh_data[3], attrib->color,
- sizeof(float)*4);
- break;
- case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
- case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
- memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord,
- sizeof(attrib->texcoord));
- break;
- case UTIL_BLITTER_ATTRIB_NONE:;
- }
-
- pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
-
- struct pipe_draw_info info = {};
- info.mode = SI_PRIM_RECTANGLE_LIST;
- info.count = 3;
- info.instance_count = num_instances;
-
- /* Don't set per-stage shader pointers for VS. */
- sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
- sctx->vertex_buffer_pointer_dirty = false;
- sctx->vertex_buffer_user_sgprs_dirty = false;
-
- si_draw_vbo(pipe, &info);
+ struct pipe_context *pipe = util_blitter_get_pipe(blitter);
+ struct si_context *sctx = (struct si_context *)pipe;
+
+ /* Pack position coordinates as signed int16. */
+ sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
+ sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16);
+ sctx->vs_blit_sh_data[2] = fui(depth);
+
+ switch (type) {
+ case UTIL_BLITTER_ATTRIB_COLOR:
+ memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4);
+ break;
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+ case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+ memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord));
+ break;
+ case UTIL_BLITTER_ATTRIB_NONE:;
+ }
+
+ pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
+
+ struct pipe_draw_info info = {};
+ info.mode = SI_PRIM_RECTANGLE_LIST;
+ info.count = 3;
+ info.instance_count = num_instances;
+
+ /* Don't set per-stage shader pointers for VS. */
+ sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
+ sctx->vertex_buffer_pointer_dirty = false;
+ sctx->vertex_buffer_user_sgprs_dirty = false;
+
+ si_draw_vbo(pipe, &info);
}
void si_trace_emit(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
- si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf,
- 0, 4, V_370_MEM, V_370_ME, &trace_id);
+ si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
- radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
- radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+ radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
- if (sctx->log)
- u_log_flush(sctx->log);
+ if (sctx->log)
+ u_log_flush(sctx->log);
}
void si_init_draw_functions(struct si_context *sctx)
{
- sctx->b.draw_vbo = si_draw_vbo;
+ sctx->b.draw_vbo = si_draw_vbo;
- sctx->blitter->draw_rectangle = si_draw_rectangle;
+ sctx->blitter->draw_rectangle = si_draw_rectangle;
- si_init_ia_multi_vgt_param_table(sctx);
+ si_init_ia_multi_vgt_param_table(sctx);
}
#include "si_build_pm4.h"
/* For MSAA sample positions. */
-#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
- ((((unsigned)(s0x) & 0xf) << 0) | (((unsigned)(s0y) & 0xf) << 4) | \
- (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \
- (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
- (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
+#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
+ ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \
+ (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | \
+ (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
/* For obtaining location coordinates from registers */
-#define SEXT4(x) ((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
-#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index) * 4)) & 0xf)
-#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
-#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
+#define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
+#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
/* The following sample ordering is required by EQAA.
*
/* 1x MSAA */
static const uint32_t sample_locs_1x =
- FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
+ FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
/* 2x MSAA (the positions are sorted for EQAA) */
static const uint32_t sample_locs_2x =
- FILL_SREG(-4,-4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+ FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
/* 4x MSAA (the positions are sorted for EQAA) */
-static const uint32_t sample_locs_4x =
- FILL_SREG(-2,-6, 2, 6, -6, 2, 6,-2);
+static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
/* 8x MSAA (the positions are sorted for EQAA) */
static const uint32_t sample_locs_8x[] = {
- FILL_SREG(-3,-5, 5, 1, -1, 3, 7,-7),
- FILL_SREG(-7,-1, 3, 7, -5, 5, 1,-3),
- /* The following are unused by hardware, but we emit them to IBs
- * instead of multiple SET_CONTEXT_REG packets. */
- 0,
- 0,
+ FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
+ FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
+ /* The following are unused by hardware, but we emit them to IBs
+ * instead of multiple SET_CONTEXT_REG packets. */
+ 0,
+ 0,
};
static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
/* 16x MSAA (the positions are sorted for EQAA) */
static const uint32_t sample_locs_16x[] = {
- FILL_SREG(-5,-2, 5, 3, -2, 6, 3,-5),
- FILL_SREG(-4,-6, 1, 1, -6, 4, 7,-4),
- FILL_SREG(-1,-3, 6, 7, -3, 2, 0,-7),
- FILL_SREG(-7,-8, 2, 5, -8, 0, 4,-1),
+ FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
+ FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
+ FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
+ FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1),
};
static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
- unsigned sample_index, float *out_value)
+ unsigned sample_index, float *out_value)
{
- const uint32_t *sample_locs;
-
- switch (sample_count) {
- case 1:
- default:
- sample_locs = &sample_locs_1x;
- break;
- case 2:
- sample_locs = &sample_locs_2x;
- break;
- case 4:
- sample_locs = &sample_locs_4x;
- break;
- case 8:
- sample_locs = sample_locs_8x;
- break;
- case 16:
- sample_locs = sample_locs_16x;
- break;
- }
-
- out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
- out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+ const uint32_t *sample_locs;
+
+ switch (sample_count) {
+ case 1:
+ default:
+ sample_locs = &sample_locs_1x;
+ break;
+ case 2:
+ sample_locs = &sample_locs_2x;
+ break;
+ case 4:
+ sample_locs = &sample_locs_4x;
+ break;
+ case 8:
+ sample_locs = sample_locs_8x;
+ break;
+ case 16:
+ sample_locs = sample_locs_16x;
+ break;
+ }
+
+ out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+ out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
}
-static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
- uint64_t centroid_priority,
- uint32_t sample_locs)
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+ uint32_t sample_locs)
{
- radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
- radeon_emit(cs, centroid_priority);
- radeon_emit(cs, centroid_priority >> 32);
- radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
- radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
- radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
- radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+ radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+ radeon_emit(cs, centroid_priority);
+ radeon_emit(cs, centroid_priority >> 32);
+ radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+ radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
}
-static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
- uint64_t centroid_priority,
- const uint32_t *sample_locs,
- unsigned num_samples)
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+ const uint32_t *sample_locs, unsigned num_samples)
{
- radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
- radeon_emit(cs, centroid_priority);
- radeon_emit(cs, centroid_priority >> 32);
- radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
- num_samples == 8 ? 14 : 16);
- radeon_emit_array(cs, sample_locs, 4);
- radeon_emit_array(cs, sample_locs, 4);
- radeon_emit_array(cs, sample_locs, 4);
- radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+ radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+ radeon_emit(cs, centroid_priority);
+ radeon_emit(cs, centroid_priority >> 32);
+ radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+ num_samples == 8 ? 14 : 16);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, 4);
+ radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
}
void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
{
- switch (nr_samples) {
- default:
- case 1:
- si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
- break;
- case 2:
- si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
- break;
- case 4:
- si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
- break;
- case 8:
- si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
- break;
- case 16:
- si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
- break;
- }
+ switch (nr_samples) {
+ default:
+ case 1:
+ si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
+ break;
+ case 2:
+ si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
+ break;
+ case 4:
+ si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
+ break;
+ case 8:
+ si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
+ break;
+ case 16:
+ si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
+ break;
+ }
}
void si_init_msaa_functions(struct si_context *sctx)
{
- int i;
+ int i;
- sctx->b.get_sample_position = si_get_sample_position;
+ sctx->b.get_sample_position = si_get_sample_position;
- si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
+ si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
- for (i = 0; i < 2; i++)
- si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
- for (i = 0; i < 4; i++)
- si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
- for (i = 0; i < 8; i++)
- si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
- for (i = 0; i < 16; i++)
- si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
+ for (i = 0; i < 2; i++)
+ si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
+ for (i = 0; i < 4; i++)
+ si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
+ for (i = 0; i < 8; i++)
+ si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
+ for (i = 0; i < 16; i++)
+ si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "si_build_pm4.h"
-#include "sid.h"
-
+#include "ac_exp_param.h"
+#include "ac_shader_util.h"
#include "compiler/nir/nir_serialize.h"
#include "nir/tgsi_to_nir.h"
-#include "util/hash_table.h"
+#include "si_build_pm4.h"
+#include "sid.h"
#include "util/crc32.h"
+#include "util/disk_cache.h"
+#include "util/hash_table.h"
+#include "util/mesa-sha1.h"
#include "util/u_async_debug.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
-#include "util/disk_cache.h"
-#include "util/mesa-sha1.h"
-#include "ac_exp_param.h"
-#include "ac_shader_util.h"
-
/* SHADER_CACHE */
/**
* Return the IR key for the shader cache.
*/
void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
- unsigned char ir_sha1_cache_key[20])
-{
- struct blob blob = {};
- unsigned ir_size;
- void *ir_binary;
-
- if (sel->nir_binary) {
- ir_binary = sel->nir_binary;
- ir_size = sel->nir_size;
- } else {
- assert(sel->nir);
-
- blob_init(&blob);
- nir_serialize(&blob, sel->nir, true);
- ir_binary = blob.data;
- ir_size = blob.size;
- }
-
- /* These settings affect the compilation, but they are not derived
- * from the input shader IR.
- */
- unsigned shader_variant_flags = 0;
-
- if (ngg)
- shader_variant_flags |= 1 << 0;
- if (sel->nir)
- shader_variant_flags |= 1 << 1;
- if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
- shader_variant_flags |= 1 << 2;
- if (sel->type == PIPE_SHADER_FRAGMENT &&
- sel->info.uses_derivatives &&
- sel->info.uses_kill &&
- sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
- shader_variant_flags |= 1 << 3;
-
- /* This varies depending on whether compute-based culling is enabled. */
- shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
-
- struct mesa_sha1 ctx;
- _mesa_sha1_init(&ctx);
- _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
- _mesa_sha1_update(&ctx, ir_binary, ir_size);
- if (sel->type == PIPE_SHADER_VERTEX ||
- sel->type == PIPE_SHADER_TESS_EVAL ||
- sel->type == PIPE_SHADER_GEOMETRY)
- _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
- _mesa_sha1_final(&ctx, ir_sha1_cache_key);
-
- if (ir_binary == blob.data)
- blob_finish(&blob);
+ unsigned char ir_sha1_cache_key[20])
+{
+ struct blob blob = {};
+ unsigned ir_size;
+ void *ir_binary;
+
+ if (sel->nir_binary) {
+ ir_binary = sel->nir_binary;
+ ir_size = sel->nir_size;
+ } else {
+ assert(sel->nir);
+
+ blob_init(&blob);
+ nir_serialize(&blob, sel->nir, true);
+ ir_binary = blob.data;
+ ir_size = blob.size;
+ }
+
+ /* These settings affect the compilation, but they are not derived
+ * from the input shader IR.
+ */
+ unsigned shader_variant_flags = 0;
+
+ if (ngg)
+ shader_variant_flags |= 1 << 0;
+ if (sel->nir)
+ shader_variant_flags |= 1 << 1;
+ if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+ shader_variant_flags |= 1 << 2;
+ if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
+ sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+ shader_variant_flags |= 1 << 3;
+
+ /* This varies depending on whether compute-based culling is enabled. */
+ shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
+
+ struct mesa_sha1 ctx;
+ _mesa_sha1_init(&ctx);
+ _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+ _mesa_sha1_update(&ctx, ir_binary, ir_size);
+ if (sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL ||
+ sel->type == PIPE_SHADER_GEOMETRY)
+ _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+ _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+ if (ir_binary == blob.data)
+ blob_finish(&blob);
}
/** Copy "data" to "ptr" and return the next dword following copied data. */
static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
{
- /* data may be NULL if size == 0 */
- if (size)
- memcpy(ptr, data, size);
- ptr += DIV_ROUND_UP(size, 4);
- return ptr;
+ /* data may be NULL if size == 0 */
+ if (size)
+ memcpy(ptr, data, size);
+ ptr += DIV_ROUND_UP(size, 4);
+ return ptr;
}
/** Read data from "ptr". Return the next dword following the data. */
static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
{
- memcpy(data, ptr, size);
- ptr += DIV_ROUND_UP(size, 4);
- return ptr;
+ memcpy(data, ptr, size);
+ ptr += DIV_ROUND_UP(size, 4);
+ return ptr;
}
/**
*/
static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
{
- *ptr++ = size;
- return write_data(ptr, data, size);
+ *ptr++ = size;
+ return write_data(ptr, data, size);
}
/**
*/
static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
{
- *size = *ptr++;
- assert(*data == NULL);
- if (!*size)
- return ptr;
- *data = malloc(*size);
- return read_data(ptr, *data, *size);
+ *size = *ptr++;
+ assert(*data == NULL);
+ if (!*size)
+ return ptr;
+ *data = malloc(*size);
+ return read_data(ptr, *data, *size);
}
/**
*/
static void *si_get_shader_binary(struct si_shader *shader)
{
- /* There is always a size of data followed by the data itself. */
- unsigned llvm_ir_size = shader->binary.llvm_ir_string ?
- strlen(shader->binary.llvm_ir_string) + 1 : 0;
-
- /* Refuse to allocate overly large buffers and guard against integer
- * overflow. */
- if (shader->binary.elf_size > UINT_MAX / 4 ||
- llvm_ir_size > UINT_MAX / 4)
- return NULL;
-
- unsigned size =
- 4 + /* total size */
- 4 + /* CRC32 of the data below */
- align(sizeof(shader->config), 4) +
- align(sizeof(shader->info), 4) +
- 4 + align(shader->binary.elf_size, 4) +
- 4 + align(llvm_ir_size, 4);
- void *buffer = CALLOC(1, size);
- uint32_t *ptr = (uint32_t*)buffer;
-
- if (!buffer)
- return NULL;
-
- *ptr++ = size;
- ptr++; /* CRC32 is calculated at the end. */
-
- ptr = write_data(ptr, &shader->config, sizeof(shader->config));
- ptr = write_data(ptr, &shader->info, sizeof(shader->info));
- ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
- ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
- assert((char *)ptr - (char *)buffer == size);
-
- /* Compute CRC32. */
- ptr = (uint32_t*)buffer;
- ptr++;
- *ptr = util_hash_crc32(ptr + 1, size - 8);
-
- return buffer;
+ /* There is always a size of data followed by the data itself. */
+ unsigned llvm_ir_size =
+ shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;
+
+ /* Refuse to allocate overly large buffers and guard against integer
+ * overflow. */
+ if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4)
+ return NULL;
+
+ unsigned size = 4 + /* total size */
+ 4 + /* CRC32 of the data below */
+ align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
+ align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
+ void *buffer = CALLOC(1, size);
+ uint32_t *ptr = (uint32_t *)buffer;
+
+ if (!buffer)
+ return NULL;
+
+ *ptr++ = size;
+ ptr++; /* CRC32 is calculated at the end. */
+
+ ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+ ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+ ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
+ ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
+ assert((char *)ptr - (char *)buffer == size);
+
+ /* Compute CRC32. */
+ ptr = (uint32_t *)buffer;
+ ptr++;
+ *ptr = util_hash_crc32(ptr + 1, size - 8);
+
+ return buffer;
}
static bool si_load_shader_binary(struct si_shader *shader, void *binary)
{
- uint32_t *ptr = (uint32_t*)binary;
- uint32_t size = *ptr++;
- uint32_t crc32 = *ptr++;
- unsigned chunk_size;
- unsigned elf_size;
-
- if (util_hash_crc32(ptr, size - 8) != crc32) {
- fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
- return false;
- }
-
- ptr = read_data(ptr, &shader->config, sizeof(shader->config));
- ptr = read_data(ptr, &shader->info, sizeof(shader->info));
- ptr = read_chunk(ptr, (void**)&shader->binary.elf_buffer,
- &elf_size);
- shader->binary.elf_size = elf_size;
- ptr = read_chunk(ptr, (void**)&shader->binary.llvm_ir_string, &chunk_size);
-
- return true;
+ uint32_t *ptr = (uint32_t *)binary;
+ uint32_t size = *ptr++;
+ uint32_t crc32 = *ptr++;
+ unsigned chunk_size;
+ unsigned elf_size;
+
+ if (util_hash_crc32(ptr, size - 8) != crc32) {
+ fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+ return false;
+ }
+
+ ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+ ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+ ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size);
+ shader->binary.elf_size = elf_size;
+ ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
+
+ return true;
}
/**
* Insert a shader into the cache. It's assumed the shader is not in the cache.
* Use si_shader_cache_load_shader before calling this.
*/
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
- unsigned char ir_sha1_cache_key[20],
- struct si_shader *shader,
- bool insert_into_disk_cache)
-{
- void *hw_binary;
- struct hash_entry *entry;
- uint8_t key[CACHE_KEY_SIZE];
-
- entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
- if (entry)
- return; /* already added */
-
- hw_binary = si_get_shader_binary(shader);
- if (!hw_binary)
- return;
-
- if (_mesa_hash_table_insert(sscreen->shader_cache,
- mem_dup(ir_sha1_cache_key, 20),
- hw_binary) == NULL) {
- FREE(hw_binary);
- return;
- }
-
- if (sscreen->disk_shader_cache && insert_into_disk_cache) {
- disk_cache_compute_key(sscreen->disk_shader_cache,
- ir_sha1_cache_key, 20, key);
- disk_cache_put(sscreen->disk_shader_cache, key, hw_binary,
- *((uint32_t *) hw_binary), NULL);
- }
-}
-
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
- unsigned char ir_sha1_cache_key[20],
- struct si_shader *shader)
-{
- struct hash_entry *entry =
- _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
-
- if (entry) {
- if (si_load_shader_binary(shader, entry->data)) {
- p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
- return true;
- }
- }
- p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
-
- if (!sscreen->disk_shader_cache)
- return false;
-
- unsigned char sha1[CACHE_KEY_SIZE];
- disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key,
- 20, sha1);
-
- size_t binary_size;
- uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1,
- &binary_size);
- if (buffer) {
- if (binary_size >= sizeof(uint32_t) &&
- *((uint32_t*)buffer) == binary_size) {
- if (si_load_shader_binary(shader, buffer)) {
- free(buffer);
- si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
- shader, false);
- p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
- return true;
- }
- } else {
- /* Something has gone wrong discard the item from the cache and
- * rebuild/link from source.
- */
- assert(!"Invalid radeonsi shader disk cache item!");
- disk_cache_remove(sscreen->disk_shader_cache, sha1);
- }
- }
-
- free(buffer);
- p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
- return false;
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+ struct si_shader *shader, bool insert_into_disk_cache)
+{
+ void *hw_binary;
+ struct hash_entry *entry;
+ uint8_t key[CACHE_KEY_SIZE];
+
+ entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+ if (entry)
+ return; /* already added */
+
+ hw_binary = si_get_shader_binary(shader);
+ if (!hw_binary)
+ return;
+
+ if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), hw_binary) ==
+ NULL) {
+ FREE(hw_binary);
+ return;
+ }
+
+ if (sscreen->disk_shader_cache && insert_into_disk_cache) {
+ disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
+ disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
+ }
+}
+
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+ struct si_shader *shader)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+
+ if (entry) {
+ if (si_load_shader_binary(shader, entry->data)) {
+ p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
+ return true;
+ }
+ }
+ p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
+
+ if (!sscreen->disk_shader_cache)
+ return false;
+
+ unsigned char sha1[CACHE_KEY_SIZE];
+ disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
+
+ size_t binary_size;
+ uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
+ if (buffer) {
+ if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
+ if (si_load_shader_binary(shader, buffer)) {
+ free(buffer);
+ si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
+ p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
+ return true;
+ }
+ } else {
+ /* Something has gone wrong discard the item from the cache and
+ * rebuild/link from source.
+ */
+ assert(!"Invalid radeonsi shader disk cache item!");
+ disk_cache_remove(sscreen->disk_shader_cache, sha1);
+ }
+ }
+
+ free(buffer);
+ p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
+ return false;
}
static uint32_t si_shader_cache_key_hash(const void *key)
{
- /* Take the first dword of SHA1. */
- return *(uint32_t*)key;
+ /* Take the first dword of SHA1. */
+ return *(uint32_t *)key;
}
static bool si_shader_cache_key_equals(const void *a, const void *b)
{
- /* Compare SHA1s. */
- return memcmp(a, b, 20) == 0;
+ /* Compare SHA1s. */
+ return memcmp(a, b, 20) == 0;
}
static void si_destroy_shader_cache_entry(struct hash_entry *entry)
{
- FREE((void*)entry->key);
- FREE(entry->data);
+ FREE((void *)entry->key);
+ FREE(entry->data);
}
bool si_init_shader_cache(struct si_screen *sscreen)
{
- (void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
- sscreen->shader_cache =
- _mesa_hash_table_create(NULL,
- si_shader_cache_key_hash,
- si_shader_cache_key_equals);
+ (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
+ sscreen->shader_cache =
+ _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
- return sscreen->shader_cache != NULL;
+ return sscreen->shader_cache != NULL;
}
void si_destroy_shader_cache(struct si_screen *sscreen)
{
- if (sscreen->shader_cache)
- _mesa_hash_table_destroy(sscreen->shader_cache,
- si_destroy_shader_cache_entry);
- simple_mtx_destroy(&sscreen->shader_cache_mutex);
+ if (sscreen->shader_cache)
+ _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
+ simple_mtx_destroy(&sscreen->shader_cache_mutex);
}
/* SHADER STATES */
-static void si_set_tesseval_regs(struct si_screen *sscreen,
- const struct si_shader_selector *tes,
- struct si_pm4_state *pm4)
-{
- const struct si_shader_info *info = &tes->info;
- unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
- unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
- bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
- bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
- unsigned type, partitioning, topology, distribution_mode;
-
- switch (tes_prim_mode) {
- case PIPE_PRIM_LINES:
- type = V_028B6C_TESS_ISOLINE;
- break;
- case PIPE_PRIM_TRIANGLES:
- type = V_028B6C_TESS_TRIANGLE;
- break;
- case PIPE_PRIM_QUADS:
- type = V_028B6C_TESS_QUAD;
- break;
- default:
- assert(0);
- return;
- }
-
- switch (tes_spacing) {
- case PIPE_TESS_SPACING_FRACTIONAL_ODD:
- partitioning = V_028B6C_PART_FRAC_ODD;
- break;
- case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
- partitioning = V_028B6C_PART_FRAC_EVEN;
- break;
- case PIPE_TESS_SPACING_EQUAL:
- partitioning = V_028B6C_PART_INTEGER;
- break;
- default:
- assert(0);
- return;
- }
-
- if (tes_point_mode)
- topology = V_028B6C_OUTPUT_POINT;
- else if (tes_prim_mode == PIPE_PRIM_LINES)
- topology = V_028B6C_OUTPUT_LINE;
- else if (tes_vertex_order_cw)
- /* for some reason, this must be the other way around */
- topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
- else
- topology = V_028B6C_OUTPUT_TRIANGLE_CW;
-
- if (sscreen->info.has_distributed_tess) {
- if (sscreen->info.family == CHIP_FIJI ||
- sscreen->info.family >= CHIP_POLARIS10)
- distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
- else
- distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
- } else
- distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
-
- assert(pm4->shader);
- pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) |
- S_028B6C_PARTITIONING(partitioning) |
- S_028B6C_TOPOLOGY(topology) |
- S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
+ struct si_pm4_state *pm4)
+{
+ const struct si_shader_info *info = &tes->info;
+ unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+ unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+ bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+ bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+ unsigned type, partitioning, topology, distribution_mode;
+
+ switch (tes_prim_mode) {
+ case PIPE_PRIM_LINES:
+ type = V_028B6C_TESS_ISOLINE;
+ break;
+ case PIPE_PRIM_TRIANGLES:
+ type = V_028B6C_TESS_TRIANGLE;
+ break;
+ case PIPE_PRIM_QUADS:
+ type = V_028B6C_TESS_QUAD;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ switch (tes_spacing) {
+ case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+ partitioning = V_028B6C_PART_FRAC_ODD;
+ break;
+ case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+ partitioning = V_028B6C_PART_FRAC_EVEN;
+ break;
+ case PIPE_TESS_SPACING_EQUAL:
+ partitioning = V_028B6C_PART_INTEGER;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ if (tes_point_mode)
+ topology = V_028B6C_OUTPUT_POINT;
+ else if (tes_prim_mode == PIPE_PRIM_LINES)
+ topology = V_028B6C_OUTPUT_LINE;
+ else if (tes_vertex_order_cw)
+ /* for some reason, this must be the other way around */
+ topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+ else
+ topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+ if (sscreen->info.has_distributed_tess) {
+ if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
+ distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
+ else
+ distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
+ } else
+ distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
+
+ assert(pm4->shader);
+ pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+ S_028B6C_TOPOLOGY(topology) |
+ S_028B6C_DISTRIBUTION_MODE(distribution_mode);
}
/* Polaris needs different VTX_REUSE_DEPTH settings depending on
*
* If "shader" is NULL, it's assumed it's not LS or GS copy shader.
*/
-static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen,
- struct si_shader_selector *sel,
- struct si_shader *shader,
- struct si_pm4_state *pm4)
+static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
+ struct si_shader *shader, struct si_pm4_state *pm4)
{
- unsigned type = sel->type;
-
- if (sscreen->info.family < CHIP_POLARIS10 ||
- sscreen->info.chip_class >= GFX10)
- return;
-
- /* VS as VS, or VS as ES: */
- if ((type == PIPE_SHADER_VERTEX &&
- (!shader ||
- (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
- /* TES as VS, or TES as ES: */
- type == PIPE_SHADER_TESS_EVAL) {
- unsigned vtx_reuse_depth = 30;
-
- if (type == PIPE_SHADER_TESS_EVAL &&
- sel->info.properties[TGSI_PROPERTY_TES_SPACING] ==
- PIPE_TESS_SPACING_FRACTIONAL_ODD)
- vtx_reuse_depth = 14;
-
- assert(pm4->shader);
- pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
- }
+ unsigned type = sel->type;
+
+ if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
+ return;
+
+ /* VS as VS, or VS as ES: */
+ if ((type == PIPE_SHADER_VERTEX &&
+ (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
+ /* TES as VS, or TES as ES: */
+ type == PIPE_SHADER_TESS_EVAL) {
+ unsigned vtx_reuse_depth = 30;
+
+ if (type == PIPE_SHADER_TESS_EVAL &&
+ sel->info.properties[TGSI_PROPERTY_TES_SPACING] == PIPE_TESS_SPACING_FRACTIONAL_ODD)
+ vtx_reuse_depth = 14;
+
+ assert(pm4->shader);
+ pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+ }
}
static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
{
- if (shader->pm4)
- si_pm4_clear_state(shader->pm4);
- else
- shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
- if (shader->pm4) {
- shader->pm4->shader = shader;
- return shader->pm4;
- } else {
- fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
- return NULL;
- }
+ if (shader->pm4)
+ si_pm4_clear_state(shader->pm4);
+ else
+ shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+
+ if (shader->pm4) {
+ shader->pm4->shader = shader;
+ return shader->pm4;
+ } else {
+ fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
+ return NULL;
+ }
}
static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
- unsigned num_always_on_user_sgprs)
+ unsigned num_always_on_user_sgprs)
{
- struct si_shader_selector *vs = shader->previous_stage_sel ?
- shader->previous_stage_sel : shader->selector;
- unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+ struct si_shader_selector *vs =
+ shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+ unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
- /* 1 SGPR is reserved for the vertex buffer pointer. */
- assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+ /* 1 SGPR is reserved for the vertex buffer pointer. */
+ assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
- if (num_vbos_in_user_sgprs)
- return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+ if (num_vbos_in_user_sgprs)
+ return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
- /* Add the pointer to VBO descriptors. */
- return num_always_on_user_sgprs + 1;
+ /* Add the pointer to VBO descriptors. */
+ return num_always_on_user_sgprs + 1;
}
/* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
-static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen,
- struct si_shader *shader, bool legacy_vs_prim_id)
-{
- assert(shader->selector->type == PIPE_SHADER_VERTEX ||
- (shader->previous_stage_sel &&
- shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
-
- /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...).
- * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...)
- * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID).
- * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or InstanceID)
- */
- bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
-
- if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
- return 3;
- else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
- return 2;
- else if (is_ls || shader->info.uses_instanceid)
- return 1;
- else
- return 0;
+static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
+ bool legacy_vs_prim_id)
+{
+ assert(shader->selector->type == PIPE_SHADER_VERTEX ||
+ (shader->previous_stage_sel && shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
+
+ /* GFX6-9 LS (VertexID, RelAutoindex, InstanceID / StepRate0(==1), ...).
+ * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID, ...)
+ * GFX10 LS (VertexID, RelAutoindex, UserVGPR1, InstanceID).
+ * GFX10 ES,VS (VertexID, UserVGPR0, UserVGPR1 or VSPrimID, UserVGPR2 or
+ * InstanceID)
+ */
+ bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
+
+ if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
+ return 3;
+ else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
+ return 2;
+ else if (is_ls || shader->info.uses_instanceid)
+ return 1;
+ else
+ return 0;
}
static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
{
- struct si_pm4_state *pm4;
- uint64_t va;
+ struct si_pm4_state *pm4;
+ uint64_t va;
- assert(sscreen->info.chip_class <= GFX8);
+ assert(sscreen->info.chip_class <= GFX8);
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
- si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
- shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
- S_00B528_DX10_CLAMP(1) |
- S_00B528_FLOAT_MODE(shader->config.float_mode);
- shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
- S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+ shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
+ S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
+ S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode);
+ shader->config.rsrc2 =
+ S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
+ S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
}
static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
{
- struct si_pm4_state *pm4;
- uint64_t va;
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (sscreen->info.chip_class >= GFX9) {
- if (sscreen->info.chip_class >= GFX10) {
- si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
- } else {
- si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
- si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
- }
-
- unsigned num_user_sgprs =
- si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
-
- shader->config.rsrc2 =
- S_00B42C_USER_SGPR(num_user_sgprs) |
- S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
- if (sscreen->info.chip_class >= GFX10)
- shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
- else
- shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
- } else {
- si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
- si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
-
- shader->config.rsrc2 =
- S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) |
- S_00B42C_OC_LDS_EN(1) |
- S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
- }
-
- si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
- S_00B428_VGPRS((shader->config.num_vgprs - 1) /
- (sscreen->ge_wave_size == 32 ? 8 : 4)) |
- (sscreen->info.chip_class <= GFX9 ?
- S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) |
- S_00B428_DX10_CLAMP(1) |
- S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
- S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
- S_00B428_FLOAT_MODE(shader->config.float_mode) |
- S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9 ?
- si_get_vs_vgpr_comp_cnt(sscreen, shader, false) : 0));
-
- if (sscreen->info.chip_class <= GFX8) {
- si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
- shader->config.rsrc2);
- }
+ struct si_pm4_state *pm4;
+ uint64_t va;
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ if (sscreen->info.chip_class >= GFX9) {
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+ } else {
+ si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
+ }
+
+ unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
+
+ shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) |
+ S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+ if (sscreen->info.chip_class >= GFX10)
+ shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+ else
+ shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+ } else {
+ si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
+
+ shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
+ S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+ }
+
+ si_pm4_set_reg(
+ pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+ S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+ (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
+ : 0) |
+ S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+ S_00B428_FLOAT_MODE(shader->config.float_mode) |
+ S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9
+ ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false)
+ : 0));
+
+ if (sscreen->info.chip_class <= GFX8) {
+ si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
+ }
}
static void si_emit_shader_es(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.es->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ struct si_shader *shader = sctx->queued.named.es->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- if (!shader)
- return;
+ if (!shader)
+ return;
- radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
- SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
- shader->selector->esgs_itemsize / 4);
+ radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+ shader->selector->esgs_itemsize / 4);
- if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
- SI_TRACKED_VGT_TF_PARAM,
- shader->vgt_tf_param);
+ if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
- if (shader->vgt_vertex_reuse_block_cntl)
- radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
- SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
- shader->vgt_vertex_reuse_block_cntl);
+ if (shader->vgt_vertex_reuse_block_cntl)
+ radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ shader->vgt_vertex_reuse_block_cntl);
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
{
- struct si_pm4_state *pm4;
- unsigned num_user_sgprs;
- unsigned vgpr_comp_cnt;
- uint64_t va;
- unsigned oc_lds_en;
-
- assert(sscreen->info.chip_class <= GFX8);
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- pm4->atom.emit = si_emit_shader_es;
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
- num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
- } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
- vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
- num_user_sgprs = SI_TES_NUM_USER_SGPR;
- } else
- unreachable("invalid shader selector type");
-
- oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
- si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
- si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
- S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
- S_00B328_DX10_CLAMP(1) |
- S_00B328_FLOAT_MODE(shader->config.float_mode));
- si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
- S_00B32C_USER_SGPR(num_user_sgprs) |
- S_00B32C_OC_LDS_EN(oc_lds_en) |
- S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-
- if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
- polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
-}
-
-void gfx9_get_gs_info(struct si_shader_selector *es,
- struct si_shader_selector *gs,
- struct gfx9_gs_info *out)
-{
- unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
- unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
- bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
- input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
-
- /* All these are in dwords: */
- /* We can't allow using the whole LDS, because GS waves compete with
- * other shader stages for LDS space. */
- const unsigned max_lds_size = 8 * 1024;
- const unsigned esgs_itemsize = es->esgs_itemsize / 4;
- unsigned esgs_lds_size;
-
- /* All these are per subgroup: */
- const unsigned max_out_prims = 32 * 1024;
- const unsigned max_es_verts = 255;
- const unsigned ideal_gs_prims = 64;
- unsigned max_gs_prims, gs_prims;
- unsigned min_es_verts, es_verts, worst_case_es_verts;
-
- if (uses_adjacency || gs_num_invocations > 1)
- max_gs_prims = 127 / gs_num_invocations;
- else
- max_gs_prims = 255;
-
- /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
- * Make sure we don't go over the maximum value.
- */
- if (gs->gs_max_out_vertices > 0) {
- max_gs_prims = MIN2(max_gs_prims,
- max_out_prims /
- (gs->gs_max_out_vertices * gs_num_invocations));
- }
- assert(max_gs_prims > 0);
-
- /* If the primitive has adjacency, halve the number of vertices
- * that will be reused in multiple primitives.
- */
- min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
-
- gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
- worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
- /* Compute ESGS LDS size based on the worst case number of ES vertices
- * needed to create the target number of GS prims per subgroup.
- */
- esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
- /* If total LDS usage is too big, refactor partitions based on ratio
- * of ESGS item sizes.
- */
- if (esgs_lds_size > max_lds_size) {
- /* Our target GS Prims Per Subgroup was too large. Calculate
- * the maximum number of GS Prims Per Subgroup that will fit
- * into LDS, capped by the maximum that the hardware can support.
- */
- gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
- max_gs_prims);
- assert(gs_prims > 0);
- worst_case_es_verts = MIN2(min_es_verts * gs_prims,
- max_es_verts);
-
- esgs_lds_size = esgs_itemsize * worst_case_es_verts;
- assert(esgs_lds_size <= max_lds_size);
- }
-
- /* Now calculate remaining ESGS information. */
- if (esgs_lds_size)
- es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
- else
- es_verts = max_es_verts;
-
- /* Vertices for adjacency primitives are not always reused, so restore
- * it for ES_VERTS_PER_SUBGRP.
- */
- min_es_verts = gs->gs_input_verts_per_prim;
-
- /* For normal primitives, the VGT only checks if they are past the ES
- * verts per subgroup after allocating a full GS primitive and if they
- * are, kick off a new subgroup. But if those additional ES verts are
- * unique (e.g. not reused) we need to make sure there is enough LDS
- * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
- */
- es_verts -= min_es_verts - 1;
-
- out->es_verts_per_subgroup = es_verts;
- out->gs_prims_per_subgroup = gs_prims;
- out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
- out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
- gs->gs_max_out_vertices;
- out->esgs_ring_size = 4 * esgs_lds_size;
-
- assert(out->max_prims_per_subgroup <= max_out_prims);
+ struct si_pm4_state *pm4;
+ unsigned num_user_sgprs;
+ unsigned vgpr_comp_cnt;
+ uint64_t va;
+ unsigned oc_lds_en;
+
+ assert(sscreen->info.chip_class <= GFX8);
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ pm4->atom.emit = si_emit_shader_es;
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ if (shader->selector->type == PIPE_SHADER_VERTEX) {
+ vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+ } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+ vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
+ num_user_sgprs = SI_TES_NUM_USER_SGPR;
+ } else
+ unreachable("invalid shader selector type");
+
+ oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+ si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+ S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
+ S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) |
+ S_00B328_FLOAT_MODE(shader->config.float_mode));
+ si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+ S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
+ S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+
+ if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+}
+
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+ struct gfx9_gs_info *out)
+{
+ unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
+ unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+ bool uses_adjacency =
+ input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space. */
+ const unsigned max_lds_size = 8 * 1024;
+ const unsigned esgs_itemsize = es->esgs_itemsize / 4;
+ unsigned esgs_lds_size;
+
+ /* All these are per subgroup: */
+ const unsigned max_out_prims = 32 * 1024;
+ const unsigned max_es_verts = 255;
+ const unsigned ideal_gs_prims = 64;
+ unsigned max_gs_prims, gs_prims;
+ unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+ if (uses_adjacency || gs_num_invocations > 1)
+ max_gs_prims = 127 / gs_num_invocations;
+ else
+ max_gs_prims = 255;
+
+ /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+ * Make sure we don't go over the maximum value.
+ */
+ if (gs->gs_max_out_vertices > 0) {
+ max_gs_prims =
+ MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations));
+ }
+ assert(max_gs_prims > 0);
+
+ /* If the primitive has adjacency, halve the number of vertices
+ * that will be reused in multiple primitives.
+ */
+ min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
+
+ gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+ /* Compute ESGS LDS size based on the worst case number of ES vertices
+ * needed to create the target number of GS prims per subgroup.
+ */
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+ /* If total LDS usage is too big, refactor partitions based on ratio
+ * of ESGS item sizes.
+ */
+ if (esgs_lds_size > max_lds_size) {
+ /* Our target GS Prims Per Subgroup was too large. Calculate
+ * the maximum number of GS Prims Per Subgroup that will fit
+ * into LDS, capped by the maximum that the hardware can support.
+ */
+ gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+ assert(gs_prims > 0);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+ assert(esgs_lds_size <= max_lds_size);
+ }
+
+ /* Now calculate remaining ESGS information. */
+ if (esgs_lds_size)
+ es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+ else
+ es_verts = max_es_verts;
+
+ /* Vertices for adjacency primitives are not always reused, so restore
+ * it for ES_VERTS_PER_SUBGRP.
+ */
+ min_es_verts = gs->gs_input_verts_per_prim;
+
+ /* For normal primitives, the VGT only checks if they are past the ES
+ * verts per subgroup after allocating a full GS primitive and if they
+ * are, kick off a new subgroup. But if those additional ES verts are
+ * unique (e.g. not reused) we need to make sure there is enough LDS
+ * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+ */
+ es_verts -= min_es_verts - 1;
+
+ out->es_verts_per_subgroup = es_verts;
+ out->gs_prims_per_subgroup = gs_prims;
+ out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+ out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices;
+ out->esgs_ring_size = 4 * esgs_lds_size;
+
+ assert(out->max_prims_per_subgroup <= max_out_prims);
}
static void si_emit_shader_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
- if (!shader)
- return;
-
- /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
- * R_028A68_VGT_GSVS_RING_OFFSET_3 */
- radeon_opt_set_context_reg3(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
- SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_1,
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
-
- /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
- radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
- SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
- shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
-
- /* R_028B38_VGT_GS_MAX_VERT_OUT */
- radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
- SI_TRACKED_VGT_GS_MAX_VERT_OUT,
- shader->ctx_reg.gs.vgt_gs_max_vert_out);
-
- /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
- * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
- radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
- SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
- shader->ctx_reg.gs.vgt_gs_vert_itemsize,
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_2,
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
-
- /* R_028B90_VGT_GS_INSTANCE_CNT */
- radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
- SI_TRACKED_VGT_GS_INSTANCE_CNT,
- shader->ctx_reg.gs.vgt_gs_instance_cnt);
-
- if (sctx->chip_class >= GFX9) {
- /* R_028A44_VGT_GS_ONCHIP_CNTL */
- radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
- SI_TRACKED_VGT_GS_ONCHIP_CNTL,
- shader->ctx_reg.gs.vgt_gs_onchip_cntl);
- /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
- radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
- SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
- shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
- /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
- radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
- SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
- shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
-
- if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
- radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
- SI_TRACKED_VGT_TF_PARAM,
- shader->vgt_tf_param);
- if (shader->vgt_vertex_reuse_block_cntl)
- radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
- SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
- shader->vgt_vertex_reuse_block_cntl);
- }
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
+ * R_028A68_VGT_GSVS_RING_OFFSET_3 */
+ radeon_opt_set_context_reg3(
+ sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
+
+ /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
+ radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+ shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
+
+ /* R_028B38_VGT_GS_MAX_VERT_OUT */
+ radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ shader->ctx_reg.gs.vgt_gs_max_vert_out);
+
+ /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
+ * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
+ radeon_opt_set_context_reg4(
+ sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
+
+ /* R_028B90_VGT_GS_INSTANCE_CNT */
+ radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+ shader->ctx_reg.gs.vgt_gs_instance_cnt);
+
+ if (sctx->chip_class >= GFX9) {
+ /* R_028A44_VGT_GS_ONCHIP_CNTL */
+ radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ shader->ctx_reg.gs.vgt_gs_onchip_cntl);
+ /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
+ radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+ SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+ shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
+ /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
+ radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+ shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
+
+ if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
+ if (shader->vgt_vertex_reuse_block_cntl)
+ radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ shader->vgt_vertex_reuse_block_cntl);
+ }
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
{
- struct si_shader_selector *sel = shader->selector;
- const ubyte *num_components = sel->info.num_stream_output_components;
- unsigned gs_num_invocations = sel->gs_num_invocations;
- struct si_pm4_state *pm4;
- uint64_t va;
- unsigned max_stream = sel->max_gs_stream;
- unsigned offset;
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- pm4->atom.emit = si_emit_shader_gs;
-
- offset = num_components[0] * sel->gs_max_out_vertices;
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
-
- if (max_stream >= 1)
- offset += num_components[1] * sel->gs_max_out_vertices;
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
-
- if (max_stream >= 2)
- offset += num_components[2] * sel->gs_max_out_vertices;
- shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
-
- if (max_stream >= 3)
- offset += num_components[3] * sel->gs_max_out_vertices;
- shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
-
- /* The GSVS_RING_ITEMSIZE register takes 15 bits */
- assert(offset < (1 << 15));
-
- shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
-
- shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
- shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
-
- shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
- S_028B90_ENABLE(gs_num_invocations > 0);
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (sscreen->info.chip_class >= GFX9) {
- unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
- unsigned es_type = shader->key.part.gs.es->type;
- unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
-
- if (es_type == PIPE_SHADER_VERTEX) {
- es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
- } else if (es_type == PIPE_SHADER_TESS_EVAL)
- es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
- else
- unreachable("invalid shader selector type");
-
- /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
- * VGPR[0:4] are always loaded.
- */
- if (sel->info.uses_invocationid)
- gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
- else if (sel->info.uses_primid)
- gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
- else if (input_prim >= PIPE_PRIM_TRIANGLES)
- gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
- else
- gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
- unsigned num_user_sgprs;
- if (es_type == PIPE_SHADER_VERTEX)
- num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
- else
- num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
- if (sscreen->info.chip_class >= GFX10) {
- si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
- } else {
- si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
- }
-
- uint32_t rsrc1 =
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B228_DX10_CLAMP(1) |
- S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
- S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
- S_00B228_FLOAT_MODE(shader->config.float_mode) |
- S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
- uint32_t rsrc2 =
- S_00B22C_USER_SGPR(num_user_sgprs) |
- S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
- S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
- S_00B22C_LDS_SIZE(shader->config.lds_size) |
- S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
- if (sscreen->info.chip_class >= GFX10) {
- rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
- } else {
- rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
- rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
- }
-
- si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
- si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
-
- if (sscreen->info.chip_class >= GFX10) {
- si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
- S_00B204_CU_EN(0xffff) |
- S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
- }
-
- shader->ctx_reg.gs.vgt_gs_onchip_cntl =
- S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
- S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
- S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
- shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
- S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
- shader->ctx_reg.gs.vgt_esgs_ring_itemsize =
- shader->key.part.gs.es->esgs_itemsize / 4;
-
- if (es_type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
-
- polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
- NULL, pm4);
- } else {
- si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
- si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
-
- si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
- S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
- S_00B228_DX10_CLAMP(1) |
- S_00B228_FLOAT_MODE(shader->config.float_mode));
- si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
- S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
- S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
- }
+ struct si_shader_selector *sel = shader->selector;
+ const ubyte *num_components = sel->info.num_stream_output_components;
+ unsigned gs_num_invocations = sel->gs_num_invocations;
+ struct si_pm4_state *pm4;
+ uint64_t va;
+ unsigned max_stream = sel->max_gs_stream;
+ unsigned offset;
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ pm4->atom.emit = si_emit_shader_gs;
+
+ offset = num_components[0] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
+
+ if (max_stream >= 1)
+ offset += num_components[1] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
+
+ if (max_stream >= 2)
+ offset += num_components[2] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
+
+ if (max_stream >= 3)
+ offset += num_components[3] * sel->gs_max_out_vertices;
+ shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
+
+ /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+ assert(offset < (1 << 15));
+
+ shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
+
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
+ shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
+
+ shader->ctx_reg.gs.vgt_gs_instance_cnt =
+ S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ if (sscreen->info.chip_class >= GFX9) {
+ unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+ unsigned es_type = shader->key.part.gs.es->type;
+ unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+
+ if (es_type == PIPE_SHADER_VERTEX) {
+ es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+ } else if (es_type == PIPE_SHADER_TESS_EVAL)
+ es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
+ else
+ unreachable("invalid shader selector type");
+
+ /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+ * VGPR[0:4] are always loaded.
+ */
+ if (sel->info.uses_invocationid)
+ gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+ else if (sel->info.uses_primid)
+ gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+ else if (input_prim >= PIPE_PRIM_TRIANGLES)
+ gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+ else
+ gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+ unsigned num_user_sgprs;
+ if (es_type == PIPE_SHADER_VERTEX)
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+ else
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+ } else {
+ si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
+ }
+
+ uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
+ S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+ S_00B228_FLOAT_MODE(shader->config.float_mode) |
+ S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+ uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
+ S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+ S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+ S_00B22C_LDS_SIZE(shader->config.lds_size) |
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+ } else {
+ rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
+ rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+ }
+
+ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+ }
+
+ shader->ctx_reg.gs.vgt_gs_onchip_cntl =
+ S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
+ shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
+ S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
+ shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
+
+ if (es_type == PIPE_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+ polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+ } else {
+ si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+
+ si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+ S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+ S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+ }
}
static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
{
- enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
+ enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+ sctx->tracked_regs.reg_value[reg] != value) {
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- if (sctx->family == CHIP_NAVI10 ||
- sctx->family == CHIP_NAVI12 ||
- sctx->family == CHIP_NAVI14) {
- /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
- }
+ if (sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+ sctx->family == CHIP_NAVI14) {
+ /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+ }
- radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+ radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
+ sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+ sctx->tracked_regs.reg_value[reg] = value;
+ }
}
/* Common tail code for NGG primitive shaders. */
-static void gfx10_emit_shader_ngg_tail(struct si_context *sctx,
- struct si_shader *shader,
- unsigned initial_cdw)
-{
- radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
- SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
- shader->ctx_reg.ngg.ge_max_output_per_subgroup);
- radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL,
- SI_TRACKED_GE_NGG_SUBGRP_CNTL,
- shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
- radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
- SI_TRACKED_VGT_PRIMITIVEID_EN,
- shader->ctx_reg.ngg.vgt_primitiveid_en);
- radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
- SI_TRACKED_VGT_GS_ONCHIP_CNTL,
- shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
- radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
- SI_TRACKED_VGT_GS_INSTANCE_CNT,
- shader->ctx_reg.ngg.vgt_gs_instance_cnt);
- radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
- SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
- shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
- radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
- SI_TRACKED_SPI_VS_OUT_CONFIG,
- shader->ctx_reg.ngg.spi_vs_out_config);
- radeon_opt_set_context_reg2(sctx, R_028708_SPI_SHADER_IDX_FORMAT,
- SI_TRACKED_SPI_SHADER_IDX_FORMAT,
- shader->ctx_reg.ngg.spi_shader_idx_format,
- shader->ctx_reg.ngg.spi_shader_pos_format);
- radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
- SI_TRACKED_PA_CL_VTE_CNTL,
- shader->ctx_reg.ngg.pa_cl_vte_cntl);
- radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL,
- SI_TRACKED_PA_CL_NGG_CNTL,
- shader->ctx_reg.ngg.pa_cl_ngg_cntl);
-
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
- shader->pa_cl_vs_out_cntl,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
-
- /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
- gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader,
+ unsigned initial_cdw)
+{
+ radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+ SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+ shader->ctx_reg.ngg.ge_max_output_per_subgroup);
+ radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+ shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
+ radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+ shader->ctx_reg.ngg.vgt_primitiveid_en);
+ radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
+ radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+ shader->ctx_reg.ngg.vgt_gs_instance_cnt);
+ radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
+ radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+ shader->ctx_reg.ngg.spi_vs_out_config);
+ radeon_opt_set_context_reg2(
+ sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
+ shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
+ radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+ shader->ctx_reg.ngg.pa_cl_vte_cntl);
+ radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+ shader->ctx_reg.ngg.pa_cl_ngg_cntl);
+
+ radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
}
static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- if (!shader)
- return;
+ if (!shader)
+ return;
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
}
static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- if (!shader)
- return;
+ if (!shader)
+ return;
- radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
- SI_TRACKED_VGT_TF_PARAM,
- shader->vgt_tf_param);
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
}
static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- if (!shader)
- return;
+ if (!shader)
+ return;
- radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
- SI_TRACKED_VGT_GS_MAX_VERT_OUT,
- shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+ radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
}
static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ struct si_shader *shader = sctx->queued.named.gs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- if (!shader)
- return;
+ if (!shader)
+ return;
- radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
- SI_TRACKED_VGT_GS_MAX_VERT_OUT,
- shader->ctx_reg.ngg.vgt_gs_max_vert_out);
- radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
- SI_TRACKED_VGT_TF_PARAM,
- shader->vgt_tf_param);
+ radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
}
unsigned si_get_input_prim(const struct si_shader_selector *gs)
{
- if (gs->type == PIPE_SHADER_GEOMETRY)
- return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-
- if (gs->type == PIPE_SHADER_TESS_EVAL) {
- if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
- return PIPE_PRIM_POINTS;
- if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
- return PIPE_PRIM_LINES;
- return PIPE_PRIM_TRIANGLES;
- }
-
- /* TODO: Set this correctly if the primitive type is set in the shader key. */
- return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
+ if (gs->type == PIPE_SHADER_GEOMETRY)
+ return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+
+ if (gs->type == PIPE_SHADER_TESS_EVAL) {
+ if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+ return PIPE_PRIM_POINTS;
+ if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+ return PIPE_PRIM_LINES;
+ return PIPE_PRIM_TRIANGLES;
+ }
+
+ /* TODO: Set this correctly if the primitive type is set in the shader key. */
+ return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
}
static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg)
{
- bool misc_vec_ena =
- sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
- sel->info.writes_layer || sel->info.writes_viewport_index;
- return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
- S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
- S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
- S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
- S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
- S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+ bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+ sel->info.writes_layer || sel->info.writes_viewport_index;
+ return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+ S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
+ S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+ S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+ S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+ S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
}
/**
*/
static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
{
- const struct si_shader_selector *gs_sel = shader->selector;
- const struct si_shader_info *gs_info = &gs_sel->info;
- enum pipe_shader_type gs_type = shader->selector->type;
- const struct si_shader_selector *es_sel =
- shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
- const struct si_shader_info *es_info = &es_sel->info;
- enum pipe_shader_type es_type = es_sel->type;
- unsigned num_user_sgprs;
- unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
- uint64_t va;
- unsigned window_space =
- gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
- bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
- unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
- unsigned input_prim = si_get_input_prim(gs_sel);
- bool break_wave_at_eoi = false;
- struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- if (es_type == PIPE_SHADER_TESS_EVAL) {
- pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
- : gfx10_emit_shader_ngg_tess_nogs;
- } else {
- pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
- : gfx10_emit_shader_ngg_notess_nogs;
- }
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (es_type == PIPE_SHADER_VERTEX) {
- es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-
- if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
- num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
- es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
- } else {
- num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
- }
- } else {
- assert(es_type == PIPE_SHADER_TESS_EVAL);
- es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
- num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
- if (es_enable_prim_id || gs_info->uses_primid)
- break_wave_at_eoi = true;
- }
-
- /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
- * VGPR[0:4] are always loaded.
- *
- * Vertex shaders always need to load VGPR3, because they need to
- * pass edge flags for decomposed primitives (such as quads) to the PA
- * for the GL_LINE polygon mode to skip rendering lines on inner edges.
- */
- if (gs_info->uses_invocationid ||
- (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
- gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
- else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
- (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
- gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
- else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
- gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
- else
- gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
- si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
- si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
- si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) /
- (sscreen->ge_wave_size == 32 ? 8 : 4)) |
- S_00B228_FLOAT_MODE(shader->config.float_mode) |
- S_00B228_DX10_CLAMP(1) |
- S_00B228_MEM_ORDERED(1) |
- S_00B228_WGP_MODE(1) |
- S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
- si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
- S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
- S_00B22C_USER_SGPR(num_user_sgprs) |
- S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
- S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
- S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
- S_00B22C_LDS_SIZE(shader->config.lds_size));
-
- /* Determine LATE_ALLOC_GS. */
- unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
- unsigned late_alloc_wave64; /* The limit is per SH. */
-
- /* For Wave32, the hw will launch twice the number of late
- * alloc waves, so 1 == 2x wave32.
- *
- * Don't use late alloc for NGG on Navi14 due to a hw bug.
- */
- if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
- late_alloc_wave64 = 0;
- else if (num_cu_per_sh <= 6)
- late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
- else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
- late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
- else
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
- /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
- if (sscreen->info.family == CHIP_NAVI10 ||
- sscreen->info.family == CHIP_NAVI12 ||
- sscreen->info.family == CHIP_NAVI14)
- late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
- si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
- S_00B204_CU_EN(0xffff) |
- S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
-
- nparams = MAX2(shader->info.nr_param_exports, 1);
- shader->ctx_reg.ngg.spi_vs_out_config =
- S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
- S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
-
- shader->ctx_reg.ngg.spi_shader_idx_format =
- S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
- shader->ctx_reg.ngg.spi_shader_pos_format =
- S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
- S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE);
-
- shader->ctx_reg.ngg.vgt_primitiveid_en =
- S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
- S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
- gs_sel->info.writes_primid);
-
- if (gs_type == PIPE_SHADER_GEOMETRY) {
- shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
- shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
- } else {
- shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
- }
-
- if (es_type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, es_sel, pm4);
-
- shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
- S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
- S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
- S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
- shader->ctx_reg.ngg.ge_max_output_per_subgroup =
- S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
- shader->ctx_reg.ngg.ge_ngg_subgrp_cntl =
- S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
- S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
- shader->ctx_reg.ngg.vgt_gs_instance_cnt =
- S_028B90_CNT(gs_num_invocations) |
- S_028B90_ENABLE(gs_num_invocations > 1) |
- S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(
- shader->ngg.max_vert_out_per_gs_instance);
-
- /* Always output hw-generated edge flags and pass them via the prim
- * export to prevent drawing lines on internal edges of decomposed
- * primitives (such as quads) with polygon mode = lines. Only VS needs
- * this.
- */
- shader->ctx_reg.ngg.pa_cl_ngg_cntl =
- S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
- shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
-
- /* Oversubscribe PC. This improves performance when there are too many varyings. */
- float oversub_pc_factor = 0.25;
-
- if (shader->key.opt.ngg_culling) {
- /* Be more aggressive with NGG culling. */
- if (shader->info.nr_param_exports > 4)
- oversub_pc_factor = 1;
- else if (shader->info.nr_param_exports > 2)
- oversub_pc_factor = 0.75;
- else
- oversub_pc_factor = 0.5;
- }
-
- unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
- shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
- S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
-
- if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
- shader->ge_cntl =
- S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
- } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
- shader->ge_cntl =
- S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
- } else {
- shader->ge_cntl =
- S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
- S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
- S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
-
- /* Bug workaround for a possible hang with non-tessellation cases.
- * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
- *
- * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
- */
- if ((sscreen->info.family == CHIP_NAVI10 ||
- sscreen->info.family == CHIP_NAVI12 ||
- sscreen->info.family == CHIP_NAVI14) &&
- (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
- shader->ngg.hw_max_esverts != 256) {
- shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
- if (shader->ngg.hw_max_esverts > 5) {
- shader->ge_cntl |=
- S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
- }
- }
- }
-
- if (window_space) {
- shader->ctx_reg.ngg.pa_cl_vte_cntl =
- S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
- } else {
- shader->ctx_reg.ngg.pa_cl_vte_cntl =
- S_028818_VTX_W0_FMT(1) |
- S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
- S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
- S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
- }
+ const struct si_shader_selector *gs_sel = shader->selector;
+ const struct si_shader_info *gs_info = &gs_sel->info;
+ enum pipe_shader_type gs_type = shader->selector->type;
+ const struct si_shader_selector *es_sel =
+ shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+ const struct si_shader_info *es_info = &es_sel->info;
+ enum pipe_shader_type es_type = es_sel->type;
+ unsigned num_user_sgprs;
+ unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+ uint64_t va;
+ unsigned window_space = gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+ bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
+ unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+ unsigned input_prim = si_get_input_prim(gs_sel);
+ bool break_wave_at_eoi = false;
+ struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ if (es_type == PIPE_SHADER_TESS_EVAL) {
+ pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
+ : gfx10_emit_shader_ngg_tess_nogs;
+ } else {
+ pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
+ : gfx10_emit_shader_ngg_notess_nogs;
+ }
+
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ if (es_type == PIPE_SHADER_VERTEX) {
+ es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+
+ if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+ num_user_sgprs =
+ SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+ } else {
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+ }
+ } else {
+ assert(es_type == PIPE_SHADER_TESS_EVAL);
+ es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
+ num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+ if (es_enable_prim_id || gs_info->uses_primid)
+ break_wave_at_eoi = true;
+ }
+
+ /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+ * VGPR[0:4] are always loaded.
+ *
+ * Vertex shaders always need to load VGPR3, because they need to
+ * pass edge flags for decomposed primitives (such as quads) to the PA
+ * for the GL_LINE polygon mode to skip rendering lines on inner edges.
+ */
+ if (gs_info->uses_invocationid ||
+ (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+ gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
+ else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
+ (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+ gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+ else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
+ gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+ else
+ gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+ si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+ si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
+ si_pm4_set_reg(
+ pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+ S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
+ S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
+ S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+ si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
+ S_00B22C_USER_SGPR(num_user_sgprs) |
+ S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+ S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
+ S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+ S_00B22C_LDS_SIZE(shader->config.lds_size));
+
+ /* Determine LATE_ALLOC_GS. */
+ unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+ unsigned late_alloc_wave64; /* The limit is per SH. */
+
+ /* For Wave32, the hw will launch twice the number of late
+ * alloc waves, so 1 == 2x wave32.
+ *
+ * Don't use late alloc for NGG on Navi14 due to a hw bug.
+ */
+ if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
+ late_alloc_wave64 = 0;
+ else if (num_cu_per_sh <= 6)
+ late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+ else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+ else
+ late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+ /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
+ if (sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+ sscreen->info.family == CHIP_NAVI14)
+ late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
+
+ si_pm4_set_reg(
+ pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+ S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
+ nparams = MAX2(shader->info.nr_param_exports, 1);
+ shader->ctx_reg.ngg.spi_vs_out_config =
+ S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
+ S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+
+ shader->ctx_reg.ngg.spi_shader_idx_format =
+ S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
+ shader->ctx_reg.ngg.spi_shader_pos_format =
+ S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+ S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE);
+
+ shader->ctx_reg.ngg.vgt_primitiveid_en =
+ S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
+ S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+ gs_sel->info.writes_primid);
+
+ if (gs_type == PIPE_SHADER_GEOMETRY) {
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
+ shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
+ } else {
+ shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
+ }
+
+ if (es_type == PIPE_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, es_sel, pm4);
+
+ shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
+ S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
+ shader->ctx_reg.ngg.ge_max_output_per_subgroup =
+ S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
+ shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
+ S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
+ shader->ctx_reg.ngg.vgt_gs_instance_cnt =
+ S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
+ S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
+
+ /* Always output hw-generated edge flags and pass them via the prim
+ * export to prevent drawing lines on internal edges of decomposed
+ * primitives (such as quads) with polygon mode = lines. Only VS needs
+ * this.
+ */
+ shader->ctx_reg.ngg.pa_cl_ngg_cntl =
+ S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
+ shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
+
+ /* Oversubscribe PC. This improves performance when there are too many varyings. */
+ float oversub_pc_factor = 0.25;
+
+ if (shader->key.opt.ngg_culling) {
+ /* Be more aggressive with NGG culling. */
+ if (shader->info.nr_param_exports > 4)
+ oversub_pc_factor = 1;
+ else if (shader->info.nr_param_exports > 2)
+ oversub_pc_factor = 0.75;
+ else
+ oversub_pc_factor = 0.5;
+ }
+
+ unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
+ shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+ S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+ if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+ } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+ } else {
+ shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+ S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+ S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+ /* Bug workaround for a possible hang with non-tessellation cases.
+ * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+ *
+ * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+ */
+ if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+ sscreen->info.family == CHIP_NAVI14) &&
+ (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+ shader->ngg.hw_max_esverts != 256) {
+ shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+ if (shader->ngg.hw_max_esverts > 5) {
+ shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+ }
+ }
+ }
+
+ if (window_space) {
+ shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+ } else {
+ shader->ctx_reg.ngg.pa_cl_vte_cntl =
+ S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+ S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+ S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+ }
}
static void si_emit_shader_vs(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.vs->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
- if (!shader)
- return;
-
- radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE,
- SI_TRACKED_VGT_GS_MODE,
- shader->ctx_reg.vs.vgt_gs_mode);
- radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
- SI_TRACKED_VGT_PRIMITIVEID_EN,
- shader->ctx_reg.vs.vgt_primitiveid_en);
-
- if (sctx->chip_class <= GFX8) {
- radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF,
- SI_TRACKED_VGT_REUSE_OFF,
- shader->ctx_reg.vs.vgt_reuse_off);
- }
-
- radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
- SI_TRACKED_SPI_VS_OUT_CONFIG,
- shader->ctx_reg.vs.spi_vs_out_config);
-
- radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
- SI_TRACKED_SPI_SHADER_POS_FORMAT,
- shader->ctx_reg.vs.spi_shader_pos_format);
-
- radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
- SI_TRACKED_PA_CL_VTE_CNTL,
- shader->ctx_reg.vs.pa_cl_vte_cntl);
-
- if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
- SI_TRACKED_VGT_TF_PARAM,
- shader->vgt_tf_param);
-
- if (shader->vgt_vertex_reuse_block_cntl)
- radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
- SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
- shader->vgt_vertex_reuse_block_cntl);
-
- /* Required programming for tessellation. (legacy pipeline only) */
- if (sctx->chip_class == GFX10 &&
- shader->selector->type == PIPE_SHADER_TESS_EVAL) {
- radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
- SI_TRACKED_VGT_GS_ONCHIP_CNTL,
- S_028A44_ES_VERTS_PER_SUBGRP(250) |
- S_028A44_GS_PRIMS_PER_SUBGRP(126) |
- S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
- }
-
- if (sctx->chip_class >= GFX10) {
- radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
- shader->pa_cl_vs_out_cntl,
- SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
- }
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
-
- /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
- if (sctx->chip_class >= GFX10)
- gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
+ struct si_shader *shader = sctx->queued.named.vs->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+ if (!shader)
+ return;
+
+ radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
+ shader->ctx_reg.vs.vgt_gs_mode);
+ radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+ shader->ctx_reg.vs.vgt_primitiveid_en);
+
+ if (sctx->chip_class <= GFX8) {
+ radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
+ shader->ctx_reg.vs.vgt_reuse_off);
+ }
+
+ radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+ shader->ctx_reg.vs.spi_vs_out_config);
+
+ radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
+ SI_TRACKED_SPI_SHADER_POS_FORMAT,
+ shader->ctx_reg.vs.spi_shader_pos_format);
+
+ radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+ shader->ctx_reg.vs.pa_cl_vte_cntl);
+
+ if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+ radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+ shader->vgt_tf_param);
+
+ if (shader->vgt_vertex_reuse_block_cntl)
+ radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+ shader->vgt_vertex_reuse_block_cntl);
+
+ /* Required programming for tessellation. (legacy pipeline only) */
+ if (sctx->chip_class == GFX10 && shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+ radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+ S_028A44_ES_VERTS_PER_SUBGRP(250) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+ }
+
+ if (sctx->chip_class >= GFX10) {
+ radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+ SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+ }
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
+
+ /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+ if (sctx->chip_class >= GFX10)
+ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
}
/**
static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
struct si_shader_selector *gs)
{
- const struct si_shader_info *info = &shader->selector->info;
- struct si_pm4_state *pm4;
- unsigned num_user_sgprs, vgpr_comp_cnt;
- uint64_t va;
- unsigned nparams, oc_lds_en;
- unsigned window_space =
- info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
- bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- pm4->atom.emit = si_emit_shader_vs;
-
- /* We always write VGT_GS_MODE in the VS state, because every switch
- * between different shader pipelines involving a different GS or no
- * GS at all involves a switch of the VS (different GS use different
- * copy shaders). On the other hand, when the API switches from a GS to
- * no GS and then back to the same GS used originally, the GS state is
- * not sent again.
- */
- if (!gs) {
- unsigned mode = V_028A40_GS_OFF;
-
- /* PrimID needs GS scenario A. */
- if (enable_prim_id)
- mode = V_028A40_GS_SCENARIO_A;
-
- shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
- shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
- } else {
- shader->ctx_reg.vs.vgt_gs_mode = ac_vgt_gs_mode(gs->gs_max_out_vertices,
- sscreen->info.chip_class);
- shader->ctx_reg.vs.vgt_primitiveid_en = 0;
- }
-
- if (sscreen->info.chip_class <= GFX8) {
- /* Reuse needs to be set off if we write oViewport. */
- shader->ctx_reg.vs.vgt_reuse_off =
- S_028AB4_REUSE_OFF(info->writes_viewport_index);
- }
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
- if (gs) {
- vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
- num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
- } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
-
- if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
- num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
- info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
- } else {
- num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
- }
- } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
- vgpr_comp_cnt = enable_prim_id ? 3 : 2;
- num_user_sgprs = SI_TES_NUM_USER_SGPR;
- } else
- unreachable("invalid shader selector type");
-
- /* VS is required to export at least one param. */
- nparams = MAX2(shader->info.nr_param_exports, 1);
- shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
-
- if (sscreen->info.chip_class >= GFX10) {
- shader->ctx_reg.vs.spi_vs_out_config |=
- S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
- }
-
- shader->ctx_reg.vs.spi_shader_pos_format =
- S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
- S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE) |
- S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
- V_02870C_SPI_SHADER_4COMP :
- V_02870C_SPI_SHADER_NONE);
- shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
- S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
- shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
-
- oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
- si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
- si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
-
- uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) /
- (sscreen->ge_wave_size == 32 ? 8 : 4)) |
- S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
- S_00B128_DX10_CLAMP(1) |
- S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
- S_00B128_FLOAT_MODE(shader->config.float_mode);
- uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) |
- S_00B12C_OC_LDS_EN(oc_lds_en) |
- S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
- if (sscreen->info.chip_class >= GFX10)
- rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
- else if (sscreen->info.chip_class == GFX9)
- rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-
- if (sscreen->info.chip_class <= GFX9)
- rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
-
- if (!sscreen->use_ngg_streamout) {
- rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
- S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
- S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
- S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
- S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
- }
-
- si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
- si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
-
- if (window_space)
- shader->ctx_reg.vs.pa_cl_vte_cntl =
- S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
- else
- shader->ctx_reg.vs.pa_cl_vte_cntl =
- S_028818_VTX_W0_FMT(1) |
- S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
- S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
- S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
-
- if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
- polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+ const struct si_shader_info *info = &shader->selector->info;
+ struct si_pm4_state *pm4;
+ unsigned num_user_sgprs, vgpr_comp_cnt;
+ uint64_t va;
+ unsigned nparams, oc_lds_en;
+ unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+ bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ pm4->atom.emit = si_emit_shader_vs;
+
+ /* We always write VGT_GS_MODE in the VS state, because every switch
+ * between different shader pipelines involving a different GS or no
+ * GS at all involves a switch of the VS (different GS use different
+ * copy shaders). On the other hand, when the API switches from a GS to
+ * no GS and then back to the same GS used originally, the GS state is
+ * not sent again.
+ */
+ if (!gs) {
+ unsigned mode = V_028A40_GS_OFF;
+
+ /* PrimID needs GS scenario A. */
+ if (enable_prim_id)
+ mode = V_028A40_GS_SCENARIO_A;
+
+ shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
+ shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
+ } else {
+ shader->ctx_reg.vs.vgt_gs_mode =
+ ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class);
+ shader->ctx_reg.vs.vgt_primitiveid_en = 0;
+ }
+
+ if (sscreen->info.chip_class <= GFX8) {
+ /* Reuse needs to be set off if we write oViewport. */
+ shader->ctx_reg.vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index);
+ }
+
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+ if (gs) {
+ vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
+ num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
+ } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
+ vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
+
+ if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+ num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+ } else {
+ num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+ }
+ } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+ vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+ num_user_sgprs = SI_TES_NUM_USER_SGPR;
+ } else
+ unreachable("invalid shader selector type");
+
+ /* VS is required to export at least one param. */
+ nparams = MAX2(shader->info.nr_param_exports, 1);
+ shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
+
+ if (sscreen->info.chip_class >= GFX10) {
+ shader->ctx_reg.vs.spi_vs_out_config |=
+ S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+ }
+
+ shader->ctx_reg.vs.spi_shader_pos_format =
+ S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+ S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE) |
+ S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+ : V_02870C_SPI_SHADER_NONE);
+ shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+ S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
+ shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
+
+ oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+ si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
+
+ uint32_t rsrc1 =
+ S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+ S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) |
+ S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B128_FLOAT_MODE(shader->config.float_mode);
+ uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) |
+ S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+ if (sscreen->info.chip_class >= GFX10)
+ rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+ else if (sscreen->info.chip_class == GFX9)
+ rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
+ if (sscreen->info.chip_class <= GFX9)
+ rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
+
+ if (!sscreen->use_ngg_streamout) {
+ rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
+ S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
+ S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
+ S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
+ S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
+ }
+
+ si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
+ si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
+
+ if (window_space)
+ shader->ctx_reg.vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+ else
+ shader->ctx_reg.vs.pa_cl_vte_cntl =
+ S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+ S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+ S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+
+ if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
}
static unsigned si_get_ps_num_interp(struct si_shader *ps)
{
- struct si_shader_info *info = &ps->selector->info;
- unsigned num_colors = !!(info->colors_read & 0x0f) +
- !!(info->colors_read & 0xf0);
- unsigned num_interp = ps->selector->info.num_inputs +
- (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
-
- assert(num_interp <= 32);
- return MIN2(num_interp, 32);
+ struct si_shader_info *info = &ps->selector->info;
+ unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
+ unsigned num_interp =
+ ps->selector->info.num_inputs + (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
+
+ assert(num_interp <= 32);
+ return MIN2(num_interp, 32);
}
static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
{
- unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
- unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
+ unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
+ unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
- /* If the i-th target format is set, all previous target formats must
- * be non-zero to avoid hangs.
- */
- for (i = 0; i < num_targets; i++)
- if (!(value & (0xf << (i * 4))))
- value |= V_028714_SPI_SHADER_32_R << (i * 4);
+ /* If the i-th target format is set, all previous target formats must
+ * be non-zero to avoid hangs.
+ */
+ for (i = 0; i < num_targets; i++)
+ if (!(value & (0xf << (i * 4))))
+ value |= V_028714_SPI_SHADER_32_R << (i * 4);
- return value;
+ return value;
}
static void si_emit_shader_ps(struct si_context *sctx)
{
- struct si_shader *shader = sctx->queued.named.ps->shader;
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
- if (!shader)
- return;
-
- /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
- radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA,
- SI_TRACKED_SPI_PS_INPUT_ENA,
- shader->ctx_reg.ps.spi_ps_input_ena,
- shader->ctx_reg.ps.spi_ps_input_addr);
-
- radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL,
- SI_TRACKED_SPI_BARYC_CNTL,
- shader->ctx_reg.ps.spi_baryc_cntl);
- radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL,
- SI_TRACKED_SPI_PS_IN_CONTROL,
- shader->ctx_reg.ps.spi_ps_in_control);
-
- /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
- radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
- SI_TRACKED_SPI_SHADER_Z_FORMAT,
- shader->ctx_reg.ps.spi_shader_z_format,
- shader->ctx_reg.ps.spi_shader_col_format);
-
- radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
- SI_TRACKED_CB_SHADER_MASK,
- shader->ctx_reg.ps.cb_shader_mask);
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
-}
+ struct si_shader *shader = sctx->queued.named.ps->shader;
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
-{
- struct si_shader_info *info = &shader->selector->info;
- struct si_pm4_state *pm4;
- unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
- unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
- uint64_t va;
- unsigned input_ena = shader->config.spi_ps_input_ena;
-
- /* we need to enable at least one of them, otherwise we hang the GPU */
- assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
- G_0286CC_PERSP_CENTER_ENA(input_ena) ||
- G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
- G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
- G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
- G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
- G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
- G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
- /* POS_W_FLOAT_ENA requires one of the perspective weights. */
- assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) ||
- G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
- G_0286CC_PERSP_CENTER_ENA(input_ena) ||
- G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
- G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
-
- /* Validate interpolation optimization flags (read as implications). */
- assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
- (G_0286CC_PERSP_CENTER_ENA(input_ena) &&
- G_0286CC_PERSP_CENTROID_ENA(input_ena)));
- assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
- (G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
- G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
- assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
- (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) &&
- !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
- assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
- (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) &&
- !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
- assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
- (!G_0286CC_PERSP_CENTER_ENA(input_ena) &&
- !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
- assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
- (!G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
- !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-
- /* Validate cases when the optimizations are off (read as implications). */
- assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
- !G_0286CC_PERSP_CENTER_ENA(input_ena) ||
- !G_0286CC_PERSP_CENTROID_ENA(input_ena));
- assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
- !G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
- !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
-
- pm4 = si_get_shader_pm4_state(shader);
- if (!pm4)
- return;
-
- pm4->atom.emit = si_emit_shader_ps;
-
- /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
- * Possible vaules:
- * 0 -> Position = pixel center
- * 1 -> Position = pixel centroid
- * 2 -> Position = at sample position
- *
- * From GLSL 4.5 specification, section 7.1:
- * "The variable gl_FragCoord is available as an input variable from
- * within fragment shaders and it holds the window relative coordinates
- * (x, y, z, 1/w) values for the fragment. If multi-sampling, this
- * value can be for any location within the pixel, or one of the
- * fragment samples. The use of centroid does not further restrict
- * this value to be inside the current primitive."
- *
- * Meaning that centroid has no effect and we can return anything within
- * the pixel. Thus, return the value at sample position, because that's
- * the most accurate one shaders can get.
- */
- spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
-
- if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
- TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
- spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
-
- spi_shader_col_format = si_get_spi_shader_col_format(shader);
- cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
-
- /* Ensure that some export memory is always allocated, for two reasons:
- *
- * 1) Correctness: The hardware ignores the EXEC mask if no export
- * memory is allocated, so KILL and alpha test do not work correctly
- * without this.
- * 2) Performance: Every shader needs at least a NULL export, even when
- * it writes no color/depth output. The NULL export instruction
- * stalls without this setting.
- *
- * Don't add this to CB_SHADER_MASK.
- *
- * GFX10 supports pixel shaders without exports by setting both
- * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
- * instructions if any are present.
- */
- if ((sscreen->info.chip_class <= GFX9 ||
- info->uses_kill ||
- shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
- !spi_shader_col_format &&
- !info->writes_z && !info->writes_stencil && !info->writes_samplemask)
- spi_shader_col_format = V_028714_SPI_SHADER_32_R;
-
- shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
- shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
-
- /* Set interpolation controls. */
- spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
- S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
-
- shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
- shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
- shader->ctx_reg.ps.spi_shader_z_format =
- ac_get_spi_shader_z_format(info->writes_z,
- info->writes_stencil,
- info->writes_samplemask);
- shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
- shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
-
- va = shader->bo->gpu_address;
- si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
- si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
- si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
-
- uint32_t rsrc1 =
- S_00B028_VGPRS((shader->config.num_vgprs - 1) /
- (sscreen->ps_wave_size == 32 ? 8 : 4)) |
- S_00B028_DX10_CLAMP(1) |
- S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
- S_00B028_FLOAT_MODE(shader->config.float_mode);
-
- if (sscreen->info.chip_class < GFX10) {
- rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
- }
-
- si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
- si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
- S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
- S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
- S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+ if (!shader)
+ return;
+
+ /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
+ radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
+ shader->ctx_reg.ps.spi_ps_input_ena,
+ shader->ctx_reg.ps.spi_ps_input_addr);
+
+ radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
+ shader->ctx_reg.ps.spi_baryc_cntl);
+ radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
+ shader->ctx_reg.ps.spi_ps_in_control);
+
+ /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
+ radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
+ shader->ctx_reg.ps.spi_shader_z_format,
+ shader->ctx_reg.ps.spi_shader_col_format);
+
+ radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
+ shader->ctx_reg.ps.cb_shader_mask);
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
-static void si_shader_init_pm4_state(struct si_screen *sscreen,
- struct si_shader *shader)
+static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
{
- switch (shader->selector->type) {
- case PIPE_SHADER_VERTEX:
- if (shader->key.as_ls)
- si_shader_ls(sscreen, shader);
- else if (shader->key.as_es)
- si_shader_es(sscreen, shader);
- else if (shader->key.as_ngg)
- gfx10_shader_ngg(sscreen, shader);
- else
- si_shader_vs(sscreen, shader, NULL);
- break;
- case PIPE_SHADER_TESS_CTRL:
- si_shader_hs(sscreen, shader);
- break;
- case PIPE_SHADER_TESS_EVAL:
- if (shader->key.as_es)
- si_shader_es(sscreen, shader);
- else if (shader->key.as_ngg)
- gfx10_shader_ngg(sscreen, shader);
- else
- si_shader_vs(sscreen, shader, NULL);
- break;
- case PIPE_SHADER_GEOMETRY:
- if (shader->key.as_ngg)
- gfx10_shader_ngg(sscreen, shader);
- else
- si_shader_gs(sscreen, shader);
- break;
- case PIPE_SHADER_FRAGMENT:
- si_shader_ps(sscreen, shader);
- break;
- default:
- assert(0);
- }
+ struct si_shader_info *info = &shader->selector->info;
+ struct si_pm4_state *pm4;
+ unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
+ unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
+ uint64_t va;
+ unsigned input_ena = shader->config.spi_ps_input_ena;
+
+ /* we need to enable at least one of them, otherwise we hang the GPU */
+ assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) ||
+ G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
+ G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
+ G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
+ /* POS_W_FLOAT_ENA requires one of the perspective weights. */
+ assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
+ G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
+ G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
+
+ /* Validate interpolation optimization flags (read as implications). */
+ assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
+ (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+ assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
+ (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+ assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
+ (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+ assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
+ (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+ assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
+ (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+ assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
+ (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+
+ /* Validate cases when the optimizations are off (read as implications). */
+ assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
+ !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
+ assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
+ !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
+
+ pm4 = si_get_shader_pm4_state(shader);
+ if (!pm4)
+ return;
+
+ pm4->atom.emit = si_emit_shader_ps;
+
+ /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+ * Possible vaules:
+ * 0 -> Position = pixel center
+ * 1 -> Position = pixel centroid
+ * 2 -> Position = at sample position
+ *
+ * From GLSL 4.5 specification, section 7.1:
+ * "The variable gl_FragCoord is available as an input variable from
+ * within fragment shaders and it holds the window relative coordinates
+ * (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+ * value can be for any location within the pixel, or one of the
+ * fragment samples. The use of centroid does not further restrict
+ * this value to be inside the current primitive."
+ *
+ * Meaning that centroid has no effect and we can return anything within
+ * the pixel. Thus, return the value at sample position, because that's
+ * the most accurate one shaders can get.
+ */
+ spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
+
+ if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
+ spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
+
+ spi_shader_col_format = si_get_spi_shader_col_format(shader);
+ cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
+
+ /* Ensure that some export memory is always allocated, for two reasons:
+ *
+ * 1) Correctness: The hardware ignores the EXEC mask if no export
+ * memory is allocated, so KILL and alpha test do not work correctly
+ * without this.
+ * 2) Performance: Every shader needs at least a NULL export, even when
+ * it writes no color/depth output. The NULL export instruction
+ * stalls without this setting.
+ *
+ * Don't add this to CB_SHADER_MASK.
+ *
+ * GFX10 supports pixel shaders without exports by setting both
+ * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
+ * instructions if any are present.
+ */
+ if ((sscreen->info.chip_class <= GFX9 || info->uses_kill ||
+ shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
+ !spi_shader_col_format && !info->writes_z && !info->writes_stencil &&
+ !info->writes_samplemask)
+ spi_shader_col_format = V_028714_SPI_SHADER_32_R;
+
+ shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
+ shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
+
+ /* Set interpolation controls. */
+ spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+ S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
+
+ shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
+ shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
+ shader->ctx_reg.ps.spi_shader_z_format =
+ ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask);
+ shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
+ shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
+
+ va = shader->bo->gpu_address;
+ si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+ si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+ si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
+
+ uint32_t rsrc1 =
+ S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
+ S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+ S_00B028_FLOAT_MODE(shader->config.float_mode);
+
+ if (sscreen->info.chip_class < GFX10) {
+ rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
+ }
+
+ si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
+ si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+ S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
+ S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
+ S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+}
+
+static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
+{
+ switch (shader->selector->type) {
+ case PIPE_SHADER_VERTEX:
+ if (shader->key.as_ls)
+ si_shader_ls(sscreen, shader);
+ else if (shader->key.as_es)
+ si_shader_es(sscreen, shader);
+ else if (shader->key.as_ngg)
+ gfx10_shader_ngg(sscreen, shader);
+ else
+ si_shader_vs(sscreen, shader, NULL);
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ si_shader_hs(sscreen, shader);
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ if (shader->key.as_es)
+ si_shader_es(sscreen, shader);
+ else if (shader->key.as_ngg)
+ gfx10_shader_ngg(sscreen, shader);
+ else
+ si_shader_vs(sscreen, shader, NULL);
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ if (shader->key.as_ngg)
+ gfx10_shader_ngg(sscreen, shader);
+ else
+ si_shader_gs(sscreen, shader);
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ si_shader_ps(sscreen, shader);
+ break;
+ default:
+ assert(0);
+ }
}
static unsigned si_get_alpha_test_func(struct si_context *sctx)
{
- /* Alpha-test should be disabled if colorbuffer 0 is integer. */
- return sctx->queued.named.dsa->alpha_func;
+ /* Alpha-test should be disabled if colorbuffer 0 is integer. */
+ return sctx->queued.named.dsa->alpha_func;
}
-void si_shader_selector_key_vs(struct si_context *sctx,
- struct si_shader_selector *vs,
- struct si_shader_key *key,
- struct si_vs_prolog_bits *prolog_key)
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+ struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key)
{
- if (!sctx->vertex_elements ||
- vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
- return;
-
- struct si_vertex_elements *elts = sctx->vertex_elements;
-
- prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
- prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
- prolog_key->unpack_instance_id_from_vertex_id =
- sctx->prim_discard_cs_instancing;
-
- /* Prefer a monolithic shader to allow scheduling divisions around
- * VBO loads. */
- if (prolog_key->instance_divisor_is_fetched)
- key->opt.prefer_mono = 1;
-
- unsigned count = MIN2(vs->info.num_inputs, elts->count);
- unsigned count_mask = (1 << count) - 1;
- unsigned fix = elts->fix_fetch_always & count_mask;
- unsigned opencode = elts->fix_fetch_opencode & count_mask;
-
- if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
- uint32_t mask = elts->fix_fetch_unaligned & count_mask;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
- unsigned vbidx = elts->vertex_buffer_index[i];
- struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
- unsigned align_mask = (1 << log_hw_load_size) - 1;
- if (vb->buffer_offset & align_mask ||
- vb->stride & align_mask) {
- fix |= 1 << i;
- opencode |= 1 << i;
- }
- }
- }
-
- while (fix) {
- unsigned i = u_bit_scan(&fix);
- key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
- }
- key->mono.vs_fetch_opencode = opencode;
-}
+ if (!sctx->vertex_elements || vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
+ return;
-static void si_shader_selector_key_hw_vs(struct si_context *sctx,
- struct si_shader_selector *vs,
- struct si_shader_key *key)
-{
- struct si_shader_selector *ps = sctx->ps_shader.cso;
-
- key->opt.clip_disable =
- sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
- (vs->info.clipdist_writemask ||
- vs->info.writes_clipvertex) &&
- !vs->info.culldist_writemask;
-
- /* Find out if PS is disabled. */
- bool ps_disabled = true;
- if (ps) {
- bool ps_modifies_zs = ps->info.uses_kill ||
- ps->info.writes_z ||
- ps->info.writes_stencil ||
- ps->info.writes_samplemask ||
- sctx->queued.named.blend->alpha_to_coverage ||
- si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
- unsigned ps_colormask = si_get_total_colormask(sctx);
-
- ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
- (!ps_colormask &&
- !ps_modifies_zs &&
- !ps->info.writes_memory);
- }
-
- /* Find out which VS outputs aren't used by the PS. */
- uint64_t outputs_written = vs->outputs_written_before_ps;
- uint64_t inputs_read = 0;
-
- /* Ignore outputs that are not passed from VS to PS. */
- outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
- (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
- (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
-
- if (!ps_disabled) {
- inputs_read = ps->inputs_read;
- }
-
- uint64_t linked = outputs_written & inputs_read;
-
- key->opt.kill_outputs = ~linked & outputs_written;
- key->opt.ngg_culling = sctx->ngg_culling;
+ struct si_vertex_elements *elts = sctx->vertex_elements;
+
+ prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
+ prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+ prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing;
+
+ /* Prefer a monolithic shader to allow scheduling divisions around
+ * VBO loads. */
+ if (prolog_key->instance_divisor_is_fetched)
+ key->opt.prefer_mono = 1;
+
+ unsigned count = MIN2(vs->info.num_inputs, elts->count);
+ unsigned count_mask = (1 << count) - 1;
+ unsigned fix = elts->fix_fetch_always & count_mask;
+ unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+ if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+ uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+ unsigned vbidx = elts->vertex_buffer_index[i];
+ struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+ unsigned align_mask = (1 << log_hw_load_size) - 1;
+ if (vb->buffer_offset & align_mask || vb->stride & align_mask) {
+ fix |= 1 << i;
+ opencode |= 1 << i;
+ }
+ }
+ }
+
+ while (fix) {
+ unsigned i = u_bit_scan(&fix);
+ key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+ }
+ key->mono.vs_fetch_opencode = opencode;
}
-/* Compute the key for the hw shader variant */
-static inline void si_shader_selector_key(struct pipe_context *ctx,
- struct si_shader_selector *sel,
- union si_vgt_stages_key stages_key,
- struct si_shader_key *key)
+static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs,
+ struct si_shader_key *key)
{
- struct si_context *sctx = (struct si_context *)ctx;
-
- memset(key, 0, sizeof(*key));
-
- switch (sel->type) {
- case PIPE_SHADER_VERTEX:
- si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
-
- if (sctx->tes_shader.cso)
- key->as_ls = 1;
- else if (sctx->gs_shader.cso) {
- key->as_es = 1;
- key->as_ngg = stages_key.u.ngg;
- } else {
- key->as_ngg = stages_key.u.ngg;
- si_shader_selector_key_hw_vs(sctx, sel, key);
-
- if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->mono.u.vs_export_prim_id = 1;
- }
- break;
- case PIPE_SHADER_TESS_CTRL:
- if (sctx->chip_class >= GFX9) {
- si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
- key, &key->part.tcs.ls_prolog);
- key->part.tcs.ls = sctx->vs_shader.cso;
-
- /* When the LS VGPR fix is needed, monolithic shaders
- * can:
- * - avoid initializing EXEC in both the LS prolog
- * and the LS main part when !vs_needs_prolog
- * - remove the fixup for unused input VGPRs
- */
- key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
-
- /* The LS output / HS input layout can be communicated
- * directly instead of via user SGPRs for merged LS-HS.
- * The LS VGPR fix prefers this too.
- */
- key->opt.prefer_mono = 1;
- }
-
- key->part.tcs.epilog.prim_mode =
- sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
- key->part.tcs.epilog.invoc0_tess_factors_are_def =
- sel->info.tessfactors_are_def_in_all_invocs;
- key->part.tcs.epilog.tes_reads_tess_factors =
- sctx->tes_shader.cso->info.reads_tess_factors;
-
- if (sel == sctx->fixed_func_tcs_shader.cso)
- key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
- break;
- case PIPE_SHADER_TESS_EVAL:
- key->as_ngg = stages_key.u.ngg;
-
- if (sctx->gs_shader.cso)
- key->as_es = 1;
- else {
- si_shader_selector_key_hw_vs(sctx, sel, key);
-
- if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->mono.u.vs_export_prim_id = 1;
- }
- break;
- case PIPE_SHADER_GEOMETRY:
- if (sctx->chip_class >= GFX9) {
- if (sctx->tes_shader.cso) {
- key->part.gs.es = sctx->tes_shader.cso;
- } else {
- si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
- key, &key->part.gs.vs_prolog);
- key->part.gs.es = sctx->vs_shader.cso;
- key->part.gs.prolog.gfx9_prev_is_vs = 1;
- }
-
- key->as_ngg = stages_key.u.ngg;
-
- /* Merged ES-GS can have unbalanced wave usage.
- *
- * ES threads are per-vertex, while GS threads are
- * per-primitive. So without any amplification, there
- * are fewer GS threads than ES threads, which can result
- * in empty (no-op) GS waves. With too much amplification,
- * there are more GS threads than ES threads, which
- * can result in empty (no-op) ES waves.
- *
- * Non-monolithic shaders are implemented by setting EXEC
- * at the beginning of shader parts, and don't jump to
- * the end if EXEC is 0.
- *
- * Monolithic shaders use conditional blocks, so they can
- * jump and skip empty waves of ES or GS. So set this to
- * always use optimized variants, which are monolithic.
- */
- key->opt.prefer_mono = 1;
- }
- key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
- break;
- case PIPE_SHADER_FRAGMENT: {
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_state_blend *blend = sctx->queued.named.blend;
-
- if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
- sel->info.colors_written == 0x1)
- key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
-
- /* Select the shader color format based on whether
- * blending or alpha are needed.
- */
- key->part.ps.epilog.spi_shader_col_format =
- (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_blend_alpha) |
- (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_blend) |
- (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format_alpha) |
- (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
- sctx->framebuffer.spi_shader_col_format);
- key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
-
- /* The output for dual source blending should have
- * the same format as the first output.
- */
- if (blend->dual_src_blend) {
- key->part.ps.epilog.spi_shader_col_format |=
- (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
- }
-
- /* If alpha-to-coverage is enabled, we have to export alpha
- * even if there is no color buffer.
- */
- if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) &&
- blend->alpha_to_coverage)
- key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
-
- /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
- * to the range supported by the type if a channel has less
- * than 16 bits and the export format is 16_ABGR.
- */
- if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
- key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
- key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
- }
-
- /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
- if (!key->part.ps.epilog.last_cbuf) {
- key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
- key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
- key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
- }
-
- bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
- bool is_line = util_prim_is_lines(sctx->current_rast_prim);
-
- key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
- key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
-
- key->part.ps.epilog.alpha_to_one = blend->alpha_to_one &&
- rs->multisample_enable;
-
- key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
- key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
- (is_line && rs->line_smooth)) &&
- sctx->framebuffer.nr_samples <= 1;
- key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
-
- if (sctx->ps_iter_samples > 1 &&
- sel->info.reads_samplemask) {
- key->part.ps.prolog.samplemask_log_ps_iter =
- util_logbase2(sctx->ps_iter_samples);
- }
-
- if (rs->force_persample_interp &&
- rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1 &&
- sctx->ps_iter_samples > 1) {
- key->part.ps.prolog.force_persp_sample_interp =
- sel->info.uses_persp_center ||
- sel->info.uses_persp_centroid;
-
- key->part.ps.prolog.force_linear_sample_interp =
- sel->info.uses_linear_center ||
- sel->info.uses_linear_centroid;
- } else if (rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1) {
- key->part.ps.prolog.bc_optimize_for_persp =
- sel->info.uses_persp_center &&
- sel->info.uses_persp_centroid;
- key->part.ps.prolog.bc_optimize_for_linear =
- sel->info.uses_linear_center &&
- sel->info.uses_linear_centroid;
- } else {
- /* Make sure SPI doesn't compute more than 1 pair
- * of (i,j), which is the optimization here. */
- key->part.ps.prolog.force_persp_center_interp =
- sel->info.uses_persp_center +
- sel->info.uses_persp_centroid +
- sel->info.uses_persp_sample > 1;
-
- key->part.ps.prolog.force_linear_center_interp =
- sel->info.uses_linear_center +
- sel->info.uses_linear_centroid +
- sel->info.uses_linear_sample > 1;
-
- if (sel->info.uses_persp_opcode_interp_sample ||
- sel->info.uses_linear_opcode_interp_sample)
- key->mono.u.ps.interpolate_at_sample_force_center = 1;
- }
-
- key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
-
- /* ps_uses_fbfetch is true only if the color buffer is bound. */
- if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
- struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
- struct pipe_resource *tex = cb0->texture;
-
- /* 1D textures are allocated and used as 2D on GFX9. */
- key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
- key->mono.u.ps.fbfetch_is_1D = sctx->chip_class != GFX9 &&
- (tex->target == PIPE_TEXTURE_1D ||
- tex->target == PIPE_TEXTURE_1D_ARRAY);
- key->mono.u.ps.fbfetch_layered = tex->target == PIPE_TEXTURE_1D_ARRAY ||
- tex->target == PIPE_TEXTURE_2D_ARRAY ||
- tex->target == PIPE_TEXTURE_CUBE ||
- tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
- tex->target == PIPE_TEXTURE_3D;
- }
- break;
- }
- default:
- assert(0);
- }
-
- if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
- memset(&key->opt, 0, sizeof(key->opt));
+ struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+ key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
+ (vs->info.clipdist_writemask || vs->info.writes_clipvertex) &&
+ !vs->info.culldist_writemask;
+
+ /* Find out if PS is disabled. */
+ bool ps_disabled = true;
+ if (ps) {
+ bool ps_modifies_zs = ps->info.uses_kill || ps->info.writes_z || ps->info.writes_stencil ||
+ ps->info.writes_samplemask ||
+ sctx->queued.named.blend->alpha_to_coverage ||
+ si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
+ unsigned ps_colormask = si_get_total_colormask(sctx);
+
+ ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
+ (!ps_colormask && !ps_modifies_zs && !ps->info.writes_memory);
+ }
+
+ /* Find out which VS outputs aren't used by the PS. */
+ uint64_t outputs_written = vs->outputs_written_before_ps;
+ uint64_t inputs_read = 0;
+
+ /* Ignore outputs that are not passed from VS to PS. */
+ outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+ (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+ (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
+
+ if (!ps_disabled) {
+ inputs_read = ps->inputs_read;
+ }
+
+ uint64_t linked = outputs_written & inputs_read;
+
+ key->opt.kill_outputs = ~linked & outputs_written;
+ key->opt.ngg_culling = sctx->ngg_culling;
}
-static void si_build_shader_variant(struct si_shader *shader,
- int thread_index,
- bool low_priority)
-{
- struct si_shader_selector *sel = shader->selector;
- struct si_screen *sscreen = sel->screen;
- struct ac_llvm_compiler *compiler;
- struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
-
- if (thread_index >= 0) {
- if (low_priority) {
- assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
- compiler = &sscreen->compiler_lowp[thread_index];
- } else {
- assert(thread_index < ARRAY_SIZE(sscreen->compiler));
- compiler = &sscreen->compiler[thread_index];
- }
- if (!debug->async)
- debug = NULL;
- } else {
- assert(!low_priority);
- compiler = shader->compiler_ctx_state.compiler;
- }
-
- if (!compiler->passes)
- si_init_compiler(sscreen, compiler);
-
- if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
- PRINT_ERR("Failed to build shader variant (type=%u)\n",
- sel->type);
- shader->compilation_failed = true;
- return;
- }
-
- if (shader->compiler_ctx_state.is_debug_context) {
- FILE *f = open_memstream(&shader->shader_log,
- &shader->shader_log_size);
- if (f) {
- si_shader_dump(sscreen, shader, NULL, f, false);
- fclose(f);
- }
- }
-
- si_shader_init_pm4_state(sscreen, shader);
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+ union si_vgt_stages_key stages_key,
+ struct si_shader_key *key)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ memset(key, 0, sizeof(*key));
+
+ switch (sel->type) {
+ case PIPE_SHADER_VERTEX:
+ si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
+
+ if (sctx->tes_shader.cso)
+ key->as_ls = 1;
+ else if (sctx->gs_shader.cso) {
+ key->as_es = 1;
+ key->as_ngg = stages_key.u.ngg;
+ } else {
+ key->as_ngg = stages_key.u.ngg;
+ si_shader_selector_key_hw_vs(sctx, sel, key);
+
+ if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+ key->mono.u.vs_export_prim_id = 1;
+ }
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ if (sctx->chip_class >= GFX9) {
+ si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.tcs.ls_prolog);
+ key->part.tcs.ls = sctx->vs_shader.cso;
+
+ /* When the LS VGPR fix is needed, monolithic shaders
+ * can:
+ * - avoid initializing EXEC in both the LS prolog
+ * and the LS main part when !vs_needs_prolog
+ * - remove the fixup for unused input VGPRs
+ */
+ key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+
+ /* The LS output / HS input layout can be communicated
+ * directly instead of via user SGPRs for merged LS-HS.
+ * The LS VGPR fix prefers this too.
+ */
+ key->opt.prefer_mono = 1;
+ }
+
+ key->part.tcs.epilog.prim_mode =
+ sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+ key->part.tcs.epilog.invoc0_tess_factors_are_def =
+ sel->info.tessfactors_are_def_in_all_invocs;
+ key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors;
+
+ if (sel == sctx->fixed_func_tcs_shader.cso)
+ key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ key->as_ngg = stages_key.u.ngg;
+
+ if (sctx->gs_shader.cso)
+ key->as_es = 1;
+ else {
+ si_shader_selector_key_hw_vs(sctx, sel, key);
+
+ if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+ key->mono.u.vs_export_prim_id = 1;
+ }
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ if (sctx->chip_class >= GFX9) {
+ if (sctx->tes_shader.cso) {
+ key->part.gs.es = sctx->tes_shader.cso;
+ } else {
+ si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog);
+ key->part.gs.es = sctx->vs_shader.cso;
+ key->part.gs.prolog.gfx9_prev_is_vs = 1;
+ }
+
+ key->as_ngg = stages_key.u.ngg;
+
+ /* Merged ES-GS can have unbalanced wave usage.
+ *
+ * ES threads are per-vertex, while GS threads are
+ * per-primitive. So without any amplification, there
+ * are fewer GS threads than ES threads, which can result
+ * in empty (no-op) GS waves. With too much amplification,
+ * there are more GS threads than ES threads, which
+ * can result in empty (no-op) ES waves.
+ *
+ * Non-monolithic shaders are implemented by setting EXEC
+ * at the beginning of shader parts, and don't jump to
+ * the end if EXEC is 0.
+ *
+ * Monolithic shaders use conditional blocks, so they can
+ * jump and skip empty waves of ES or GS. So set this to
+ * always use optimized variants, which are monolithic.
+ */
+ key->opt.prefer_mono = 1;
+ }
+ key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
+ break;
+ case PIPE_SHADER_FRAGMENT: {
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_state_blend *blend = sctx->queued.named.blend;
+
+ if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+ sel->info.colors_written == 0x1)
+ key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
+ /* Select the shader color format based on whether
+ * blending or alpha are needed.
+ */
+ key->part.ps.epilog.spi_shader_col_format =
+ (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+ (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_blend) |
+ (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format_alpha) |
+ (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+ sctx->framebuffer.spi_shader_col_format);
+ key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+ /* The output for dual source blending should have
+ * the same format as the first output.
+ */
+ if (blend->dual_src_blend) {
+ key->part.ps.epilog.spi_shader_col_format |=
+ (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
+ }
+
+ /* If alpha-to-coverage is enabled, we have to export alpha
+ * even if there is no color buffer.
+ */
+ if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+ key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+
+ /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+ * to the range supported by the type if a channel has less
+ * than 16 bits and the export format is 16_ABGR.
+ */
+ if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+ key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+ key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+ }
+
+ /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+ if (!key->part.ps.epilog.last_cbuf) {
+ key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+ key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
+ key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
+ }
+
+ bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+ bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+
+ key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+ key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+
+ key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+
+ key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+ key->part.ps.epilog.poly_line_smoothing =
+ ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+ sctx->framebuffer.nr_samples <= 1;
+ key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+
+ if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) {
+ key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+ }
+
+ if (rs->force_persample_interp && rs->multisample_enable &&
+ sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+ key->part.ps.prolog.force_persp_sample_interp =
+ sel->info.uses_persp_center || sel->info.uses_persp_centroid;
+
+ key->part.ps.prolog.force_linear_sample_interp =
+ sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+ } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+ key->part.ps.prolog.bc_optimize_for_persp =
+ sel->info.uses_persp_center && sel->info.uses_persp_centroid;
+ key->part.ps.prolog.bc_optimize_for_linear =
+ sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+ } else {
+ /* Make sure SPI doesn't compute more than 1 pair
+ * of (i,j), which is the optimization here. */
+ key->part.ps.prolog.force_persp_center_interp = sel->info.uses_persp_center +
+ sel->info.uses_persp_centroid +
+ sel->info.uses_persp_sample >
+ 1;
+
+ key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+ sel->info.uses_linear_centroid +
+ sel->info.uses_linear_sample >
+ 1;
+
+ if (sel->info.uses_persp_opcode_interp_sample ||
+ sel->info.uses_linear_opcode_interp_sample)
+ key->mono.u.ps.interpolate_at_sample_force_center = 1;
+ }
+
+ key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
+
+ /* ps_uses_fbfetch is true only if the color buffer is bound. */
+ if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
+ struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+ struct pipe_resource *tex = cb0->texture;
+
+ /* 1D textures are allocated and used as 2D on GFX9. */
+ key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+ key->mono.u.ps.fbfetch_is_1D =
+ sctx->chip_class != GFX9 &&
+ (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+ key->mono.u.ps.fbfetch_layered =
+ tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+ tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+ tex->target == PIPE_TEXTURE_3D;
+ }
+ break;
+ }
+ default:
+ assert(0);
+ }
+
+ if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
+ memset(&key->opt, 0, sizeof(key->opt));
+}
+
+static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
+{
+ struct si_shader_selector *sel = shader->selector;
+ struct si_screen *sscreen = sel->screen;
+ struct ac_llvm_compiler *compiler;
+ struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
+
+ if (thread_index >= 0) {
+ if (low_priority) {
+ assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
+ compiler = &sscreen->compiler_lowp[thread_index];
+ } else {
+ assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+ compiler = &sscreen->compiler[thread_index];
+ }
+ if (!debug->async)
+ debug = NULL;
+ } else {
+ assert(!low_priority);
+ compiler = shader->compiler_ctx_state.compiler;
+ }
+
+ if (!compiler->passes)
+ si_init_compiler(sscreen, compiler);
+
+ if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
+ PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->type);
+ shader->compilation_failed = true;
+ return;
+ }
+
+ if (shader->compiler_ctx_state.is_debug_context) {
+ FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size);
+ if (f) {
+ si_shader_dump(sscreen, shader, NULL, f, false);
+ fclose(f);
+ }
+ }
+
+ si_shader_init_pm4_state(sscreen, shader);
}
static void si_build_shader_variant_low_priority(void *job, int thread_index)
{
- struct si_shader *shader = (struct si_shader *)job;
+ struct si_shader *shader = (struct si_shader *)job;
- assert(thread_index >= 0);
+ assert(thread_index >= 0);
- si_build_shader_variant(shader, thread_index, true);
+ si_build_shader_variant(shader, thread_index, true);
}
static const struct si_shader_key zeroed;
-static bool si_check_missing_main_part(struct si_screen *sscreen,
- struct si_shader_selector *sel,
- struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key)
+static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
+ struct si_compiler_ctx_state *compiler_state,
+ struct si_shader_key *key)
{
- struct si_shader **mainp = si_get_main_shader_part(sel, key);
-
- if (!*mainp) {
- struct si_shader *main_part = CALLOC_STRUCT(si_shader);
-
- if (!main_part)
- return false;
-
- /* We can leave the fence as permanently signaled because the
- * main part becomes visible globally only after it has been
- * compiled. */
- util_queue_fence_init(&main_part->ready);
-
- main_part->selector = sel;
- main_part->key.as_es = key->as_es;
- main_part->key.as_ls = key->as_ls;
- main_part->key.as_ngg = key->as_ngg;
- main_part->is_monolithic = false;
-
- if (!si_compile_shader(sscreen, compiler_state->compiler,
- main_part, &compiler_state->debug)) {
- FREE(main_part);
- return false;
- }
- *mainp = main_part;
- }
- return true;
+ struct si_shader **mainp = si_get_main_shader_part(sel, key);
+
+ if (!*mainp) {
+ struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+
+ if (!main_part)
+ return false;
+
+ /* We can leave the fence as permanently signaled because the
+ * main part becomes visible globally only after it has been
+ * compiled. */
+ util_queue_fence_init(&main_part->ready);
+
+ main_part->selector = sel;
+ main_part->key.as_es = key->as_es;
+ main_part->key.as_ls = key->as_ls;
+ main_part->key.as_ngg = key->as_ngg;
+ main_part->is_monolithic = false;
+
+ if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
+ &compiler_state->debug)) {
+ FREE(main_part);
+ return false;
+ }
+ *mainp = main_part;
+ }
+ return true;
}
/**
* the compilation isn't finished, don't select any
* shader and return an error.
*/
-int si_shader_select_with_key(struct si_screen *sscreen,
- struct si_shader_ctx_state *state,
- struct si_compiler_ctx_state *compiler_state,
- struct si_shader_key *key,
- int thread_index,
- bool optimized_or_none)
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+ struct si_compiler_ctx_state *compiler_state,
+ struct si_shader_key *key, int thread_index, bool optimized_or_none)
{
- struct si_shader_selector *sel = state->cso;
- struct si_shader_selector *previous_stage_sel = NULL;
- struct si_shader *current = state->current;
- struct si_shader *iter, *shader = NULL;
+ struct si_shader_selector *sel = state->cso;
+ struct si_shader_selector *previous_stage_sel = NULL;
+ struct si_shader *current = state->current;
+ struct si_shader *iter, *shader = NULL;
again:
- /* Check if we don't need to change anything.
- * This path is also used for most shaders that don't need multiple
- * variants, it will cost just a computation of the key and this
- * test. */
- if (likely(current &&
- memcmp(¤t->key, key, sizeof(*key)) == 0)) {
- if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) {
- if (current->is_optimized) {
- if (optimized_or_none)
- return -1;
-
- memset(&key->opt, 0, sizeof(key->opt));
- goto current_not_ready;
- }
-
- util_queue_fence_wait(¤t->ready);
- }
-
- return current->compilation_failed ? -1 : 0;
- }
+ /* Check if we don't need to change anything.
+ * This path is also used for most shaders that don't need multiple
+ * variants, it will cost just a computation of the key and this
+ * test. */
+ if (likely(current && memcmp(¤t->key, key, sizeof(*key)) == 0)) {
+ if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) {
+ if (current->is_optimized) {
+ if (optimized_or_none)
+ return -1;
+
+ memset(&key->opt, 0, sizeof(key->opt));
+ goto current_not_ready;
+ }
+
+ util_queue_fence_wait(¤t->ready);
+ }
+
+ return current->compilation_failed ? -1 : 0;
+ }
current_not_ready:
- /* This must be done before the mutex is locked, because async GS
- * compilation calls this function too, and therefore must enter
- * the mutex first.
- *
- * Only wait if we are in a draw call. Don't wait if we are
- * in a compiler thread.
- */
- if (thread_index < 0)
- util_queue_fence_wait(&sel->ready);
-
- simple_mtx_lock(&sel->mutex);
-
- /* Find the shader variant. */
- for (iter = sel->first_variant; iter; iter = iter->next_variant) {
- /* Don't check the "current" shader. We checked it above. */
- if (current != iter &&
- memcmp(&iter->key, key, sizeof(*key)) == 0) {
- simple_mtx_unlock(&sel->mutex);
-
- if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
- /* If it's an optimized shader and its compilation has
- * been started but isn't done, use the unoptimized
- * shader so as not to cause a stall due to compilation.
- */
- if (iter->is_optimized) {
- if (optimized_or_none)
- return -1;
- memset(&key->opt, 0, sizeof(key->opt));
- goto again;
- }
-
- util_queue_fence_wait(&iter->ready);
- }
-
- if (iter->compilation_failed) {
- return -1; /* skip the draw call */
- }
-
- state->current = iter;
- return 0;
- }
- }
-
- /* Build a new shader. */
- shader = CALLOC_STRUCT(si_shader);
- if (!shader) {
- simple_mtx_unlock(&sel->mutex);
- return -ENOMEM;
- }
-
- util_queue_fence_init(&shader->ready);
-
- shader->selector = sel;
- shader->key = *key;
- shader->compiler_ctx_state = *compiler_state;
-
- /* If this is a merged shader, get the first shader's selector. */
- if (sscreen->info.chip_class >= GFX9) {
- if (sel->type == PIPE_SHADER_TESS_CTRL)
- previous_stage_sel = key->part.tcs.ls;
- else if (sel->type == PIPE_SHADER_GEOMETRY)
- previous_stage_sel = key->part.gs.es;
-
- /* We need to wait for the previous shader. */
- if (previous_stage_sel && thread_index < 0)
- util_queue_fence_wait(&previous_stage_sel->ready);
- }
-
- bool is_pure_monolithic =
- sscreen->use_monolithic_shaders ||
- memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
-
- /* Compile the main shader part if it doesn't exist. This can happen
- * if the initial guess was wrong.
- *
- * The prim discard CS doesn't need the main shader part.
- */
- if (!is_pure_monolithic &&
- !key->opt.vs_as_prim_discard_cs) {
- bool ok = true;
-
- /* Make sure the main shader part is present. This is needed
- * for shaders that can be compiled as VS, LS, or ES, and only
- * one of them is compiled at creation.
- *
- * It is also needed for GS, which can be compiled as non-NGG
- * and NGG.
- *
- * For merged shaders, check that the starting shader's main
- * part is present.
- */
- if (previous_stage_sel) {
- struct si_shader_key shader1_key = zeroed;
-
- if (sel->type == PIPE_SHADER_TESS_CTRL) {
- shader1_key.as_ls = 1;
- } else if (sel->type == PIPE_SHADER_GEOMETRY) {
- shader1_key.as_es = 1;
- shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
- } else {
- assert(0);
- }
-
- simple_mtx_lock(&previous_stage_sel->mutex);
- ok = si_check_missing_main_part(sscreen,
- previous_stage_sel,
- compiler_state, &shader1_key);
- simple_mtx_unlock(&previous_stage_sel->mutex);
- }
-
- if (ok) {
- ok = si_check_missing_main_part(sscreen, sel,
- compiler_state, key);
- }
-
- if (!ok) {
- FREE(shader);
- simple_mtx_unlock(&sel->mutex);
- return -ENOMEM; /* skip the draw call */
- }
- }
-
- /* Keep the reference to the 1st shader of merged shaders, so that
- * Gallium can't destroy it before we destroy the 2nd shader.
- *
- * Set sctx = NULL, because it's unused if we're not releasing
- * the shader, and we don't have any sctx here.
- */
- si_shader_selector_reference(NULL, &shader->previous_stage_sel,
- previous_stage_sel);
-
- /* Monolithic-only shaders don't make a distinction between optimized
- * and unoptimized. */
- shader->is_monolithic =
- is_pure_monolithic ||
- memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
- /* The prim discard CS is always optimized. */
- shader->is_optimized =
- (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
- memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
- /* If it's an optimized shader, compile it asynchronously. */
- if (shader->is_optimized && thread_index < 0) {
- /* Compile it asynchronously. */
- util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
- shader, &shader->ready,
- si_build_shader_variant_low_priority, NULL,
- 0);
-
- /* Add only after the ready fence was reset, to guard against a
- * race with si_bind_XX_shader. */
- if (!sel->last_variant) {
- sel->first_variant = shader;
- sel->last_variant = shader;
- } else {
- sel->last_variant->next_variant = shader;
- sel->last_variant = shader;
- }
-
- /* Use the default (unoptimized) shader for now. */
- memset(&key->opt, 0, sizeof(key->opt));
- simple_mtx_unlock(&sel->mutex);
-
- if (sscreen->options.sync_compile)
- util_queue_fence_wait(&shader->ready);
-
- if (optimized_or_none)
- return -1;
- goto again;
- }
-
- /* Reset the fence before adding to the variant list. */
- util_queue_fence_reset(&shader->ready);
-
- if (!sel->last_variant) {
- sel->first_variant = shader;
- sel->last_variant = shader;
- } else {
- sel->last_variant->next_variant = shader;
- sel->last_variant = shader;
- }
-
- simple_mtx_unlock(&sel->mutex);
-
- assert(!shader->is_optimized);
- si_build_shader_variant(shader, thread_index, false);
-
- util_queue_fence_signal(&shader->ready);
-
- if (!shader->compilation_failed)
- state->current = shader;
-
- return shader->compilation_failed ? -1 : 0;
-}
-
-static int si_shader_select(struct pipe_context *ctx,
- struct si_shader_ctx_state *state,
- union si_vgt_stages_key stages_key,
- struct si_compiler_ctx_state *compiler_state)
-{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_key key;
-
- si_shader_selector_key(ctx, state->cso, stages_key, &key);
- return si_shader_select_with_key(sctx->screen, state, compiler_state,
- &key, -1, false);
-}
-
-static void si_parse_next_shader_property(const struct si_shader_info *info,
- bool streamout,
- struct si_shader_key *key)
-{
- unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
-
- switch (info->processor) {
- case PIPE_SHADER_VERTEX:
- switch (next_shader) {
- case PIPE_SHADER_GEOMETRY:
- key->as_es = 1;
- break;
- case PIPE_SHADER_TESS_CTRL:
- case PIPE_SHADER_TESS_EVAL:
- key->as_ls = 1;
- break;
- default:
- /* If POSITION isn't written, it can only be a HW VS
- * if streamout is used. If streamout isn't used,
- * assume that it's a HW LS. (the next shader is TCS)
- * This heuristic is needed for separate shader objects.
- */
- if (!info->writes_position && !streamout)
- key->as_ls = 1;
- }
- break;
-
- case PIPE_SHADER_TESS_EVAL:
- if (next_shader == PIPE_SHADER_GEOMETRY ||
- !info->writes_position)
- key->as_es = 1;
- break;
- }
+ /* This must be done before the mutex is locked, because async GS
+ * compilation calls this function too, and therefore must enter
+ * the mutex first.
+ *
+ * Only wait if we are in a draw call. Don't wait if we are
+ * in a compiler thread.
+ */
+ if (thread_index < 0)
+ util_queue_fence_wait(&sel->ready);
+
+ simple_mtx_lock(&sel->mutex);
+
+ /* Find the shader variant. */
+ for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+ /* Don't check the "current" shader. We checked it above. */
+ if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) {
+ simple_mtx_unlock(&sel->mutex);
+
+ if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+ /* If it's an optimized shader and its compilation has
+ * been started but isn't done, use the unoptimized
+ * shader so as not to cause a stall due to compilation.
+ */
+ if (iter->is_optimized) {
+ if (optimized_or_none)
+ return -1;
+ memset(&key->opt, 0, sizeof(key->opt));
+ goto again;
+ }
+
+ util_queue_fence_wait(&iter->ready);
+ }
+
+ if (iter->compilation_failed) {
+ return -1; /* skip the draw call */
+ }
+
+ state->current = iter;
+ return 0;
+ }
+ }
+
+ /* Build a new shader. */
+ shader = CALLOC_STRUCT(si_shader);
+ if (!shader) {
+ simple_mtx_unlock(&sel->mutex);
+ return -ENOMEM;
+ }
+
+ util_queue_fence_init(&shader->ready);
+
+ shader->selector = sel;
+ shader->key = *key;
+ shader->compiler_ctx_state = *compiler_state;
+
+ /* If this is a merged shader, get the first shader's selector. */
+ if (sscreen->info.chip_class >= GFX9) {
+ if (sel->type == PIPE_SHADER_TESS_CTRL)
+ previous_stage_sel = key->part.tcs.ls;
+ else if (sel->type == PIPE_SHADER_GEOMETRY)
+ previous_stage_sel = key->part.gs.es;
+
+ /* We need to wait for the previous shader. */
+ if (previous_stage_sel && thread_index < 0)
+ util_queue_fence_wait(&previous_stage_sel->ready);
+ }
+
+ bool is_pure_monolithic =
+ sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
+
+ /* Compile the main shader part if it doesn't exist. This can happen
+ * if the initial guess was wrong.
+ *
+ * The prim discard CS doesn't need the main shader part.
+ */
+ if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+ bool ok = true;
+
+ /* Make sure the main shader part is present. This is needed
+ * for shaders that can be compiled as VS, LS, or ES, and only
+ * one of them is compiled at creation.
+ *
+ * It is also needed for GS, which can be compiled as non-NGG
+ * and NGG.
+ *
+ * For merged shaders, check that the starting shader's main
+ * part is present.
+ */
+ if (previous_stage_sel) {
+ struct si_shader_key shader1_key = zeroed;
+
+ if (sel->type == PIPE_SHADER_TESS_CTRL) {
+ shader1_key.as_ls = 1;
+ } else if (sel->type == PIPE_SHADER_GEOMETRY) {
+ shader1_key.as_es = 1;
+ shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
+ } else {
+ assert(0);
+ }
+
+ simple_mtx_lock(&previous_stage_sel->mutex);
+ ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key);
+ simple_mtx_unlock(&previous_stage_sel->mutex);
+ }
+
+ if (ok) {
+ ok = si_check_missing_main_part(sscreen, sel, compiler_state, key);
+ }
+
+ if (!ok) {
+ FREE(shader);
+ simple_mtx_unlock(&sel->mutex);
+ return -ENOMEM; /* skip the draw call */
+ }
+ }
+
+ /* Keep the reference to the 1st shader of merged shaders, so that
+ * Gallium can't destroy it before we destroy the 2nd shader.
+ *
+ * Set sctx = NULL, because it's unused if we're not releasing
+ * the shader, and we don't have any sctx here.
+ */
+ si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel);
+
+ /* Monolithic-only shaders don't make a distinction between optimized
+ * and unoptimized. */
+ shader->is_monolithic =
+ is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+ /* The prim discard CS is always optimized. */
+ shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+ memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+ /* If it's an optimized shader, compile it asynchronously. */
+ if (shader->is_optimized && thread_index < 0) {
+ /* Compile it asynchronously. */
+ util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready,
+ si_build_shader_variant_low_priority, NULL, 0);
+
+ /* Add only after the ready fence was reset, to guard against a
+ * race with si_bind_XX_shader. */
+ if (!sel->last_variant) {
+ sel->first_variant = shader;
+ sel->last_variant = shader;
+ } else {
+ sel->last_variant->next_variant = shader;
+ sel->last_variant = shader;
+ }
+
+ /* Use the default (unoptimized) shader for now. */
+ memset(&key->opt, 0, sizeof(key->opt));
+ simple_mtx_unlock(&sel->mutex);
+
+ if (sscreen->options.sync_compile)
+ util_queue_fence_wait(&shader->ready);
+
+ if (optimized_or_none)
+ return -1;
+ goto again;
+ }
+
+ /* Reset the fence before adding to the variant list. */
+ util_queue_fence_reset(&shader->ready);
+
+ if (!sel->last_variant) {
+ sel->first_variant = shader;
+ sel->last_variant = shader;
+ } else {
+ sel->last_variant->next_variant = shader;
+ sel->last_variant = shader;
+ }
+
+ simple_mtx_unlock(&sel->mutex);
+
+ assert(!shader->is_optimized);
+ si_build_shader_variant(shader, thread_index, false);
+
+ util_queue_fence_signal(&shader->ready);
+
+ if (!shader->compilation_failed)
+ state->current = shader;
+
+ return shader->compilation_failed ? -1 : 0;
+}
+
+static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state,
+ union si_vgt_stages_key stages_key,
+ struct si_compiler_ctx_state *compiler_state)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_key key;
+
+ si_shader_selector_key(ctx, state->cso, stages_key, &key);
+ return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false);
+}
+
+static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
+ struct si_shader_key *key)
+{
+ unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
+
+ switch (info->processor) {
+ case PIPE_SHADER_VERTEX:
+ switch (next_shader) {
+ case PIPE_SHADER_GEOMETRY:
+ key->as_es = 1;
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ case PIPE_SHADER_TESS_EVAL:
+ key->as_ls = 1;
+ break;
+ default:
+ /* If POSITION isn't written, it can only be a HW VS
+ * if streamout is used. If streamout isn't used,
+ * assume that it's a HW LS. (the next shader is TCS)
+ * This heuristic is needed for separate shader objects.
+ */
+ if (!info->writes_position && !streamout)
+ key->as_ls = 1;
+ }
+ break;
+
+ case PIPE_SHADER_TESS_EVAL:
+ if (next_shader == PIPE_SHADER_GEOMETRY || !info->writes_position)
+ key->as_es = 1;
+ break;
+ }
}
/**
*/
static void si_init_shader_selector_async(void *job, int thread_index)
{
- struct si_shader_selector *sel = (struct si_shader_selector *)job;
- struct si_screen *sscreen = sel->screen;
- struct ac_llvm_compiler *compiler;
- struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
-
- assert(!debug->debug_message || debug->async);
- assert(thread_index >= 0);
- assert(thread_index < ARRAY_SIZE(sscreen->compiler));
- compiler = &sscreen->compiler[thread_index];
-
- if (!compiler->passes)
- si_init_compiler(sscreen, compiler);
-
- /* Serialize NIR to save memory. Monolithic shader variants
- * have to deserialize NIR before compilation.
- */
- if (sel->nir) {
- struct blob blob;
- size_t size;
-
- blob_init(&blob);
- /* true = remove optional debugging data to increase
- * the likehood of getting more shader cache hits.
- * It also drops variable names, so we'll save more memory.
- */
- nir_serialize(&blob, sel->nir, true);
- blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
- sel->nir_size = size;
- }
-
- /* Compile the main shader part for use with a prolog and/or epilog.
- * If this fails, the driver will try to compile a monolithic shader
- * on demand.
- */
- if (!sscreen->use_monolithic_shaders) {
- struct si_shader *shader = CALLOC_STRUCT(si_shader);
- unsigned char ir_sha1_cache_key[20];
-
- if (!shader) {
- fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
- return;
- }
-
- /* We can leave the fence signaled because use of the default
- * main part is guarded by the selector's ready fence. */
- util_queue_fence_init(&shader->ready);
-
- shader->selector = sel;
- shader->is_monolithic = false;
- si_parse_next_shader_property(&sel->info,
- sel->so.num_outputs != 0,
- &shader->key);
-
- if (sscreen->use_ngg &&
- (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
- ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
- sel->type == PIPE_SHADER_TESS_EVAL ||
- sel->type == PIPE_SHADER_GEOMETRY))
- shader->key.as_ngg = 1;
-
- if (sel->nir) {
- si_get_ir_cache_key(sel, shader->key.as_ngg,
- shader->key.as_es, ir_sha1_cache_key);
- }
-
- /* Try to load the shader from the shader cache. */
- simple_mtx_lock(&sscreen->shader_cache_mutex);
-
- if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
- si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
- } else {
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
- /* Compile the shader if it hasn't been loaded from the cache. */
- if (!si_compile_shader(sscreen, compiler, shader, debug)) {
- FREE(shader);
- fprintf(stderr, "radeonsi: can't compile a main shader part\n");
- return;
- }
-
- simple_mtx_lock(&sscreen->shader_cache_mutex);
- si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
- shader, true);
- simple_mtx_unlock(&sscreen->shader_cache_mutex);
- }
-
- *si_get_main_shader_part(sel, &shader->key) = shader;
-
- /* Unset "outputs_written" flags for outputs converted to
- * DEFAULT_VAL, so that later inter-shader optimizations don't
- * try to eliminate outputs that don't exist in the final
- * shader.
- *
- * This is only done if non-monolithic shaders are enabled.
- */
- if ((sel->type == PIPE_SHADER_VERTEX ||
- sel->type == PIPE_SHADER_TESS_EVAL) &&
- !shader->key.as_ls &&
- !shader->key.as_es) {
- unsigned i;
-
- for (i = 0; i < sel->info.num_outputs; i++) {
- unsigned offset = shader->info.vs_output_param_offset[i];
-
- if (offset <= AC_EXP_PARAM_OFFSET_31)
- continue;
-
- unsigned name = sel->info.output_semantic_name[i];
- unsigned index = sel->info.output_semantic_index[i];
- unsigned id;
-
- switch (name) {
- case TGSI_SEMANTIC_GENERIC:
- /* don't process indices the function can't handle */
- if (index >= SI_MAX_IO_GENERIC)
- break;
- /* fall through */
- default:
- id = si_shader_io_get_unique_index(name, index, true);
- sel->outputs_written_before_ps &= ~(1ull << id);
- break;
- case TGSI_SEMANTIC_POSITION: /* ignore these */
- case TGSI_SEMANTIC_PSIZE:
- case TGSI_SEMANTIC_CLIPVERTEX:
- case TGSI_SEMANTIC_EDGEFLAG:
- break;
- }
- }
- }
- }
-
- /* The GS copy shader is always pre-compiled. */
- if (sel->type == PIPE_SHADER_GEOMETRY &&
- (!sscreen->use_ngg ||
- !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
- sel->tess_turns_off_ngg)) {
- sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
- if (!sel->gs_copy_shader) {
- fprintf(stderr, "radeonsi: can't create GS copy shader\n");
- return;
- }
-
- si_shader_vs(sscreen, sel->gs_copy_shader, sel);
- }
-
- /* Free NIR. We only keep serialized NIR after this point. */
- if (sel->nir) {
- ralloc_free(sel->nir);
- sel->nir = NULL;
- }
+ struct si_shader_selector *sel = (struct si_shader_selector *)job;
+ struct si_screen *sscreen = sel->screen;
+ struct ac_llvm_compiler *compiler;
+ struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+
+ assert(!debug->debug_message || debug->async);
+ assert(thread_index >= 0);
+ assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+ compiler = &sscreen->compiler[thread_index];
+
+ if (!compiler->passes)
+ si_init_compiler(sscreen, compiler);
+
+ /* Serialize NIR to save memory. Monolithic shader variants
+ * have to deserialize NIR before compilation.
+ */
+ if (sel->nir) {
+ struct blob blob;
+ size_t size;
+
+ blob_init(&blob);
+ /* true = remove optional debugging data to increase
+ * the likehood of getting more shader cache hits.
+ * It also drops variable names, so we'll save more memory.
+ */
+ nir_serialize(&blob, sel->nir, true);
+ blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+ sel->nir_size = size;
+ }
+
+ /* Compile the main shader part for use with a prolog and/or epilog.
+ * If this fails, the driver will try to compile a monolithic shader
+ * on demand.
+ */
+ if (!sscreen->use_monolithic_shaders) {
+ struct si_shader *shader = CALLOC_STRUCT(si_shader);
+ unsigned char ir_sha1_cache_key[20];
+
+ if (!shader) {
+ fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
+ return;
+ }
+
+ /* We can leave the fence signaled because use of the default
+ * main part is guarded by the selector's ready fence. */
+ util_queue_fence_init(&shader->ready);
+
+ shader->selector = sel;
+ shader->is_monolithic = false;
+ si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
+
+ if (sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
+ ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
+ sel->type == PIPE_SHADER_TESS_EVAL || sel->type == PIPE_SHADER_GEOMETRY))
+ shader->key.as_ngg = 1;
+
+ if (sel->nir) {
+ si_get_ir_cache_key(sel, shader->key.as_ngg, shader->key.as_es, ir_sha1_cache_key);
+ }
+
+ /* Try to load the shader from the shader cache. */
+ simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+ if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+ si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+ } else {
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+ /* Compile the shader if it hasn't been loaded from the cache. */
+ if (!si_compile_shader(sscreen, compiler, shader, debug)) {
+ FREE(shader);
+ fprintf(stderr, "radeonsi: can't compile a main shader part\n");
+ return;
+ }
+
+ simple_mtx_lock(&sscreen->shader_cache_mutex);
+ si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+ simple_mtx_unlock(&sscreen->shader_cache_mutex);
+ }
+
+ *si_get_main_shader_part(sel, &shader->key) = shader;
+
+ /* Unset "outputs_written" flags for outputs converted to
+ * DEFAULT_VAL, so that later inter-shader optimizations don't
+ * try to eliminate outputs that don't exist in the final
+ * shader.
+ *
+ * This is only done if non-monolithic shaders are enabled.
+ */
+ if ((sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL) &&
+ !shader->key.as_ls && !shader->key.as_es) {
+ unsigned i;
+
+ for (i = 0; i < sel->info.num_outputs; i++) {
+ unsigned offset = shader->info.vs_output_param_offset[i];
+
+ if (offset <= AC_EXP_PARAM_OFFSET_31)
+ continue;
+
+ unsigned name = sel->info.output_semantic_name[i];
+ unsigned index = sel->info.output_semantic_index[i];
+ unsigned id;
+
+ switch (name) {
+ case TGSI_SEMANTIC_GENERIC:
+ /* don't process indices the function can't handle */
+ if (index >= SI_MAX_IO_GENERIC)
+ break;
+ /* fall through */
+ default:
+ id = si_shader_io_get_unique_index(name, index, true);
+ sel->outputs_written_before_ps &= ~(1ull << id);
+ break;
+ case TGSI_SEMANTIC_POSITION: /* ignore these */
+ case TGSI_SEMANTIC_PSIZE:
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ case TGSI_SEMANTIC_EDGEFLAG:
+ break;
+ }
+ }
+ }
+ }
+
+ /* The GS copy shader is always pre-compiled. */
+ if (sel->type == PIPE_SHADER_GEOMETRY &&
+ (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+ sel->tess_turns_off_ngg)) {
+ sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+ if (!sel->gs_copy_shader) {
+ fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+ return;
+ }
+
+ si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+ }
+
+ /* Free NIR. We only keep serialized NIR after this point. */
+ if (sel->nir) {
+ ralloc_free(sel->nir);
+ sel->nir = NULL;
+ }
}
void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
- struct util_queue_fence *ready_fence,
- struct si_compiler_ctx_state *compiler_ctx_state,
- void *job, util_queue_execute_func execute)
+ struct util_queue_fence *ready_fence,
+ struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+ util_queue_execute_func execute)
{
- util_queue_fence_init(ready_fence);
-
- struct util_async_debug_callback async_debug;
- bool debug =
- (sctx->debug.debug_message && !sctx->debug.async) ||
- sctx->is_debug ||
- si_can_dump_shader(sctx->screen, processor);
-
- if (debug) {
- u_async_debug_init(&async_debug);
- compiler_ctx_state->debug = async_debug.base;
- }
-
- util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
- ready_fence, execute, NULL, 0);
-
- if (debug) {
- util_queue_fence_wait(ready_fence);
- u_async_debug_drain(&async_debug, &sctx->debug);
- u_async_debug_cleanup(&async_debug);
- }
-
- if (sctx->screen->options.sync_compile)
- util_queue_fence_wait(ready_fence);
+ util_queue_fence_init(ready_fence);
+
+ struct util_async_debug_callback async_debug;
+ bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug ||
+ si_can_dump_shader(sctx->screen, processor);
+
+ if (debug) {
+ u_async_debug_init(&async_debug);
+ compiler_ctx_state->debug = async_debug.base;
+ }
+
+ util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0);
+
+ if (debug) {
+ util_queue_fence_wait(ready_fence);
+ u_async_debug_drain(&async_debug, &sctx->debug);
+ u_async_debug_cleanup(&async_debug);
+ }
+
+ if (sctx->screen->options.sync_compile)
+ util_queue_fence_wait(ready_fence);
}
/* Return descriptor slot usage masks from the given shader info. */
-void si_get_active_slot_masks(const struct si_shader_info *info,
- uint32_t *const_and_shader_buffers,
- uint64_t *samplers_and_images)
-{
- unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
-
- num_shaderbufs = util_last_bit(info->shader_buffers_declared);
- num_constbufs = util_last_bit(info->const_buffers_declared);
- /* two 8-byte images share one 16-byte slot */
- num_images = align(util_last_bit(info->images_declared), 2);
- num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
- num_samplers = util_last_bit(info->samplers_declared);
-
- /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
- start = si_get_shaderbuf_slot(num_shaderbufs - 1);
- *const_and_shader_buffers =
- u_bit_consecutive(start, num_shaderbufs + num_constbufs);
-
- /* The layout is:
- * - fmask[last] ... fmask[0] go to [15-last .. 15]
- * - image[last] ... image[0] go to [31-last .. 31]
- * - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
- *
- * FMASKs for images are placed separately, because MSAA images are rare,
- * and so we can benefit from a better cache hit rate if we keep image
- * descriptors together.
- */
- if (num_msaa_images)
- num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
-
- start = si_get_image_slot(num_images - 1) / 2;
- *samplers_and_images =
- u_bit_consecutive64(start, num_images / 2 + num_samplers);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+ uint64_t *samplers_and_images)
+{
+ unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
+
+ num_shaderbufs = util_last_bit(info->shader_buffers_declared);
+ num_constbufs = util_last_bit(info->const_buffers_declared);
+ /* two 8-byte images share one 16-byte slot */
+ num_images = align(util_last_bit(info->images_declared), 2);
+ num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
+ num_samplers = util_last_bit(info->samplers_declared);
+
+ /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+ start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+ *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs);
+
+ /* The layout is:
+ * - fmask[last] ... fmask[0] go to [15-last .. 15]
+ * - image[last] ... image[0] go to [31-last .. 31]
+ * - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
+ *
+ * FMASKs for images are placed separately, because MSAA images are rare,
+ * and so we can benefit from a better cache hit rate if we keep image
+ * descriptors together.
+ */
+ if (num_msaa_images)
+ num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
+
+ start = si_get_image_slot(num_images - 1) / 2;
+ *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);
}
static void *si_create_shader_selector(struct pipe_context *ctx,
- const struct pipe_shader_state *state)
-{
- struct si_screen *sscreen = (struct si_screen *)ctx->screen;
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
- int i;
-
- if (!sel)
- return NULL;
-
- sel->screen = sscreen;
- sel->compiler_ctx_state.debug = sctx->debug;
- sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
-
- sel->so = state->stream_output;
-
- if (state->type == PIPE_SHADER_IR_TGSI) {
- sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
- } else {
- assert(state->type == PIPE_SHADER_IR_NIR);
- sel->nir = state->ir.nir;
- }
-
- si_nir_scan_shader(sel->nir, &sel->info);
- si_nir_adjust_driver_locations(sel->nir);
-
- sel->type = sel->info.processor;
- p_atomic_inc(&sscreen->num_shaders_created);
- si_get_active_slot_masks(&sel->info,
- &sel->active_const_and_shader_buffers,
- &sel->active_samplers_and_images);
-
- /* Record which streamout buffers are enabled. */
- for (i = 0; i < sel->so.num_outputs; i++) {
- sel->enabled_streamout_buffer_mask |=
- (1 << sel->so.output[i].output_buffer) <<
- (sel->so.output[i].stream * 4);
- }
-
- sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
- !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
- sel->info.num_inputs : 0;
- sel->num_vbos_in_user_sgprs =
- MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
-
- /* The prolog is a no-op if there are no inputs. */
- sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
- sel->info.num_inputs &&
- !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
- sel->prim_discard_cs_allowed =
- sel->type == PIPE_SHADER_VERTEX &&
- !sel->info.uses_bindless_images &&
- !sel->info.uses_bindless_samplers &&
- !sel->info.writes_memory &&
- !sel->info.writes_viewport_index &&
- !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
- !sel->so.num_outputs;
-
- switch (sel->type) {
- case PIPE_SHADER_GEOMETRY:
- sel->gs_output_prim =
- sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
-
- /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
- sel->rast_prim = sel->gs_output_prim;
- if (util_rast_prim_is_triangles(sel->rast_prim))
- sel->rast_prim = PIPE_PRIM_TRIANGLES;
-
- sel->gs_max_out_vertices =
- sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
- sel->gs_num_invocations =
- sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
- sel->gsvs_vertex_size = sel->info.num_outputs * 16;
- sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
- sel->gs_max_out_vertices;
-
- sel->max_gs_stream = 0;
- for (i = 0; i < sel->so.num_outputs; i++)
- sel->max_gs_stream = MAX2(sel->max_gs_stream,
- sel->so.output[i].stream);
-
- sel->gs_input_verts_per_prim =
- u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
-
- /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
- sel->tess_turns_off_ngg =
- sscreen->info.chip_class == GFX10 &&
- sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
- break;
-
- case PIPE_SHADER_TESS_CTRL:
- /* Always reserve space for these. */
- sel->patch_outputs_written |=
- (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
- (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
- /* fall through */
- case PIPE_SHADER_VERTEX:
- case PIPE_SHADER_TESS_EVAL:
- for (i = 0; i < sel->info.num_outputs; i++) {
- unsigned name = sel->info.output_semantic_name[i];
- unsigned index = sel->info.output_semantic_index[i];
-
- switch (name) {
- case TGSI_SEMANTIC_TESSINNER:
- case TGSI_SEMANTIC_TESSOUTER:
- case TGSI_SEMANTIC_PATCH:
- sel->patch_outputs_written |=
- 1ull << si_shader_io_get_unique_index_patch(name, index);
- break;
-
- case TGSI_SEMANTIC_GENERIC:
- /* don't process indices the function can't handle */
- if (index >= SI_MAX_IO_GENERIC)
- break;
- /* fall through */
- default:
- sel->outputs_written |=
- 1ull << si_shader_io_get_unique_index(name, index, false);
- sel->outputs_written_before_ps |=
- 1ull << si_shader_io_get_unique_index(name, index, true);
- break;
- case TGSI_SEMANTIC_EDGEFLAG:
- break;
- }
- }
- sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
- sel->lshs_vertex_stride = sel->esgs_itemsize;
-
- /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
- * will start on a different bank. (except for the maximum 32*16).
- */
- if (sel->lshs_vertex_stride < 32*16)
- sel->lshs_vertex_stride += 4;
-
- /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
- * conflicts, i.e. each vertex will start at a different bank.
- */
- if (sctx->chip_class >= GFX9)
- sel->esgs_itemsize += 4;
-
- assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
-
- /* Only for TES: */
- if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
- sel->rast_prim = PIPE_PRIM_POINTS;
- else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
- sel->rast_prim = PIPE_PRIM_LINE_STRIP;
- else
- sel->rast_prim = PIPE_PRIM_TRIANGLES;
- break;
-
- case PIPE_SHADER_FRAGMENT:
- for (i = 0; i < sel->info.num_inputs; i++) {
- unsigned name = sel->info.input_semantic_name[i];
- unsigned index = sel->info.input_semantic_index[i];
-
- switch (name) {
- case TGSI_SEMANTIC_GENERIC:
- /* don't process indices the function can't handle */
- if (index >= SI_MAX_IO_GENERIC)
- break;
- /* fall through */
- default:
- sel->inputs_read |=
- 1ull << si_shader_io_get_unique_index(name, index, true);
- break;
- case TGSI_SEMANTIC_PCOORD: /* ignore this */
- break;
- }
- }
-
- for (i = 0; i < 8; i++)
- if (sel->info.colors_written & (1 << i))
- sel->colors_written_4bit |= 0xf << (4 * i);
-
- for (i = 0; i < sel->info.num_inputs; i++) {
- if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
- int index = sel->info.input_semantic_index[i];
- sel->color_attr_index[index] = i;
- }
- }
- break;
- default:;
- }
-
- sel->ngg_culling_allowed =
- sscreen->info.chip_class == GFX10 &&
- sscreen->info.has_dedicated_vram &&
- sscreen->use_ngg_culling &&
- /* Disallow TES by default, because TessMark results are mixed. */
- (sel->type == PIPE_SHADER_VERTEX ||
- (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
- sel->info.writes_position &&
- !sel->info.writes_viewport_index && /* cull only against viewport 0 */
- !sel->info.writes_memory &&
- !sel->so.num_outputs &&
- !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
- !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
- /* PA_CL_VS_OUT_CNTL */
- if (sctx->chip_class <= GFX9)
- sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
-
- sel->clipdist_mask = sel->info.writes_clipvertex ?
- SIX_BITS : sel->info.clipdist_writemask;
- sel->culldist_mask = sel->info.culldist_writemask <<
- sel->info.num_written_clipdistance;
-
- /* DB_SHADER_CONTROL */
- sel->db_shader_control =
- S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
- S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
- S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
- S_02880C_KILL_ENABLE(sel->info.uses_kill);
-
- switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
- case TGSI_FS_DEPTH_LAYOUT_GREATER:
- sel->db_shader_control |=
- S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
- break;
- case TGSI_FS_DEPTH_LAYOUT_LESS:
- sel->db_shader_control |=
- S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
- break;
- }
-
- /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
- *
- * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
- * --|-----------|------------|------------|--------------------|-------------------|-------------
- * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0
- * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0
- * 2 | false | true | n/a | LateZ | 1 | 0
- * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0
- * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1
- *
- * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
- * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
- *
- * Don't use ReZ without profiling !!!
- *
- * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
- * shaders.
- */
- if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
- /* Cases 3, 4. */
- sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
- S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
- S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
- } else if (sel->info.writes_memory) {
- /* Case 2. */
- sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) |
- S_02880C_EXEC_ON_HIER_FAIL(1);
- } else {
- /* Case 1. */
- sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
- }
-
- if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
- sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
-
- (void) simple_mtx_init(&sel->mutex, mtx_plain);
-
- si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
- &sel->compiler_ctx_state, sel,
- si_init_shader_selector_async);
- return sel;
-}
-
-static void *si_create_shader(struct pipe_context *ctx,
- const struct pipe_shader_state *state)
-{
- struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-
- return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
+ const struct pipe_shader_state *state)
+{
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
+ int i;
+
+ if (!sel)
+ return NULL;
+
+ sel->screen = sscreen;
+ sel->compiler_ctx_state.debug = sctx->debug;
+ sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+
+ sel->so = state->stream_output;
+
+ if (state->type == PIPE_SHADER_IR_TGSI) {
+ sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
+ } else {
+ assert(state->type == PIPE_SHADER_IR_NIR);
+ sel->nir = state->ir.nir;
+ }
+
+ si_nir_scan_shader(sel->nir, &sel->info);
+ si_nir_adjust_driver_locations(sel->nir);
+
+ sel->type = sel->info.processor;
+ p_atomic_inc(&sscreen->num_shaders_created);
+ si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+ &sel->active_samplers_and_images);
+
+ /* Record which streamout buffers are enabled. */
+ for (i = 0; i < sel->so.num_outputs; i++) {
+ sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
+ << (sel->so.output[i].stream * 4);
+ }
+
+ sel->num_vs_inputs =
+ sel->type == PIPE_SHADER_VERTEX && !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]
+ ? sel->info.num_inputs
+ : 0;
+ sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+
+ /* The prolog is a no-op if there are no inputs. */
+ sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && sel->info.num_inputs &&
+ !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+ sel->prim_discard_cs_allowed =
+ sel->type == PIPE_SHADER_VERTEX && !sel->info.uses_bindless_images &&
+ !sel->info.uses_bindless_samplers && !sel->info.writes_memory &&
+ !sel->info.writes_viewport_index &&
+ !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && !sel->so.num_outputs;
+
+ switch (sel->type) {
+ case PIPE_SHADER_GEOMETRY:
+ sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
+
+ /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+ sel->rast_prim = sel->gs_output_prim;
+ if (util_rast_prim_is_triangles(sel->rast_prim))
+ sel->rast_prim = PIPE_PRIM_TRIANGLES;
+
+ sel->gs_max_out_vertices = sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+ sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
+ sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+ sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices;
+
+ sel->max_gs_stream = 0;
+ for (i = 0; i < sel->so.num_outputs; i++)
+ sel->max_gs_stream = MAX2(sel->max_gs_stream, sel->so.output[i].stream);
+
+ sel->gs_input_verts_per_prim =
+ u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
+
+ /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
+ sel->tess_turns_off_ngg = sscreen->info.chip_class == GFX10 &&
+ sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
+ break;
+
+ case PIPE_SHADER_TESS_CTRL:
+ /* Always reserve space for these. */
+ sel->patch_outputs_written |=
+ (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
+ (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
+ /* fall through */
+ case PIPE_SHADER_VERTEX:
+ case PIPE_SHADER_TESS_EVAL:
+ for (i = 0; i < sel->info.num_outputs; i++) {
+ unsigned name = sel->info.output_semantic_name[i];
+ unsigned index = sel->info.output_semantic_index[i];
+
+ switch (name) {
+ case TGSI_SEMANTIC_TESSINNER:
+ case TGSI_SEMANTIC_TESSOUTER:
+ case TGSI_SEMANTIC_PATCH:
+ sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(name, index);
+ break;
+
+ case TGSI_SEMANTIC_GENERIC:
+ /* don't process indices the function can't handle */
+ if (index >= SI_MAX_IO_GENERIC)
+ break;
+ /* fall through */
+ default:
+ sel->outputs_written |= 1ull << si_shader_io_get_unique_index(name, index, false);
+ sel->outputs_written_before_ps |= 1ull
+ << si_shader_io_get_unique_index(name, index, true);
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ break;
+ }
+ }
+ sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+ sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+ /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+ * will start on a different bank. (except for the maximum 32*16).
+ */
+ if (sel->lshs_vertex_stride < 32 * 16)
+ sel->lshs_vertex_stride += 4;
+
+ /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
+ * conflicts, i.e. each vertex will start at a different bank.
+ */
+ if (sctx->chip_class >= GFX9)
+ sel->esgs_itemsize += 4;
+
+ assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
+
+ /* Only for TES: */
+ if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+ sel->rast_prim = PIPE_PRIM_POINTS;
+ else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+ sel->rast_prim = PIPE_PRIM_LINE_STRIP;
+ else
+ sel->rast_prim = PIPE_PRIM_TRIANGLES;
+ break;
+
+ case PIPE_SHADER_FRAGMENT:
+ for (i = 0; i < sel->info.num_inputs; i++) {
+ unsigned name = sel->info.input_semantic_name[i];
+ unsigned index = sel->info.input_semantic_index[i];
+
+ switch (name) {
+ case TGSI_SEMANTIC_GENERIC:
+ /* don't process indices the function can't handle */
+ if (index >= SI_MAX_IO_GENERIC)
+ break;
+ /* fall through */
+ default:
+ sel->inputs_read |= 1ull << si_shader_io_get_unique_index(name, index, true);
+ break;
+ case TGSI_SEMANTIC_PCOORD: /* ignore this */
+ break;
+ }
+ }
+
+ for (i = 0; i < 8; i++)
+ if (sel->info.colors_written & (1 << i))
+ sel->colors_written_4bit |= 0xf << (4 * i);
+
+ for (i = 0; i < sel->info.num_inputs; i++) {
+ if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+ int index = sel->info.input_semantic_index[i];
+ sel->color_attr_index[index] = i;
+ }
+ }
+ break;
+ default:;
+ }
+
+ sel->ngg_culling_allowed =
+ sscreen->info.chip_class == GFX10 && sscreen->info.has_dedicated_vram &&
+ sscreen->use_ngg_culling &&
+ /* Disallow TES by default, because TessMark results are mixed. */
+ (sel->type == PIPE_SHADER_VERTEX ||
+ (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
+ sel->info.writes_position &&
+ !sel->info.writes_viewport_index && /* cull only against viewport 0 */
+ !sel->info.writes_memory && !sel->so.num_outputs &&
+ !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
+ !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+ /* PA_CL_VS_OUT_CNTL */
+ if (sctx->chip_class <= GFX9)
+ sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
+
+ sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS : sel->info.clipdist_writemask;
+ sel->culldist_mask = sel->info.culldist_writemask << sel->info.num_written_clipdistance;
+
+ /* DB_SHADER_CONTROL */
+ sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+ S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+ S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+ S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+ switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+ case TGSI_FS_DEPTH_LAYOUT_GREATER:
+ sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+ break;
+ case TGSI_FS_DEPTH_LAYOUT_LESS:
+ sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+ break;
+ }
+
+ /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
+ *
+ * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
+ * --|-----------|------------|------------|--------------------|-------------------|-------------
+ * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0
+ * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0
+ * 2 | false | true | n/a | LateZ | 1 | 0
+ * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0
+ * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1
+ *
+ * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
+ * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
+ *
+ * Don't use ReZ without profiling !!!
+ *
+ * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
+ * shaders.
+ */
+ if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
+ /* Cases 3, 4. */
+ sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
+ S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
+ S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
+ } else if (sel->info.writes_memory) {
+ /* Case 2. */
+ sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1);
+ } else {
+ /* Case 1. */
+ sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+ }
+
+ if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
+ sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
+
+ (void)simple_mtx_init(&sel->mutex, mtx_plain);
+
+ si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, &sel->compiler_ctx_state,
+ sel, si_init_shader_selector_async);
+ return sel;
+}
+
+static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state)
+{
+ struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+ return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
}
static void si_update_streamout_state(struct si_context *sctx)
{
- struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
+ struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
- if (!shader_with_so)
- return;
+ if (!shader_with_so)
+ return;
- sctx->streamout.enabled_stream_buffers_mask =
- shader_with_so->enabled_streamout_buffer_mask;
- sctx->streamout.stride_in_dw = shader_with_so->so.stride;
+ sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
+ sctx->streamout.stride_in_dw = shader_with_so->so.stride;
}
-static void si_update_clip_regs(struct si_context *sctx,
- struct si_shader_selector *old_hw_vs,
- struct si_shader *old_hw_vs_variant,
- struct si_shader_selector *next_hw_vs,
- struct si_shader *next_hw_vs_variant)
+static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
+ struct si_shader *old_hw_vs_variant,
+ struct si_shader_selector *next_hw_vs,
+ struct si_shader *next_hw_vs_variant)
{
- if (next_hw_vs &&
- (!old_hw_vs ||
- old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
- next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
- old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
- old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
- old_hw_vs->culldist_mask != next_hw_vs->culldist_mask ||
- !old_hw_vs_variant ||
- !next_hw_vs_variant ||
- old_hw_vs_variant->key.opt.clip_disable !=
- next_hw_vs_variant->key.opt.clip_disable))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+ if (next_hw_vs &&
+ (!old_hw_vs ||
+ old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
+ next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
+ old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
+ old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
+ old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
+ !next_hw_vs_variant ||
+ old_hw_vs_variant->key.opt.clip_disable != next_hw_vs_variant->key.opt.clip_disable))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
}
static void si_update_common_shader_state(struct si_context *sctx)
{
- sctx->uses_bindless_samplers =
- si_shader_uses_bindless_samplers(sctx->vs_shader.cso) ||
- si_shader_uses_bindless_samplers(sctx->gs_shader.cso) ||
- si_shader_uses_bindless_samplers(sctx->ps_shader.cso) ||
- si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
- si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
- sctx->uses_bindless_images =
- si_shader_uses_bindless_images(sctx->vs_shader.cso) ||
- si_shader_uses_bindless_images(sctx->gs_shader.cso) ||
- si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
- si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
- si_shader_uses_bindless_images(sctx->tes_shader.cso);
- sctx->do_update_shaders = true;
+ sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->vs_shader.cso) ||
+ si_shader_uses_bindless_samplers(sctx->gs_shader.cso) ||
+ si_shader_uses_bindless_samplers(sctx->ps_shader.cso) ||
+ si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
+ si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
+ sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->vs_shader.cso) ||
+ si_shader_uses_bindless_images(sctx->gs_shader.cso) ||
+ si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
+ si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
+ si_shader_uses_bindless_images(sctx->tes_shader.cso);
+ sctx->do_update_shaders = true;
}
static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
- struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
- struct si_shader_selector *sel = state;
-
- if (sctx->vs_shader.cso == sel)
- return;
-
- sctx->vs_shader.cso = sel;
- sctx->vs_shader.current = sel ? sel->first_variant : NULL;
- sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
-
- if (si_update_ngg(sctx))
- si_shader_change_notify(sctx);
-
- si_update_common_shader_state(sctx);
- si_update_vs_viewport_state(sctx);
- si_set_active_descriptors_for_shader(sctx, sel);
- si_update_streamout_state(sctx);
- si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
- si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+ struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+ struct si_shader_selector *sel = state;
+
+ if (sctx->vs_shader.cso == sel)
+ return;
+
+ sctx->vs_shader.cso = sel;
+ sctx->vs_shader.current = sel ? sel->first_variant : NULL;
+ sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
+
+ if (si_update_ngg(sctx))
+ si_shader_change_notify(sctx);
+
+ si_update_common_shader_state(sctx);
+ si_update_vs_viewport_state(sctx);
+ si_set_active_descriptors_for_shader(sctx, sel);
+ si_update_streamout_state(sctx);
+ si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+ si_get_vs_state(sctx));
}
static void si_update_tess_uses_prim_id(struct si_context *sctx)
{
- sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
- (sctx->tes_shader.cso &&
- sctx->tes_shader.cso->info.uses_primid) ||
- (sctx->tcs_shader.cso &&
- sctx->tcs_shader.cso->info.uses_primid) ||
- (sctx->gs_shader.cso &&
- sctx->gs_shader.cso->info.uses_primid) ||
- (sctx->ps_shader.cso && !sctx->gs_shader.cso &&
- sctx->ps_shader.cso->info.uses_primid);
+ sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
+ (sctx->tes_shader.cso && sctx->tes_shader.cso->info.uses_primid) ||
+ (sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
+ (sctx->gs_shader.cso && sctx->gs_shader.cso->info.uses_primid) ||
+ (sctx->ps_shader.cso && !sctx->gs_shader.cso && sctx->ps_shader.cso->info.uses_primid);
}
bool si_update_ngg(struct si_context *sctx)
{
- if (!sctx->screen->use_ngg) {
- assert(!sctx->ngg);
- return false;
- }
-
- bool new_ngg = true;
-
- if (sctx->gs_shader.cso && sctx->tes_shader.cso &&
- sctx->gs_shader.cso->tess_turns_off_ngg) {
- new_ngg = false;
- } else if (!sctx->screen->use_ngg_streamout) {
- struct si_shader_selector *last = si_get_vs(sctx)->cso;
-
- if ((last && last->so.num_outputs) ||
- sctx->streamout.prims_gen_query_enabled)
- new_ngg = false;
- }
-
- if (new_ngg != sctx->ngg) {
- /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
- * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
- * pointers are set.
- */
- if ((sctx->family == CHIP_NAVI10 ||
- sctx->family == CHIP_NAVI12 ||
- sctx->family == CHIP_NAVI14) &&
- !new_ngg)
- sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-
- sctx->ngg = new_ngg;
- sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
- return true;
- }
- return false;
+ if (!sctx->screen->use_ngg) {
+ assert(!sctx->ngg);
+ return false;
+ }
+
+ bool new_ngg = true;
+
+ if (sctx->gs_shader.cso && sctx->tes_shader.cso && sctx->gs_shader.cso->tess_turns_off_ngg) {
+ new_ngg = false;
+ } else if (!sctx->screen->use_ngg_streamout) {
+ struct si_shader_selector *last = si_get_vs(sctx)->cso;
+
+ if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
+ new_ngg = false;
+ }
+
+ if (new_ngg != sctx->ngg) {
+ /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
+ * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
+ * pointers are set.
+ */
+ if ((sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+ sctx->family == CHIP_NAVI14) &&
+ !new_ngg)
+ sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+
+ sctx->ngg = new_ngg;
+ sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+ return true;
+ }
+ return false;
}
static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
- struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
- struct si_shader_selector *sel = state;
- bool enable_changed = !!sctx->gs_shader.cso != !!sel;
- bool ngg_changed;
-
- if (sctx->gs_shader.cso == sel)
- return;
-
- sctx->gs_shader.cso = sel;
- sctx->gs_shader.current = sel ? sel->first_variant : NULL;
- sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
-
- si_update_common_shader_state(sctx);
- sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
- ngg_changed = si_update_ngg(sctx);
- if (ngg_changed || enable_changed)
- si_shader_change_notify(sctx);
- if (enable_changed) {
- if (sctx->ia_multi_vgt_param_key.u.uses_tess)
- si_update_tess_uses_prim_id(sctx);
- }
- si_update_vs_viewport_state(sctx);
- si_set_active_descriptors_for_shader(sctx, sel);
- si_update_streamout_state(sctx);
- si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
- si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+ struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+ struct si_shader_selector *sel = state;
+ bool enable_changed = !!sctx->gs_shader.cso != !!sel;
+ bool ngg_changed;
+
+ if (sctx->gs_shader.cso == sel)
+ return;
+
+ sctx->gs_shader.cso = sel;
+ sctx->gs_shader.current = sel ? sel->first_variant : NULL;
+ sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
+
+ si_update_common_shader_state(sctx);
+ sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+ ngg_changed = si_update_ngg(sctx);
+ if (ngg_changed || enable_changed)
+ si_shader_change_notify(sctx);
+ if (enable_changed) {
+ if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+ si_update_tess_uses_prim_id(sctx);
+ }
+ si_update_vs_viewport_state(sctx);
+ si_set_active_descriptors_for_shader(sctx, sel);
+ si_update_streamout_state(sctx);
+ si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+ si_get_vs_state(sctx));
}
static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *sel = state;
- bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *sel = state;
+ bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
- if (sctx->tcs_shader.cso == sel)
- return;
+ if (sctx->tcs_shader.cso == sel)
+ return;
- sctx->tcs_shader.cso = sel;
- sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
- si_update_tess_uses_prim_id(sctx);
+ sctx->tcs_shader.cso = sel;
+ sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
+ si_update_tess_uses_prim_id(sctx);
- si_update_common_shader_state(sctx);
+ si_update_common_shader_state(sctx);
- if (enable_changed)
- sctx->last_tcs = NULL; /* invalidate derived tess state */
+ if (enable_changed)
+ sctx->last_tcs = NULL; /* invalidate derived tess state */
- si_set_active_descriptors_for_shader(sctx, sel);
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
- struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
- struct si_shader_selector *sel = state;
- bool enable_changed = !!sctx->tes_shader.cso != !!sel;
-
- if (sctx->tes_shader.cso == sel)
- return;
-
- sctx->tes_shader.cso = sel;
- sctx->tes_shader.current = sel ? sel->first_variant : NULL;
- sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
- si_update_tess_uses_prim_id(sctx);
-
- si_update_common_shader_state(sctx);
- sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
- bool ngg_changed = si_update_ngg(sctx);
- if (ngg_changed || enable_changed)
- si_shader_change_notify(sctx);
- if (enable_changed)
- sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
- si_update_vs_viewport_state(sctx);
- si_set_active_descriptors_for_shader(sctx, sel);
- si_update_streamout_state(sctx);
- si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
- si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+ struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+ struct si_shader_selector *sel = state;
+ bool enable_changed = !!sctx->tes_shader.cso != !!sel;
+
+ if (sctx->tes_shader.cso == sel)
+ return;
+
+ sctx->tes_shader.cso = sel;
+ sctx->tes_shader.current = sel ? sel->first_variant : NULL;
+ sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
+ si_update_tess_uses_prim_id(sctx);
+
+ si_update_common_shader_state(sctx);
+ sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+ bool ngg_changed = si_update_ngg(sctx);
+ if (ngg_changed || enable_changed)
+ si_shader_change_notify(sctx);
+ if (enable_changed)
+ sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+ si_update_vs_viewport_state(sctx);
+ si_set_active_descriptors_for_shader(sctx, sel);
+ si_update_streamout_state(sctx);
+ si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+ si_get_vs_state(sctx));
}
static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *old_sel = sctx->ps_shader.cso;
- struct si_shader_selector *sel = state;
-
- /* skip if supplied shader is one already in use */
- if (old_sel == sel)
- return;
-
- sctx->ps_shader.cso = sel;
- sctx->ps_shader.current = sel ? sel->first_variant : NULL;
-
- si_update_common_shader_state(sctx);
- if (sel) {
- if (sctx->ia_multi_vgt_param_key.u.uses_tess)
- si_update_tess_uses_prim_id(sctx);
-
- if (!old_sel ||
- old_sel->info.colors_written != sel->info.colors_written)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
- if (sctx->screen->has_out_of_order_rast &&
- (!old_sel ||
- old_sel->info.writes_memory != sel->info.writes_memory ||
- old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
- sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
- }
- si_set_active_descriptors_for_shader(sctx, sel);
- si_update_ps_colorbuf0_slot(sctx);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *old_sel = sctx->ps_shader.cso;
+ struct si_shader_selector *sel = state;
+
+ /* skip if supplied shader is one already in use */
+ if (old_sel == sel)
+ return;
+
+ sctx->ps_shader.cso = sel;
+ sctx->ps_shader.current = sel ? sel->first_variant : NULL;
+
+ si_update_common_shader_state(sctx);
+ if (sel) {
+ if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+ si_update_tess_uses_prim_id(sctx);
+
+ if (!old_sel || old_sel->info.colors_written != sel->info.colors_written)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+ if (sctx->screen->has_out_of_order_rast &&
+ (!old_sel || old_sel->info.writes_memory != sel->info.writes_memory ||
+ old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
+ sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+ }
+ si_set_active_descriptors_for_shader(sctx, sel);
+ si_update_ps_colorbuf0_slot(sctx);
}
static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
{
- if (shader->is_optimized) {
- util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
- &shader->ready);
- }
-
- util_queue_fence_destroy(&shader->ready);
-
- if (shader->pm4) {
- /* If destroyed shaders were not unbound, the next compiled
- * shader variant could get the same pointer address and so
- * binding it to the same shader stage would be considered
- * a no-op, causing random behavior.
- */
- switch (shader->selector->type) {
- case PIPE_SHADER_VERTEX:
- if (shader->key.as_ls) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_delete_state(sctx, ls, shader->pm4);
- } else if (shader->key.as_es) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_delete_state(sctx, es, shader->pm4);
- } else if (shader->key.as_ngg) {
- si_pm4_delete_state(sctx, gs, shader->pm4);
- } else {
- si_pm4_delete_state(sctx, vs, shader->pm4);
- }
- break;
- case PIPE_SHADER_TESS_CTRL:
- si_pm4_delete_state(sctx, hs, shader->pm4);
- break;
- case PIPE_SHADER_TESS_EVAL:
- if (shader->key.as_es) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_delete_state(sctx, es, shader->pm4);
- } else if (shader->key.as_ngg) {
- si_pm4_delete_state(sctx, gs, shader->pm4);
- } else {
- si_pm4_delete_state(sctx, vs, shader->pm4);
- }
- break;
- case PIPE_SHADER_GEOMETRY:
- if (shader->is_gs_copy_shader)
- si_pm4_delete_state(sctx, vs, shader->pm4);
- else
- si_pm4_delete_state(sctx, gs, shader->pm4);
- break;
- case PIPE_SHADER_FRAGMENT:
- si_pm4_delete_state(sctx, ps, shader->pm4);
- break;
- default:;
- }
- }
-
- si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
- si_shader_destroy(shader);
- free(shader);
+ if (shader->is_optimized) {
+ util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready);
+ }
+
+ util_queue_fence_destroy(&shader->ready);
+
+ if (shader->pm4) {
+ /* If destroyed shaders were not unbound, the next compiled
+ * shader variant could get the same pointer address and so
+ * binding it to the same shader stage would be considered
+ * a no-op, causing random behavior.
+ */
+ switch (shader->selector->type) {
+ case PIPE_SHADER_VERTEX:
+ if (shader->key.as_ls) {
+ assert(sctx->chip_class <= GFX8);
+ si_pm4_delete_state(sctx, ls, shader->pm4);
+ } else if (shader->key.as_es) {
+ assert(sctx->chip_class <= GFX8);
+ si_pm4_delete_state(sctx, es, shader->pm4);
+ } else if (shader->key.as_ngg) {
+ si_pm4_delete_state(sctx, gs, shader->pm4);
+ } else {
+ si_pm4_delete_state(sctx, vs, shader->pm4);
+ }
+ break;
+ case PIPE_SHADER_TESS_CTRL:
+ si_pm4_delete_state(sctx, hs, shader->pm4);
+ break;
+ case PIPE_SHADER_TESS_EVAL:
+ if (shader->key.as_es) {
+ assert(sctx->chip_class <= GFX8);
+ si_pm4_delete_state(sctx, es, shader->pm4);
+ } else if (shader->key.as_ngg) {
+ si_pm4_delete_state(sctx, gs, shader->pm4);
+ } else {
+ si_pm4_delete_state(sctx, vs, shader->pm4);
+ }
+ break;
+ case PIPE_SHADER_GEOMETRY:
+ if (shader->is_gs_copy_shader)
+ si_pm4_delete_state(sctx, vs, shader->pm4);
+ else
+ si_pm4_delete_state(sctx, gs, shader->pm4);
+ break;
+ case PIPE_SHADER_FRAGMENT:
+ si_pm4_delete_state(sctx, ps, shader->pm4);
+ break;
+ default:;
+ }
+ }
+
+ si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
+ si_shader_destroy(shader);
+ free(shader);
}
static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_shader_selector *sel = (struct si_shader_selector *)cso;
- struct si_shader *p = sel->first_variant, *c;
- struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
- [PIPE_SHADER_VERTEX] = &sctx->vs_shader,
- [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
- [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
- [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
- [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
- };
-
- util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
-
- if (current_shader[sel->type]->cso == sel) {
- current_shader[sel->type]->cso = NULL;
- current_shader[sel->type]->current = NULL;
- }
-
- while (p) {
- c = p->next_variant;
- si_delete_shader(sctx, p);
- p = c;
- }
-
- if (sel->main_shader_part)
- si_delete_shader(sctx, sel->main_shader_part);
- if (sel->main_shader_part_ls)
- si_delete_shader(sctx, sel->main_shader_part_ls);
- if (sel->main_shader_part_es)
- si_delete_shader(sctx, sel->main_shader_part_es);
- if (sel->main_shader_part_ngg)
- si_delete_shader(sctx, sel->main_shader_part_ngg);
- if (sel->gs_copy_shader)
- si_delete_shader(sctx, sel->gs_copy_shader);
-
- util_queue_fence_destroy(&sel->ready);
- simple_mtx_destroy(&sel->mutex);
- ralloc_free(sel->nir);
- free(sel->nir_binary);
- free(sel);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *sel = (struct si_shader_selector *)cso;
+ struct si_shader *p = sel->first_variant, *c;
+ struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
+ [PIPE_SHADER_VERTEX] = &sctx->vs_shader, [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
+ [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
+ [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
+ };
+
+ util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
+
+ if (current_shader[sel->type]->cso == sel) {
+ current_shader[sel->type]->cso = NULL;
+ current_shader[sel->type]->current = NULL;
+ }
+
+ while (p) {
+ c = p->next_variant;
+ si_delete_shader(sctx, p);
+ p = c;
+ }
+
+ if (sel->main_shader_part)
+ si_delete_shader(sctx, sel->main_shader_part);
+ if (sel->main_shader_part_ls)
+ si_delete_shader(sctx, sel->main_shader_part_ls);
+ if (sel->main_shader_part_es)
+ si_delete_shader(sctx, sel->main_shader_part_es);
+ if (sel->main_shader_part_ngg)
+ si_delete_shader(sctx, sel->main_shader_part_ngg);
+ if (sel->gs_copy_shader)
+ si_delete_shader(sctx, sel->gs_copy_shader);
+
+ util_queue_fence_destroy(&sel->ready);
+ simple_mtx_destroy(&sel->mutex);
+ ralloc_free(sel->nir);
+ free(sel->nir_binary);
+ free(sel);
}
static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
- si_shader_selector_reference(sctx, &sel, NULL);
-}
-
-static unsigned si_get_ps_input_cntl(struct si_context *sctx,
- struct si_shader *vs, unsigned name,
- unsigned index, unsigned interpolate)
-{
- struct si_shader_info *vsinfo = &vs->selector->info;
- unsigned j, offset, ps_input_cntl = 0;
-
- if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
- (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) ||
- name == TGSI_SEMANTIC_PRIMID)
- ps_input_cntl |= S_028644_FLAT_SHADE(1);
-
- if (name == TGSI_SEMANTIC_PCOORD ||
- (name == TGSI_SEMANTIC_TEXCOORD &&
- sctx->sprite_coord_enable & (1 << index))) {
- ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
- }
-
- for (j = 0; j < vsinfo->num_outputs; j++) {
- if (name == vsinfo->output_semantic_name[j] &&
- index == vsinfo->output_semantic_index[j]) {
- offset = vs->info.vs_output_param_offset[j];
-
- if (offset <= AC_EXP_PARAM_OFFSET_31) {
- /* The input is loaded from parameter memory. */
- ps_input_cntl |= S_028644_OFFSET(offset);
- } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
- if (offset == AC_EXP_PARAM_UNDEFINED) {
- /* This can happen with depth-only rendering. */
- offset = 0;
- } else {
- /* The input is a DEFAULT_VAL constant. */
- assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
- offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
- offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
- }
-
- ps_input_cntl = S_028644_OFFSET(0x20) |
- S_028644_DEFAULT_VAL(offset);
- }
- break;
- }
- }
-
- if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
- /* PrimID is written after the last output when HW VS is used. */
- ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
- else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
- /* No corresponding output found, load defaults into input.
- * Don't set any other bits.
- * (FLAT_SHADE=1 completely changes behavior) */
- ps_input_cntl = S_028644_OFFSET(0x20);
- /* D3D 9 behaviour. GL is undefined */
- if (name == TGSI_SEMANTIC_COLOR && index == 0)
- ps_input_cntl |= S_028644_DEFAULT_VAL(3);
- }
- return ps_input_cntl;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+ si_shader_selector_reference(sctx, &sel, NULL);
+}
+
+static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, unsigned name,
+ unsigned index, unsigned interpolate)
+{
+ struct si_shader_info *vsinfo = &vs->selector->info;
+ unsigned j, offset, ps_input_cntl = 0;
+
+ if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
+ (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) || name == TGSI_SEMANTIC_PRIMID)
+ ps_input_cntl |= S_028644_FLAT_SHADE(1);
+
+ if (name == TGSI_SEMANTIC_PCOORD ||
+ (name == TGSI_SEMANTIC_TEXCOORD && sctx->sprite_coord_enable & (1 << index))) {
+ ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
+ }
+
+ for (j = 0; j < vsinfo->num_outputs; j++) {
+ if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) {
+ offset = vs->info.vs_output_param_offset[j];
+
+ if (offset <= AC_EXP_PARAM_OFFSET_31) {
+ /* The input is loaded from parameter memory. */
+ ps_input_cntl |= S_028644_OFFSET(offset);
+ } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+ if (offset == AC_EXP_PARAM_UNDEFINED) {
+ /* This can happen with depth-only rendering. */
+ offset = 0;
+ } else {
+ /* The input is a DEFAULT_VAL constant. */
+ assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+ offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+ offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+ }
+
+ ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
+ }
+ break;
+ }
+ }
+
+ if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
+ /* PrimID is written after the last output when HW VS is used. */
+ ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
+ else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+ /* No corresponding output found, load defaults into input.
+ * Don't set any other bits.
+ * (FLAT_SHADE=1 completely changes behavior) */
+ ps_input_cntl = S_028644_OFFSET(0x20);
+ /* D3D 9 behaviour. GL is undefined */
+ if (name == TGSI_SEMANTIC_COLOR && index == 0)
+ ps_input_cntl |= S_028644_DEFAULT_VAL(3);
+ }
+ return ps_input_cntl;
}
static void si_emit_spi_map(struct si_context *sctx)
{
- struct si_shader *ps = sctx->ps_shader.current;
- struct si_shader *vs = si_get_vs_state(sctx);
- struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
- unsigned i, num_interp, num_written = 0, bcol_interp[2];
- unsigned spi_ps_input_cntl[32];
-
- if (!ps || !ps->selector->info.num_inputs)
- return;
-
- num_interp = si_get_ps_num_interp(ps);
- assert(num_interp > 0);
-
- for (i = 0; i < psinfo->num_inputs; i++) {
- unsigned name = psinfo->input_semantic_name[i];
- unsigned index = psinfo->input_semantic_index[i];
- unsigned interpolate = psinfo->input_interpolate[i];
-
- spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
- index, interpolate);
-
- if (name == TGSI_SEMANTIC_COLOR) {
- assert(index < ARRAY_SIZE(bcol_interp));
- bcol_interp[index] = interpolate;
- }
- }
-
- if (ps->key.part.ps.prolog.color_two_side) {
- unsigned bcol = TGSI_SEMANTIC_BCOLOR;
-
- for (i = 0; i < 2; i++) {
- if (!(psinfo->colors_read & (0xf << (i * 4))))
- continue;
-
- spi_ps_input_cntl[num_written++] =
- si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
-
- }
- }
- assert(num_interp == num_written);
-
- /* R_028644_SPI_PS_INPUT_CNTL_0 */
- /* Dota 2: Only ~16% of SPI map updates set different values. */
- /* Talos: Only ~9% of SPI map updates set different values. */
- unsigned initial_cdw = sctx->gfx_cs->current.cdw;
- radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
- spi_ps_input_cntl,
- sctx->tracked_regs.spi_ps_input_cntl, num_interp);
-
- if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll = true;
+ struct si_shader *ps = sctx->ps_shader.current;
+ struct si_shader *vs = si_get_vs_state(sctx);
+ struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+ unsigned i, num_interp, num_written = 0, bcol_interp[2];
+ unsigned spi_ps_input_cntl[32];
+
+ if (!ps || !ps->selector->info.num_inputs)
+ return;
+
+ num_interp = si_get_ps_num_interp(ps);
+ assert(num_interp > 0);
+
+ for (i = 0; i < psinfo->num_inputs; i++) {
+ unsigned name = psinfo->input_semantic_name[i];
+ unsigned index = psinfo->input_semantic_index[i];
+ unsigned interpolate = psinfo->input_interpolate[i];
+
+ spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name, index, interpolate);
+
+ if (name == TGSI_SEMANTIC_COLOR) {
+ assert(index < ARRAY_SIZE(bcol_interp));
+ bcol_interp[index] = interpolate;
+ }
+ }
+
+ if (ps->key.part.ps.prolog.color_two_side) {
+ unsigned bcol = TGSI_SEMANTIC_BCOLOR;
+
+ for (i = 0; i < 2; i++) {
+ if (!(psinfo->colors_read & (0xf << (i * 4))))
+ continue;
+
+ spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+ }
+ }
+ assert(num_interp == num_written);
+
+ /* R_028644_SPI_PS_INPUT_CNTL_0 */
+ /* Dota 2: Only ~16% of SPI map updates set different values. */
+ /* Talos: Only ~9% of SPI map updates set different values. */
+ unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+ radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
+ sctx->tracked_regs.spi_ps_input_cntl, num_interp);
+
+ if (initial_cdw != sctx->gfx_cs->current.cdw)
+ sctx->context_roll = true;
}
/**
*/
static void si_init_config_add_vgt_flush(struct si_context *sctx)
{
- if (sctx->init_config_has_vgt_flush)
- return;
-
- /* Done by Vulkan before VGT_FLUSH. */
- si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
- si_pm4_cmd_add(sctx->init_config,
- EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- si_pm4_cmd_end(sctx->init_config, false);
-
- /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
- si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
- si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
- si_pm4_cmd_end(sctx->init_config, false);
- sctx->init_config_has_vgt_flush = true;
+ if (sctx->init_config_has_vgt_flush)
+ return;
+
+ /* Done by Vulkan before VGT_FLUSH. */
+ si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+ si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ si_pm4_cmd_end(sctx->init_config, false);
+
+ /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
+ si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+ si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ si_pm4_cmd_end(sctx->init_config, false);
+ sctx->init_config_has_vgt_flush = true;
}
/* Initialize state related to ESGS / GSVS ring buffers */
static bool si_update_gs_ring_buffers(struct si_context *sctx)
{
- struct si_shader_selector *es =
- sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
- struct si_shader_selector *gs = sctx->gs_shader.cso;
- struct si_pm4_state *pm4;
-
- /* Chip constants. */
- unsigned num_se = sctx->screen->info.max_se;
- unsigned wave_size = 64;
- unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
- /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
- * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
- */
- unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
- unsigned alignment = 256 * num_se;
- /* The maximum size is 63.999 MB per SE. */
- unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
-
- /* Calculate the minimum size. */
- unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
- wave_size, alignment);
-
- /* These are recommended sizes, not minimum sizes. */
- unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
- es->esgs_itemsize * gs->gs_input_verts_per_prim;
- unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
- gs->max_gsvs_emit_size;
-
- min_esgs_ring_size = align(min_esgs_ring_size, alignment);
- esgs_ring_size = align(esgs_ring_size, alignment);
- gsvs_ring_size = align(gsvs_ring_size, alignment);
-
- esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
- gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
-
- /* Some rings don't have to be allocated if shaders don't use them.
- * (e.g. no varyings between ES and GS or GS and VS)
- *
- * GFX9 doesn't have the ESGS ring.
- */
- bool update_esgs = sctx->chip_class <= GFX8 &&
- esgs_ring_size &&
- (!sctx->esgs_ring ||
- sctx->esgs_ring->width0 < esgs_ring_size);
- bool update_gsvs = gsvs_ring_size &&
- (!sctx->gsvs_ring ||
- sctx->gsvs_ring->width0 < gsvs_ring_size);
-
- if (!update_esgs && !update_gsvs)
- return true;
-
- if (update_esgs) {
- pipe_resource_reference(&sctx->esgs_ring, NULL);
- sctx->esgs_ring =
- pipe_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- esgs_ring_size,
- sctx->screen->info.pte_fragment_size);
- if (!sctx->esgs_ring)
- return false;
- }
-
- if (update_gsvs) {
- pipe_resource_reference(&sctx->gsvs_ring, NULL);
- sctx->gsvs_ring =
- pipe_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- gsvs_ring_size,
- sctx->screen->info.pte_fragment_size);
- if (!sctx->gsvs_ring)
- return false;
- }
-
- /* Create the "init_config_gs_rings" state. */
- pm4 = CALLOC_STRUCT(si_pm4_state);
- if (!pm4)
- return false;
-
- if (sctx->chip_class >= GFX7) {
- if (sctx->esgs_ring) {
- assert(sctx->chip_class <= GFX8);
- si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
- sctx->esgs_ring->width0 / 256);
- }
- if (sctx->gsvs_ring)
- si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
- sctx->gsvs_ring->width0 / 256);
- } else {
- if (sctx->esgs_ring)
- si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
- sctx->esgs_ring->width0 / 256);
- if (sctx->gsvs_ring)
- si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
- sctx->gsvs_ring->width0 / 256);
- }
-
- /* Set the state. */
- if (sctx->init_config_gs_rings)
- si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
- sctx->init_config_gs_rings = pm4;
-
- if (!sctx->init_config_has_vgt_flush) {
- si_init_config_add_vgt_flush(sctx);
- si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
- }
-
- /* Flush the context to re-emit both init_config states. */
- sctx->initial_gfx_cs_size = 0; /* force flush */
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
- /* Set ring bindings. */
- if (sctx->esgs_ring) {
- assert(sctx->chip_class <= GFX8);
- si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
- sctx->esgs_ring, 0, sctx->esgs_ring->width0,
- true, true, 4, 64, 0);
- si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
- sctx->esgs_ring, 0, sctx->esgs_ring->width0,
- false, false, 0, 0, 0);
- }
- if (sctx->gsvs_ring) {
- si_set_ring_buffer(sctx, SI_RING_GSVS,
- sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
- false, false, 0, 0, 0);
- }
-
- return true;
+ struct si_shader_selector *es =
+ sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+ struct si_shader_selector *gs = sctx->gs_shader.cso;
+ struct si_pm4_state *pm4;
+
+ /* Chip constants. */
+ unsigned num_se = sctx->screen->info.max_se;
+ unsigned wave_size = 64;
+ unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+ /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
+ * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
+ */
+ unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
+ unsigned alignment = 256 * num_se;
+ /* The maximum size is 63.999 MB per SE. */
+ unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+ /* Calculate the minimum size. */
+ unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment);
+
+ /* These are recommended sizes, not minimum sizes. */
+ unsigned esgs_ring_size =
+ max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim;
+ unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size;
+
+ min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+ esgs_ring_size = align(esgs_ring_size, alignment);
+ gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+ esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+ gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+ /* Some rings don't have to be allocated if shaders don't use them.
+ * (e.g. no varyings between ES and GS or GS and VS)
+ *
+ * GFX9 doesn't have the ESGS ring.
+ */
+ bool update_esgs = sctx->chip_class <= GFX8 && esgs_ring_size &&
+ (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size);
+ bool update_gsvs =
+ gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size);
+
+ if (!update_esgs && !update_gsvs)
+ return true;
+
+ if (update_esgs) {
+ pipe_resource_reference(&sctx->esgs_ring, NULL);
+ sctx->esgs_ring =
+ pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ esgs_ring_size, sctx->screen->info.pte_fragment_size);
+ if (!sctx->esgs_ring)
+ return false;
+ }
+
+ if (update_gsvs) {
+ pipe_resource_reference(&sctx->gsvs_ring, NULL);
+ sctx->gsvs_ring =
+ pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ gsvs_ring_size, sctx->screen->info.pte_fragment_size);
+ if (!sctx->gsvs_ring)
+ return false;
+ }
+
+ /* Create the "init_config_gs_rings" state. */
+ pm4 = CALLOC_STRUCT(si_pm4_state);
+ if (!pm4)
+ return false;
+
+ if (sctx->chip_class >= GFX7) {
+ if (sctx->esgs_ring) {
+ assert(sctx->chip_class <= GFX8);
+ si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+ }
+ if (sctx->gsvs_ring)
+ si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+ } else {
+ if (sctx->esgs_ring)
+ si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+ if (sctx->gsvs_ring)
+ si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+ }
+
+ /* Set the state. */
+ if (sctx->init_config_gs_rings)
+ si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+ sctx->init_config_gs_rings = pm4;
+
+ if (!sctx->init_config_has_vgt_flush) {
+ si_init_config_add_vgt_flush(sctx);
+ si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+ }
+
+ /* Flush the context to re-emit both init_config states. */
+ sctx->initial_gfx_cs_size = 0; /* force flush */
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+ /* Set ring bindings. */
+ if (sctx->esgs_ring) {
+ assert(sctx->chip_class <= GFX8);
+ si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true,
+ true, 4, 64, 0);
+ si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false,
+ false, 0, 0, 0);
+ }
+ if (sctx->gsvs_ring) {
+ si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false,
+ false, 0, 0, 0);
+ }
+
+ return true;
}
static void si_shader_lock(struct si_shader *shader)
{
- simple_mtx_lock(&shader->selector->mutex);
- if (shader->previous_stage_sel) {
- assert(shader->previous_stage_sel != shader->selector);
- simple_mtx_lock(&shader->previous_stage_sel->mutex);
- }
+ simple_mtx_lock(&shader->selector->mutex);
+ if (shader->previous_stage_sel) {
+ assert(shader->previous_stage_sel != shader->selector);
+ simple_mtx_lock(&shader->previous_stage_sel->mutex);
+ }
}
static void si_shader_unlock(struct si_shader *shader)
{
- if (shader->previous_stage_sel)
- simple_mtx_unlock(&shader->previous_stage_sel->mutex);
- simple_mtx_unlock(&shader->selector->mutex);
+ if (shader->previous_stage_sel)
+ simple_mtx_unlock(&shader->previous_stage_sel->mutex);
+ simple_mtx_unlock(&shader->selector->mutex);
}
/**
* 0 if not
* < 0 if there was a failure
*/
-static int si_update_scratch_buffer(struct si_context *sctx,
- struct si_shader *shader)
+static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
{
- uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+ uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
- if (!shader)
- return 0;
+ if (!shader)
+ return 0;
- /* This shader doesn't need a scratch buffer */
- if (shader->config.scratch_bytes_per_wave == 0)
- return 0;
+ /* This shader doesn't need a scratch buffer */
+ if (shader->config.scratch_bytes_per_wave == 0)
+ return 0;
- /* Prevent race conditions when updating:
- * - si_shader::scratch_bo
- * - si_shader::binary::code
- * - si_shader::previous_stage::binary::code.
- */
- si_shader_lock(shader);
+ /* Prevent race conditions when updating:
+ * - si_shader::scratch_bo
+ * - si_shader::binary::code
+ * - si_shader::previous_stage::binary::code.
+ */
+ si_shader_lock(shader);
- /* This shader is already configured to use the current
- * scratch buffer. */
- if (shader->scratch_bo == sctx->scratch_buffer) {
- si_shader_unlock(shader);
- return 0;
- }
+ /* This shader is already configured to use the current
+ * scratch buffer. */
+ if (shader->scratch_bo == sctx->scratch_buffer) {
+ si_shader_unlock(shader);
+ return 0;
+ }
- assert(sctx->scratch_buffer);
+ assert(sctx->scratch_buffer);
- /* Replace the shader bo with a new bo that has the relocs applied. */
- if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
- si_shader_unlock(shader);
- return -1;
- }
+ /* Replace the shader bo with a new bo that has the relocs applied. */
+ if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
+ si_shader_unlock(shader);
+ return -1;
+ }
- /* Update the shader state to use the new shader bo. */
- si_shader_init_pm4_state(sctx->screen, shader);
+ /* Update the shader state to use the new shader bo. */
+ si_shader_init_pm4_state(sctx->screen, shader);
- si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
+ si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
- si_shader_unlock(shader);
- return 1;
+ si_shader_unlock(shader);
+ return 1;
}
static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
{
- return shader ? shader->config.scratch_bytes_per_wave : 0;
+ return shader ? shader->config.scratch_bytes_per_wave : 0;
}
static struct si_shader *si_get_tcs_current(struct si_context *sctx)
{
- if (!sctx->tes_shader.cso)
- return NULL; /* tessellation disabled */
+ if (!sctx->tes_shader.cso)
+ return NULL; /* tessellation disabled */
- return sctx->tcs_shader.cso ? sctx->tcs_shader.current :
- sctx->fixed_func_tcs_shader.current;
+ return sctx->tcs_shader.cso ? sctx->tcs_shader.current : sctx->fixed_func_tcs_shader.current;
}
static bool si_update_scratch_relocs(struct si_context *sctx)
{
- struct si_shader *tcs = si_get_tcs_current(sctx);
- int r;
-
- /* Update the shaders, so that they are using the latest scratch.
- * The scratch buffer may have been changed since these shaders were
- * last used, so we still need to try to update them, even if they
- * require scratch buffers smaller than the current size.
- */
- r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
- r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
- r = si_update_scratch_buffer(sctx, tcs);
- if (r < 0)
- return false;
- if (r == 1)
- si_pm4_bind_state(sctx, hs, tcs->pm4);
-
- /* VS can be bound as LS, ES, or VS. */
- r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
- if (r < 0)
- return false;
- if (r == 1) {
- if (sctx->vs_shader.current->key.as_ls)
- si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
- else if (sctx->vs_shader.current->key.as_es)
- si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
- else if (sctx->vs_shader.current->key.as_ngg)
- si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
- else
- si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
- }
-
- /* TES can be bound as ES or VS. */
- r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
- if (r < 0)
- return false;
- if (r == 1) {
- if (sctx->tes_shader.current->key.as_es)
- si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
- else if (sctx->tes_shader.current->key.as_ngg)
- si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
- else
- si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
- }
-
- return true;
+ struct si_shader *tcs = si_get_tcs_current(sctx);
+ int r;
+
+ /* Update the shaders, so that they are using the latest scratch.
+ * The scratch buffer may have been changed since these shaders were
+ * last used, so we still need to try to update them, even if they
+ * require scratch buffers smaller than the current size.
+ */
+ r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+ r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+ r = si_update_scratch_buffer(sctx, tcs);
+ if (r < 0)
+ return false;
+ if (r == 1)
+ si_pm4_bind_state(sctx, hs, tcs->pm4);
+
+ /* VS can be bound as LS, ES, or VS. */
+ r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1) {
+ if (sctx->vs_shader.current->key.as_ls)
+ si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+ else if (sctx->vs_shader.current->key.as_es)
+ si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+ else if (sctx->vs_shader.current->key.as_ngg)
+ si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+ else
+ si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+ }
+
+ /* TES can be bound as ES or VS. */
+ r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+ if (r < 0)
+ return false;
+ if (r == 1) {
+ if (sctx->tes_shader.current->key.as_es)
+ si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+ else if (sctx->tes_shader.current->key.as_ngg)
+ si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+ else
+ si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+ }
+
+ return true;
}
static bool si_update_spi_tmpring_size(struct si_context *sctx)
{
- /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
- * There are 2 cases to handle:
- *
- * - If the current needed size is less than the maximum seen size,
- * use the maximum seen size, so that WAVESIZE remains the same.
- *
- * - If the current needed size is greater than the maximum seen size,
- * the scratch buffer is reallocated, so we can increase WAVESIZE.
- *
- * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
- * Otherwise, the number of waves that can use scratch is
- * SPI_TMPRING_SIZE.WAVES.
- */
- unsigned bytes = 0;
-
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
-
- if (sctx->tes_shader.cso) {
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
- bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
- }
-
- sctx->max_seen_scratch_bytes_per_wave =
- MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
-
- unsigned scratch_needed_size =
- sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
- unsigned spi_tmpring_size;
-
- if (scratch_needed_size > 0) {
- if (!sctx->scratch_buffer ||
- scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
- /* Create a bigger scratch buffer */
- si_resource_reference(&sctx->scratch_buffer, NULL);
-
- sctx->scratch_buffer =
- si_aligned_buffer_create(&sctx->screen->b,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- scratch_needed_size,
- sctx->screen->info.pte_fragment_size);
- if (!sctx->scratch_buffer)
- return false;
-
- si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
- si_context_add_resource_size(sctx,
- &sctx->scratch_buffer->b.b);
- }
-
- if (!si_update_scratch_relocs(sctx))
- return false;
- }
-
- /* The LLVM shader backend should be reporting aligned scratch_sizes. */
- assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
- "scratch size should already be aligned correctly.");
-
- spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
- S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
- if (spi_tmpring_size != sctx->spi_tmpring_size) {
- sctx->spi_tmpring_size = spi_tmpring_size;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
- }
- return true;
+ /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
+ * There are 2 cases to handle:
+ *
+ * - If the current needed size is less than the maximum seen size,
+ * use the maximum seen size, so that WAVESIZE remains the same.
+ *
+ * - If the current needed size is greater than the maximum seen size,
+ * the scratch buffer is reallocated, so we can increase WAVESIZE.
+ *
+ * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
+ * Otherwise, the number of waves that can use scratch is
+ * SPI_TMPRING_SIZE.WAVES.
+ */
+ unsigned bytes = 0;
+
+ bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
+ bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
+ bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
+
+ if (sctx->tes_shader.cso) {
+ bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
+ bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
+ }
+
+ sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+
+ unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
+ unsigned spi_tmpring_size;
+
+ if (scratch_needed_size > 0) {
+ if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
+ /* Create a bigger scratch buffer */
+ si_resource_reference(&sctx->scratch_buffer, NULL);
+
+ sctx->scratch_buffer = si_aligned_buffer_create(
+ &sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_needed_size,
+ sctx->screen->info.pte_fragment_size);
+ if (!sctx->scratch_buffer)
+ return false;
+
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+ si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
+ }
+
+ if (!si_update_scratch_relocs(sctx))
+ return false;
+ }
+
+ /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+ assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+ "scratch size should already be aligned correctly.");
+
+ spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+ S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
+ if (spi_tmpring_size != sctx->spi_tmpring_size) {
+ sctx->spi_tmpring_size = spi_tmpring_size;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+ }
+ return true;
}
static void si_init_tess_factor_ring(struct si_context *sctx)
{
- assert(!sctx->tess_rings);
- assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
-
- /* The address must be aligned to 2^19, because the shader only
- * receives the high 13 bits.
- */
- sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_32BIT,
- PIPE_USAGE_DEFAULT,
- sctx->screen->tess_offchip_ring_size +
- sctx->screen->tess_factor_ring_size,
- 1 << 19);
- if (!sctx->tess_rings)
- return;
-
- si_init_config_add_vgt_flush(sctx);
-
- si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings),
- RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
-
- uint64_t factor_va = si_resource(sctx->tess_rings)->gpu_address +
- sctx->screen->tess_offchip_ring_size;
-
- /* Append these registers to the init config state. */
- if (sctx->chip_class >= GFX7) {
- si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
- S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
- si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
- factor_va >> 8);
- if (sctx->chip_class >= GFX10)
- si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
- S_030984_BASE_HI(factor_va >> 40));
- else if (sctx->chip_class == GFX9)
- si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
- S_030944_BASE_HI(factor_va >> 40));
- si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
- sctx->screen->vgt_hs_offchip_param);
- } else {
- si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
- S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
- si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
- factor_va >> 8);
- si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
- sctx->screen->vgt_hs_offchip_param);
- }
-
- /* Flush the context to re-emit the init_config state.
- * This is done only once in a lifetime of a context.
- */
- si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
- sctx->initial_gfx_cs_size = 0; /* force flush */
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ assert(!sctx->tess_rings);
+ assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
+
+ /* The address must be aligned to 2^19, because the shader only
+ * receives the high 13 bits.
+ */
+ sctx->tess_rings = pipe_aligned_buffer_create(
+ sctx->b.screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT,
+ sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19);
+ if (!sctx->tess_rings)
+ return;
+
+ si_init_config_add_vgt_flush(sctx);
+
+ si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings), RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SHADER_RINGS);
+
+ uint64_t factor_va =
+ si_resource(sctx->tess_rings)->gpu_address + sctx->screen->tess_offchip_ring_size;
+
+ /* Append these registers to the init config state. */
+ if (sctx->chip_class >= GFX7) {
+ si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
+ S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
+ si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+ if (sctx->chip_class >= GFX10)
+ si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+ S_030984_BASE_HI(factor_va >> 40));
+ else if (sctx->chip_class == GFX9)
+ si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
+ S_030944_BASE_HI(factor_va >> 40));
+ si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
+ sctx->screen->vgt_hs_offchip_param);
+ } else {
+ si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
+ S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
+ si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
+ si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+ sctx->screen->vgt_hs_offchip_param);
+ }
+
+ /* Flush the context to re-emit the init_config state.
+ * This is done only once in a lifetime of a context.
+ */
+ si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+ sctx->initial_gfx_cs_size = 0; /* force flush */
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
- union si_vgt_stages_key key)
+ union si_vgt_stages_key key)
{
- struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
- uint32_t stages = 0;
-
- if (key.u.tess) {
- stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
- S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
-
- if (key.u.gs)
- stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
- S_028B54_GS_EN(1);
- else if (key.u.ngg)
- stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
- else
- stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
- } else if (key.u.gs) {
- stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
- S_028B54_GS_EN(1);
- } else if (key.u.ngg) {
- stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
- }
-
- if (key.u.ngg) {
- stages |= S_028B54_PRIMGEN_EN(1) |
- S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
- S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
- S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
- } else if (key.u.gs)
- stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
-
- if (screen->info.chip_class >= GFX9)
- stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
-
- if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
- stages |= S_028B54_HS_W32_EN(1) |
- S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
- S_028B54_VS_W32_EN(1);
- }
-
- si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
- return pm4;
+ struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+ uint32_t stages = 0;
+
+ if (key.u.tess) {
+ stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
+
+ if (key.u.gs)
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
+ else if (key.u.ngg)
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
+ else
+ stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+ } else if (key.u.gs) {
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
+ } else if (key.u.ngg) {
+ stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
+ }
+
+ if (key.u.ngg) {
+ stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+ S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+ S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
+ } else if (key.u.gs)
+ stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+
+ if (screen->info.chip_class >= GFX9)
+ stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+
+ if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+ stages |= S_028B54_HS_W32_EN(1) |
+ S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
+ S_028B54_VS_W32_EN(1);
+ }
+
+ si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+ return pm4;
}
-static void si_update_vgt_shader_config(struct si_context *sctx,
- union si_vgt_stages_key key)
+static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key)
{
- struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
+ struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
- if (unlikely(!*pm4))
- *pm4 = si_build_vgt_shader_config(sctx->screen, key);
- si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+ if (unlikely(!*pm4))
+ *pm4 = si_build_vgt_shader_config(sctx->screen, key);
+ si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
}
bool si_update_shaders(struct si_context *sctx)
{
- struct pipe_context *ctx = (struct pipe_context*)sctx;
- struct si_compiler_ctx_state compiler_state;
- struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- struct si_shader *old_vs = si_get_vs_state(sctx);
- bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
- struct si_shader *old_ps = sctx->ps_shader.current;
- union si_vgt_stages_key key;
- unsigned old_spi_shader_col_format =
- old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
- int r;
-
- if (!sctx->compiler.passes)
- si_init_compiler(sctx->screen, &sctx->compiler);
-
- compiler_state.compiler = &sctx->compiler;
- compiler_state.debug = sctx->debug;
- compiler_state.is_debug_context = sctx->is_debug;
-
- key.index = 0;
-
- if (sctx->tes_shader.cso)
- key.u.tess = 1;
- if (sctx->gs_shader.cso)
- key.u.gs = 1;
-
- if (sctx->ngg) {
- key.u.ngg = 1;
- key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
- }
-
- /* Update TCS and TES. */
- if (sctx->tes_shader.cso) {
- if (!sctx->tess_rings) {
- si_init_tess_factor_ring(sctx);
- if (!sctx->tess_rings)
- return false;
- }
-
- if (sctx->tcs_shader.cso) {
- r = si_shader_select(ctx, &sctx->tcs_shader, key,
- &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
- } else {
- if (!sctx->fixed_func_tcs_shader.cso) {
- sctx->fixed_func_tcs_shader.cso =
- si_create_fixed_func_tcs(sctx);
- if (!sctx->fixed_func_tcs_shader.cso)
- return false;
- }
-
- r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader,
- key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, hs,
- sctx->fixed_func_tcs_shader.current->pm4);
- }
-
- if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
- r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
- if (r)
- return false;
-
- if (sctx->gs_shader.cso) {
- /* TES as ES */
- assert(sctx->chip_class <= GFX8);
- si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
- } else if (key.u.ngg) {
- si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
- } else {
- si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
- }
- }
- } else {
- if (sctx->chip_class <= GFX8)
- si_pm4_bind_state(sctx, ls, NULL);
- si_pm4_bind_state(sctx, hs, NULL);
- }
-
- /* Update GS. */
- if (sctx->gs_shader.cso) {
- r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
- if (!key.u.ngg) {
- si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
-
- if (!si_update_gs_ring_buffers(sctx))
- return false;
- } else {
- si_pm4_bind_state(sctx, vs, NULL);
- }
- } else {
- if (!key.u.ngg) {
- si_pm4_bind_state(sctx, gs, NULL);
- if (sctx->chip_class <= GFX8)
- si_pm4_bind_state(sctx, es, NULL);
- }
- }
-
- /* Update VS. */
- if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
- r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
- if (r)
- return false;
-
- if (!key.u.tess && !key.u.gs) {
- if (key.u.ngg) {
- si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
- si_pm4_bind_state(sctx, vs, NULL);
- } else {
- si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
- }
- } else if (sctx->tes_shader.cso) {
- si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
- } else {
- assert(sctx->gs_shader.cso);
- si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
- }
- }
-
- /* This must be done after the shader variant is selected. */
- if (sctx->ngg) {
- struct si_shader *vs = si_get_vs(sctx)->current;
-
- key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
- key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
- SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
- }
-
- si_update_vgt_shader_config(sctx, key);
-
- if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
- if (sctx->ps_shader.cso) {
- unsigned db_shader_control;
-
- r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
- if (r)
- return false;
- si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
- db_shader_control =
- sctx->ps_shader.cso->db_shader_control |
- S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
-
- if (si_pm4_state_changed(sctx, ps) ||
- si_pm4_state_changed(sctx, vs) ||
- (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
- sctx->sprite_coord_enable != rs->sprite_coord_enable ||
- sctx->flatshade != rs->flatshade) {
- sctx->sprite_coord_enable = rs->sprite_coord_enable;
- sctx->flatshade = rs->flatshade;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
- }
-
- if (sctx->screen->info.rbplus_allowed &&
- si_pm4_state_changed(sctx, ps) &&
- (!old_ps ||
- old_spi_shader_col_format !=
- sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
- if (sctx->ps_db_shader_control != db_shader_control) {
- sctx->ps_db_shader_control = db_shader_control;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
- if (sctx->screen->dpbb_allowed)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
- }
-
- if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
- sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
- if (sctx->chip_class == GFX6)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
- if (sctx->framebuffer.nr_samples <= 1)
- si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
- }
- }
-
- if (si_pm4_state_enabled_and_changed(sctx, ls) ||
- si_pm4_state_enabled_and_changed(sctx, hs) ||
- si_pm4_state_enabled_and_changed(sctx, es) ||
- si_pm4_state_enabled_and_changed(sctx, gs) ||
- si_pm4_state_enabled_and_changed(sctx, vs) ||
- si_pm4_state_enabled_and_changed(sctx, ps)) {
- if (!si_update_spi_tmpring_size(sctx))
- return false;
- }
-
- if (sctx->chip_class >= GFX7) {
- if (si_pm4_state_enabled_and_changed(sctx, ls))
- sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
- else if (!sctx->queued.named.ls)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
-
- if (si_pm4_state_enabled_and_changed(sctx, hs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
- else if (!sctx->queued.named.hs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
-
- if (si_pm4_state_enabled_and_changed(sctx, es))
- sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
- else if (!sctx->queued.named.es)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
-
- if (si_pm4_state_enabled_and_changed(sctx, gs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
- else if (!sctx->queued.named.gs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
-
- if (si_pm4_state_enabled_and_changed(sctx, vs))
- sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
- else if (!sctx->queued.named.vs)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
-
- if (si_pm4_state_enabled_and_changed(sctx, ps))
- sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
- else if (!sctx->queued.named.ps)
- sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
- }
-
- sctx->do_update_shaders = false;
- return true;
+ struct pipe_context *ctx = (struct pipe_context *)sctx;
+ struct si_compiler_ctx_state compiler_state;
+ struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+ struct si_shader *old_vs = si_get_vs_state(sctx);
+ bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
+ struct si_shader *old_ps = sctx->ps_shader.current;
+ union si_vgt_stages_key key;
+ unsigned old_spi_shader_col_format =
+ old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
+ int r;
+
+ if (!sctx->compiler.passes)
+ si_init_compiler(sctx->screen, &sctx->compiler);
+
+ compiler_state.compiler = &sctx->compiler;
+ compiler_state.debug = sctx->debug;
+ compiler_state.is_debug_context = sctx->is_debug;
+
+ key.index = 0;
+
+ if (sctx->tes_shader.cso)
+ key.u.tess = 1;
+ if (sctx->gs_shader.cso)
+ key.u.gs = 1;
+
+ if (sctx->ngg) {
+ key.u.ngg = 1;
+ key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
+ }
+
+ /* Update TCS and TES. */
+ if (sctx->tes_shader.cso) {
+ if (!sctx->tess_rings) {
+ si_init_tess_factor_ring(sctx);
+ if (!sctx->tess_rings)
+ return false;
+ }
+
+ if (sctx->tcs_shader.cso) {
+ r = si_shader_select(ctx, &sctx->tcs_shader, key, &compiler_state);
+ if (r)
+ return false;
+ si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+ } else {
+ if (!sctx->fixed_func_tcs_shader.cso) {
+ sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx);
+ if (!sctx->fixed_func_tcs_shader.cso)
+ return false;
+ }
+
+ r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state);
+ if (r)
+ return false;
+ si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4);
+ }
+
+ if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
+ r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
+ if (r)
+ return false;
+
+ if (sctx->gs_shader.cso) {
+ /* TES as ES */
+ assert(sctx->chip_class <= GFX8);
+ si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+ } else if (key.u.ngg) {
+ si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+ } else {
+ si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+ }
+ }
+ } else {
+ if (sctx->chip_class <= GFX8)
+ si_pm4_bind_state(sctx, ls, NULL);
+ si_pm4_bind_state(sctx, hs, NULL);
+ }
+
+ /* Update GS. */
+ if (sctx->gs_shader.cso) {
+ r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
+ if (r)
+ return false;
+ si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+ if (!key.u.ngg) {
+ si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
+
+ if (!si_update_gs_ring_buffers(sctx))
+ return false;
+ } else {
+ si_pm4_bind_state(sctx, vs, NULL);
+ }
+ } else {
+ if (!key.u.ngg) {
+ si_pm4_bind_state(sctx, gs, NULL);
+ if (sctx->chip_class <= GFX8)
+ si_pm4_bind_state(sctx, es, NULL);
+ }
+ }
+
+ /* Update VS. */
+ if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
+ r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
+ if (r)
+ return false;
+
+ if (!key.u.tess && !key.u.gs) {
+ if (key.u.ngg) {
+ si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+ si_pm4_bind_state(sctx, vs, NULL);
+ } else {
+ si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+ }
+ } else if (sctx->tes_shader.cso) {
+ si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+ } else {
+ assert(sctx->gs_shader.cso);
+ si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+ }
+ }
+
+ /* This must be done after the shader variant is selected. */
+ if (sctx->ngg) {
+ struct si_shader *vs = si_get_vs(sctx)->current;
+
+ key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+ key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+ }
+
+ si_update_vgt_shader_config(sctx, key);
+
+ if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+ if (sctx->ps_shader.cso) {
+ unsigned db_shader_control;
+
+ r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
+ if (r)
+ return false;
+ si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+ db_shader_control = sctx->ps_shader.cso->db_shader_control |
+ S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
+ if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+ (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
+ sctx->sprite_coord_enable != rs->sprite_coord_enable ||
+ sctx->flatshade != rs->flatshade) {
+ sctx->sprite_coord_enable = rs->sprite_coord_enable;
+ sctx->flatshade = rs->flatshade;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+ }
+
+ if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) &&
+ (!old_ps || old_spi_shader_col_format !=
+ sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+ if (sctx->ps_db_shader_control != db_shader_control) {
+ sctx->ps_db_shader_control = db_shader_control;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+ if (sctx->screen->dpbb_allowed)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+ }
+
+ if (sctx->smoothing_enabled !=
+ sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
+ sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+ if (sctx->chip_class == GFX6)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+ if (sctx->framebuffer.nr_samples <= 1)
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+ }
+ }
+
+ if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) ||
+ si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) ||
+ si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
+ if (!si_update_spi_tmpring_size(sctx))
+ return false;
+ }
+
+ if (sctx->chip_class >= GFX7) {
+ if (si_pm4_state_enabled_and_changed(sctx, ls))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+ else if (!sctx->queued.named.ls)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
+
+ if (si_pm4_state_enabled_and_changed(sctx, hs))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+ else if (!sctx->queued.named.hs)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
+
+ if (si_pm4_state_enabled_and_changed(sctx, es))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+ else if (!sctx->queued.named.es)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
+
+ if (si_pm4_state_enabled_and_changed(sctx, gs))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+ else if (!sctx->queued.named.gs)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
+
+ if (si_pm4_state_enabled_and_changed(sctx, vs))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+ else if (!sctx->queued.named.vs)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+
+ if (si_pm4_state_enabled_and_changed(sctx, ps))
+ sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+ else if (!sctx->queued.named.ps)
+ sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
+ }
+
+ sctx->do_update_shaders = false;
+ return true;
}
static void si_emit_scratch_state(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
- radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
- sctx->spi_tmpring_size);
+ radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
- if (sctx->scratch_buffer) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- sctx->scratch_buffer, RADEON_USAGE_READWRITE,
- RADEON_PRIO_SCRATCH_BUFFER);
- }
+ if (sctx->scratch_buffer) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+ RADEON_PRIO_SCRATCH_BUFFER);
+ }
}
void si_init_screen_live_shader_cache(struct si_screen *sscreen)
{
- util_live_shader_cache_init(&sscreen->live_shader_cache,
- si_create_shader_selector,
- si_destroy_shader_selector);
+ util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
+ si_destroy_shader_selector);
}
void si_init_shader_functions(struct si_context *sctx)
{
- sctx->atoms.s.spi_map.emit = si_emit_spi_map;
- sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
-
- sctx->b.create_vs_state = si_create_shader;
- sctx->b.create_tcs_state = si_create_shader;
- sctx->b.create_tes_state = si_create_shader;
- sctx->b.create_gs_state = si_create_shader;
- sctx->b.create_fs_state = si_create_shader;
-
- sctx->b.bind_vs_state = si_bind_vs_shader;
- sctx->b.bind_tcs_state = si_bind_tcs_shader;
- sctx->b.bind_tes_state = si_bind_tes_shader;
- sctx->b.bind_gs_state = si_bind_gs_shader;
- sctx->b.bind_fs_state = si_bind_ps_shader;
-
- sctx->b.delete_vs_state = si_delete_shader_selector;
- sctx->b.delete_tcs_state = si_delete_shader_selector;
- sctx->b.delete_tes_state = si_delete_shader_selector;
- sctx->b.delete_gs_state = si_delete_shader_selector;
- sctx->b.delete_fs_state = si_delete_shader_selector;
+ sctx->atoms.s.spi_map.emit = si_emit_spi_map;
+ sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
+
+ sctx->b.create_vs_state = si_create_shader;
+ sctx->b.create_tcs_state = si_create_shader;
+ sctx->b.create_tes_state = si_create_shader;
+ sctx->b.create_gs_state = si_create_shader;
+ sctx->b.create_fs_state = si_create_shader;
+
+ sctx->b.bind_vs_state = si_bind_vs_shader;
+ sctx->b.bind_tcs_state = si_bind_tcs_shader;
+ sctx->b.bind_tes_state = si_bind_tes_shader;
+ sctx->b.bind_gs_state = si_bind_gs_shader;
+ sctx->b.bind_fs_state = si_bind_ps_shader;
+
+ sctx->b.delete_vs_state = si_delete_shader_selector;
+ sctx->b.delete_tcs_state = si_delete_shader_selector;
+ sctx->b.delete_tes_state = si_delete_shader_selector;
+ sctx->b.delete_gs_state = si_delete_shader_selector;
+ sctx->b.delete_fs_state = si_delete_shader_selector;
}
*/
#include "si_build_pm4.h"
-
#include "util/u_memory.h"
#include "util/u_suballoc.h"
static void si_set_streamout_enable(struct si_context *sctx, bool enable);
static inline void si_so_target_reference(struct si_streamout_target **dst,
- struct pipe_stream_output_target *src)
+ struct pipe_stream_output_target *src)
{
- pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
+ pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
}
-static struct pipe_stream_output_target *
-si_create_so_target(struct pipe_context *ctx,
- struct pipe_resource *buffer,
- unsigned buffer_offset,
- unsigned buffer_size)
+static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned buffer_offset,
+ unsigned buffer_size)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_streamout_target *t;
- struct si_resource *buf = si_resource(buffer);
-
- t = CALLOC_STRUCT(si_streamout_target);
- if (!t) {
- return NULL;
- }
-
- unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
- u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
- &t->buf_filled_size_offset,
- (struct pipe_resource**)&t->buf_filled_size);
- if (!t->buf_filled_size) {
- FREE(t);
- return NULL;
- }
-
- t->b.reference.count = 1;
- t->b.context = ctx;
- pipe_resource_reference(&t->b.buffer, buffer);
- t->b.buffer_offset = buffer_offset;
- t->b.buffer_size = buffer_size;
-
- util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
- buffer_offset + buffer_size);
- return &t->b;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_streamout_target *t;
+ struct si_resource *buf = si_resource(buffer);
+
+ t = CALLOC_STRUCT(si_streamout_target);
+ if (!t) {
+ return NULL;
+ }
+
+ unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
+ u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
+ &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
+ if (!t->buf_filled_size) {
+ FREE(t);
+ return NULL;
+ }
+
+ t->b.reference.count = 1;
+ t->b.context = ctx;
+ pipe_resource_reference(&t->b.buffer, buffer);
+ t->b.buffer_offset = buffer_offset;
+ t->b.buffer_size = buffer_size;
+
+ util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
+ return &t->b;
}
-static void si_so_target_destroy(struct pipe_context *ctx,
- struct pipe_stream_output_target *target)
+static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
{
- struct si_streamout_target *t = (struct si_streamout_target*)target;
- pipe_resource_reference(&t->b.buffer, NULL);
- si_resource_reference(&t->buf_filled_size, NULL);
- FREE(t);
+ struct si_streamout_target *t = (struct si_streamout_target *)target;
+ pipe_resource_reference(&t->b.buffer, NULL);
+ si_resource_reference(&t->buf_filled_size, NULL);
+ FREE(t);
}
void si_streamout_buffers_dirty(struct si_context *sctx)
{
- if (!sctx->streamout.enabled_mask)
- return;
+ if (!sctx->streamout.enabled_mask)
+ return;
- si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
- si_set_streamout_enable(sctx, true);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
+ si_set_streamout_enable(sctx, true);
}
-static void si_set_streamout_targets(struct pipe_context *ctx,
- unsigned num_targets,
- struct pipe_stream_output_target **targets,
- const unsigned *offsets)
+static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
+ struct pipe_stream_output_target **targets,
+ const unsigned *offsets)
{
- struct si_context *sctx = (struct si_context *)ctx;
- unsigned old_num_targets = sctx->streamout.num_targets;
- unsigned i;
- bool wait_now = false;
-
- /* We are going to unbind the buffers. Mark which caches need to be flushed. */
- if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
- /* Since streamout uses vector writes which go through TC L2
- * and most other clients can use TC L2 as well, we don't need
- * to flush it.
- *
- * The only cases which requires flushing it is VGT DMA index
- * fetching (on <= GFX7) and indirect draw data, which are rare
- * cases. Thus, flag the TC L2 dirtiness in the resource and
- * handle it at draw call time.
- */
- for (i = 0; i < sctx->streamout.num_targets; i++)
- if (sctx->streamout.targets[i])
- si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
-
- /* Invalidate the scalar cache in case a streamout buffer is
- * going to be used as a constant buffer.
- *
- * Invalidate vL1, because streamout bypasses it (done by
- * setting GLC=1 in the store instruction), but vL1 in other
- * CUs can contain outdated data of streamout buffers.
- *
- * VS_PARTIAL_FLUSH is required if the buffers are going to be
- * used as an input immediately.
- */
- sctx->flags |= SI_CONTEXT_INV_SCACHE |
- SI_CONTEXT_INV_VCACHE;
-
- /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
- if (sctx->screen->use_ngg_streamout) {
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
- /* Wait now. This is needed to make sure that GDS is not
- * busy at the end of IBs.
- *
- * Also, the next streamout operation will overwrite GDS,
- * so we need to make sure that it's idle.
- */
- wait_now = true;
- } else {
- sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
- }
- }
-
- /* All readers of the streamout targets need to be finished before we can
- * start writing to the targets.
- */
- if (num_targets) {
- if (sctx->screen->use_ngg_streamout)
- si_allocate_gds(sctx);
-
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- /* Streamout buffers must be bound in 2 places:
- * 1) in VGT by setting the VGT_STRMOUT registers
- * 2) as shader resources
- */
-
- /* Stop streamout. */
- if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
- si_emit_streamout_end(sctx);
-
- /* Set the new targets. */
- unsigned enabled_mask = 0, append_bitmask = 0;
- for (i = 0; i < num_targets; i++) {
- si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
- if (!targets[i])
- continue;
-
- si_context_add_resource_size(sctx, targets[i]->buffer);
- enabled_mask |= 1 << i;
-
- if (offsets[i] == ((unsigned)-1))
- append_bitmask |= 1 << i;
- }
-
- for (; i < sctx->streamout.num_targets; i++)
- si_so_target_reference(&sctx->streamout.targets[i], NULL);
-
- sctx->streamout.enabled_mask = enabled_mask;
- sctx->streamout.num_targets = num_targets;
- sctx->streamout.append_bitmask = append_bitmask;
-
- /* Update dirty state bits. */
- if (num_targets) {
- si_streamout_buffers_dirty(sctx);
- } else {
- si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
- si_set_streamout_enable(sctx, false);
- }
-
- /* Set the shader resources.*/
- for (i = 0; i < num_targets; i++) {
- if (targets[i]) {
- struct pipe_shader_buffer sbuf;
- sbuf.buffer = targets[i]->buffer;
-
- if (sctx->screen->use_ngg_streamout) {
- sbuf.buffer_offset = targets[i]->buffer_offset;
- sbuf.buffer_size = targets[i]->buffer_size;
- } else {
- sbuf.buffer_offset = 0;
- sbuf.buffer_size = targets[i]->buffer_offset +
- targets[i]->buffer_size;
- }
-
- si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
- si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
- } else {
- si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
- }
- }
- for (; i < old_num_targets; i++)
- si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-
- if (wait_now)
- sctx->emit_cache_flush(sctx);
+ struct si_context *sctx = (struct si_context *)ctx;
+ unsigned old_num_targets = sctx->streamout.num_targets;
+ unsigned i;
+ bool wait_now = false;
+
+ /* We are going to unbind the buffers. Mark which caches need to be flushed. */
+ if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
+ /* Since streamout uses vector writes which go through TC L2
+ * and most other clients can use TC L2 as well, we don't need
+ * to flush it.
+ *
+ * The only cases which requires flushing it is VGT DMA index
+ * fetching (on <= GFX7) and indirect draw data, which are rare
+ * cases. Thus, flag the TC L2 dirtiness in the resource and
+ * handle it at draw call time.
+ */
+ for (i = 0; i < sctx->streamout.num_targets; i++)
+ if (sctx->streamout.targets[i])
+ si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
+
+ /* Invalidate the scalar cache in case a streamout buffer is
+ * going to be used as a constant buffer.
+ *
+ * Invalidate vL1, because streamout bypasses it (done by
+ * setting GLC=1 in the store instruction), but vL1 in other
+ * CUs can contain outdated data of streamout buffers.
+ *
+ * VS_PARTIAL_FLUSH is required if the buffers are going to be
+ * used as an input immediately.
+ */
+ sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+ /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+ if (sctx->screen->use_ngg_streamout) {
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
+ /* Wait now. This is needed to make sure that GDS is not
+ * busy at the end of IBs.
+ *
+ * Also, the next streamout operation will overwrite GDS,
+ * so we need to make sure that it's idle.
+ */
+ wait_now = true;
+ } else {
+ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+ }
+ }
+
+ /* All readers of the streamout targets need to be finished before we can
+ * start writing to the targets.
+ */
+ if (num_targets) {
+ if (sctx->screen->use_ngg_streamout)
+ si_allocate_gds(sctx);
+
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* Streamout buffers must be bound in 2 places:
+ * 1) in VGT by setting the VGT_STRMOUT registers
+ * 2) as shader resources
+ */
+
+ /* Stop streamout. */
+ if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
+ si_emit_streamout_end(sctx);
+
+ /* Set the new targets. */
+ unsigned enabled_mask = 0, append_bitmask = 0;
+ for (i = 0; i < num_targets; i++) {
+ si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
+ if (!targets[i])
+ continue;
+
+ si_context_add_resource_size(sctx, targets[i]->buffer);
+ enabled_mask |= 1 << i;
+
+ if (offsets[i] == ((unsigned)-1))
+ append_bitmask |= 1 << i;
+ }
+
+ for (; i < sctx->streamout.num_targets; i++)
+ si_so_target_reference(&sctx->streamout.targets[i], NULL);
+
+ sctx->streamout.enabled_mask = enabled_mask;
+ sctx->streamout.num_targets = num_targets;
+ sctx->streamout.append_bitmask = append_bitmask;
+
+ /* Update dirty state bits. */
+ if (num_targets) {
+ si_streamout_buffers_dirty(sctx);
+ } else {
+ si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
+ si_set_streamout_enable(sctx, false);
+ }
+
+ /* Set the shader resources.*/
+ for (i = 0; i < num_targets; i++) {
+ if (targets[i]) {
+ struct pipe_shader_buffer sbuf;
+ sbuf.buffer = targets[i]->buffer;
+
+ if (sctx->screen->use_ngg_streamout) {
+ sbuf.buffer_offset = targets[i]->buffer_offset;
+ sbuf.buffer_size = targets[i]->buffer_size;
+ } else {
+ sbuf.buffer_offset = 0;
+ sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
+ }
+
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+ si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+ } else {
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+ }
+ }
+ for (; i < old_num_targets; i++)
+ si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+
+ if (wait_now)
+ sctx->emit_cache_flush(sctx);
}
static void gfx10_emit_streamout_begin(struct si_context *sctx)
{
- struct si_streamout_target **t = sctx->streamout.targets;
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned last_target = 0;
-
- for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
- if (t[i])
- last_target = i;
- }
-
- for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
- if (!t[i])
- continue;
-
- t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
-
- bool append = sctx->streamout.append_bitmask & (1 << i);
- uint64_t va = 0;
-
- if (append) {
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- t[i]->buf_filled_size,
- RADEON_USAGE_READ,
- RADEON_PRIO_SO_FILLED_SIZE);
-
- va = t[i]->buf_filled_size->gpu_address +
- t[i]->buf_filled_size_offset;
- }
-
- radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
- radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
- S_411_DST_SEL(V_411_GDS) |
- S_411_CP_SYNC(i == last_target));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
- radeon_emit(cs, 4 * i); /* destination in GDS */
- radeon_emit(cs, 0);
- radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
- S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
- }
-
- sctx->streamout.begin_emitted = true;
+ struct si_streamout_target **t = sctx->streamout.targets;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned last_target = 0;
+
+ for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+ if (t[i])
+ last_target = i;
+ }
+
+ for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+ if (!t[i])
+ continue;
+
+ t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
+
+ bool append = sctx->streamout.append_bitmask & (1 << i);
+ uint64_t va = 0;
+
+ if (append) {
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+ RADEON_PRIO_SO_FILLED_SIZE);
+
+ va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+ }
+
+ radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+ radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+ S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, 4 * i); /* destination in GDS */
+ radeon_emit(cs, 0);
+ radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+ }
+
+ sctx->streamout.begin_emitted = true;
}
static void gfx10_emit_streamout_end(struct si_context *sctx)
{
- struct si_streamout_target **t = sctx->streamout.targets;
+ struct si_streamout_target **t = sctx->streamout.targets;
- for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
- if (!t[i])
- continue;
+ for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+ if (!t[i])
+ continue;
- uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+ uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
- si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
- EOP_DST_SEL_TC_L2,
- EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
- EOP_DATA_SEL_GDS,
- t[i]->buf_filled_size, va,
- EOP_DATA_GDS(i, 1), 0);
+ si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
+ t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
- t[i]->buf_filled_size_valid = true;
- }
+ t[i]->buf_filled_size_valid = true;
+ }
- sctx->streamout.begin_emitted = false;
+ sctx->streamout.begin_emitted = false;
}
static void si_flush_vgt_streamout(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- unsigned reg_strmout_cntl;
-
- /* The register is at different places on different ASICs. */
- if (sctx->chip_class >= GFX7) {
- reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
- radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
- } else {
- reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
- radeon_set_config_reg(cs, reg_strmout_cntl, 0);
- }
-
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
-
- radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
- radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
- radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
- radeon_emit(cs, 0);
- radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
- radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
- radeon_emit(cs, 4); /* poll interval */
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ unsigned reg_strmout_cntl;
+
+ /* The register is at different places on different ASICs. */
+ if (sctx->chip_class >= GFX7) {
+ reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
+ radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
+ } else {
+ reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
+ radeon_set_config_reg(cs, reg_strmout_cntl, 0);
+ }
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
+
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs,
+ WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+ radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
+ radeon_emit(cs, 0);
+ radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
+ radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
}
static void si_emit_streamout_begin(struct si_context *sctx)
{
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct si_streamout_target **t = sctx->streamout.targets;
- uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
- unsigned i;
-
- si_flush_vgt_streamout(sctx);
-
- for (i = 0; i < sctx->streamout.num_targets; i++) {
- if (!t[i])
- continue;
-
- t[i]->stride_in_dw = stride_in_dw[i];
-
- /* AMD GCN binds streamout buffers as shader resources.
- * VGT only counts primitives and tells the shader
- * through SGPRs what to do. */
- radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
- radeon_emit(cs, (t[i]->b.buffer_offset +
- t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
- radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
-
- if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
- uint64_t va = t[i]->buf_filled_size->gpu_address +
- t[i]->buf_filled_size_offset;
-
- /* Append. */
- radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
- radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
- STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, va); /* src address lo */
- radeon_emit(cs, va >> 32); /* src address hi */
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- t[i]->buf_filled_size,
- RADEON_USAGE_READ,
- RADEON_PRIO_SO_FILLED_SIZE);
- } else {
- /* Start from the beginning. */
- radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
- radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
- STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
- radeon_emit(cs, 0); /* unused */
- }
- }
-
- sctx->streamout.begin_emitted = true;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct si_streamout_target **t = sctx->streamout.targets;
+ uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
+ unsigned i;
+
+ si_flush_vgt_streamout(sctx);
+
+ for (i = 0; i < sctx->streamout.num_targets; i++) {
+ if (!t[i])
+ continue;
+
+ t[i]->stride_in_dw = stride_in_dw[i];
+
+ /* AMD GCN binds streamout buffers as shader resources.
+ * VGT only counts primitives and tells the shader
+ * through SGPRs what to do. */
+ radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+ radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
+
+ if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
+ uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+
+ /* Append. */
+ radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+ radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+ STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, va); /* src address lo */
+ radeon_emit(cs, va >> 32); /* src address hi */
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+ RADEON_PRIO_SO_FILLED_SIZE);
+ } else {
+ /* Start from the beginning. */
+ radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+ radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+ STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+ radeon_emit(cs, 0); /* unused */
+ }
+ }
+
+ sctx->streamout.begin_emitted = true;
}
void si_emit_streamout_end(struct si_context *sctx)
{
- if (sctx->screen->use_ngg_streamout) {
- gfx10_emit_streamout_end(sctx);
- return;
- }
-
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- struct si_streamout_target **t = sctx->streamout.targets;
- unsigned i;
- uint64_t va;
-
- si_flush_vgt_streamout(sctx);
-
- for (i = 0; i < sctx->streamout.num_targets; i++) {
- if (!t[i])
- continue;
-
- va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
- radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
- radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
- STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
- STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
- radeon_emit(cs, va); /* dst address lo */
- radeon_emit(cs, va >> 32); /* dst address hi */
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, 0); /* unused */
-
- radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
- t[i]->buf_filled_size,
- RADEON_USAGE_WRITE,
- RADEON_PRIO_SO_FILLED_SIZE);
-
- /* Zero the buffer size. The counters (primitives generated,
- * primitives emitted) may be enabled even if there is not
- * buffer bound. This ensures that the primitives-emitted query
- * won't increment. */
- radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
- sctx->context_roll = true;
-
- t[i]->buf_filled_size_valid = true;
- }
-
- sctx->streamout.begin_emitted = false;
+ if (sctx->screen->use_ngg_streamout) {
+ gfx10_emit_streamout_end(sctx);
+ return;
+ }
+
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ struct si_streamout_target **t = sctx->streamout.targets;
+ unsigned i;
+ uint64_t va;
+
+ si_flush_vgt_streamout(sctx);
+
+ for (i = 0; i < sctx->streamout.num_targets; i++) {
+ if (!t[i])
+ continue;
+
+ va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+ radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+ radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+ STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+ radeon_emit(cs, va); /* dst address lo */
+ radeon_emit(cs, va >> 32); /* dst address hi */
+ radeon_emit(cs, 0); /* unused */
+ radeon_emit(cs, 0); /* unused */
+
+ radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
+ RADEON_PRIO_SO_FILLED_SIZE);
+
+ /* Zero the buffer size. The counters (primitives generated,
+ * primitives emitted) may be enabled even if there is not
+ * buffer bound. This ensures that the primitives-emitted query
+ * won't increment. */
+ radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+ sctx->context_roll = true;
+
+ t[i]->buf_filled_size_valid = true;
+ }
+
+ sctx->streamout.begin_emitted = false;
}
/* STREAMOUT CONFIG DERIVED STATE
static void si_emit_streamout_enable(struct si_context *sctx)
{
- assert(!sctx->screen->use_ngg_streamout);
-
- radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
- radeon_emit(sctx->gfx_cs,
- S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
- S_028B94_RAST_STREAM(0) |
- S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
- S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
- S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
- radeon_emit(sctx->gfx_cs,
- sctx->streamout.hw_enabled_mask &
- sctx->streamout.enabled_stream_buffers_mask);
+ assert(!sctx->screen->use_ngg_streamout);
+
+ radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+ radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
+ S_028B94_RAST_STREAM(0) |
+ S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
+ S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
+ S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
+ radeon_emit(sctx->gfx_cs,
+ sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
}
static void si_set_streamout_enable(struct si_context *sctx, bool enable)
{
- bool old_strmout_en = si_get_strmout_en(sctx);
- unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
+ bool old_strmout_en = si_get_strmout_en(sctx);
+ unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
- sctx->streamout.streamout_enabled = enable;
+ sctx->streamout.streamout_enabled = enable;
- sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
- (sctx->streamout.enabled_mask << 4) |
- (sctx->streamout.enabled_mask << 8) |
- (sctx->streamout.enabled_mask << 12);
+ sctx->streamout.hw_enabled_mask =
+ sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
+ (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
- if (!sctx->screen->use_ngg_streamout &&
- ((old_strmout_en != si_get_strmout_en(sctx)) ||
- (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+ if (!sctx->screen->use_ngg_streamout &&
+ ((old_strmout_en != si_get_strmout_en(sctx)) ||
+ (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
}
-void si_update_prims_generated_query_state(struct si_context *sctx,
- unsigned type, int diff)
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
{
- if (!sctx->screen->use_ngg_streamout &&
- type == PIPE_QUERY_PRIMITIVES_GENERATED) {
- bool old_strmout_en = si_get_strmout_en(sctx);
+ if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+ bool old_strmout_en = si_get_strmout_en(sctx);
- sctx->streamout.num_prims_gen_queries += diff;
- assert(sctx->streamout.num_prims_gen_queries >= 0);
+ sctx->streamout.num_prims_gen_queries += diff;
+ assert(sctx->streamout.num_prims_gen_queries >= 0);
- sctx->streamout.prims_gen_query_enabled =
- sctx->streamout.num_prims_gen_queries != 0;
+ sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
- if (old_strmout_en != si_get_strmout_en(sctx))
- si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+ if (old_strmout_en != si_get_strmout_en(sctx))
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
- if (si_update_ngg(sctx)) {
- si_shader_change_notify(sctx);
- sctx->do_update_shaders = true;
- }
- }
+ if (si_update_ngg(sctx)) {
+ si_shader_change_notify(sctx);
+ sctx->do_update_shaders = true;
+ }
+ }
}
void si_init_streamout_functions(struct si_context *sctx)
{
- sctx->b.create_stream_output_target = si_create_so_target;
- sctx->b.stream_output_target_destroy = si_so_target_destroy;
- sctx->b.set_stream_output_targets = si_set_streamout_targets;
-
- if (sctx->screen->use_ngg_streamout) {
- sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
- } else {
- sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
- sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
- }
+ sctx->b.create_stream_output_target = si_create_so_target;
+ sctx->b.stream_output_target_destroy = si_so_target_destroy;
+ sctx->b.set_stream_output_targets = si_set_streamout_targets;
+
+ if (sctx->screen->use_ngg_streamout) {
+ sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
+ } else {
+ sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+ sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
+ }
}
void si_update_ngg_small_prim_precision(struct si_context *ctx)
{
- if (!ctx->screen->use_ngg_culling)
- return;
-
- /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
- unsigned num_samples = ctx->framebuffer.nr_samples;
- unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
- float precision;
-
- if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
- precision = num_samples / 4096.0;
- else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
- precision = num_samples / 1024.0;
- else
- precision = num_samples / 256.0;
-
- ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
- ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
+ if (!ctx->screen->use_ngg_culling)
+ return;
+
+ /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
+ unsigned num_samples = ctx->framebuffer.nr_samples;
+ unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
+ float precision;
+
+ if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+ precision = num_samples / 4096.0;
+ else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+ precision = num_samples / 1024.0;
+ else
+ precision = num_samples / 256.0;
+
+ ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+ ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
}
-void si_get_small_prim_cull_info(struct si_context *sctx,
- struct si_small_prim_cull_info *out)
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
{
- /* This is needed by the small primitive culling, because it's done
- * in screen space.
- */
- struct si_small_prim_cull_info info;
- unsigned num_samples = sctx->framebuffer.nr_samples;
- assert(num_samples >= 1);
-
- info.scale[0] = sctx->viewports.states[0].scale[0];
- info.scale[1] = sctx->viewports.states[0].scale[1];
- info.translate[0] = sctx->viewports.states[0].translate[0];
- info.translate[1] = sctx->viewports.states[0].translate[1];
-
- /* The viewport shouldn't flip the X axis for the small prim culling to work. */
- assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
-
- /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
- * This is because the viewport transformation inverts the clip space
- * bounding box, so min becomes max, which breaks small primitive
- * culling.
- */
- if (sctx->viewports.y_inverted) {
- info.scale[1] = -info.scale[1];
- info.translate[1] = -info.translate[1];
- }
-
- /* Scale the framebuffer up, so that samples become pixels and small
- * primitive culling is the same for all sample counts.
- * This only works with the standard DX sample positions, because
- * the samples are evenly spaced on both X and Y axes.
- */
- for (unsigned i = 0; i < 2; i++) {
- info.scale[i] *= num_samples;
- info.translate[i] *= num_samples;
- }
- *out = info;
+ /* This is needed by the small primitive culling, because it's done
+ * in screen space.
+ */
+ struct si_small_prim_cull_info info;
+ unsigned num_samples = sctx->framebuffer.nr_samples;
+ assert(num_samples >= 1);
+
+ info.scale[0] = sctx->viewports.states[0].scale[0];
+ info.scale[1] = sctx->viewports.states[0].scale[1];
+ info.translate[0] = sctx->viewports.states[0].translate[0];
+ info.translate[1] = sctx->viewports.states[0].translate[1];
+
+ /* The viewport shouldn't flip the X axis for the small prim culling to work. */
+ assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
+
+ /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+ * This is because the viewport transformation inverts the clip space
+ * bounding box, so min becomes max, which breaks small primitive
+ * culling.
+ */
+ if (sctx->viewports.y_inverted) {
+ info.scale[1] = -info.scale[1];
+ info.translate[1] = -info.translate[1];
+ }
+
+ /* Scale the framebuffer up, so that samples become pixels and small
+ * primitive culling is the same for all sample counts.
+ * This only works with the standard DX sample positions, because
+ * the samples are evenly spaced on both X and Y axes.
+ */
+ for (unsigned i = 0; i < 2; i++) {
+ info.scale[i] *= num_samples;
+ info.translate[i] *= num_samples;
+ }
+ *out = info;
}
-static void si_set_scissor_states(struct pipe_context *pctx,
- unsigned start_slot,
- unsigned num_scissors,
- const struct pipe_scissor_state *state)
+static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
+ unsigned num_scissors, const struct pipe_scissor_state *state)
{
- struct si_context *ctx = (struct si_context *)pctx;
- int i;
+ struct si_context *ctx = (struct si_context *)pctx;
+ int i;
- for (i = 0; i < num_scissors; i++)
- ctx->scissors[start_slot + i] = state[i];
+ for (i = 0; i < num_scissors; i++)
+ ctx->scissors[start_slot + i] = state[i];
- if (!ctx->queued.named.rasterizer->scissor_enable)
- return;
+ if (!ctx->queued.named.rasterizer->scissor_enable)
+ return;
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
}
/* Since the guard band disables clipping, we have to clip per-pixel
* using a scissor.
*/
static void si_get_scissor_from_viewport(struct si_context *ctx,
- const struct pipe_viewport_state *vp,
- struct si_signed_scissor *scissor)
+ const struct pipe_viewport_state *vp,
+ struct si_signed_scissor *scissor)
{
- float tmp, minx, miny, maxx, maxy;
-
- /* Convert (-1, -1) and (1, 1) from clip space into window space. */
- minx = -vp->scale[0] + vp->translate[0];
- miny = -vp->scale[1] + vp->translate[1];
- maxx = vp->scale[0] + vp->translate[0];
- maxy = vp->scale[1] + vp->translate[1];
-
- /* Handle inverted viewports. */
- if (minx > maxx) {
- tmp = minx;
- minx = maxx;
- maxx = tmp;
- }
- if (miny > maxy) {
- tmp = miny;
- miny = maxy;
- maxy = tmp;
- }
-
- /* Convert to integer and round up the max bounds. */
- scissor->minx = minx;
- scissor->miny = miny;
- scissor->maxx = ceilf(maxx);
- scissor->maxy = ceilf(maxy);
+ float tmp, minx, miny, maxx, maxy;
+
+ /* Convert (-1, -1) and (1, 1) from clip space into window space. */
+ minx = -vp->scale[0] + vp->translate[0];
+ miny = -vp->scale[1] + vp->translate[1];
+ maxx = vp->scale[0] + vp->translate[0];
+ maxy = vp->scale[1] + vp->translate[1];
+
+ /* Handle inverted viewports. */
+ if (minx > maxx) {
+ tmp = minx;
+ minx = maxx;
+ maxx = tmp;
+ }
+ if (miny > maxy) {
+ tmp = miny;
+ miny = maxy;
+ maxy = tmp;
+ }
+
+ /* Convert to integer and round up the max bounds. */
+ scissor->minx = minx;
+ scissor->miny = miny;
+ scissor->maxx = ceilf(maxx);
+ scissor->maxy = ceilf(maxy);
}
-static void si_clamp_scissor(struct si_context *ctx,
- struct pipe_scissor_state *out,
- struct si_signed_scissor *scissor)
+static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out,
+ struct si_signed_scissor *scissor)
{
- out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
- out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
- out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
- out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
+ out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
+ out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
+ out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
+ out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
}
-static void si_clip_scissor(struct pipe_scissor_state *out,
- struct pipe_scissor_state *clip)
+static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip)
{
- out->minx = MAX2(out->minx, clip->minx);
- out->miny = MAX2(out->miny, clip->miny);
- out->maxx = MIN2(out->maxx, clip->maxx);
- out->maxy = MIN2(out->maxy, clip->maxy);
+ out->minx = MAX2(out->minx, clip->minx);
+ out->miny = MAX2(out->miny, clip->miny);
+ out->maxx = MIN2(out->maxx, clip->maxx);
+ out->maxy = MIN2(out->maxy, clip->maxy);
}
-static void si_scissor_make_union(struct si_signed_scissor *out,
- struct si_signed_scissor *in)
+static void si_scissor_make_union(struct si_signed_scissor *out, struct si_signed_scissor *in)
{
- out->minx = MIN2(out->minx, in->minx);
- out->miny = MIN2(out->miny, in->miny);
- out->maxx = MAX2(out->maxx, in->maxx);
- out->maxy = MAX2(out->maxy, in->maxy);
- out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
+ out->minx = MIN2(out->minx, in->minx);
+ out->miny = MIN2(out->miny, in->miny);
+ out->maxx = MAX2(out->maxx, in->maxx);
+ out->maxy = MAX2(out->maxy, in->maxy);
+ out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
}
-static void si_emit_one_scissor(struct si_context *ctx,
- struct radeon_cmdbuf *cs,
- struct si_signed_scissor *vp_scissor,
- struct pipe_scissor_state *scissor)
+static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs,
+ struct si_signed_scissor *vp_scissor,
+ struct pipe_scissor_state *scissor)
{
- struct pipe_scissor_state final;
-
- if (ctx->vs_disables_clipping_viewport) {
- final.minx = final.miny = 0;
- final.maxx = final.maxy = SI_MAX_SCISSOR;
- } else {
- si_clamp_scissor(ctx, &final, vp_scissor);
- }
-
- if (scissor)
- si_clip_scissor(&final, scissor);
-
- /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
- * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
- */
- if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
- radeon_emit(cs, S_028250_TL_X(1) |
- S_028250_TL_Y(1) |
- S_028250_WINDOW_OFFSET_DISABLE(1));
- radeon_emit(cs, S_028254_BR_X(1) |
- S_028254_BR_Y(1));
- return;
- }
-
- radeon_emit(cs, S_028250_TL_X(final.minx) |
- S_028250_TL_Y(final.miny) |
- S_028250_WINDOW_OFFSET_DISABLE(1));
- radeon_emit(cs, S_028254_BR_X(final.maxx) |
- S_028254_BR_Y(final.maxy));
+ struct pipe_scissor_state final;
+
+ if (ctx->vs_disables_clipping_viewport) {
+ final.minx = final.miny = 0;
+ final.maxx = final.maxy = SI_MAX_SCISSOR;
+ } else {
+ si_clamp_scissor(ctx, &final, vp_scissor);
+ }
+
+ if (scissor)
+ si_clip_scissor(&final, scissor);
+
+ /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
+ * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+ */
+ if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
+ radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
+ radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1));
+ return;
+ }
+
+ radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
+ S_028250_WINDOW_OFFSET_DISABLE(1));
+ radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
}
#define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
static void si_emit_guardband(struct si_context *ctx)
{
- const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
- struct si_signed_scissor vp_as_scissor;
- struct pipe_viewport_state vp;
- float left, top, right, bottom, max_range, guardband_x, guardband_y;
- float discard_x, discard_y;
-
- if (ctx->vs_writes_viewport_index) {
- /* Shaders can draw to any viewport. Make a union of all
- * viewports. */
- vp_as_scissor = ctx->viewports.as_scissor[0];
- for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
- si_scissor_make_union(&vp_as_scissor,
- &ctx->viewports.as_scissor[i]);
- }
- } else {
- vp_as_scissor = ctx->viewports.as_scissor[0];
- }
-
- /* Blits don't set the viewport state. The vertex shader determines
- * the viewport size by scaling the coordinates, so we don't know
- * how large the viewport is. Assume the worst case.
- */
- if (ctx->vs_disables_clipping_viewport)
- vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
-
- /* Determine the optimal hardware screen offset to center the viewport
- * within the viewport range in order to maximize the guardband size.
- */
- int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
- int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
-
- /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
- const unsigned hw_screen_offset_alignment =
- ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
-
- /* Indexed by quantization modes */
- static int max_viewport_size[] = {65535, 16383, 4095};
-
- /* Ensure that the whole viewport stays representable in
- * absolute coordinates.
- * See comment in si_set_viewport_states.
- */
- assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
- vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
-
- hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
- hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-
- /* Align the screen offset by dropping the low bits. */
- hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
- hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
-
- /* Apply the offset to center the viewport and maximize the guardband. */
- vp_as_scissor.minx -= hw_screen_offset_x;
- vp_as_scissor.maxx -= hw_screen_offset_x;
- vp_as_scissor.miny -= hw_screen_offset_y;
- vp_as_scissor.maxy -= hw_screen_offset_y;
-
- /* Reconstruct the viewport transformation from the scissor. */
- vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
- vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
- vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
- vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
-
- /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
- if (vp_as_scissor.minx == vp_as_scissor.maxx)
- vp.scale[0] = 0.5;
- if (vp_as_scissor.miny == vp_as_scissor.maxy)
- vp.scale[1] = 0.5;
-
- /* Find the biggest guard band that is inside the supported viewport
- * range. The guard band is specified as a horizontal and vertical
- * distance from (0,0) in clip space.
- *
- * This is done by applying the inverse viewport transformation
- * on the viewport limits to get those limits in clip space.
- *
- * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
- */
- assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
- max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
- left = (-max_range - vp.translate[0]) / vp.scale[0];
- right = ( max_range - vp.translate[0]) / vp.scale[0];
- top = (-max_range - vp.translate[1]) / vp.scale[1];
- bottom = ( max_range - vp.translate[1]) / vp.scale[1];
-
- assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
-
- guardband_x = MIN2(-left, right);
- guardband_y = MIN2(-top, bottom);
-
- discard_x = 1.0;
- discard_y = 1.0;
-
- if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
- /* When rendering wide points or lines, we need to be more
- * conservative about when to discard them entirely. */
- float pixels;
-
- if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
- pixels = rs->max_point_size;
- else
- pixels = rs->line_width;
-
- /* Add half the point size / line width */
- discard_x += pixels / (2.0 * vp.scale[0]);
- discard_y += pixels / (2.0 * vp.scale[1]);
-
- /* Discard primitives that would lie entirely outside the clip
- * region. */
- discard_x = MIN2(discard_x, guardband_x);
- discard_y = MIN2(discard_y, guardband_y);
- }
-
- /* If any of the GB registers is updated, all of them must be updated.
- * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
- * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
- */
- unsigned initial_cdw = ctx->gfx_cs->current.cdw;
- radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
- SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
- fui(guardband_y), fui(discard_y),
- fui(guardband_x), fui(discard_x));
- radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
- SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
- S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
- S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
- radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
- SI_TRACKED_PA_SU_VTX_CNTL,
- S_028BE4_PIX_CENTER(rs->half_pixel_center) |
- S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
- vp_as_scissor.quant_mode));
- if (initial_cdw != ctx->gfx_cs->current.cdw)
- ctx->context_roll = true;
-
- si_update_ngg_small_prim_precision(ctx);
+ const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+ struct si_signed_scissor vp_as_scissor;
+ struct pipe_viewport_state vp;
+ float left, top, right, bottom, max_range, guardband_x, guardband_y;
+ float discard_x, discard_y;
+
+ if (ctx->vs_writes_viewport_index) {
+ /* Shaders can draw to any viewport. Make a union of all
+ * viewports. */
+ vp_as_scissor = ctx->viewports.as_scissor[0];
+ for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+ si_scissor_make_union(&vp_as_scissor, &ctx->viewports.as_scissor[i]);
+ }
+ } else {
+ vp_as_scissor = ctx->viewports.as_scissor[0];
+ }
+
+ /* Blits don't set the viewport state. The vertex shader determines
+ * the viewport size by scaling the coordinates, so we don't know
+ * how large the viewport is. Assume the worst case.
+ */
+ if (ctx->vs_disables_clipping_viewport)
+ vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+ /* Determine the optimal hardware screen offset to center the viewport
+ * within the viewport range in order to maximize the guardband size.
+ */
+ int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+ int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+ /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
+ const unsigned hw_screen_offset_alignment =
+ ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+ /* Indexed by quantization modes */
+ static int max_viewport_size[] = {65535, 16383, 4095};
+
+ /* Ensure that the whole viewport stays representable in
+ * absolute coordinates.
+ * See comment in si_set_viewport_states.
+ */
+ assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+ vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
+ hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+ hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+
+ /* Align the screen offset by dropping the low bits. */
+ hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+ hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+ /* Apply the offset to center the viewport and maximize the guardband. */
+ vp_as_scissor.minx -= hw_screen_offset_x;
+ vp_as_scissor.maxx -= hw_screen_offset_x;
+ vp_as_scissor.miny -= hw_screen_offset_y;
+ vp_as_scissor.maxy -= hw_screen_offset_y;
+
+ /* Reconstruct the viewport transformation from the scissor. */
+ vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+ vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+ vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+ vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
+
+ /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
+ if (vp_as_scissor.minx == vp_as_scissor.maxx)
+ vp.scale[0] = 0.5;
+ if (vp_as_scissor.miny == vp_as_scissor.maxy)
+ vp.scale[1] = 0.5;
+
+ /* Find the biggest guard band that is inside the supported viewport
+ * range. The guard band is specified as a horizontal and vertical
+ * distance from (0,0) in clip space.
+ *
+ * This is done by applying the inverse viewport transformation
+ * on the viewport limits to get those limits in clip space.
+ *
+ * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
+ */
+ assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+ max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
+ left = (-max_range - vp.translate[0]) / vp.scale[0];
+ right = (max_range - vp.translate[0]) / vp.scale[0];
+ top = (-max_range - vp.translate[1]) / vp.scale[1];
+ bottom = (max_range - vp.translate[1]) / vp.scale[1];
+
+ assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
+
+ guardband_x = MIN2(-left, right);
+ guardband_y = MIN2(-top, bottom);
+
+ discard_x = 1.0;
+ discard_y = 1.0;
+
+ if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
+ /* When rendering wide points or lines, we need to be more
+ * conservative about when to discard them entirely. */
+ float pixels;
+
+ if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
+ pixels = rs->max_point_size;
+ else
+ pixels = rs->line_width;
+
+ /* Add half the point size / line width */
+ discard_x += pixels / (2.0 * vp.scale[0]);
+ discard_y += pixels / (2.0 * vp.scale[1]);
+
+ /* Discard primitives that would lie entirely outside the clip
+ * region. */
+ discard_x = MIN2(discard_x, guardband_x);
+ discard_y = MIN2(discard_y, guardband_y);
+ }
+
+ /* If any of the GB registers is updated, all of them must be updated.
+ * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+ */
+ unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+ radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+ SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y),
+ fui(guardband_x), fui(discard_x));
+ radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+ SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+ S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+ S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+ radeon_opt_set_context_reg(
+ ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
+ S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+ S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
+ if (initial_cdw != ctx->gfx_cs->current.cdw)
+ ctx->context_roll = true;
+
+ si_update_ngg_small_prim_precision(ctx);
}
static void si_emit_scissors(struct si_context *ctx)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
- struct pipe_scissor_state *states = ctx->scissors;
- bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
-
- /* The simple case: Only 1 viewport is active. */
- if (!ctx->vs_writes_viewport_index) {
- struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
-
- radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
- si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
- return;
- }
-
- /* All registers in the array need to be updated if any of them is changed.
- * This is a hardware requirement.
- */
- radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
- SI_MAX_VIEWPORTS * 2);
- for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
- si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
- scissor_enabled ? &states[i] : NULL);
- }
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ struct pipe_scissor_state *states = ctx->scissors;
+ bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+
+ /* The simple case: Only 1 viewport is active. */
+ if (!ctx->vs_writes_viewport_index) {
+ struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
+
+ radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+ si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
+ return;
+ }
+
+ /* All registers in the array need to be updated if any of them is changed.
+ * This is a hardware requirement.
+ */
+ radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
+ for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+ si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
+ scissor_enabled ? &states[i] : NULL);
+ }
}
-static void si_set_viewport_states(struct pipe_context *pctx,
- unsigned start_slot,
- unsigned num_viewports,
- const struct pipe_viewport_state *state)
+static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
+ unsigned num_viewports, const struct pipe_viewport_state *state)
{
- struct si_context *ctx = (struct si_context *)pctx;
- int i;
-
- for (i = 0; i < num_viewports; i++) {
- unsigned index = start_slot + i;
- struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
-
- ctx->viewports.states[index] = state[i];
-
- si_get_scissor_from_viewport(ctx, &state[i], scissor);
-
- unsigned w = scissor->maxx - scissor->minx;
- unsigned h = scissor->maxy - scissor->miny;
- unsigned max_extent = MAX2(w, h);
-
- int max_corner = MAX2(scissor->maxx, scissor->maxy);
-
- unsigned center_x = (scissor->maxx + scissor->minx) / 2;
- unsigned center_y = (scissor->maxy + scissor->miny) / 2;
- unsigned max_center = MAX2(center_x, center_y);
-
- /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
- * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
- * (for example, a 1x1 viewport in the lower right corner of
- * 16Kx16K) Such viewports need a greater guardband, so they
- * have to use a worse quantization mode.
- */
- unsigned distance_off_center =
- MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
- max_extent += distance_off_center;
-
- /* Determine the best quantization mode (subpixel precision),
- * but also leave enough space for the guardband.
- *
- * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
- * and Raven1 for line and rectangle primitive types to work correctly.
- * Always use 16_8 if primitive binning is possible to occur.
- */
- if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) &&
- ctx->screen->dpbb_allowed)
- max_extent = 16384; /* Use QUANT_MODE == 16_8. */
-
- /* Another constraint is that all coordinates in the viewport
- * are representable in fixed point with respect to the
- * surface origin.
- *
- * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
- * an offset that would make the upper corner of the viewport
- * greater than the maximum representable number post
- * quantization, ie 2^quant_bits.
- *
- * This does not matter for 14.10 and 16.8 formats since the
- * offset is already limited at 8k, but it means we can't use
- * 12.12 if we are drawing to some pixels outside the lower
- * 4k x 4k of the render target.
- */
-
- if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
- scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
- else if (max_extent <= 4096) /* 16K scanline area for guardband */
- scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
- else /* 64K scanline area for guardband */
- scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
- }
-
- if (start_slot == 0) {
- ctx->viewports.y_inverted =
- -state->scale[1] + state->translate[1] >
- state->scale[1] + state->translate[1];
- }
-
- si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ struct si_context *ctx = (struct si_context *)pctx;
+ int i;
+
+ for (i = 0; i < num_viewports; i++) {
+ unsigned index = start_slot + i;
+ struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
+
+ ctx->viewports.states[index] = state[i];
+
+ si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+ unsigned w = scissor->maxx - scissor->minx;
+ unsigned h = scissor->maxy - scissor->miny;
+ unsigned max_extent = MAX2(w, h);
+
+ int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
+ unsigned center_x = (scissor->maxx + scissor->minx) / 2;
+ unsigned center_y = (scissor->maxy + scissor->miny) / 2;
+ unsigned max_center = MAX2(center_x, center_y);
+
+ /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
+ * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
+ * (for example, a 1x1 viewport in the lower right corner of
+ * 16Kx16K) Such viewports need a greater guardband, so they
+ * have to use a worse quantization mode.
+ */
+ unsigned distance_off_center = MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+ max_extent += distance_off_center;
+
+ /* Determine the best quantization mode (subpixel precision),
+ * but also leave enough space for the guardband.
+ *
+ * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+ * and Raven1 for line and rectangle primitive types to work correctly.
+ * Always use 16_8 if primitive binning is possible to occur.
+ */
+ if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && ctx->screen->dpbb_allowed)
+ max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+ /* Another constraint is that all coordinates in the viewport
+ * are representable in fixed point with respect to the
+ * surface origin.
+ *
+ * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+ * an offset that would make the upper corner of the viewport
+ * greater than the maximum representable number post
+ * quantization, ie 2^quant_bits.
+ *
+ * This does not matter for 14.10 and 16.8 formats since the
+ * offset is already limited at 8k, but it means we can't use
+ * 12.12 if we are drawing to some pixels outside the lower
+ * 4k x 4k of the render target.
+ */
+
+ if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+ else if (max_extent <= 4096) /* 16K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+ else /* 64K scanline area for guardband */
+ scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+ }
+
+ if (start_slot == 0) {
+ ctx->viewports.y_inverted =
+ -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+ }
+
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
}
-static void si_emit_one_viewport(struct si_context *ctx,
- struct pipe_viewport_state *state)
+static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
- radeon_emit(cs, fui(state->scale[0]));
- radeon_emit(cs, fui(state->translate[0]));
- radeon_emit(cs, fui(state->scale[1]));
- radeon_emit(cs, fui(state->translate[1]));
- radeon_emit(cs, fui(state->scale[2]));
- radeon_emit(cs, fui(state->translate[2]));
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+ radeon_emit(cs, fui(state->scale[0]));
+ radeon_emit(cs, fui(state->translate[0]));
+ radeon_emit(cs, fui(state->scale[1]));
+ radeon_emit(cs, fui(state->translate[1]));
+ radeon_emit(cs, fui(state->scale[2]));
+ radeon_emit(cs, fui(state->translate[2]));
}
static void si_emit_viewports(struct si_context *ctx)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
- struct pipe_viewport_state *states = ctx->viewports.states;
-
- if (ctx->screen->use_ngg_culling) {
- /* Set the viewport info for small primitive culling. */
- struct si_small_prim_cull_info info;
- si_get_small_prim_cull_info(ctx, &info);
-
- if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
- unsigned offset = 0;
-
- /* Align to 256, because the address is shifted by 8 bits. */
- u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256,
- &info, &offset,
- (struct pipe_resource**)&ctx->small_prim_cull_info_buf);
-
- ctx->small_prim_cull_info_address =
- ctx->small_prim_cull_info_buf->gpu_address + offset;
- ctx->last_small_prim_cull_info = info;
- ctx->small_prim_cull_info_dirty = true;
- }
-
- if (ctx->small_prim_cull_info_dirty) {
- /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
- radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
- RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
- radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
- ctx->small_prim_cull_info_address >> 8);
- ctx->small_prim_cull_info_dirty = false;
- }
- }
-
- /* The simple case: Only 1 viewport is active. */
- if (!ctx->vs_writes_viewport_index) {
- radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
- si_emit_one_viewport(ctx, &states[0]);
- return;
- }
-
- /* All registers in the array need to be updated if any of them is changed.
- * This is a hardware requirement.
- */
- radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
- 0, SI_MAX_VIEWPORTS * 6);
- for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
- si_emit_one_viewport(ctx, &states[i]);
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ struct pipe_viewport_state *states = ctx->viewports.states;
+
+ if (ctx->screen->use_ngg_culling) {
+ /* Set the viewport info for small primitive culling. */
+ struct si_small_prim_cull_info info;
+ si_get_small_prim_cull_info(ctx, &info);
+
+ if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
+ unsigned offset = 0;
+
+ /* Align to 256, because the address is shifted by 8 bits. */
+ u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+ (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
+
+ ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
+ ctx->last_small_prim_cull_info = info;
+ ctx->small_prim_cull_info_dirty = true;
+ }
+
+ if (ctx->small_prim_cull_info_dirty) {
+ /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+ radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
+ RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+ radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+ ctx->small_prim_cull_info_address >> 8);
+ ctx->small_prim_cull_info_dirty = false;
+ }
+ }
+
+ /* The simple case: Only 1 viewport is active. */
+ if (!ctx->vs_writes_viewport_index) {
+ radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+ si_emit_one_viewport(ctx, &states[0]);
+ return;
+ }
+
+ /* All registers in the array need to be updated if any of them is changed.
+ * This is a hardware requirement.
+ */
+ radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
+ for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
+ si_emit_one_viewport(ctx, &states[i]);
}
-static inline void
-si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
- bool window_space_position, float *zmin, float *zmax)
+static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+ bool window_space_position, float *zmin, float *zmax)
{
- if (window_space_position) {
- *zmin = 0;
- *zmax = 1;
- return;
- }
- util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
+ if (window_space_position) {
+ *zmin = 0;
+ *zmax = 1;
+ return;
+ }
+ util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
}
static void si_emit_depth_ranges(struct si_context *ctx)
{
- struct radeon_cmdbuf *cs = ctx->gfx_cs;
- struct pipe_viewport_state *states = ctx->viewports.states;
- bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
- bool window_space = ctx->vs_disables_clipping_viewport;
- float zmin, zmax;
-
- /* The simple case: Only 1 viewport is active. */
- if (!ctx->vs_writes_viewport_index) {
- si_viewport_zmin_zmax(&states[0], clip_halfz, window_space,
- &zmin, &zmax);
-
- radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
- radeon_emit(cs, fui(zmin));
- radeon_emit(cs, fui(zmax));
- return;
- }
-
- /* All registers in the array need to be updated if any of them is changed.
- * This is a hardware requirement.
- */
- radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0,
- SI_MAX_VIEWPORTS * 2);
- for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
- si_viewport_zmin_zmax(&states[i], clip_halfz, window_space,
- &zmin, &zmax);
- radeon_emit(cs, fui(zmin));
- radeon_emit(cs, fui(zmax));
- }
+ struct radeon_cmdbuf *cs = ctx->gfx_cs;
+ struct pipe_viewport_state *states = ctx->viewports.states;
+ bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
+ bool window_space = ctx->vs_disables_clipping_viewport;
+ float zmin, zmax;
+
+ /* The simple case: Only 1 viewport is active. */
+ if (!ctx->vs_writes_viewport_index) {
+ si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
+
+ radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
+ radeon_emit(cs, fui(zmin));
+ radeon_emit(cs, fui(zmax));
+ return;
+ }
+
+ /* All registers in the array need to be updated if any of them is changed.
+ * This is a hardware requirement.
+ */
+ radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
+ for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+ si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
+ radeon_emit(cs, fui(zmin));
+ radeon_emit(cs, fui(zmax));
+ }
}
static void si_emit_viewport_states(struct si_context *ctx)
{
- si_emit_viewports(ctx);
- si_emit_depth_ranges(ctx);
+ si_emit_viewports(ctx);
+ si_emit_depth_ranges(ctx);
}
/**
*/
void si_update_vs_viewport_state(struct si_context *ctx)
{
- struct si_shader_info *info = si_get_vs_info(ctx);
- bool vs_window_space;
-
- if (!info)
- return;
-
- /* When the VS disables clipping and viewport transformation. */
- vs_window_space =
- info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
- if (ctx->vs_disables_clipping_viewport != vs_window_space) {
- ctx->vs_disables_clipping_viewport = vs_window_space;
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
- }
-
- /* Viewport index handling. */
- if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
- return;
-
- /* This changes how the guardband is computed. */
- ctx->vs_writes_viewport_index = info->writes_viewport_index;
- si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-
- /* Emit scissors and viewports that were enabled by having
- * the ViewportIndex output.
- */
- if (info->writes_viewport_index) {
- si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
- si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
- }
+ struct si_shader_info *info = si_get_vs_info(ctx);
+ bool vs_window_space;
+
+ if (!info)
+ return;
+
+ /* When the VS disables clipping and viewport transformation. */
+ vs_window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+ if (ctx->vs_disables_clipping_viewport != vs_window_space) {
+ ctx->vs_disables_clipping_viewport = vs_window_space;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+ }
+
+ /* Viewport index handling. */
+ if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+ return;
+
+ /* This changes how the guardband is computed. */
+ ctx->vs_writes_viewport_index = info->writes_viewport_index;
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
+ /* Emit scissors and viewports that were enabled by having
+ * the ViewportIndex output.
+ */
+ if (info->writes_viewport_index) {
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+ si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+ }
}
static void si_emit_window_rectangles(struct si_context *sctx)
{
- /* There are four clipping rectangles. Their corner coordinates are inclusive.
- * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
- * on whether the pixel is inside cliprects 0-3, respectively. For example,
- * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
- * the number 3 (binary 0011).
- *
- * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
- */
- struct radeon_cmdbuf *cs = sctx->gfx_cs;
- static const unsigned outside[4] = {
- /* outside rectangle 0 */
- V_02820C_OUT |
- V_02820C_IN_1 |
- V_02820C_IN_2 |
- V_02820C_IN_21 |
- V_02820C_IN_3 |
- V_02820C_IN_31 |
- V_02820C_IN_32 |
- V_02820C_IN_321,
- /* outside rectangles 0, 1 */
- V_02820C_OUT |
- V_02820C_IN_2 |
- V_02820C_IN_3 |
- V_02820C_IN_32,
- /* outside rectangles 0, 1, 2 */
- V_02820C_OUT |
- V_02820C_IN_3,
- /* outside rectangles 0, 1, 2, 3 */
- V_02820C_OUT,
- };
- const unsigned disabled = 0xffff; /* all inside and outside cases */
- unsigned num_rectangles = sctx->num_window_rectangles;
- struct pipe_scissor_state *rects = sctx->window_rectangles;
- unsigned rule;
-
- assert(num_rectangles <= 4);
-
- if (num_rectangles == 0)
- rule = disabled;
- else if (sctx->window_rectangles_include)
- rule = ~outside[num_rectangles - 1];
- else
- rule = outside[num_rectangles - 1];
-
- radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
- SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
- if (num_rectangles == 0)
- return;
-
- radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
- num_rectangles * 2);
- for (unsigned i = 0; i < num_rectangles; i++) {
- radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
- S_028210_TL_Y(rects[i].miny));
- radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
- S_028214_BR_Y(rects[i].maxy));
- }
+ /* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ *
+ * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+ */
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
+ static const unsigned outside[4] = {
+ /* outside rectangle 0 */
+ V_02820C_OUT | V_02820C_IN_1 | V_02820C_IN_2 | V_02820C_IN_21 | V_02820C_IN_3 |
+ V_02820C_IN_31 | V_02820C_IN_32 | V_02820C_IN_321,
+ /* outside rectangles 0, 1 */
+ V_02820C_OUT | V_02820C_IN_2 | V_02820C_IN_3 | V_02820C_IN_32,
+ /* outside rectangles 0, 1, 2 */
+ V_02820C_OUT | V_02820C_IN_3,
+ /* outside rectangles 0, 1, 2, 3 */
+ V_02820C_OUT,
+ };
+ const unsigned disabled = 0xffff; /* all inside and outside cases */
+ unsigned num_rectangles = sctx->num_window_rectangles;
+ struct pipe_scissor_state *rects = sctx->window_rectangles;
+ unsigned rule;
+
+ assert(num_rectangles <= 4);
+
+ if (num_rectangles == 0)
+ rule = disabled;
+ else if (sctx->window_rectangles_include)
+ rule = ~outside[num_rectangles - 1];
+ else
+ rule = outside[num_rectangles - 1];
+
+ radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
+ rule);
+ if (num_rectangles == 0)
+ return;
+
+ radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
+ for (unsigned i = 0; i < num_rectangles; i++) {
+ radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
+ radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
+ }
}
-static void si_set_window_rectangles(struct pipe_context *ctx,
- bool include,
- unsigned num_rectangles,
- const struct pipe_scissor_state *rects)
+static void si_set_window_rectangles(struct pipe_context *ctx, bool include,
+ unsigned num_rectangles,
+ const struct pipe_scissor_state *rects)
{
- struct si_context *sctx = (struct si_context *)ctx;
+ struct si_context *sctx = (struct si_context *)ctx;
- sctx->num_window_rectangles = num_rectangles;
- sctx->window_rectangles_include = include;
- if (num_rectangles) {
- memcpy(sctx->window_rectangles, rects,
- sizeof(*rects) * num_rectangles);
- }
+ sctx->num_window_rectangles = num_rectangles;
+ sctx->window_rectangles_include = include;
+ if (num_rectangles) {
+ memcpy(sctx->window_rectangles, rects, sizeof(*rects) * num_rectangles);
+ }
- si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
+ si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
}
void si_init_viewport_functions(struct si_context *ctx)
{
- ctx->atoms.s.guardband.emit = si_emit_guardband;
- ctx->atoms.s.scissors.emit = si_emit_scissors;
- ctx->atoms.s.viewports.emit = si_emit_viewport_states;
- ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+ ctx->atoms.s.guardband.emit = si_emit_guardband;
+ ctx->atoms.s.scissors.emit = si_emit_scissors;
+ ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+ ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
- ctx->b.set_scissor_states = si_set_scissor_states;
- ctx->b.set_viewport_states = si_set_viewport_states;
- ctx->b.set_window_rectangles = si_set_window_rectangles;
+ ctx->b.set_scissor_states = si_set_scissor_states;
+ ctx->b.set_viewport_states = si_set_viewport_states;
+ ctx->b.set_window_rectangles = si_set_window_rectangles;
- for (unsigned i = 0; i < 16; i++)
- ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+ for (unsigned i = 0; i < 16; i++)
+ ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
}
/* This file implements randomized SDMA texture blit tests. */
#include "si_pipe.h"
-#include "util/u_surface.h"
#include "util/rand_xor.h"
+#include "util/u_surface.h"
static uint64_t seed_xorshift128plus[2];
/* The GPU blits are emulated on the CPU using these CPU textures. */
struct cpu_texture {
- uint8_t *ptr;
- uint64_t size;
- uint64_t layer_stride;
- unsigned stride;
+ uint8_t *ptr;
+ uint64_t size;
+ uint64_t layer_stride;
+ unsigned stride;
};
-static void alloc_cpu_texture(struct cpu_texture *tex,
- struct pipe_resource *templ)
+static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ)
{
- tex->stride = align(util_format_get_stride(templ->format, templ->width0),
- RAND_NUM_SIZE);
- tex->layer_stride = (uint64_t)tex->stride * templ->height0;
- tex->size = tex->layer_stride * templ->array_size;
- tex->ptr = malloc(tex->size);
- assert(tex->ptr);
+ tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE);
+ tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+ tex->size = tex->layer_stride * templ->array_size;
+ tex->ptr = malloc(tex->size);
+ assert(tex->ptr);
}
-static void set_random_pixels(struct pipe_context *ctx,
- struct pipe_resource *tex,
- struct cpu_texture *cpu)
+static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex,
+ struct cpu_texture *cpu)
{
- struct pipe_transfer *t;
- uint8_t *map;
- int x,y,z;
-
- map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
- 0, 0, 0, tex->width0, tex->height0,
- tex->array_size, &t);
- assert(map);
-
- for (z = 0; z < tex->array_size; z++) {
- for (y = 0; y < tex->height0; y++) {
- uint64_t *ptr = (uint64_t*)
- (map + t->layer_stride*z + t->stride*y);
- uint64_t *ptr_cpu = (uint64_t*)
- (cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
- unsigned size = cpu->stride / RAND_NUM_SIZE;
-
- assert(t->stride % RAND_NUM_SIZE == 0);
- assert(cpu->stride % RAND_NUM_SIZE == 0);
-
- for (x = 0; x < size; x++) {
- *ptr++ = *ptr_cpu++ =
- rand_xorshift128plus(seed_xorshift128plus);
- }
- }
- }
-
- pipe_transfer_unmap(ctx, t);
+ struct pipe_transfer *t;
+ uint8_t *map;
+ int x, y, z;
+
+ map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0,
+ tex->array_size, &t);
+ assert(map);
+
+ for (z = 0; z < tex->array_size; z++) {
+ for (y = 0; y < tex->height0; y++) {
+ uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y);
+ uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y);
+ unsigned size = cpu->stride / RAND_NUM_SIZE;
+
+ assert(t->stride % RAND_NUM_SIZE == 0);
+ assert(cpu->stride % RAND_NUM_SIZE == 0);
+
+ for (x = 0; x < size; x++) {
+ *ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus);
+ }
+ }
+ }
+
+ pipe_transfer_unmap(ctx, t);
}
-static bool compare_textures(struct pipe_context *ctx,
- struct pipe_resource *tex,
- struct cpu_texture *cpu)
+static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex,
+ struct cpu_texture *cpu)
{
- struct pipe_transfer *t;
- uint8_t *map;
- int y,z;
- bool pass = true;
- unsigned stride = util_format_get_stride(tex->format, tex->width0);
-
- map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
- 0, 0, 0, tex->width0, tex->height0,
- tex->array_size, &t);
- assert(map);
-
- for (z = 0; z < tex->array_size; z++) {
- for (y = 0; y < tex->height0; y++) {
- uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
- uint8_t *cpu_ptr = cpu->ptr +
- cpu->layer_stride*z + cpu->stride*y;
-
- if (memcmp(ptr, cpu_ptr, stride)) {
- pass = false;
- goto done;
- }
- }
- }
+ struct pipe_transfer *t;
+ uint8_t *map;
+ int y, z;
+ bool pass = true;
+ unsigned stride = util_format_get_stride(tex->format, tex->width0);
+
+ map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0,
+ tex->array_size, &t);
+ assert(map);
+
+ for (z = 0; z < tex->array_size; z++) {
+ for (y = 0; y < tex->height0; y++) {
+ uint8_t *ptr = map + t->layer_stride * z + t->stride * y;
+ uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y;
+
+ if (memcmp(ptr, cpu_ptr, stride)) {
+ pass = false;
+ goto done;
+ }
+ }
+ }
done:
- pipe_transfer_unmap(ctx, t);
- return pass;
+ pipe_transfer_unmap(ctx, t);
+ return pass;
}
static enum pipe_format choose_format()
{
- enum pipe_format formats[] = {
- PIPE_FORMAT_R8_UINT,
- PIPE_FORMAT_R16_UINT,
- PIPE_FORMAT_R32_UINT,
- PIPE_FORMAT_R32G32_UINT,
- PIPE_FORMAT_R32G32B32A32_UINT,
- PIPE_FORMAT_G8R8_B8R8_UNORM,
- };
- return formats[rand() % ARRAY_SIZE(formats)];
+ enum pipe_format formats[] = {
+ PIPE_FORMAT_R8_UINT, PIPE_FORMAT_R16_UINT, PIPE_FORMAT_R32_UINT,
+ PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM,
+ };
+ return formats[rand() % ARRAY_SIZE(formats)];
}
-static const char *array_mode_to_string(struct si_screen *sscreen,
- struct radeon_surf *surf)
+static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf)
{
- if (sscreen->info.chip_class >= GFX9) {
- switch (surf->u.gfx9.surf.swizzle_mode) {
- case 0:
- return " LINEAR";
- case 21:
- return " 4KB_S_X";
- case 22:
- return " 4KB_D_X";
- case 25:
- return "64KB_S_X";
- case 26:
- return "64KB_D_X";
- default:
- printf("Unhandled swizzle mode = %u\n",
- surf->u.gfx9.surf.swizzle_mode);
- return " UNKNOWN";
- }
- } else {
- switch (surf->u.legacy.level[0].mode) {
- case RADEON_SURF_MODE_LINEAR_ALIGNED:
- return "LINEAR_ALIGNED";
- case RADEON_SURF_MODE_1D:
- return "1D_TILED_THIN1";
- case RADEON_SURF_MODE_2D:
- return "2D_TILED_THIN1";
- default:
- assert(0);
- return " UNKNOWN";
- }
- }
+ if (sscreen->info.chip_class >= GFX9) {
+ switch (surf->u.gfx9.surf.swizzle_mode) {
+ case 0:
+ return " LINEAR";
+ case 21:
+ return " 4KB_S_X";
+ case 22:
+ return " 4KB_D_X";
+ case 25:
+ return "64KB_S_X";
+ case 26:
+ return "64KB_D_X";
+ default:
+ printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode);
+ return " UNKNOWN";
+ }
+ } else {
+ switch (surf->u.legacy.level[0].mode) {
+ case RADEON_SURF_MODE_LINEAR_ALIGNED:
+ return "LINEAR_ALIGNED";
+ case RADEON_SURF_MODE_1D:
+ return "1D_TILED_THIN1";
+ case RADEON_SURF_MODE_2D:
+ return "2D_TILED_THIN1";
+ default:
+ assert(0);
+ return " UNKNOWN";
+ }
+ }
}
static unsigned generate_max_tex_side(unsigned max_tex_side)
{
- switch (rand() % 4) {
- case 0:
- /* Try to hit large sizes in 1/4 of the cases. */
- return max_tex_side;
- case 1:
- /* Try to hit 1D tiling in 1/4 of the cases. */
- return 128;
- default:
- /* Try to hit common sizes in 2/4 of the cases. */
- return 2048;
- }
+ switch (rand() % 4) {
+ case 0:
+ /* Try to hit large sizes in 1/4 of the cases. */
+ return max_tex_side;
+ case 1:
+ /* Try to hit 1D tiling in 1/4 of the cases. */
+ return 128;
+ default:
+ /* Try to hit common sizes in 2/4 of the cases. */
+ return 2048;
+ }
}
void si_test_dma(struct si_screen *sscreen)
{
- struct pipe_screen *screen = &sscreen->b;
- struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
- struct si_context *sctx = (struct si_context*)ctx;
- uint64_t max_alloc_size;
- unsigned i, iterations, num_partial_copies, max_tex_side;
- unsigned num_pass = 0, num_fail = 0;
-
- max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
-
- /* Max 128 MB allowed for both textures. */
- max_alloc_size = 128 * 1024 * 1024;
-
- /* the seed for random test parameters */
- srand(0x9b47d95b);
- /* the seed for random pixel data */
- s_rand_xorshift128plus(seed_xorshift128plus, false);
-
- iterations = 1000000000; /* just kill it when you are bored */
- num_partial_copies = 30;
-
- /* These parameters are randomly generated per test:
- * - whether to do one whole-surface copy or N partial copies per test
- * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
- * - which texture dimensions to use
- * - whether to use VRAM (all tiling modes) and GTT (staging, linear
- * only) allocations
- * - random initial pixels in src
- * - generate random subrectangle copies for partial blits
- */
- for (i = 0; i < iterations; i++) {
- struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
- struct si_texture *sdst;
- struct si_texture *ssrc;
- struct cpu_texture src_cpu, dst_cpu;
- unsigned max_width, max_height, max_depth, j, num;
- unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
- unsigned max_tex_layers;
- bool pass;
- bool do_partial_copies = rand() & 1;
-
- /* generate a random test case */
- tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
- tsrc.depth0 = tdst.depth0 = 1;
-
- tsrc.format = tdst.format = choose_format();
-
- max_tex_side_gen = generate_max_tex_side(max_tex_side);
- max_tex_layers = rand() % 4 ? 1 : 5;
-
- tsrc.width0 = (rand() % max_tex_side_gen) + 1;
- tsrc.height0 = (rand() % max_tex_side_gen) + 1;
- tsrc.array_size = (rand() % max_tex_layers) + 1;
-
- if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
- tsrc.width0 = align(tsrc.width0, 2);
-
- /* Have a 1/4 chance of getting power-of-two dimensions. */
- if (rand() % 4 == 0) {
- tsrc.width0 = util_next_power_of_two(tsrc.width0);
- tsrc.height0 = util_next_power_of_two(tsrc.height0);
- }
-
- if (!do_partial_copies) {
- /* whole-surface copies only, same dimensions */
- tdst = tsrc;
- } else {
- max_tex_side_gen = generate_max_tex_side(max_tex_side);
- max_tex_layers = rand() % 4 ? 1 : 5;
-
- /* many partial copies, dimensions can be different */
- tdst.width0 = (rand() % max_tex_side_gen) + 1;
- tdst.height0 = (rand() % max_tex_side_gen) + 1;
- tdst.array_size = (rand() % max_tex_layers) + 1;
-
- /* Have a 1/4 chance of getting power-of-two dimensions. */
- if (rand() % 4 == 0) {
- tdst.width0 = util_next_power_of_two(tdst.width0);
- tdst.height0 = util_next_power_of_two(tdst.height0);
- }
- }
-
- /* check texture sizes */
- if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0)
- * tsrc.array_size * util_format_get_blocksize(tsrc.format) +
- (uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0)
- * tdst.array_size * util_format_get_blocksize(tdst.format) >
- max_alloc_size) {
- /* too large, try again */
- i--;
- continue;
- }
-
- /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
- * or GTT + linear only (1/4 of cases)
- */
- tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
- tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-
- /* Allocate textures (both the GPU and CPU copies).
- * The CPU will emulate what the GPU should be doing.
- */
- src = screen->resource_create(screen, &tsrc);
- dst = screen->resource_create(screen, &tdst);
- assert(src);
- assert(dst);
- sdst = (struct si_texture*)dst;
- ssrc = (struct si_texture*)src;
- alloc_cpu_texture(&src_cpu, &tsrc);
- alloc_cpu_texture(&dst_cpu, &tdst);
-
- printf("%4u: dst = (%5u x %5u x %u, %s), "
- " src = (%5u x %5u x %u, %s), format = %s, ",
- i, tdst.width0, tdst.height0, tdst.array_size,
- array_mode_to_string(sscreen, &sdst->surface),
- tsrc.width0, tsrc.height0, tsrc.array_size,
- array_mode_to_string(sscreen, &ssrc->surface),
- util_format_description(tsrc.format)->name);
- fflush(stdout);
-
- /* set src pixels */
- set_random_pixels(ctx, src, &src_cpu);
-
- /* clear dst pixels */
- uint32_t zero = 0;
- si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
- SI_COHERENCY_SHADER, false);
- memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
-
- /* preparation */
- max_width = MIN2(tsrc.width0, tdst.width0);
- max_height = MIN2(tsrc.height0, tdst.height0);
- max_depth = MIN2(tsrc.array_size, tdst.array_size);
-
- num = do_partial_copies ? num_partial_copies : 1;
- for (j = 0; j < num; j++) {
- int width, height, depth;
- int srcx, srcy, srcz, dstx, dsty, dstz;
- struct pipe_box box;
- unsigned old_num_draw_calls = sctx->num_draw_calls;
- unsigned old_num_dma_calls = sctx->num_dma_calls;
- unsigned old_num_cs_calls = sctx->num_compute_calls;
-
- if (!do_partial_copies) {
- /* copy whole src to dst */
- width = max_width;
- height = max_height;
- depth = max_depth;
-
- srcx = srcy = srcz = dstx = dsty = dstz = 0;
- } else {
- /* random sub-rectangle copies from src to dst */
- depth = (rand() % max_depth) + 1;
- srcz = rand() % (tsrc.array_size - depth + 1);
- dstz = rand() % (tdst.array_size - depth + 1);
-
- /* special code path to hit the tiled partial copies */
- if (!ssrc->surface.is_linear &&
- !sdst->surface.is_linear &&
- rand() & 1) {
- if (max_width < 8 || max_height < 8)
- continue;
- width = ((rand() % (max_width / 8)) + 1) * 8;
- height = ((rand() % (max_height / 8)) + 1) * 8;
-
- srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
- srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
-
- dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
- dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
- } else {
- /* just make sure that it doesn't divide by zero */
- assert(max_width > 0 && max_height > 0);
-
- width = (rand() % max_width) + 1;
- height = (rand() % max_height) + 1;
-
- srcx = rand() % (tsrc.width0 - width + 1);
- srcy = rand() % (tsrc.height0 - height + 1);
-
- dstx = rand() % (tdst.width0 - width + 1);
- dsty = rand() % (tdst.height0 - height + 1);
- }
-
- /* special code path to hit out-of-bounds reads in L2T */
- if (ssrc->surface.is_linear &&
- !sdst->surface.is_linear &&
- rand() % 4 == 0) {
- srcx = 0;
- srcy = 0;
- srcz = 0;
- }
- }
-
- /* GPU copy */
- u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
- sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
-
- /* See which engine was used. */
- gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
- dma_blits += sctx->num_dma_calls > old_num_dma_calls;
- cs_blits += sctx->num_compute_calls > old_num_cs_calls;
-
- /* CPU copy */
- util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
- dst_cpu.layer_stride,
- dstx, dsty, dstz, width, height, depth,
- src_cpu.ptr, src_cpu.stride,
- src_cpu.layer_stride,
- srcx, srcy, srcz);
- }
-
- pass = compare_textures(ctx, dst, &dst_cpu);
- if (pass)
- num_pass++;
- else
- num_fail++;
-
- printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n",
- gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail",
- num_pass, num_pass+num_fail);
-
- /* cleanup */
- pipe_resource_reference(&src, NULL);
- pipe_resource_reference(&dst, NULL);
- free(src_cpu.ptr);
- free(dst_cpu.ptr);
- }
-
- ctx->destroy(ctx);
- exit(0);
+ struct pipe_screen *screen = &sscreen->b;
+ struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+ struct si_context *sctx = (struct si_context *)ctx;
+ uint64_t max_alloc_size;
+ unsigned i, iterations, num_partial_copies, max_tex_side;
+ unsigned num_pass = 0, num_fail = 0;
+
+ max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
+
+ /* Max 128 MB allowed for both textures. */
+ max_alloc_size = 128 * 1024 * 1024;
+
+ /* the seed for random test parameters */
+ srand(0x9b47d95b);
+ /* the seed for random pixel data */
+ s_rand_xorshift128plus(seed_xorshift128plus, false);
+
+ iterations = 1000000000; /* just kill it when you are bored */
+ num_partial_copies = 30;
+
+ /* These parameters are randomly generated per test:
+ * - whether to do one whole-surface copy or N partial copies per test
+ * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+ * - which texture dimensions to use
+ * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+ * only) allocations
+ * - random initial pixels in src
+ * - generate random subrectangle copies for partial blits
+ */
+ for (i = 0; i < iterations; i++) {
+ struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+ struct si_texture *sdst;
+ struct si_texture *ssrc;
+ struct cpu_texture src_cpu, dst_cpu;
+ unsigned max_width, max_height, max_depth, j, num;
+ unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
+ unsigned max_tex_layers;
+ bool pass;
+ bool do_partial_copies = rand() & 1;
+
+ /* generate a random test case */
+ tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+ tsrc.depth0 = tdst.depth0 = 1;
+
+ tsrc.format = tdst.format = choose_format();
+
+ max_tex_side_gen = generate_max_tex_side(max_tex_side);
+ max_tex_layers = rand() % 4 ? 1 : 5;
+
+ tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+ tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+ tsrc.array_size = (rand() % max_tex_layers) + 1;
+
+ if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
+ tsrc.width0 = align(tsrc.width0, 2);
+
+ /* Have a 1/4 chance of getting power-of-two dimensions. */
+ if (rand() % 4 == 0) {
+ tsrc.width0 = util_next_power_of_two(tsrc.width0);
+ tsrc.height0 = util_next_power_of_two(tsrc.height0);
+ }
+
+ if (!do_partial_copies) {
+ /* whole-surface copies only, same dimensions */
+ tdst = tsrc;
+ } else {
+ max_tex_side_gen = generate_max_tex_side(max_tex_side);
+ max_tex_layers = rand() % 4 ? 1 : 5;
+
+ /* many partial copies, dimensions can be different */
+ tdst.width0 = (rand() % max_tex_side_gen) + 1;
+ tdst.height0 = (rand() % max_tex_side_gen) + 1;
+ tdst.array_size = (rand() % max_tex_layers) + 1;
+
+ /* Have a 1/4 chance of getting power-of-two dimensions. */
+ if (rand() % 4 == 0) {
+ tdst.width0 = util_next_power_of_two(tdst.width0);
+ tdst.height0 = util_next_power_of_two(tdst.height0);
+ }
+ }
+
+ /* check texture sizes */
+ if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) *
+ tsrc.array_size * util_format_get_blocksize(tsrc.format) +
+ (uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) *
+ tdst.array_size * util_format_get_blocksize(tdst.format) >
+ max_alloc_size) {
+ /* too large, try again */
+ i--;
+ continue;
+ }
+
+ /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+ * or GTT + linear only (1/4 of cases)
+ */
+ tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+ tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+
+ /* Allocate textures (both the GPU and CPU copies).
+ * The CPU will emulate what the GPU should be doing.
+ */
+ src = screen->resource_create(screen, &tsrc);
+ dst = screen->resource_create(screen, &tdst);
+ assert(src);
+ assert(dst);
+ sdst = (struct si_texture *)dst;
+ ssrc = (struct si_texture *)src;
+ alloc_cpu_texture(&src_cpu, &tsrc);
+ alloc_cpu_texture(&dst_cpu, &tdst);
+
+ printf("%4u: dst = (%5u x %5u x %u, %s), "
+ " src = (%5u x %5u x %u, %s), format = %s, ",
+ i, tdst.width0, tdst.height0, tdst.array_size,
+ array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0,
+ tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface),
+ util_format_description(tsrc.format)->name);
+ fflush(stdout);
+
+ /* set src pixels */
+ set_random_pixels(ctx, src, &src_cpu);
+
+ /* clear dst pixels */
+ uint32_t zero = 0;
+ si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false);
+ memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+
+ /* preparation */
+ max_width = MIN2(tsrc.width0, tdst.width0);
+ max_height = MIN2(tsrc.height0, tdst.height0);
+ max_depth = MIN2(tsrc.array_size, tdst.array_size);
+
+ num = do_partial_copies ? num_partial_copies : 1;
+ for (j = 0; j < num; j++) {
+ int width, height, depth;
+ int srcx, srcy, srcz, dstx, dsty, dstz;
+ struct pipe_box box;
+ unsigned old_num_draw_calls = sctx->num_draw_calls;
+ unsigned old_num_dma_calls = sctx->num_dma_calls;
+ unsigned old_num_cs_calls = sctx->num_compute_calls;
+
+ if (!do_partial_copies) {
+ /* copy whole src to dst */
+ width = max_width;
+ height = max_height;
+ depth = max_depth;
+
+ srcx = srcy = srcz = dstx = dsty = dstz = 0;
+ } else {
+ /* random sub-rectangle copies from src to dst */
+ depth = (rand() % max_depth) + 1;
+ srcz = rand() % (tsrc.array_size - depth + 1);
+ dstz = rand() % (tdst.array_size - depth + 1);
+
+ /* special code path to hit the tiled partial copies */
+ if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) {
+ if (max_width < 8 || max_height < 8)
+ continue;
+ width = ((rand() % (max_width / 8)) + 1) * 8;
+ height = ((rand() % (max_height / 8)) + 1) * 8;
+
+ srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+ srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+
+ dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+ dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+ } else {
+ /* just make sure that it doesn't divide by zero */
+ assert(max_width > 0 && max_height > 0);
+
+ width = (rand() % max_width) + 1;
+ height = (rand() % max_height) + 1;
+
+ srcx = rand() % (tsrc.width0 - width + 1);
+ srcy = rand() % (tsrc.height0 - height + 1);
+
+ dstx = rand() % (tdst.width0 - width + 1);
+ dsty = rand() % (tdst.height0 - height + 1);
+ }
+
+ /* special code path to hit out-of-bounds reads in L2T */
+ if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) {
+ srcx = 0;
+ srcy = 0;
+ srcz = 0;
+ }
+ }
+
+ /* GPU copy */
+ u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+ sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+
+ /* See which engine was used. */
+ gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+ dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+ cs_blits += sctx->num_compute_calls > old_num_cs_calls;
+
+ /* CPU copy */
+ util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty,
+ dstz, width, height, depth, src_cpu.ptr, src_cpu.stride,
+ src_cpu.layer_stride, srcx, srcy, srcz);
+ }
+
+ pass = compare_textures(ctx, dst, &dst_cpu);
+ if (pass)
+ num_pass++;
+ else
+ num_fail++;
+
+ printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
+ pass ? "pass" : "fail", num_pass, num_pass + num_fail);
+
+ /* cleanup */
+ pipe_resource_reference(&src, NULL);
+ pipe_resource_reference(&dst, NULL);
+ free(src_cpu.ptr);
+ free(dst_cpu.ptr);
+ }
+
+ ctx->destroy(ctx);
+ exit(0);
}
#include "si_pipe.h"
#include "si_query.h"
-#define MIN_SIZE 512
-#define MAX_SIZE (128 * 1024 * 1024)
-#define SIZE_SHIFT 1
-#define NUM_RUNS 128
+#define MIN_SIZE 512
+#define MAX_SIZE (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS 128
static double get_MBps_rate(unsigned num_bytes, unsigned ns)
{
- return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+ return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
}
void si_test_dma_perf(struct si_screen *sscreen)
{
- struct pipe_screen *screen = &sscreen->b;
- struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
- struct si_context *sctx = (struct si_context*)ctx;
- const uint32_t clear_value = 0x12345678;
- static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
- static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+ struct pipe_screen *screen = &sscreen->b;
+ struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+ struct si_context *sctx = (struct si_context *)ctx;
+ const uint32_t clear_value = 0x12345678;
+ static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+ static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
- static const char *method_str[] = {
- "CP MC ",
- "CP L2 ",
- "CP L2 ",
- "SDMA ",
- };
- static const char *placement_str[] = {
- /* Clear */
- "fill->VRAM",
- "fill->GTT ",
- /* Copy */
- "VRAM->VRAM",
- "VRAM->GTT ",
- "GTT ->VRAM",
- };
-
- printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
- printf("Heap ,Method ,L2p,Wa,");
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- if (size >= 1024)
- printf("%6uKB,", size / 1024);
- else
- printf(" %6uB,", size);
- }
- printf("\n");
-
- /* results[log2(size)][placement][method][] */
- struct si_result {
- bool is_valid;
- bool is_cp;
- bool is_sdma;
- bool is_cs;
- unsigned cache_policy;
- unsigned dwords_per_thread;
- unsigned waves_per_sh;
- unsigned score;
- unsigned index; /* index in results[x][y][index] */
- } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
- /* Run benchmarks. */
- for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
- bool is_copy = placement >= 2;
-
- printf("-----------,--------,---,--,");
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
- printf("--------,");
- printf("\n");
-
- for (unsigned method = 0; method < NUM_METHODS; method++) {
- bool test_cp = method <= 2;
- bool test_sdma = method == 3;
- bool test_cs = method >= 4;
- unsigned cs_method = method - 4;
- STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
- unsigned cs_waves_per_sh =
- test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
- cs_method %= 2*NUM_SHADERS;
- unsigned cache_policy = test_cp ? method % 3 :
- test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
- unsigned cs_dwords_per_thread =
- test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
- if (test_sdma && !sctx->sdma_cs)
- continue;
-
- if (sctx->chip_class == GFX6) {
- /* GFX6 doesn't support CP DMA operations through L2. */
- if (test_cp && cache_policy != L2_BYPASS)
- continue;
- /* WAVES_PER_SH is in multiples of 16 on GFX6. */
- if (test_cs && cs_waves_per_sh % 16 != 0)
- continue;
- }
-
- printf("%s ,", placement_str[placement]);
- if (test_cs) {
- printf("CS x%-4u,%3s,", cs_dwords_per_thread,
- cache_policy == L2_LRU ? "LRU" :
- cache_policy == L2_STREAM ? "Str" : "");
- } else {
- printf("%s,%3s,", method_str[method],
- method == L2_LRU ? "LRU" :
- method == L2_STREAM ? "Str" : "");
- }
- if (test_cs && cs_waves_per_sh)
- printf("%2u,", cs_waves_per_sh);
- else
- printf(" ,");
-
- double score = 0;
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- /* Don't test bigger sizes if it's too slow. Print 0. */
- if (size >= 512*1024 &&
- score < 400 * (size / (4*1024*1024))) {
- printf("%7.0f ,", 0.0);
- continue;
- }
-
- enum pipe_resource_usage dst_usage, src_usage;
- struct pipe_resource *dst, *src;
- struct pipe_query *q[NUM_RUNS];
- unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
- if (test_sdma) {
- if (sctx->chip_class == GFX6)
- query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
- else
- query_type = SI_QUERY_TIME_ELAPSED_SDMA;
- }
-
- if (placement == 0 || placement == 2 || placement == 4)
- dst_usage = PIPE_USAGE_DEFAULT;
- else
- dst_usage = PIPE_USAGE_STREAM;
-
- if (placement == 2 || placement == 3)
- src_usage = PIPE_USAGE_DEFAULT;
- else
- src_usage = PIPE_USAGE_STREAM;
-
- dst = pipe_buffer_create(screen, 0, dst_usage, size);
- src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
- /* Run tests. */
- for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
- q[iter] = ctx->create_query(ctx, query_type, 0);
- ctx->begin_query(ctx, q[iter]);
-
- if (test_cp) {
- /* CP DMA */
- if (is_copy) {
- si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
- SI_COHERENCY_NONE, cache_policy);
- } else {
- si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
- clear_value, 0,
- SI_COHERENCY_NONE, cache_policy);
- }
- } else if (test_sdma) {
- /* SDMA */
- if (is_copy) {
- si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
- } else {
- si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
- }
- } else {
- /* Compute */
- /* The memory accesses are coalesced, meaning that the 1st instruction writes
- * the 1st contiguous block of data for the whole wave, the 2nd instruction
- * writes the 2nd contiguous block of data, etc.
- */
- unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
- unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
- unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
- unsigned num_dwords = size / 4;
- unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
- void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
- cache_policy == L2_STREAM, is_copy);
-
- struct pipe_grid_info info = {};
- info.block[0] = MIN2(64, num_instructions);
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
- info.grid[1] = 1;
- info.grid[2] = 1;
-
- struct pipe_shader_buffer sb[2] = {};
- sb[0].buffer = dst;
- sb[0].buffer_size = size;
-
- if (is_copy) {
- sb[1].buffer = src;
- sb[1].buffer_size = size;
- } else {
- for (unsigned i = 0; i < 4; i++)
- sctx->cs_user_data[i] = clear_value;
- }
-
- sctx->flags |= SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_SCACHE;
-
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
- is_copy ? 2 : 1, sb, 0x1);
- ctx->bind_compute_state(ctx, cs);
- sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
- ctx->launch_grid(ctx, &info);
-
- ctx->bind_compute_state(ctx, NULL);
- ctx->delete_compute_state(ctx, cs);
- sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- /* Flush L2, so that we don't just test L2 cache performance. */
- if (!test_sdma) {
- sctx->flags |= SI_CONTEXT_WB_L2;
- sctx->emit_cache_flush(sctx);
- }
-
- ctx->end_query(ctx, q[iter]);
- ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
- }
- pipe_resource_reference(&dst, NULL);
- pipe_resource_reference(&src, NULL);
-
- /* Get results. */
- uint64_t min = ~0ull, max = 0, total = 0;
-
- for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
- union pipe_query_result result;
-
- ctx->get_query_result(ctx, q[iter], true, &result);
- ctx->destroy_query(ctx, q[iter]);
-
- min = MIN2(min, result.u64);
- max = MAX2(max, result.u64);
- total += result.u64;
- }
-
- score = get_MBps_rate(size, total / (double)NUM_RUNS);
- printf("%7.0f ,", score);
- fflush(stdout);
-
- struct si_result *r = &results[util_logbase2(size)][placement][method];
- r->is_valid = true;
- r->is_cp = test_cp;
- r->is_sdma = test_sdma;
- r->is_cs = test_cs;
- r->cache_policy = cache_policy;
- r->dwords_per_thread = cs_dwords_per_thread;
- r->waves_per_sh = cs_waves_per_sh;
- r->score = score;
- r->index = method;
- }
- puts("");
- }
- }
-
- puts("");
- puts("static struct si_method");
- printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
- sctx->screen->info.name);
- puts("{");
- puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
-
- /* Analyze results and find the best methods. */
- for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
- if (placement == 0)
- puts(" if (dst == RADEON_DOMAIN_VRAM) {");
- else if (placement == 1)
- puts(" } else { /* GTT */");
- else if (placement == 2) {
- puts("}");
- puts("");
- puts("static struct si_method");
- printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
- sctx->screen->info.name);
- printf(" uint64_t size64, bool async, bool cached)\n");
- puts("{");
- puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
- puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
- } else if (placement == 3)
- puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
- else
- puts(" } else { /* GTT -> VRAM */");
-
- for (unsigned mode = 0; mode < 3; mode++) {
- bool async = mode == 0;
- bool cached = mode == 1;
-
- if (async)
- puts(" if (async) { /* SDMA or async compute */");
- else if (cached)
- puts(" if (cached) { /* gfx ring */");
- else
- puts(" } else { /* gfx ring - uncached */");
-
- /* The list of best chosen methods. */
- struct si_result *methods[32];
- unsigned method_max_size[32];
- unsigned num_methods = 0;
-
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- /* Find the best method. */
- struct si_result *best = NULL;
-
- for (unsigned i = 0; i < NUM_METHODS; i++) {
- struct si_result *r = &results[util_logbase2(size)][placement][i];
-
- if (!r->is_valid)
- continue;
-
- /* Ban CP DMA clears via MC on <= GFX8. They are super slow
- * on GTT, which we can get due to BO evictions.
- */
- if (sctx->chip_class <= GFX8 && placement == 1 &&
- r->is_cp && r->cache_policy == L2_BYPASS)
- continue;
-
- if (async) {
- /* The following constraints for compute IBs try to limit
- * resource usage so as not to decrease the performance
- * of gfx IBs too much.
- */
-
- /* Don't use CP DMA on asynchronous rings, because
- * the engine is shared with gfx IBs.
- */
- if (r->is_cp)
- continue;
-
- /* Don't use L2 caching on asynchronous rings to minimize
- * L2 usage.
- */
- if (r->cache_policy == L2_LRU)
- continue;
-
- /* Asynchronous compute recommends waves_per_sh != 0
- * to limit CU usage. */
- if (r->is_cs && r->waves_per_sh == 0)
- continue;
- } else {
- /* SDMA is always asynchronous */
- if (r->is_sdma)
- continue;
-
- if (cached && r->cache_policy == L2_BYPASS)
- continue;
- if (!cached && r->cache_policy == L2_LRU)
- continue;
- }
-
- if (!best) {
- best = r;
- continue;
- }
-
- /* Assume some measurement error. Earlier methods occupy fewer
- * resources, so the next method is always more greedy, and we
- * don't want to select it due to a measurement error.
- */
- double min_improvement = 1.03;
-
- if (best->score * min_improvement < r->score)
- best = r;
- }
-
- if (num_methods > 0) {
- unsigned prev_index = num_methods - 1;
- struct si_result *prev = methods[prev_index];
- struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
- /* If the best one is also the best for the previous size,
- * just bump the size for the previous one.
- *
- * If there is no best, it means all methods were too slow
- * for this size and were not tested. Use the best one for
- * the previous size.
- */
- if (!best ||
- /* If it's the same method as for the previous size: */
- (prev->is_cp == best->is_cp &&
- prev->is_sdma == best->is_sdma &&
- prev->is_cs == best->is_cs &&
- prev->cache_policy == best->cache_policy &&
- prev->dwords_per_thread == best->dwords_per_thread &&
- prev->waves_per_sh == best->waves_per_sh) ||
- /* If the method for the previous size is also the best
- * for this size: */
- (prev_this_size->is_valid &&
- prev_this_size->score * 1.03 > best->score)) {
- method_max_size[prev_index] = size;
- continue;
- }
- }
-
- /* Add it to the list. */
- assert(num_methods < ARRAY_SIZE(methods));
- methods[num_methods] = best;
- method_max_size[num_methods] = size;
- num_methods++;
- }
-
- for (unsigned i = 0; i < num_methods; i++) {
- struct si_result *best = methods[i];
- unsigned size = method_max_size[i];
-
- /* The size threshold is between the current benchmarked
- * size and the next benchmarked size. */
- if (i < num_methods - 1)
- printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
- else if (i > 0)
- printf(" else ");
- else
- printf(" ");
- printf("return ");
-
- assert(best);
- if (best->is_cp) {
- printf("CP_DMA(%s);\n",
- best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
- best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
- }
- if (best->is_sdma)
- printf("SDMA;\n");
- if (best->is_cs) {
- printf("COMPUTE(%s, %u, %u);\n",
- best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
- best->dwords_per_thread,
- best->waves_per_sh);
- }
- }
- }
- puts(" }");
- }
- puts(" }");
- puts("}");
-
- ctx->destroy(ctx);
- exit(0);
+#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+ static const char *method_str[] = {
+ "CP MC ",
+ "CP L2 ",
+ "CP L2 ",
+ "SDMA ",
+ };
+ static const char *placement_str[] = {
+ /* Clear */
+ "fill->VRAM",
+ "fill->GTT ",
+ /* Copy */
+ "VRAM->VRAM",
+ "VRAM->GTT ",
+ "GTT ->VRAM",
+ };
+
+ printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+ printf("Heap ,Method ,L2p,Wa,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ if (size >= 1024)
+ printf("%6uKB,", size / 1024);
+ else
+ printf(" %6uB,", size);
+ }
+ printf("\n");
+
+ /* results[log2(size)][placement][method][] */
+ struct si_result {
+ bool is_valid;
+ bool is_cp;
+ bool is_sdma;
+ bool is_cs;
+ unsigned cache_policy;
+ unsigned dwords_per_thread;
+ unsigned waves_per_sh;
+ unsigned score;
+ unsigned index; /* index in results[x][y][index] */
+ } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+ /* Run benchmarks. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ bool is_copy = placement >= 2;
+
+ printf("-----------,--------,---,--,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+ printf("--------,");
+ printf("\n");
+
+ for (unsigned method = 0; method < NUM_METHODS; method++) {
+ bool test_cp = method <= 2;
+ bool test_sdma = method == 3;
+ bool test_cs = method >= 4;
+ unsigned cs_method = method - 4;
+ STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+ unsigned cs_waves_per_sh =
+ test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0;
+ cs_method %= 2 * NUM_SHADERS;
+ unsigned cache_policy =
+ test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+ unsigned cs_dwords_per_thread =
+ test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+ if (test_sdma && !sctx->sdma_cs)
+ continue;
+
+ if (sctx->chip_class == GFX6) {
+ /* GFX6 doesn't support CP DMA operations through L2. */
+ if (test_cp && cache_policy != L2_BYPASS)
+ continue;
+ /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+ if (test_cs && cs_waves_per_sh % 16 != 0)
+ continue;
+ }
+
+ printf("%s ,", placement_str[placement]);
+ if (test_cs) {
+ printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+ cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+ } else {
+ printf("%s,%3s,", method_str[method],
+ method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+ }
+ if (test_cs && cs_waves_per_sh)
+ printf("%2u,", cs_waves_per_sh);
+ else
+ printf(" ,");
+
+ double score = 0;
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Don't test bigger sizes if it's too slow. Print 0. */
+ if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+ printf("%7.0f ,", 0.0);
+ continue;
+ }
+
+ enum pipe_resource_usage dst_usage, src_usage;
+ struct pipe_resource *dst, *src;
+ struct pipe_query *q[NUM_RUNS];
+ unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+ if (test_sdma) {
+ if (sctx->chip_class == GFX6)
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+ else
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+ }
+
+ if (placement == 0 || placement == 2 || placement == 4)
+ dst_usage = PIPE_USAGE_DEFAULT;
+ else
+ dst_usage = PIPE_USAGE_STREAM;
+
+ if (placement == 2 || placement == 3)
+ src_usage = PIPE_USAGE_DEFAULT;
+ else
+ src_usage = PIPE_USAGE_STREAM;
+
+ dst = pipe_buffer_create(screen, 0, dst_usage, size);
+ src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+ /* Run tests. */
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ q[iter] = ctx->create_query(ctx, query_type, 0);
+ ctx->begin_query(ctx, q[iter]);
+
+ if (test_cp) {
+ /* CP DMA */
+ if (is_copy) {
+ si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+ cache_policy);
+ } else {
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+ SI_COHERENCY_NONE, cache_policy);
+ }
+ } else if (test_sdma) {
+ /* SDMA */
+ if (is_copy) {
+ si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+ } else {
+ si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+ }
+ } else {
+ /* Compute */
+ /* The memory accesses are coalesced, meaning that the 1st instruction writes
+ * the 1st contiguous block of data for the whole wave, the 2nd instruction
+ * writes the 2nd contiguous block of data, etc.
+ */
+ unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+ unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+ unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+ unsigned num_dwords = size / 4;
+ unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+ void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+ cache_policy == L2_STREAM, is_copy);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = MIN2(64, num_instructions);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_size = size;
+
+ if (is_copy) {
+ sb[1].buffer = src;
+ sb[1].buffer_size = size;
+ } else {
+ for (unsigned i = 0; i < 4; i++)
+ sctx->cs_user_data[i] = clear_value;
+ }
+
+ sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+ ctx->bind_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+ ctx->launch_grid(ctx, &info);
+
+ ctx->bind_compute_state(ctx, NULL);
+ ctx->delete_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* Flush L2, so that we don't just test L2 cache performance. */
+ if (!test_sdma) {
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ sctx->emit_cache_flush(sctx);
+ }
+
+ ctx->end_query(ctx, q[iter]);
+ ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+ }
+ pipe_resource_reference(&dst, NULL);
+ pipe_resource_reference(&src, NULL);
+
+ /* Get results. */
+ uint64_t min = ~0ull, max = 0, total = 0;
+
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ union pipe_query_result result;
+
+ ctx->get_query_result(ctx, q[iter], true, &result);
+ ctx->destroy_query(ctx, q[iter]);
+
+ min = MIN2(min, result.u64);
+ max = MAX2(max, result.u64);
+ total += result.u64;
+ }
+
+ score = get_MBps_rate(size, total / (double)NUM_RUNS);
+ printf("%7.0f ,", score);
+ fflush(stdout);
+
+ struct si_result *r = &results[util_logbase2(size)][placement][method];
+ r->is_valid = true;
+ r->is_cp = test_cp;
+ r->is_sdma = test_sdma;
+ r->is_cs = test_cs;
+ r->cache_policy = cache_policy;
+ r->dwords_per_thread = cs_dwords_per_thread;
+ r->waves_per_sh = cs_waves_per_sh;
+ r->score = score;
+ r->index = method;
+ }
+ puts("");
+ }
+ }
+
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+ "cached)\n",
+ sctx->screen->info.name);
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+
+ /* Analyze results and find the best methods. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ if (placement == 0)
+ puts(" if (dst == RADEON_DOMAIN_VRAM) {");
+ else if (placement == 1)
+ puts(" } else { /* GTT */");
+ else if (placement == 2) {
+ puts("}");
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+ sctx->screen->info.name);
+ printf(" uint64_t size64, bool async, bool cached)\n");
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+ puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+ } else if (placement == 3)
+ puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+ else
+ puts(" } else { /* GTT -> VRAM */");
+
+ for (unsigned mode = 0; mode < 3; mode++) {
+ bool async = mode == 0;
+ bool cached = mode == 1;
+
+ if (async)
+ puts(" if (async) { /* SDMA or async compute */");
+ else if (cached)
+ puts(" if (cached) { /* gfx ring */");
+ else
+ puts(" } else { /* gfx ring - uncached */");
+
+ /* The list of best chosen methods. */
+ struct si_result *methods[32];
+ unsigned method_max_size[32];
+ unsigned num_methods = 0;
+
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Find the best method. */
+ struct si_result *best = NULL;
+
+ for (unsigned i = 0; i < NUM_METHODS; i++) {
+ struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+ if (!r->is_valid)
+ continue;
+
+ /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+ * on GTT, which we can get due to BO evictions.
+ */
+ if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+ r->cache_policy == L2_BYPASS)
+ continue;
+
+ if (async) {
+ /* The following constraints for compute IBs try to limit
+ * resource usage so as not to decrease the performance
+ * of gfx IBs too much.
+ */
+
+ /* Don't use CP DMA on asynchronous rings, because
+ * the engine is shared with gfx IBs.
+ */
+ if (r->is_cp)
+ continue;
+
+ /* Don't use L2 caching on asynchronous rings to minimize
+ * L2 usage.
+ */
+ if (r->cache_policy == L2_LRU)
+ continue;
+
+ /* Asynchronous compute recommends waves_per_sh != 0
+ * to limit CU usage. */
+ if (r->is_cs && r->waves_per_sh == 0)
+ continue;
+ } else {
+ /* SDMA is always asynchronous */
+ if (r->is_sdma)
+ continue;
+
+ if (cached && r->cache_policy == L2_BYPASS)
+ continue;
+ if (!cached && r->cache_policy == L2_LRU)
+ continue;
+ }
+
+ if (!best) {
+ best = r;
+ continue;
+ }
+
+ /* Assume some measurement error. Earlier methods occupy fewer
+ * resources, so the next method is always more greedy, and we
+ * don't want to select it due to a measurement error.
+ */
+ double min_improvement = 1.03;
+
+ if (best->score * min_improvement < r->score)
+ best = r;
+ }
+
+ if (num_methods > 0) {
+ unsigned prev_index = num_methods - 1;
+ struct si_result *prev = methods[prev_index];
+ struct si_result *prev_this_size =
+ &results[util_logbase2(size)][placement][prev->index];
+
+ /* If the best one is also the best for the previous size,
+ * just bump the size for the previous one.
+ *
+ * If there is no best, it means all methods were too slow
+ * for this size and were not tested. Use the best one for
+ * the previous size.
+ */
+ if (!best ||
+ /* If it's the same method as for the previous size: */
+ (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+ prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+ prev->dwords_per_thread == best->dwords_per_thread &&
+ prev->waves_per_sh == best->waves_per_sh) ||
+ /* If the method for the previous size is also the best
+ * for this size: */
+ (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+ method_max_size[prev_index] = size;
+ continue;
+ }
+ }
+
+ /* Add it to the list. */
+ assert(num_methods < ARRAY_SIZE(methods));
+ methods[num_methods] = best;
+ method_max_size[num_methods] = size;
+ num_methods++;
+ }
+
+ for (unsigned i = 0; i < num_methods; i++) {
+ struct si_result *best = methods[i];
+ unsigned size = method_max_size[i];
+
+ /* The size threshold is between the current benchmarked
+ * size and the next benchmarked size. */
+ if (i < num_methods - 1)
+ printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+ else if (i > 0)
+ printf(" else ");
+ else
+ printf(" ");
+ printf("return ");
+
+ assert(best);
+ if (best->is_cp) {
+ printf("CP_DMA(%s);\n",
+ best->cache_policy == L2_BYPASS
+ ? "L2_BYPASS"
+ : best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
+ }
+ if (best->is_sdma)
+ printf("SDMA;\n");
+ if (best->is_cs) {
+ printf("COMPUTE(%s, %u, %u);\n",
+ best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
+ best->dwords_per_thread, best->waves_per_sh);
+ }
+ }
+ }
+ puts(" }");
+ }
+ puts(" }");
+ puts("}");
+
+ ctx->destroy(ctx);
+ exit(0);
}
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+#include "drm-uapi/drm_fourcc.h"
#include "si_pipe.h"
#include "si_query.h"
+#include "sid.h"
+#include "state_tracker/drm_driver.h"
#include "util/format/u_format.h"
+#include "util/os_time.h"
#include "util/u_log.h"
#include "util/u_memory.h"
#include "util/u_pack_color.h"
#include "util/u_resource.h"
#include "util/u_surface.h"
#include "util/u_transfer.h"
-#include "util/os_time.h"
+
#include <errno.h>
#include <inttypes.h>
-#include "state_tracker/drm_driver.h"
-#include "sid.h"
-#include "amd/addrlib/inc/addrinterface.h"
-#include "drm-uapi/drm_fourcc.h"
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
- const struct pipe_resource *templ, bool tc_compatible_htile);
+#include "amd/addrlib/inc/addrinterface.h"
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+ const struct pipe_resource *templ,
+ bool tc_compatible_htile);
-bool si_prepare_for_dma_blit(struct si_context *sctx,
- struct si_texture *dst,
- unsigned dst_level, unsigned dstx,
- unsigned dsty, unsigned dstz,
- struct si_texture *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+ unsigned src_level, const struct pipe_box *src_box)
{
- if (!sctx->sdma_cs)
- return false;
-
- if (dst->surface.bpe != src->surface.bpe)
- return false;
-
- /* MSAA: Blits don't exist in the real world. */
- if (src->buffer.b.b.nr_samples > 1 ||
- dst->buffer.b.b.nr_samples > 1)
- return false;
-
- /* Depth-stencil surfaces:
- * When dst is linear, the DB->CB copy preserves HTILE.
- * When dst is tiled, the 3D path must be used to update HTILE.
- */
- if (src->is_depth || dst->is_depth)
- return false;
-
- /* DCC as:
- * src: Use the 3D path. DCC decompression is expensive.
- * dst: Use the 3D path to compress the pixels with DCC.
- */
- if (vi_dcc_enabled(src, src_level) ||
- vi_dcc_enabled(dst, dst_level))
- return false;
-
- /* CMASK as:
- * src: Both texture and SDMA paths need decompression. Use SDMA.
- * dst: If overwriting the whole texture, discard CMASK and use
- * SDMA. Otherwise, use the 3D path.
- */
- if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
- /* The CMASK clear is only enabled for the first level. */
- assert(dst_level == 0);
- if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
- dstx, dsty, dstz, src_box->width,
- src_box->height, src_box->depth))
- return false;
-
- si_texture_discard_cmask(sctx->screen, dst);
- }
-
- /* All requirements are met. Prepare textures for SDMA. */
- if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
- sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
-
- assert(!(src->dirty_level_mask & (1 << src_level)));
- assert(!(dst->dirty_level_mask & (1 << dst_level)));
-
- return true;
+ if (!sctx->sdma_cs)
+ return false;
+
+ if (dst->surface.bpe != src->surface.bpe)
+ return false;
+
+ /* MSAA: Blits don't exist in the real world. */
+ if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
+ return false;
+
+ /* Depth-stencil surfaces:
+ * When dst is linear, the DB->CB copy preserves HTILE.
+ * When dst is tiled, the 3D path must be used to update HTILE.
+ */
+ if (src->is_depth || dst->is_depth)
+ return false;
+
+ /* DCC as:
+ * src: Use the 3D path. DCC decompression is expensive.
+ * dst: Use the 3D path to compress the pixels with DCC.
+ */
+ if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level))
+ return false;
+
+ /* CMASK as:
+ * src: Both texture and SDMA paths need decompression. Use SDMA.
+ * dst: If overwriting the whole texture, discard CMASK and use
+ * SDMA. Otherwise, use the 3D path.
+ */
+ if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
+ /* The CMASK clear is only enabled for the first level. */
+ assert(dst_level == 0);
+ if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz,
+ src_box->width, src_box->height, src_box->depth))
+ return false;
+
+ si_texture_discard_cmask(sctx->screen, dst);
+ }
+
+ /* All requirements are met. Prepare textures for SDMA. */
+ if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+ sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
+
+ assert(!(src->dirty_level_mask & (1 << src_level)));
+ assert(!(dst->dirty_level_mask & (1 << dst_level)));
+
+ return true;
}
/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
-static void si_copy_region_with_blit(struct pipe_context *pipe,
- struct pipe_resource *dst,
- unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src,
- unsigned src_level,
- const struct pipe_box *src_box)
+static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst,
+ unsigned dst_level, unsigned dstx, unsigned dsty,
+ unsigned dstz, struct pipe_resource *src, unsigned src_level,
+ const struct pipe_box *src_box)
{
- struct pipe_blit_info blit;
-
- memset(&blit, 0, sizeof(blit));
- blit.src.resource = src;
- blit.src.format = src->format;
- blit.src.level = src_level;
- blit.src.box = *src_box;
- blit.dst.resource = dst;
- blit.dst.format = dst->format;
- blit.dst.level = dst_level;
- blit.dst.box.x = dstx;
- blit.dst.box.y = dsty;
- blit.dst.box.z = dstz;
- blit.dst.box.width = src_box->width;
- blit.dst.box.height = src_box->height;
- blit.dst.box.depth = src_box->depth;
- blit.mask = util_format_get_mask(dst->format);
- blit.filter = PIPE_TEX_FILTER_NEAREST;
-
- if (blit.mask) {
- pipe->blit(pipe, &blit);
- }
+ struct pipe_blit_info blit;
+
+ memset(&blit, 0, sizeof(blit));
+ blit.src.resource = src;
+ blit.src.format = src->format;
+ blit.src.level = src_level;
+ blit.src.box = *src_box;
+ blit.dst.resource = dst;
+ blit.dst.format = dst->format;
+ blit.dst.level = dst_level;
+ blit.dst.box.x = dstx;
+ blit.dst.box.y = dsty;
+ blit.dst.box.z = dstz;
+ blit.dst.box.width = src_box->width;
+ blit.dst.box.height = src_box->height;
+ blit.dst.box.depth = src_box->depth;
+ blit.mask = util_format_get_mask(dst->format);
+ blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+ if (blit.mask) {
+ pipe->blit(pipe, &blit);
+ }
}
/* Copy from a full GPU texture to a transfer's staging one. */
static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
- struct pipe_resource *dst = &stransfer->staging->b.b;
- struct pipe_resource *src = transfer->resource;
-
- if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
- si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
- src, transfer->level, &transfer->box);
- return;
- }
-
- sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
- &transfer->box);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+ struct pipe_resource *dst = &stransfer->staging->b.b;
+ struct pipe_resource *src = transfer->resource;
+
+ if (src->nr_samples > 1 || ((struct si_texture *)src)->is_depth) {
+ si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
+ return;
+ }
+
+ sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
}
/* Copy from a transfer's staging texture to a full GPU one. */
static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
- struct pipe_resource *dst = transfer->resource;
- struct pipe_resource *src = &stransfer->staging->b.b;
- struct pipe_box sbox;
-
- u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
-
- if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
- si_copy_region_with_blit(ctx, dst, transfer->level,
- transfer->box.x, transfer->box.y, transfer->box.z,
- src, 0, &sbox);
- return;
- }
-
- if (util_format_is_compressed(dst->format)) {
- sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
- sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
- }
-
- sctx->dma_copy(ctx, dst, transfer->level,
- transfer->box.x, transfer->box.y, transfer->box.z,
- src, 0, &sbox);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+ struct pipe_resource *dst = transfer->resource;
+ struct pipe_resource *src = &stransfer->staging->b.b;
+ struct pipe_box sbox;
+
+ u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
+ if (dst->nr_samples > 1 || ((struct si_texture *)dst)->is_depth) {
+ si_copy_region_with_blit(ctx, dst, transfer->level, transfer->box.x, transfer->box.y,
+ transfer->box.z, src, 0, &sbox);
+ return;
+ }
+
+ if (util_format_is_compressed(dst->format)) {
+ sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
+ sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
+ }
+
+ sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src,
+ 0, &sbox);
}
-static unsigned si_texture_get_offset(struct si_screen *sscreen,
- struct si_texture *tex, unsigned level,
- const struct pipe_box *box,
- unsigned *stride,
- unsigned *layer_stride)
+static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex,
+ unsigned level, const struct pipe_box *box, unsigned *stride,
+ unsigned *layer_stride)
{
- if (sscreen->info.chip_class >= GFX9) {
- *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
- *layer_stride = tex->surface.u.gfx9.surf_slice_size;
-
- if (!box)
- return 0;
-
- /* Each texture is an array of slices. Each slice is an array
- * of mipmap levels. */
- return tex->surface.u.gfx9.surf_offset +
- box->z * tex->surface.u.gfx9.surf_slice_size +
- tex->surface.u.gfx9.offset[level] +
- (box->y / tex->surface.blk_h *
- tex->surface.u.gfx9.surf_pitch +
- box->x / tex->surface.blk_w) * tex->surface.bpe;
- } else {
- *stride = tex->surface.u.legacy.level[level].nblk_x *
- tex->surface.bpe;
- assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
- *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
-
- if (!box)
- return tex->surface.u.legacy.level[level].offset;
-
- /* Each texture is an array of mipmap levels. Each level is
- * an array of slices. */
- return tex->surface.u.legacy.level[level].offset +
- box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
- (box->y / tex->surface.blk_h *
- tex->surface.u.legacy.level[level].nblk_x +
- box->x / tex->surface.blk_w) * tex->surface.bpe;
- }
+ if (sscreen->info.chip_class >= GFX9) {
+ *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+ *layer_stride = tex->surface.u.gfx9.surf_slice_size;
+
+ if (!box)
+ return 0;
+
+ /* Each texture is an array of slices. Each slice is an array
+ * of mipmap levels. */
+ return tex->surface.u.gfx9.surf_offset + box->z * tex->surface.u.gfx9.surf_slice_size +
+ tex->surface.u.gfx9.offset[level] +
+ (box->y / tex->surface.blk_h * tex->surface.u.gfx9.surf_pitch +
+ box->x / tex->surface.blk_w) *
+ tex->surface.bpe;
+ } else {
+ *stride = tex->surface.u.legacy.level[level].nblk_x * tex->surface.bpe;
+ assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+ *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
+
+ if (!box)
+ return tex->surface.u.legacy.level[level].offset;
+
+ /* Each texture is an array of mipmap levels. Each level is
+ * an array of slices. */
+ return tex->surface.u.legacy.level[level].offset +
+ box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+ (box->y / tex->surface.blk_h * tex->surface.u.legacy.level[level].nblk_x +
+ box->x / tex->surface.blk_w) *
+ tex->surface.bpe;
+ }
}
-static int si_init_surface(struct si_screen *sscreen,
- struct radeon_surf *surface,
- const struct pipe_resource *ptex,
- enum radeon_surf_mode array_mode,
- unsigned pitch_in_bytes_override,
- bool is_imported,
- bool is_scanout,
- bool is_flushed_depth,
- bool tc_compatible_htile)
+static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surface,
+ const struct pipe_resource *ptex, enum radeon_surf_mode array_mode,
+ unsigned pitch_in_bytes_override, bool is_imported, bool is_scanout,
+ bool is_flushed_depth, bool tc_compatible_htile)
{
- const struct util_format_description *desc =
- util_format_description(ptex->format);
- bool is_depth, is_stencil;
- int r;
- unsigned bpe, flags = 0;
-
- is_depth = util_format_has_depth(desc);
- is_stencil = util_format_has_stencil(desc);
-
- if (!is_flushed_depth &&
- ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
- bpe = 4; /* stencil is allocated separately */
- } else {
- bpe = util_format_get_blocksize(ptex->format);
- assert(util_is_power_of_two_or_zero(bpe));
- }
-
- if (!is_flushed_depth && is_depth) {
- flags |= RADEON_SURF_ZBUFFER;
-
- if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
- flags |= RADEON_SURF_NO_HTILE;
- } else if (tc_compatible_htile &&
- (sscreen->info.chip_class >= GFX9 ||
- array_mode == RADEON_SURF_MODE_2D)) {
- /* TC-compatible HTILE only supports Z32_FLOAT.
- * GFX9 also supports Z16_UNORM.
- * On GFX8, promote Z16 to Z32. DB->CB copies will convert
- * the format for transfers.
- */
- if (sscreen->info.chip_class == GFX8)
- bpe = 4;
-
- flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
- }
-
- if (is_stencil)
- flags |= RADEON_SURF_SBUFFER;
- }
-
- if (sscreen->info.chip_class >= GFX8 &&
- (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
- ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
- (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
- flags |= RADEON_SURF_DISABLE_DCC;
-
- /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
- if (sscreen->info.family == CHIP_STONEY &&
- bpe == 16 && ptex->nr_samples >= 2)
- flags |= RADEON_SURF_DISABLE_DCC;
-
- /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
- if (sscreen->info.chip_class == GFX8 &&
- ptex->nr_storage_samples >= 4 &&
- ptex->array_size > 1)
- flags |= RADEON_SURF_DISABLE_DCC;
-
- /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
- if (sscreen->info.chip_class == GFX9 &&
- (ptex->nr_storage_samples >= 4 ||
- (sscreen->info.family == CHIP_RAVEN &&
- ptex->nr_storage_samples >= 2 && bpe < 4)))
- flags |= RADEON_SURF_DISABLE_DCC;
-
- /* TODO: GFX10: DCC causes corruption with MSAA. */
- if (sscreen->info.chip_class >= GFX10 &&
- ptex->nr_storage_samples >= 2)
- flags |= RADEON_SURF_DISABLE_DCC;
-
- /* Shared textures must always set up DCC.
- * If it's not present, it will be disabled by
- * si_get_opaque_metadata later.
- */
- if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
- flags |= RADEON_SURF_DISABLE_DCC;
-
- if (is_scanout) {
- /* This should catch bugs in gallium users setting incorrect flags. */
- assert(ptex->nr_samples <= 1 &&
- ptex->array_size == 1 &&
- ptex->depth0 == 1 &&
- ptex->last_level == 0 &&
- !(flags & RADEON_SURF_Z_OR_SBUFFER));
-
- flags |= RADEON_SURF_SCANOUT;
- }
-
- if (ptex->bind & PIPE_BIND_SHARED)
- flags |= RADEON_SURF_SHAREABLE;
- if (is_imported)
- flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
- if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
- flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
- if (sscreen->debug_flags & DBG(NO_FMASK))
- flags |= RADEON_SURF_NO_FMASK;
-
- if (sscreen->info.chip_class == GFX9 &&
- (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
- flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
- surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
- }
-
- if (sscreen->info.chip_class >= GFX10 &&
- (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
- flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
- surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
- }
-
- r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
- array_mode, surface);
- if (r) {
- return r;
- }
-
- unsigned pitch = pitch_in_bytes_override / bpe;
-
- if (sscreen->info.chip_class >= GFX9) {
- if (pitch) {
- surface->u.gfx9.surf_pitch = pitch;
- if (ptex->last_level == 0)
- surface->u.gfx9.surf.epitch = pitch - 1;
- surface->u.gfx9.surf_slice_size =
- (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
- }
- } else {
- if (pitch) {
- surface->u.legacy.level[0].nblk_x = pitch;
- surface->u.legacy.level[0].slice_size_dw =
- ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
- }
- }
- return 0;
+ const struct util_format_description *desc = util_format_description(ptex->format);
+ bool is_depth, is_stencil;
+ int r;
+ unsigned bpe, flags = 0;
+
+ is_depth = util_format_has_depth(desc);
+ is_stencil = util_format_has_stencil(desc);
+
+ if (!is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+ bpe = 4; /* stencil is allocated separately */
+ } else {
+ bpe = util_format_get_blocksize(ptex->format);
+ assert(util_is_power_of_two_or_zero(bpe));
+ }
+
+ if (!is_flushed_depth && is_depth) {
+ flags |= RADEON_SURF_ZBUFFER;
+
+ if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
+ flags |= RADEON_SURF_NO_HTILE;
+ } else if (tc_compatible_htile &&
+ (sscreen->info.chip_class >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) {
+ /* TC-compatible HTILE only supports Z32_FLOAT.
+ * GFX9 also supports Z16_UNORM.
+ * On GFX8, promote Z16 to Z32. DB->CB copies will convert
+ * the format for transfers.
+ */
+ if (sscreen->info.chip_class == GFX8)
+ bpe = 4;
+
+ flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+ }
+
+ if (is_stencil)
+ flags |= RADEON_SURF_SBUFFER;
+ }
+
+ if (sscreen->info.chip_class >= GFX8 &&
+ (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
+ (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+ if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+ if (sscreen->info.chip_class == GFX8 && ptex->nr_storage_samples >= 4 && ptex->array_size > 1)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+ if (sscreen->info.chip_class == GFX9 &&
+ (ptex->nr_storage_samples >= 4 ||
+ (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4)))
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* TODO: GFX10: DCC causes corruption with MSAA. */
+ if (sscreen->info.chip_class >= GFX10 && ptex->nr_storage_samples >= 2)
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ /* Shared textures must always set up DCC.
+ * If it's not present, it will be disabled by
+ * si_get_opaque_metadata later.
+ */
+ if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
+ flags |= RADEON_SURF_DISABLE_DCC;
+
+ if (is_scanout) {
+ /* This should catch bugs in gallium users setting incorrect flags. */
+ assert(ptex->nr_samples <= 1 && ptex->array_size == 1 && ptex->depth0 == 1 &&
+ ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER));
+
+ flags |= RADEON_SURF_SCANOUT;
+ }
+
+ if (ptex->bind & PIPE_BIND_SHARED)
+ flags |= RADEON_SURF_SHAREABLE;
+ if (is_imported)
+ flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
+ if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
+ flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
+ if (sscreen->debug_flags & DBG(NO_FMASK))
+ flags |= RADEON_SURF_NO_FMASK;
+
+ if (sscreen->info.chip_class == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
+ flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
+ surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
+ }
+
+ if (sscreen->info.chip_class >= GFX10 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
+ flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
+ surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
+ }
+
+ r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, array_mode, surface);
+ if (r) {
+ return r;
+ }
+
+ unsigned pitch = pitch_in_bytes_override / bpe;
+
+ if (sscreen->info.chip_class >= GFX9) {
+ if (pitch) {
+ surface->u.gfx9.surf_pitch = pitch;
+ if (ptex->last_level == 0)
+ surface->u.gfx9.surf.epitch = pitch - 1;
+ surface->u.gfx9.surf_slice_size = (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
+ }
+ } else {
+ if (pitch) {
+ surface->u.legacy.level[0].nblk_x = pitch;
+ surface->u.legacy.level[0].slice_size_dw =
+ ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
+ }
+ }
+ return 0;
}
-static void si_get_display_metadata(struct si_screen *sscreen,
- struct radeon_surf *surf,
- struct radeon_bo_metadata *metadata,
- enum radeon_surf_mode *array_mode,
- bool *is_scanout)
+static void si_get_display_metadata(struct si_screen *sscreen, struct radeon_surf *surf,
+ struct radeon_bo_metadata *metadata,
+ enum radeon_surf_mode *array_mode, bool *is_scanout)
{
- if (sscreen->info.chip_class >= GFX9) {
- if (metadata->u.gfx9.swizzle_mode > 0)
- *array_mode = RADEON_SURF_MODE_2D;
- else
- *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
- *is_scanout = metadata->u.gfx9.scanout;
-
- if (metadata->u.gfx9.dcc_offset_256B) {
- surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
- assert(metadata->u.gfx9.dcc_independent_64B == 1);
- }
- } else {
- surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
- surf->u.legacy.bankw = metadata->u.legacy.bankw;
- surf->u.legacy.bankh = metadata->u.legacy.bankh;
- surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
- surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
- surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
-
- if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
- *array_mode = RADEON_SURF_MODE_2D;
- else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
- *array_mode = RADEON_SURF_MODE_1D;
- else
- *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- *is_scanout = metadata->u.legacy.scanout;
- }
+ if (sscreen->info.chip_class >= GFX9) {
+ if (metadata->u.gfx9.swizzle_mode > 0)
+ *array_mode = RADEON_SURF_MODE_2D;
+ else
+ *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
+ *is_scanout = metadata->u.gfx9.scanout;
+
+ if (metadata->u.gfx9.dcc_offset_256B) {
+ surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
+ assert(metadata->u.gfx9.dcc_independent_64B == 1);
+ }
+ } else {
+ surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
+ surf->u.legacy.bankw = metadata->u.legacy.bankw;
+ surf->u.legacy.bankh = metadata->u.legacy.bankh;
+ surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
+ surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
+ surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
+
+ if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+ *array_mode = RADEON_SURF_MODE_2D;
+ else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
+ *array_mode = RADEON_SURF_MODE_1D;
+ else
+ *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ *is_scanout = metadata->u.legacy.scanout;
+ }
}
-void si_eliminate_fast_color_clear(struct si_context *sctx,
- struct si_texture *tex)
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex)
{
- struct si_screen *sscreen = sctx->screen;
- struct pipe_context *ctx = &sctx->b;
+ struct si_screen *sscreen = sctx->screen;
+ struct pipe_context *ctx = &sctx->b;
- if (ctx == sscreen->aux_context)
- simple_mtx_lock(&sscreen->aux_context_lock);
+ if (ctx == sscreen->aux_context)
+ simple_mtx_lock(&sscreen->aux_context_lock);
- unsigned n = sctx->num_decompress_calls;
- ctx->flush_resource(ctx, &tex->buffer.b.b);
+ unsigned n = sctx->num_decompress_calls;
+ ctx->flush_resource(ctx, &tex->buffer.b.b);
- /* Flush only if any fast clear elimination took place. */
- if (n != sctx->num_decompress_calls)
- ctx->flush(ctx, NULL, 0);
+ /* Flush only if any fast clear elimination took place. */
+ if (n != sctx->num_decompress_calls)
+ ctx->flush(ctx, NULL, 0);
- if (ctx == sscreen->aux_context)
- simple_mtx_unlock(&sscreen->aux_context_lock);
+ if (ctx == sscreen->aux_context)
+ simple_mtx_unlock(&sscreen->aux_context_lock);
}
-void si_texture_discard_cmask(struct si_screen *sscreen,
- struct si_texture *tex)
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex)
{
- if (!tex->cmask_buffer)
- return;
+ if (!tex->cmask_buffer)
+ return;
- assert(tex->buffer.b.b.nr_samples <= 1);
+ assert(tex->buffer.b.b.nr_samples <= 1);
- /* Disable CMASK. */
- tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
- tex->dirty_level_mask = 0;
+ /* Disable CMASK. */
+ tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+ tex->dirty_level_mask = 0;
- tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+ tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
- if (tex->cmask_buffer != &tex->buffer)
- si_resource_reference(&tex->cmask_buffer, NULL);
+ if (tex->cmask_buffer != &tex->buffer)
+ si_resource_reference(&tex->cmask_buffer, NULL);
- tex->cmask_buffer = NULL;
+ tex->cmask_buffer = NULL;
- /* Notify all contexts about the change. */
- p_atomic_inc(&sscreen->dirty_tex_counter);
- p_atomic_inc(&sscreen->compressed_colortex_counter);
+ /* Notify all contexts about the change. */
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+ p_atomic_inc(&sscreen->compressed_colortex_counter);
}
static bool si_can_disable_dcc(struct si_texture *tex)
{
- /* We can't disable DCC if it can be written by another process. */
- return tex->surface.dcc_offset &&
- (!tex->buffer.b.is_shared ||
- !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
+ /* We can't disable DCC if it can be written by another process. */
+ return tex->surface.dcc_offset &&
+ (!tex->buffer.b.is_shared ||
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
}
static void si_texture_zero_dcc_fields(struct si_texture *tex)
{
- tex->surface.dcc_offset = 0;
- tex->surface.display_dcc_offset = 0;
- tex->surface.dcc_retile_map_offset = 0;
+ tex->surface.dcc_offset = 0;
+ tex->surface.display_dcc_offset = 0;
+ tex->surface.dcc_retile_map_offset = 0;
}
-static bool si_texture_discard_dcc(struct si_screen *sscreen,
- struct si_texture *tex)
+static bool si_texture_discard_dcc(struct si_screen *sscreen, struct si_texture *tex)
{
- if (!si_can_disable_dcc(tex))
- return false;
+ if (!si_can_disable_dcc(tex))
+ return false;
- assert(tex->dcc_separate_buffer == NULL);
+ assert(tex->dcc_separate_buffer == NULL);
- /* Disable DCC. */
- si_texture_zero_dcc_fields(tex);
+ /* Disable DCC. */
+ si_texture_zero_dcc_fields(tex);
- /* Notify all contexts about the change. */
- p_atomic_inc(&sscreen->dirty_tex_counter);
- return true;
+ /* Notify all contexts about the change. */
+ p_atomic_inc(&sscreen->dirty_tex_counter);
+ return true;
}
/**
* \param sctx the current context if you have one, or sscreen->aux_context
* if you don't.
*/
-bool si_texture_disable_dcc(struct si_context *sctx,
- struct si_texture *tex)
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex)
{
- struct si_screen *sscreen = sctx->screen;
+ struct si_screen *sscreen = sctx->screen;
- if (!sctx->has_graphics)
- return si_texture_discard_dcc(sscreen, tex);
+ if (!sctx->has_graphics)
+ return si_texture_discard_dcc(sscreen, tex);
- if (!si_can_disable_dcc(tex))
- return false;
+ if (!si_can_disable_dcc(tex))
+ return false;
- if (&sctx->b == sscreen->aux_context)
- simple_mtx_lock(&sscreen->aux_context_lock);
+ if (&sctx->b == sscreen->aux_context)
+ simple_mtx_lock(&sscreen->aux_context_lock);
- /* Decompress DCC. */
- si_decompress_dcc(sctx, tex);
- sctx->b.flush(&sctx->b, NULL, 0);
+ /* Decompress DCC. */
+ si_decompress_dcc(sctx, tex);
+ sctx->b.flush(&sctx->b, NULL, 0);
- if (&sctx->b == sscreen->aux_context)
- simple_mtx_unlock(&sscreen->aux_context_lock);
+ if (&sctx->b == sscreen->aux_context)
+ simple_mtx_unlock(&sscreen->aux_context_lock);
- return si_texture_discard_dcc(sscreen, tex);
+ return si_texture_discard_dcc(sscreen, tex);
}
-static void si_reallocate_texture_inplace(struct si_context *sctx,
- struct si_texture *tex,
- unsigned new_bind_flag,
- bool invalidate_storage)
+static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_texture *tex,
+ unsigned new_bind_flag, bool invalidate_storage)
{
- struct pipe_screen *screen = sctx->b.screen;
- struct si_texture *new_tex;
- struct pipe_resource templ = tex->buffer.b.b;
- unsigned i;
-
- templ.bind |= new_bind_flag;
-
- if (tex->buffer.b.is_shared || tex->num_planes > 1)
- return;
-
- if (new_bind_flag == PIPE_BIND_LINEAR) {
- if (tex->surface.is_linear)
- return;
-
- /* This fails with MSAA, depth, and compressed textures. */
- if (si_choose_tiling(sctx->screen, &templ, false) !=
- RADEON_SURF_MODE_LINEAR_ALIGNED)
- return;
- }
-
- new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
- if (!new_tex)
- return;
-
- /* Copy the pixels to the new texture. */
- if (!invalidate_storage) {
- for (i = 0; i <= templ.last_level; i++) {
- struct pipe_box box;
-
- u_box_3d(0, 0, 0,
- u_minify(templ.width0, i), u_minify(templ.height0, i),
- util_num_layers(&templ, i), &box);
-
- sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
- &tex->buffer.b.b, i, &box);
- }
- }
-
- if (new_bind_flag == PIPE_BIND_LINEAR) {
- si_texture_discard_cmask(sctx->screen, tex);
- si_texture_discard_dcc(sctx->screen, tex);
- }
-
- /* Replace the structure fields of tex. */
- tex->buffer.b.b.bind = templ.bind;
- pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
- tex->buffer.gpu_address = new_tex->buffer.gpu_address;
- tex->buffer.vram_usage = new_tex->buffer.vram_usage;
- tex->buffer.gart_usage = new_tex->buffer.gart_usage;
- tex->buffer.bo_size = new_tex->buffer.bo_size;
- tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
- tex->buffer.domains = new_tex->buffer.domains;
- tex->buffer.flags = new_tex->buffer.flags;
-
- tex->surface = new_tex->surface;
- si_texture_reference(&tex->flushed_depth_texture,
- new_tex->flushed_depth_texture);
-
- tex->surface.fmask_offset = new_tex->surface.fmask_offset;
- tex->surface.cmask_offset = new_tex->surface.cmask_offset;
- tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
-
- if (tex->cmask_buffer == &tex->buffer)
- tex->cmask_buffer = NULL;
- else
- si_resource_reference(&tex->cmask_buffer, NULL);
-
- if (new_tex->cmask_buffer == &new_tex->buffer)
- tex->cmask_buffer = &tex->buffer;
- else
- si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
-
- tex->surface.dcc_offset = new_tex->surface.dcc_offset;
- tex->cb_color_info = new_tex->cb_color_info;
- memcpy(tex->color_clear_value, new_tex->color_clear_value,
- sizeof(tex->color_clear_value));
- tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
-
- tex->surface.htile_offset = new_tex->surface.htile_offset;
- tex->depth_clear_value = new_tex->depth_clear_value;
- tex->dirty_level_mask = new_tex->dirty_level_mask;
- tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
- tex->db_render_format = new_tex->db_render_format;
- tex->stencil_clear_value = new_tex->stencil_clear_value;
- tex->tc_compatible_htile = new_tex->tc_compatible_htile;
- tex->depth_cleared = new_tex->depth_cleared;
- tex->stencil_cleared = new_tex->stencil_cleared;
- tex->upgraded_depth = new_tex->upgraded_depth;
- tex->db_compatible = new_tex->db_compatible;
- tex->can_sample_z = new_tex->can_sample_z;
- tex->can_sample_s = new_tex->can_sample_s;
-
- tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
- tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
- tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
- si_resource_reference(&tex->dcc_separate_buffer,
- new_tex->dcc_separate_buffer);
- si_resource_reference(&tex->last_dcc_separate_buffer,
- new_tex->last_dcc_separate_buffer);
-
- if (new_bind_flag == PIPE_BIND_LINEAR) {
- assert(!tex->surface.htile_offset);
- assert(!tex->cmask_buffer);
- assert(!tex->surface.fmask_size);
- assert(!tex->surface.dcc_offset);
- assert(!tex->is_depth);
- }
-
- si_texture_reference(&new_tex, NULL);
-
- p_atomic_inc(&sctx->screen->dirty_tex_counter);
+ struct pipe_screen *screen = sctx->b.screen;
+ struct si_texture *new_tex;
+ struct pipe_resource templ = tex->buffer.b.b;
+ unsigned i;
+
+ templ.bind |= new_bind_flag;
+
+ if (tex->buffer.b.is_shared || tex->num_planes > 1)
+ return;
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ if (tex->surface.is_linear)
+ return;
+
+ /* This fails with MSAA, depth, and compressed textures. */
+ if (si_choose_tiling(sctx->screen, &templ, false) != RADEON_SURF_MODE_LINEAR_ALIGNED)
+ return;
+ }
+
+ new_tex = (struct si_texture *)screen->resource_create(screen, &templ);
+ if (!new_tex)
+ return;
+
+ /* Copy the pixels to the new texture. */
+ if (!invalidate_storage) {
+ for (i = 0; i <= templ.last_level; i++) {
+ struct pipe_box box;
+
+ u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i),
+ util_num_layers(&templ, i), &box);
+
+ sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box);
+ }
+ }
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ si_texture_discard_cmask(sctx->screen, tex);
+ si_texture_discard_dcc(sctx->screen, tex);
+ }
+
+ /* Replace the structure fields of tex. */
+ tex->buffer.b.b.bind = templ.bind;
+ pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+ tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+ tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+ tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+ tex->buffer.bo_size = new_tex->buffer.bo_size;
+ tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+ tex->buffer.domains = new_tex->buffer.domains;
+ tex->buffer.flags = new_tex->buffer.flags;
+
+ tex->surface = new_tex->surface;
+ si_texture_reference(&tex->flushed_depth_texture, new_tex->flushed_depth_texture);
+
+ tex->surface.fmask_offset = new_tex->surface.fmask_offset;
+ tex->surface.cmask_offset = new_tex->surface.cmask_offset;
+ tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+ if (tex->cmask_buffer == &tex->buffer)
+ tex->cmask_buffer = NULL;
+ else
+ si_resource_reference(&tex->cmask_buffer, NULL);
+
+ if (new_tex->cmask_buffer == &new_tex->buffer)
+ tex->cmask_buffer = &tex->buffer;
+ else
+ si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+ tex->surface.dcc_offset = new_tex->surface.dcc_offset;
+ tex->cb_color_info = new_tex->cb_color_info;
+ memcpy(tex->color_clear_value, new_tex->color_clear_value, sizeof(tex->color_clear_value));
+ tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+ tex->surface.htile_offset = new_tex->surface.htile_offset;
+ tex->depth_clear_value = new_tex->depth_clear_value;
+ tex->dirty_level_mask = new_tex->dirty_level_mask;
+ tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+ tex->db_render_format = new_tex->db_render_format;
+ tex->stencil_clear_value = new_tex->stencil_clear_value;
+ tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+ tex->depth_cleared = new_tex->depth_cleared;
+ tex->stencil_cleared = new_tex->stencil_cleared;
+ tex->upgraded_depth = new_tex->upgraded_depth;
+ tex->db_compatible = new_tex->db_compatible;
+ tex->can_sample_z = new_tex->can_sample_z;
+ tex->can_sample_s = new_tex->can_sample_s;
+
+ tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+ tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
+ tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+ si_resource_reference(&tex->dcc_separate_buffer, new_tex->dcc_separate_buffer);
+ si_resource_reference(&tex->last_dcc_separate_buffer, new_tex->last_dcc_separate_buffer);
+
+ if (new_bind_flag == PIPE_BIND_LINEAR) {
+ assert(!tex->surface.htile_offset);
+ assert(!tex->cmask_buffer);
+ assert(!tex->surface.fmask_size);
+ assert(!tex->surface.dcc_offset);
+ assert(!tex->is_depth);
+ }
+
+ si_texture_reference(&new_tex, NULL);
+
+ p_atomic_inc(&sctx->screen->dirty_tex_counter);
}
static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen)
{
- return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
+ return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
}
-static void si_set_tex_bo_metadata(struct si_screen *sscreen,
- struct si_texture *tex)
+static void si_set_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex)
{
- struct radeon_surf *surface = &tex->surface;
- struct pipe_resource *res = &tex->buffer.b.b;
- struct radeon_bo_metadata md;
-
- memset(&md, 0, sizeof(md));
-
- if (sscreen->info.chip_class >= GFX9) {
- md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
- md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-
- if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
- uint64_t dcc_offset =
- tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
- : tex->surface.dcc_offset;
-
- assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
- md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
- md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
- md.u.gfx9.dcc_independent_64B = 1;
- }
- } else {
- md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
- RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
- md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
- RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
- md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
- md.u.legacy.bankw = surface->u.legacy.bankw;
- md.u.legacy.bankh = surface->u.legacy.bankh;
- md.u.legacy.tile_split = surface->u.legacy.tile_split;
- md.u.legacy.mtilea = surface->u.legacy.mtilea;
- md.u.legacy.num_banks = surface->u.legacy.num_banks;
- md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
- md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
- }
-
- assert(tex->dcc_separate_buffer == NULL);
- assert(tex->surface.fmask_size == 0);
-
- /* Metadata image format format version 1:
- * [0] = 1 (metadata format identifier)
- * [1] = (VENDOR_ID << 16) | PCI_ID
- * [2:9] = image descriptor for the whole resource
- * [2] is always 0, because the base address is cleared
- * [9] is the DCC offset bits [39:8] from the beginning of
- * the buffer
- * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
- */
-
- md.metadata[0] = 1; /* metadata image format version 1 */
-
- /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
- md.metadata[1] = si_get_bo_metadata_word1(sscreen);
-
- static const unsigned char swizzle[] = {
- PIPE_SWIZZLE_X,
- PIPE_SWIZZLE_Y,
- PIPE_SWIZZLE_Z,
- PIPE_SWIZZLE_W
- };
- bool is_array = util_texture_is_array(res->target);
- uint32_t desc[8];
-
- sscreen->make_texture_descriptor(sscreen, tex, true,
- res->target, res->format,
- swizzle, 0, res->last_level, 0,
- is_array ? res->array_size - 1 : 0,
- res->width0, res->height0, res->depth0,
- desc, NULL);
-
- si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
- 0, 0, tex->surface.blk_w, false, desc);
-
- /* Clear the base address and set the relative DCC offset. */
- desc[0] = 0;
- desc[1] &= C_008F14_BASE_ADDRESS_HI;
-
- switch (sscreen->info.chip_class) {
- case GFX6:
- case GFX7:
- break;
- case GFX8:
- desc[7] = tex->surface.dcc_offset >> 8;
- break;
- case GFX9:
- desc[7] = tex->surface.dcc_offset >> 8;
- desc[5] &= C_008F24_META_DATA_ADDRESS;
- desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
- break;
- case GFX10:
- desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
- desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
- desc[7] = tex->surface.dcc_offset >> 16;
- break;
- default:
- assert(0);
- }
-
-
- /* Dwords [2:9] contain the image descriptor. */
- memcpy(&md.metadata[2], desc, sizeof(desc));
- md.size_metadata = 10 * 4;
-
- /* Dwords [10:..] contain the mipmap level offsets. */
- if (sscreen->info.chip_class <= GFX8) {
- for (unsigned i = 0; i <= res->last_level; i++)
- md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
-
- md.size_metadata += (1 + res->last_level) * 4;
- }
-
- sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
+ struct radeon_surf *surface = &tex->surface;
+ struct pipe_resource *res = &tex->buffer.b.b;
+ struct radeon_bo_metadata md;
+
+ memset(&md, 0, sizeof(md));
+
+ if (sscreen->info.chip_class >= GFX9) {
+ md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+ md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+
+ if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
+ uint64_t dcc_offset = tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
+ : tex->surface.dcc_offset;
+
+ assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
+ md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
+ md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
+ md.u.gfx9.dcc_independent_64B = 1;
+ }
+ } else {
+ md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D
+ ? RADEON_LAYOUT_TILED
+ : RADEON_LAYOUT_LINEAR;
+ md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D
+ ? RADEON_LAYOUT_TILED
+ : RADEON_LAYOUT_LINEAR;
+ md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
+ md.u.legacy.bankw = surface->u.legacy.bankw;
+ md.u.legacy.bankh = surface->u.legacy.bankh;
+ md.u.legacy.tile_split = surface->u.legacy.tile_split;
+ md.u.legacy.mtilea = surface->u.legacy.mtilea;
+ md.u.legacy.num_banks = surface->u.legacy.num_banks;
+ md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+ md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+ }
+
+ assert(tex->dcc_separate_buffer == NULL);
+ assert(tex->surface.fmask_size == 0);
+
+ /* Metadata image format format version 1:
+ * [0] = 1 (metadata format identifier)
+ * [1] = (VENDOR_ID << 16) | PCI_ID
+ * [2:9] = image descriptor for the whole resource
+ * [2] is always 0, because the base address is cleared
+ * [9] is the DCC offset bits [39:8] from the beginning of
+ * the buffer
+ * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+ */
+
+ md.metadata[0] = 1; /* metadata image format version 1 */
+
+ /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+ md.metadata[1] = si_get_bo_metadata_word1(sscreen);
+
+ static const unsigned char swizzle[] = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
+ PIPE_SWIZZLE_W};
+ bool is_array = util_texture_is_array(res->target);
+ uint32_t desc[8];
+
+ sscreen->make_texture_descriptor(sscreen, tex, true, res->target, res->format, swizzle, 0,
+ res->last_level, 0, is_array ? res->array_size - 1 : 0,
+ res->width0, res->height0, res->depth0, desc, NULL);
+
+ si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], 0, 0,
+ tex->surface.blk_w, false, desc);
+
+ /* Clear the base address and set the relative DCC offset. */
+ desc[0] = 0;
+ desc[1] &= C_008F14_BASE_ADDRESS_HI;
+
+ switch (sscreen->info.chip_class) {
+ case GFX6:
+ case GFX7:
+ break;
+ case GFX8:
+ desc[7] = tex->surface.dcc_offset >> 8;
+ break;
+ case GFX9:
+ desc[7] = tex->surface.dcc_offset >> 8;
+ desc[5] &= C_008F24_META_DATA_ADDRESS;
+ desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
+ break;
+ case GFX10:
+ desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
+ desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
+ desc[7] = tex->surface.dcc_offset >> 16;
+ break;
+ default:
+ assert(0);
+ }
+
+ /* Dwords [2:9] contain the image descriptor. */
+ memcpy(&md.metadata[2], desc, sizeof(desc));
+ md.size_metadata = 10 * 4;
+
+ /* Dwords [10:..] contain the mipmap level offsets. */
+ if (sscreen->info.chip_class <= GFX8) {
+ for (unsigned i = 0; i <= res->last_level; i++)
+ md.metadata[10 + i] = tex->surface.u.legacy.level[i].offset >> 8;
+
+ md.size_metadata += (1 + res->last_level) * 4;
+ }
+
+ sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
}
-static bool si_read_tex_bo_metadata(struct si_screen *sscreen,
- struct si_texture *tex,
- uint64_t offset,
- struct radeon_bo_metadata *md)
+static bool si_read_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex,
+ uint64_t offset, struct radeon_bo_metadata *md)
{
- uint32_t *desc = &md->metadata[2];
-
- if (offset || /* Non-zero planes ignore metadata. */
- md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
- md->metadata[0] == 0 || /* invalid version number */
- md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
- /* Disable DCC because it might not be enabled. */
- si_texture_zero_dcc_fields(tex);
-
- /* Don't report an error if the texture comes from an incompatible driver,
- * but this might not work.
- */
- return true;
- }
-
- /* Validate that sample counts and the number of mipmap levels match. */
- unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
- unsigned type = G_008F1C_TYPE(desc[3]);
-
- if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA ||
- type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
- unsigned log_samples =
- util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
-
- if (last_level != log_samples) {
- fprintf(stderr, "radeonsi: invalid MSAA texture import, "
- "metadata has log2(samples) = %u, the caller set %u\n",
- last_level, log_samples);
- return false;
- }
- } else {
- if (last_level != tex->buffer.b.b.last_level) {
- fprintf(stderr, "radeonsi: invalid mipmapped texture import, "
- "metadata has last_level = %u, the caller set %u\n",
- last_level, tex->buffer.b.b.last_level);
- return false;
- }
- }
-
- if (sscreen->info.chip_class >= GFX8 &&
- G_008F28_COMPRESSION_EN(desc[6])) {
- /* Read DCC information. */
- switch (sscreen->info.chip_class) {
- case GFX8:
- tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
- break;
-
- case GFX9:
- tex->surface.dcc_offset =
- ((uint64_t)desc[7] << 8) |
- ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
- tex->surface.u.gfx9.dcc.pipe_aligned =
- G_008F24_META_PIPE_ALIGNED(desc[5]);
- tex->surface.u.gfx9.dcc.rb_aligned =
- G_008F24_META_RB_ALIGNED(desc[5]);
-
- /* If DCC is unaligned, this can only be a displayable image. */
- if (!tex->surface.u.gfx9.dcc.pipe_aligned &&
- !tex->surface.u.gfx9.dcc.rb_aligned)
- assert(tex->surface.is_displayable);
- break;
-
- case GFX10:
- tex->surface.dcc_offset =
- ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) |
- ((uint64_t)desc[7] << 16);
- tex->surface.u.gfx9.dcc.pipe_aligned =
- G_00A018_META_PIPE_ALIGNED(desc[6]);
- break;
-
- default:
- assert(0);
- return false;
- }
- } else {
- /* Disable DCC. dcc_offset is always set by texture_from_handle
- * and must be cleared here.
- */
- si_texture_zero_dcc_fields(tex);
- }
-
- return true;
+ uint32_t *desc = &md->metadata[2];
+
+ if (offset || /* Non-zero planes ignore metadata. */
+ md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
+ md->metadata[0] == 0 || /* invalid version number */
+ md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
+ /* Disable DCC because it might not be enabled. */
+ si_texture_zero_dcc_fields(tex);
+
+ /* Don't report an error if the texture comes from an incompatible driver,
+ * but this might not work.
+ */
+ return true;
+ }
+
+ /* Validate that sample counts and the number of mipmap levels match. */
+ unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
+ unsigned type = G_008F1C_TYPE(desc[3]);
+
+ if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+ unsigned log_samples = util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
+
+ if (last_level != log_samples) {
+ fprintf(stderr,
+ "radeonsi: invalid MSAA texture import, "
+ "metadata has log2(samples) = %u, the caller set %u\n",
+ last_level, log_samples);
+ return false;
+ }
+ } else {
+ if (last_level != tex->buffer.b.b.last_level) {
+ fprintf(stderr,
+ "radeonsi: invalid mipmapped texture import, "
+ "metadata has last_level = %u, the caller set %u\n",
+ last_level, tex->buffer.b.b.last_level);
+ return false;
+ }
+ }
+
+ if (sscreen->info.chip_class >= GFX8 && G_008F28_COMPRESSION_EN(desc[6])) {
+ /* Read DCC information. */
+ switch (sscreen->info.chip_class) {
+ case GFX8:
+ tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
+ break;
+
+ case GFX9:
+ tex->surface.dcc_offset =
+ ((uint64_t)desc[7] << 8) | ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
+ tex->surface.u.gfx9.dcc.pipe_aligned = G_008F24_META_PIPE_ALIGNED(desc[5]);
+ tex->surface.u.gfx9.dcc.rb_aligned = G_008F24_META_RB_ALIGNED(desc[5]);
+
+ /* If DCC is unaligned, this can only be a displayable image. */
+ if (!tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+ assert(tex->surface.is_displayable);
+ break;
+
+ case GFX10:
+ tex->surface.dcc_offset =
+ ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | ((uint64_t)desc[7] << 16);
+ tex->surface.u.gfx9.dcc.pipe_aligned = G_00A018_META_PIPE_ALIGNED(desc[6]);
+ break;
+
+ default:
+ assert(0);
+ return false;
+ }
+ } else {
+ /* Disable DCC. dcc_offset is always set by texture_from_handle
+ * and must be cleared here.
+ */
+ si_texture_zero_dcc_fields(tex);
+ }
+
+ return true;
}
static bool si_has_displayable_dcc(struct si_texture *tex)
{
- struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen;
-
- if (sscreen->info.chip_class <= GFX8)
- return false;
-
- /* This needs a cache flush before scanout.
- * (it can't be scanned out and rendered to simultaneously)
- */
- if (sscreen->info.use_display_dcc_unaligned &&
- tex->surface.dcc_offset &&
- !tex->surface.u.gfx9.dcc.pipe_aligned &&
- !tex->surface.u.gfx9.dcc.rb_aligned)
- return true;
-
- /* This needs an explicit flush (flush_resource). */
- if (sscreen->info.use_display_dcc_with_retile_blit &&
- tex->surface.display_dcc_offset)
- return true;
-
- return false;
+ struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+
+ if (sscreen->info.chip_class <= GFX8)
+ return false;
+
+ /* This needs a cache flush before scanout.
+ * (it can't be scanned out and rendered to simultaneously)
+ */
+ if (sscreen->info.use_display_dcc_unaligned && tex->surface.dcc_offset &&
+ !tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+ return true;
+
+ /* This needs an explicit flush (flush_resource). */
+ if (sscreen->info.use_display_dcc_with_retile_blit && tex->surface.display_dcc_offset)
+ return true;
+
+ return false;
}
-static bool si_resource_get_param(struct pipe_screen *screen,
- struct pipe_context *context,
- struct pipe_resource *resource,
- unsigned plane,
- unsigned layer,
- enum pipe_resource_param param,
- unsigned handle_usage,
- uint64_t *value)
+static bool si_resource_get_param(struct pipe_screen *screen, struct pipe_context *context,
+ struct pipe_resource *resource, unsigned plane, unsigned layer,
+ enum pipe_resource_param param, unsigned handle_usage,
+ uint64_t *value)
{
- for (unsigned i = 0; i < plane; i++)
- resource = resource->next;
-
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_texture *tex = (struct si_texture*)resource;
- struct winsys_handle whandle;
-
- switch (param) {
- case PIPE_RESOURCE_PARAM_NPLANES:
- *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
- return true;
-
- case PIPE_RESOURCE_PARAM_STRIDE:
- if (resource->target == PIPE_BUFFER)
- *value = 0;
- else if (sscreen->info.chip_class >= GFX9)
- *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
- else
- *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
- return true;
-
- case PIPE_RESOURCE_PARAM_OFFSET:
- if (resource->target == PIPE_BUFFER)
- *value = 0;
- else if (sscreen->info.chip_class >= GFX9)
- *value = tex->surface.u.gfx9.surf_offset +
- layer * tex->surface.u.gfx9.surf_slice_size;
- else
- *value = tex->surface.u.legacy.level[0].offset +
- layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
- return true;
-
- case PIPE_RESOURCE_PARAM_MODIFIER:
- *value = DRM_FORMAT_MOD_INVALID;
- return true;
-
- case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
- case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
- case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
- memset(&whandle, 0, sizeof(whandle));
-
- if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
- whandle.type = WINSYS_HANDLE_TYPE_SHARED;
- else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
- whandle.type = WINSYS_HANDLE_TYPE_KMS;
- else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
- whandle.type = WINSYS_HANDLE_TYPE_FD;
-
- if (!screen->resource_get_handle(screen, context, resource,
- &whandle, handle_usage))
- return false;
-
- *value = whandle.handle;
- return true;
- }
- return false;
+ for (unsigned i = 0; i < plane; i++)
+ resource = resource->next;
+
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_texture *tex = (struct si_texture *)resource;
+ struct winsys_handle whandle;
+
+ switch (param) {
+ case PIPE_RESOURCE_PARAM_NPLANES:
+ *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
+ return true;
+
+ case PIPE_RESOURCE_PARAM_STRIDE:
+ if (resource->target == PIPE_BUFFER)
+ *value = 0;
+ else if (sscreen->info.chip_class >= GFX9)
+ *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+ else
+ *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
+ return true;
+
+ case PIPE_RESOURCE_PARAM_OFFSET:
+ if (resource->target == PIPE_BUFFER)
+ *value = 0;
+ else if (sscreen->info.chip_class >= GFX9)
+ *value = tex->surface.u.gfx9.surf_offset + layer * tex->surface.u.gfx9.surf_slice_size;
+ else
+ *value = tex->surface.u.legacy.level[0].offset +
+ layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+ return true;
+
+ case PIPE_RESOURCE_PARAM_MODIFIER:
+ *value = DRM_FORMAT_MOD_INVALID;
+ return true;
+
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
+ memset(&whandle, 0, sizeof(whandle));
+
+ if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
+ whandle.type = WINSYS_HANDLE_TYPE_SHARED;
+ else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
+ whandle.type = WINSYS_HANDLE_TYPE_KMS;
+ else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
+ whandle.type = WINSYS_HANDLE_TYPE_FD;
+
+ if (!screen->resource_get_handle(screen, context, resource, &whandle, handle_usage))
+ return false;
+
+ *value = whandle.handle;
+ return true;
+ }
+ return false;
}
-static void si_texture_get_info(struct pipe_screen* screen,
- struct pipe_resource *resource,
- unsigned *pstride,
- unsigned *poffset)
+static void si_texture_get_info(struct pipe_screen *screen, struct pipe_resource *resource,
+ unsigned *pstride, unsigned *poffset)
{
- uint64_t value;
-
- if (pstride) {
- si_resource_get_param(screen, NULL, resource, 0, 0,
- PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
- *pstride = value;
- }
-
- if (poffset) {
- si_resource_get_param(screen, NULL, resource, 0, 0,
- PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
- *poffset = value;
- }
+ uint64_t value;
+
+ if (pstride) {
+ si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
+ *pstride = value;
+ }
+
+ if (poffset) {
+ si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
+ *poffset = value;
+ }
}
-static bool si_texture_get_handle(struct pipe_screen* screen,
- struct pipe_context *ctx,
- struct pipe_resource *resource,
- struct winsys_handle *whandle,
- unsigned usage)
+static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_context *ctx,
+ struct pipe_resource *resource, struct winsys_handle *whandle,
+ unsigned usage)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_context *sctx;
- struct si_resource *res = si_resource(resource);
- struct si_texture *tex = (struct si_texture*)resource;
- bool update_metadata = false;
- unsigned stride, offset, slice_size;
- bool flush = false;
-
- ctx = threaded_context_unwrap_sync(ctx);
- sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context);
-
- if (resource->target != PIPE_BUFFER) {
- /* Individual planes are chained pipe_resource instances. */
- for (unsigned i = 0; i < whandle->plane; i++) {
- resource = resource->next;
- res = si_resource(resource);
- tex = (struct si_texture*)resource;
- }
-
- /* This is not supported now, but it might be required for OpenCL
- * interop in the future.
- */
- if (resource->nr_samples > 1 || tex->is_depth)
- return false;
-
- /* Move a suballocated texture into a non-suballocated allocation. */
- if (sscreen->ws->buffer_is_suballocated(res->buf) ||
- tex->surface.tile_swizzle ||
- (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
- sscreen->info.has_local_buffers)) {
- assert(!res->b.is_shared);
- si_reallocate_texture_inplace(sctx, tex,
- PIPE_BIND_SHARED, false);
- flush = true;
- assert(res->b.b.bind & PIPE_BIND_SHARED);
- assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
- assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
- assert(tex->surface.tile_swizzle == 0);
- }
-
- /* Since shader image stores don't support DCC on GFX8,
- * disable it for external clients that want write
- * access.
- */
- if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
- /* Displayable DCC requires an explicit flush. */
- (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
- si_has_displayable_dcc(tex))) {
- if (si_texture_disable_dcc(sctx, tex)) {
- update_metadata = true;
- /* si_texture_disable_dcc flushes the context */
- flush = false;
- }
- }
-
- if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
- (tex->cmask_buffer || tex->surface.dcc_offset)) {
- /* Eliminate fast clear (both CMASK and DCC) */
- si_eliminate_fast_color_clear(sctx, tex);
- /* eliminate_fast_color_clear flushes the context */
- flush = false;
-
- /* Disable CMASK if flush_resource isn't going
- * to be called.
- */
- if (tex->cmask_buffer)
- si_texture_discard_cmask(sscreen, tex);
- }
-
- /* Set metadata. */
- if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
- si_set_tex_bo_metadata(sscreen, tex);
-
- if (sscreen->info.chip_class >= GFX9) {
- slice_size = tex->surface.u.gfx9.surf_slice_size;
- } else {
- slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
- }
- } else {
- /* Buffer exports are for the OpenCL interop. */
- /* Move a suballocated buffer into a non-suballocated allocation. */
- if (sscreen->ws->buffer_is_suballocated(res->buf) ||
- /* A DMABUF export always fails if the BO is local. */
- (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
- sscreen->info.has_local_buffers)) {
- assert(!res->b.is_shared);
-
- /* Allocate a new buffer with PIPE_BIND_SHARED. */
- struct pipe_resource templ = res->b.b;
- templ.bind |= PIPE_BIND_SHARED;
-
- struct pipe_resource *newb =
- screen->resource_create(screen, &templ);
- if (!newb)
- return false;
-
- /* Copy the old buffer contents to the new one. */
- struct pipe_box box;
- u_box_1d(0, newb->width0, &box);
- sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0,
- &res->b.b, 0, &box);
- flush = true;
- /* Move the new buffer storage to the old pipe_resource. */
- si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
- pipe_resource_reference(&newb, NULL);
-
- assert(res->b.b.bind & PIPE_BIND_SHARED);
- assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
- }
-
- /* Buffers */
- slice_size = 0;
- }
-
- si_texture_get_info(screen, resource, &stride, &offset);
-
- if (flush)
- sctx->b.flush(&sctx->b, NULL, 0);
-
- if (res->b.is_shared) {
- /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
- * doesn't set it.
- */
- res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
- if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
- res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
- } else {
- res->b.is_shared = true;
- res->external_usage = usage;
- }
-
- whandle->stride = stride;
- whandle->offset = offset + slice_size * whandle->layer;
-
- return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_context *sctx;
+ struct si_resource *res = si_resource(resource);
+ struct si_texture *tex = (struct si_texture *)resource;
+ bool update_metadata = false;
+ unsigned stride, offset, slice_size;
+ bool flush = false;
+
+ ctx = threaded_context_unwrap_sync(ctx);
+ sctx = (struct si_context *)(ctx ? ctx : sscreen->aux_context);
+
+ if (resource->target != PIPE_BUFFER) {
+ /* Individual planes are chained pipe_resource instances. */
+ for (unsigned i = 0; i < whandle->plane; i++) {
+ resource = resource->next;
+ res = si_resource(resource);
+ tex = (struct si_texture *)resource;
+ }
+
+ /* This is not supported now, but it might be required for OpenCL
+ * interop in the future.
+ */
+ if (resource->nr_samples > 1 || tex->is_depth)
+ return false;
+
+ /* Move a suballocated texture into a non-suballocated allocation. */
+ if (sscreen->ws->buffer_is_suballocated(res->buf) || tex->surface.tile_swizzle ||
+ (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+ sscreen->info.has_local_buffers)) {
+ assert(!res->b.is_shared);
+ si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_SHARED, false);
+ flush = true;
+ assert(res->b.b.bind & PIPE_BIND_SHARED);
+ assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+ assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
+ assert(tex->surface.tile_swizzle == 0);
+ }
+
+ /* Since shader image stores don't support DCC on GFX8,
+ * disable it for external clients that want write
+ * access.
+ */
+ if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
+ /* Displayable DCC requires an explicit flush. */
+ (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && si_has_displayable_dcc(tex))) {
+ if (si_texture_disable_dcc(sctx, tex)) {
+ update_metadata = true;
+ /* si_texture_disable_dcc flushes the context */
+ flush = false;
+ }
+ }
+
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+ (tex->cmask_buffer || tex->surface.dcc_offset)) {
+ /* Eliminate fast clear (both CMASK and DCC) */
+ si_eliminate_fast_color_clear(sctx, tex);
+ /* eliminate_fast_color_clear flushes the context */
+ flush = false;
+
+ /* Disable CMASK if flush_resource isn't going
+ * to be called.
+ */
+ if (tex->cmask_buffer)
+ si_texture_discard_cmask(sscreen, tex);
+ }
+
+ /* Set metadata. */
+ if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
+ si_set_tex_bo_metadata(sscreen, tex);
+
+ if (sscreen->info.chip_class >= GFX9) {
+ slice_size = tex->surface.u.gfx9.surf_slice_size;
+ } else {
+ slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+ }
+ } else {
+ /* Buffer exports are for the OpenCL interop. */
+ /* Move a suballocated buffer into a non-suballocated allocation. */
+ if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+ /* A DMABUF export always fails if the BO is local. */
+ (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+ sscreen->info.has_local_buffers)) {
+ assert(!res->b.is_shared);
+
+ /* Allocate a new buffer with PIPE_BIND_SHARED. */
+ struct pipe_resource templ = res->b.b;
+ templ.bind |= PIPE_BIND_SHARED;
+
+ struct pipe_resource *newb = screen->resource_create(screen, &templ);
+ if (!newb)
+ return false;
+
+ /* Copy the old buffer contents to the new one. */
+ struct pipe_box box;
+ u_box_1d(0, newb->width0, &box);
+ sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box);
+ flush = true;
+ /* Move the new buffer storage to the old pipe_resource. */
+ si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
+ pipe_resource_reference(&newb, NULL);
+
+ assert(res->b.b.bind & PIPE_BIND_SHARED);
+ assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+ }
+
+ /* Buffers */
+ slice_size = 0;
+ }
+
+ si_texture_get_info(screen, resource, &stride, &offset);
+
+ if (flush)
+ sctx->b.flush(&sctx->b, NULL, 0);
+
+ if (res->b.is_shared) {
+ /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+ * doesn't set it.
+ */
+ res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ } else {
+ res->b.is_shared = true;
+ res->external_usage = usage;
+ }
+
+ whandle->stride = stride;
+ whandle->offset = offset + slice_size * whandle->layer;
+
+ return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
}
-static void si_texture_destroy(struct pipe_screen *screen,
- struct pipe_resource *ptex)
+static void si_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_texture *tex = (struct si_texture*)ptex;
- struct si_resource *resource = &tex->buffer;
-
- if (sscreen->info.chip_class >= GFX9)
- free(tex->surface.u.gfx9.dcc_retile_map);
-
- si_texture_reference(&tex->flushed_depth_texture, NULL);
-
- if (tex->cmask_buffer != &tex->buffer) {
- si_resource_reference(&tex->cmask_buffer, NULL);
- }
- pb_reference(&resource->buf, NULL);
- si_resource_reference(&tex->dcc_separate_buffer, NULL);
- si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
- FREE(tex);
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_texture *tex = (struct si_texture *)ptex;
+ struct si_resource *resource = &tex->buffer;
+
+ if (sscreen->info.chip_class >= GFX9)
+ free(tex->surface.u.gfx9.dcc_retile_map);
+
+ si_texture_reference(&tex->flushed_depth_texture, NULL);
+
+ if (tex->cmask_buffer != &tex->buffer) {
+ si_resource_reference(&tex->cmask_buffer, NULL);
+ }
+ pb_reference(&resource->buf, NULL);
+ si_resource_reference(&tex->dcc_separate_buffer, NULL);
+ si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+ FREE(tex);
}
static const struct u_resource_vtbl si_texture_vtbl;
-void si_print_texture_info(struct si_screen *sscreen,
- struct si_texture *tex, struct u_log_context *log)
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+ struct u_log_context *log)
{
- int i;
-
- /* Common parameters. */
- u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
- "blk_h=%u, array_size=%u, last_level=%u, "
- "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
- tex->buffer.b.b.width0, tex->buffer.b.b.height0,
- tex->buffer.b.b.depth0, tex->surface.blk_w,
- tex->surface.blk_h,
- tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
- tex->surface.bpe, tex->buffer.b.b.nr_samples,
- tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
-
- if (sscreen->info.chip_class >= GFX9) {
- u_log_printf(log, " Surf: size=%"PRIu64", slice_size=%"PRIu64", "
- "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
- tex->surface.surf_size,
- tex->surface.u.gfx9.surf_slice_size,
- tex->surface.surf_alignment,
- tex->surface.u.gfx9.surf.swizzle_mode,
- tex->surface.u.gfx9.surf.epitch,
- tex->surface.u.gfx9.surf_pitch);
-
- if (tex->surface.fmask_offset) {
- u_log_printf(log, " FMASK: offset=%"PRIu64", size=%"PRIu64", "
- "alignment=%u, swmode=%u, epitch=%u\n",
- tex->surface.fmask_offset,
- tex->surface.fmask_size,
- tex->surface.fmask_alignment,
- tex->surface.u.gfx9.fmask.swizzle_mode,
- tex->surface.u.gfx9.fmask.epitch);
- }
-
- if (tex->cmask_buffer) {
- u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, "
- "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
- tex->surface.cmask_offset,
- tex->surface.cmask_size,
- tex->surface.cmask_alignment,
- tex->surface.u.gfx9.cmask.rb_aligned,
- tex->surface.u.gfx9.cmask.pipe_aligned);
- }
-
- if (tex->surface.htile_offset) {
- u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, alignment=%u, "
- "rb_aligned=%u, pipe_aligned=%u\n",
- tex->surface.htile_offset,
- tex->surface.htile_size,
- tex->surface.htile_alignment,
- tex->surface.u.gfx9.htile.rb_aligned,
- tex->surface.u.gfx9.htile.pipe_aligned);
- }
-
- if (tex->surface.dcc_offset) {
- u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, "
- "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
- tex->surface.dcc_offset, tex->surface.dcc_size,
- tex->surface.dcc_alignment,
- tex->surface.u.gfx9.display_dcc_pitch_max,
- tex->surface.num_dcc_levels);
- }
-
- if (tex->surface.u.gfx9.stencil_offset) {
- u_log_printf(log, " Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
- tex->surface.u.gfx9.stencil_offset,
- tex->surface.u.gfx9.stencil.swizzle_mode,
- tex->surface.u.gfx9.stencil.epitch);
- }
- return;
- }
-
- u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
- "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
- tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
- tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
- tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
- (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
-
- if (tex->surface.fmask_offset)
- u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
- "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
- tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
- tex->surface.u.legacy.fmask.pitch_in_pixels,
- tex->surface.u.legacy.fmask.bankh,
- tex->surface.u.legacy.fmask.slice_tile_max,
- tex->surface.u.legacy.fmask.tiling_index);
-
- if (tex->cmask_buffer)
- u_log_printf(log, " CMask: offset=%"PRIu64", size=%u, alignment=%u, "
- "slice_tile_max=%u\n",
- tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
- tex->surface.u.legacy.cmask_slice_tile_max);
-
- if (tex->surface.htile_offset)
- u_log_printf(log, " HTile: offset=%"PRIu64", size=%u, "
- "alignment=%u, TC_compatible = %u\n",
- tex->surface.htile_offset, tex->surface.htile_size,
- tex->surface.htile_alignment,
- tex->tc_compatible_htile);
-
- if (tex->surface.dcc_offset) {
- u_log_printf(log, " DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
- tex->surface.dcc_offset, tex->surface.dcc_size,
- tex->surface.dcc_alignment);
- for (i = 0; i <= tex->buffer.b.b.last_level; i++)
- u_log_printf(log, " DCCLevel[%i]: enabled=%u, offset=%u, "
- "fast_clear_size=%u\n",
- i, i < tex->surface.num_dcc_levels,
- tex->surface.u.legacy.level[i].dcc_offset,
- tex->surface.u.legacy.level[i].dcc_fast_clear_size);
- }
-
- for (i = 0; i <= tex->buffer.b.b.last_level; i++)
- u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
- "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
- "mode=%u, tiling_index = %u\n",
- i, tex->surface.u.legacy.level[i].offset,
- (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
- u_minify(tex->buffer.b.b.width0, i),
- u_minify(tex->buffer.b.b.height0, i),
- u_minify(tex->buffer.b.b.depth0, i),
- tex->surface.u.legacy.level[i].nblk_x,
- tex->surface.u.legacy.level[i].nblk_y,
- tex->surface.u.legacy.level[i].mode,
- tex->surface.u.legacy.tiling_index[i]);
-
- if (tex->surface.has_stencil) {
- u_log_printf(log, " StencilLayout: tilesplit=%u\n",
- tex->surface.u.legacy.stencil_tile_split);
- for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
- u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", "
- "slice_size=%"PRIu64", npix_x=%u, "
- "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
- "mode=%u, tiling_index = %u\n",
- i, tex->surface.u.legacy.stencil_level[i].offset,
- (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
- u_minify(tex->buffer.b.b.width0, i),
- u_minify(tex->buffer.b.b.height0, i),
- u_minify(tex->buffer.b.b.depth0, i),
- tex->surface.u.legacy.stencil_level[i].nblk_x,
- tex->surface.u.legacy.stencil_level[i].nblk_y,
- tex->surface.u.legacy.stencil_level[i].mode,
- tex->surface.u.legacy.stencil_tiling_index[i]);
- }
- }
+ int i;
+
+ /* Common parameters. */
+ u_log_printf(log,
+ " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+ "blk_h=%u, array_size=%u, last_level=%u, "
+ "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+ tex->buffer.b.b.width0, tex->buffer.b.b.height0, tex->buffer.b.b.depth0,
+ tex->surface.blk_w, tex->surface.blk_h, tex->buffer.b.b.array_size,
+ tex->buffer.b.b.last_level, tex->surface.bpe, tex->buffer.b.b.nr_samples,
+ tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
+
+ if (sscreen->info.chip_class >= GFX9) {
+ u_log_printf(log,
+ " Surf: size=%" PRIu64 ", slice_size=%" PRIu64 ", "
+ "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+ tex->surface.surf_size, tex->surface.u.gfx9.surf_slice_size,
+ tex->surface.surf_alignment, tex->surface.u.gfx9.surf.swizzle_mode,
+ tex->surface.u.gfx9.surf.epitch, tex->surface.u.gfx9.surf_pitch);
+
+ if (tex->surface.fmask_offset) {
+ u_log_printf(log,
+ " FMASK: offset=%" PRIu64 ", size=%" PRIu64 ", "
+ "alignment=%u, swmode=%u, epitch=%u\n",
+ tex->surface.fmask_offset, tex->surface.fmask_size,
+ tex->surface.fmask_alignment, tex->surface.u.gfx9.fmask.swizzle_mode,
+ tex->surface.u.gfx9.fmask.epitch);
+ }
+
+ if (tex->cmask_buffer) {
+ u_log_printf(log,
+ " CMask: offset=%" PRIu64 ", size=%u, "
+ "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+ tex->surface.cmask_offset, tex->surface.cmask_size,
+ tex->surface.cmask_alignment, tex->surface.u.gfx9.cmask.rb_aligned,
+ tex->surface.u.gfx9.cmask.pipe_aligned);
+ }
+
+ if (tex->surface.htile_offset) {
+ u_log_printf(log,
+ " HTile: offset=%" PRIu64 ", size=%u, alignment=%u, "
+ "rb_aligned=%u, pipe_aligned=%u\n",
+ tex->surface.htile_offset, tex->surface.htile_size,
+ tex->surface.htile_alignment, tex->surface.u.gfx9.htile.rb_aligned,
+ tex->surface.u.gfx9.htile.pipe_aligned);
+ }
+
+ if (tex->surface.dcc_offset) {
+ u_log_printf(log,
+ " DCC: offset=%" PRIu64 ", size=%u, "
+ "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+ tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment,
+ tex->surface.u.gfx9.display_dcc_pitch_max, tex->surface.num_dcc_levels);
+ }
+
+ if (tex->surface.u.gfx9.stencil_offset) {
+ u_log_printf(log, " Stencil: offset=%" PRIu64 ", swmode=%u, epitch=%u\n",
+ tex->surface.u.gfx9.stencil_offset, tex->surface.u.gfx9.stencil.swizzle_mode,
+ tex->surface.u.gfx9.stencil.epitch);
+ }
+ return;
+ }
+
+ u_log_printf(log,
+ " Layout: size=%" PRIu64 ", alignment=%u, bankw=%u, "
+ "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+ tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+ tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks,
+ tex->surface.u.legacy.mtilea, tex->surface.u.legacy.tile_split,
+ tex->surface.u.legacy.pipe_config, (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+ if (tex->surface.fmask_offset)
+ u_log_printf(
+ log,
+ " FMask: offset=%" PRIu64 ", size=%" PRIu64 ", alignment=%u, pitch_in_pixels=%u, "
+ "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+ tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+ tex->surface.u.legacy.fmask.pitch_in_pixels, tex->surface.u.legacy.fmask.bankh,
+ tex->surface.u.legacy.fmask.slice_tile_max, tex->surface.u.legacy.fmask.tiling_index);
+
+ if (tex->cmask_buffer)
+ u_log_printf(log,
+ " CMask: offset=%" PRIu64 ", size=%u, alignment=%u, "
+ "slice_tile_max=%u\n",
+ tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+ tex->surface.u.legacy.cmask_slice_tile_max);
+
+ if (tex->surface.htile_offset)
+ u_log_printf(log,
+ " HTile: offset=%" PRIu64 ", size=%u, "
+ "alignment=%u, TC_compatible = %u\n",
+ tex->surface.htile_offset, tex->surface.htile_size, tex->surface.htile_alignment,
+ tex->tc_compatible_htile);
+
+ if (tex->surface.dcc_offset) {
+ u_log_printf(log, " DCC: offset=%" PRIu64 ", size=%u, alignment=%u\n",
+ tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment);
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+ u_log_printf(log,
+ " DCCLevel[%i]: enabled=%u, offset=%u, "
+ "fast_clear_size=%u\n",
+ i, i < tex->surface.num_dcc_levels, tex->surface.u.legacy.level[i].dcc_offset,
+ tex->surface.u.legacy.level[i].dcc_fast_clear_size);
+ }
+
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+ u_log_printf(log,
+ " Level[%i]: offset=%" PRIu64 ", slice_size=%" PRIu64 ", "
+ "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "mode=%u, tiling_index = %u\n",
+ i, tex->surface.u.legacy.level[i].offset,
+ (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+ u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+ u_minify(tex->buffer.b.b.depth0, i), tex->surface.u.legacy.level[i].nblk_x,
+ tex->surface.u.legacy.level[i].nblk_y, tex->surface.u.legacy.level[i].mode,
+ tex->surface.u.legacy.tiling_index[i]);
+
+ if (tex->surface.has_stencil) {
+ u_log_printf(log, " StencilLayout: tilesplit=%u\n",
+ tex->surface.u.legacy.stencil_tile_split);
+ for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
+ u_log_printf(log,
+ " StencilLevel[%i]: offset=%" PRIu64 ", "
+ "slice_size=%" PRIu64 ", npix_x=%u, "
+ "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "mode=%u, tiling_index = %u\n",
+ i, tex->surface.u.legacy.stencil_level[i].offset,
+ (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+ u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+ u_minify(tex->buffer.b.b.depth0, i),
+ tex->surface.u.legacy.stencil_level[i].nblk_x,
+ tex->surface.u.legacy.stencil_level[i].nblk_y,
+ tex->surface.u.legacy.stencil_level[i].mode,
+ tex->surface.u.legacy.stencil_tiling_index[i]);
+ }
+ }
}
/**
* \param alloc_size the size to allocate if plane0 != NULL
* \param alignment alignment for the allocation
*/
-static struct si_texture *
-si_texture_create_object(struct pipe_screen *screen,
- const struct pipe_resource *base,
- const struct radeon_surf *surface,
- const struct si_texture *plane0,
- struct pb_buffer *imported_buf,
- uint64_t offset,
- uint64_t alloc_size,
- unsigned alignment)
+static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
+ const struct pipe_resource *base,
+ const struct radeon_surf *surface,
+ const struct si_texture *plane0,
+ struct pb_buffer *imported_buf, uint64_t offset,
+ uint64_t alloc_size, unsigned alignment)
{
- struct si_texture *tex;
- struct si_resource *resource;
- struct si_screen *sscreen = (struct si_screen*)screen;
-
- tex = CALLOC_STRUCT(si_texture);
- if (!tex)
- goto error;
-
- resource = &tex->buffer;
- resource->b.b = *base;
- resource->b.b.next = NULL;
- resource->b.vtbl = &si_texture_vtbl;
- pipe_reference_init(&resource->b.b.reference, 1);
- resource->b.b.screen = screen;
-
- /* don't include stencil-only formats which we don't support for rendering */
- tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
- tex->surface = *surface;
- tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
- (tex->surface.flags &
- RADEON_SURF_TC_COMPATIBLE_HTILE);
-
- /* TC-compatible HTILE:
- * - GFX8 only supports Z32_FLOAT.
- * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
- if (tex->tc_compatible_htile) {
- if (sscreen->info.chip_class >= GFX9 &&
- base->format == PIPE_FORMAT_Z16_UNORM)
- tex->db_render_format = base->format;
- else {
- tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
- tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
- base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
- }
- } else {
- tex->db_render_format = base->format;
- }
-
- /* Applies to GCN. */
- tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
-
- /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
- * between frames, so the only thing that can enable separate DCC
- * with DRI2 is multiple slow clears within a frame.
- */
- tex->ps_draw_ratio = 0;
-
- if (sscreen->info.chip_class >= GFX9) {
- tex->surface.u.gfx9.surf_offset = offset;
- } else {
- for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
- tex->surface.u.legacy.level[i].offset += offset;
- }
-
- if (tex->is_depth) {
- if (sscreen->info.chip_class >= GFX9) {
- tex->can_sample_z = true;
- tex->can_sample_s = true;
-
- /* Stencil texturing with HTILE doesn't work
- * with mipmapping on Navi10-14. */
- if ((sscreen->info.family == CHIP_NAVI10 ||
- sscreen->info.family == CHIP_NAVI12 ||
- sscreen->info.family == CHIP_NAVI14) &&
- base->last_level > 0)
- tex->htile_stencil_disabled = true;
- } else {
- tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
- tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
- }
-
- tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
- } else {
- if (tex->surface.cmask_offset) {
- tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
- tex->cmask_buffer = &tex->buffer;
- }
- }
-
- if (plane0) {
- /* The buffer is shared with the first plane. */
- resource->bo_size = plane0->buffer.bo_size;
- resource->bo_alignment = plane0->buffer.bo_alignment;
- resource->flags = plane0->buffer.flags;
- resource->domains = plane0->buffer.domains;
- resource->vram_usage = plane0->buffer.vram_usage;
- resource->gart_usage = plane0->buffer.gart_usage;
-
- pb_reference(&resource->buf, plane0->buffer.buf);
- resource->gpu_address = plane0->buffer.gpu_address;
- } else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
- /* Create the backing buffer. */
- si_init_resource_fields(sscreen, resource, alloc_size, alignment);
-
- if (!si_alloc_resource(sscreen, resource))
- goto error;
- } else {
- resource->buf = imported_buf;
- resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
- resource->bo_size = imported_buf->size;
- resource->bo_alignment = imported_buf->alignment;
- resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
- if (resource->domains & RADEON_DOMAIN_VRAM)
- resource->vram_usage = resource->bo_size;
- else if (resource->domains & RADEON_DOMAIN_GTT)
- resource->gart_usage = resource->bo_size;
- }
-
- if (tex->cmask_buffer) {
- /* Initialize the cmask to 0xCC (= compressed state). */
- si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
- tex->surface.cmask_offset, tex->surface.cmask_size,
- 0xCCCCCCCC);
- }
- if (tex->surface.htile_offset) {
- uint32_t clear_value = 0;
-
- if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
- clear_value = 0x0000030F;
-
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.htile_offset,
- tex->surface.htile_size,
- clear_value);
- }
-
- /* Initialize DCC only if the texture is not being imported. */
- if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
- /* Clear DCC to black for all tiles with DCC enabled.
- *
- * This fixes corruption in 3DMark Slingshot Extreme, which
- * uses uninitialized textures, causing corruption.
- */
- if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
- tex->buffer.b.b.nr_samples <= 2) {
- /* Simple case - all tiles have DCC enabled. */
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.dcc_offset,
- tex->surface.dcc_size,
- DCC_CLEAR_COLOR_0000);
- } else if (sscreen->info.chip_class >= GFX9) {
- /* Clear to uncompressed. Clearing this to black is complicated. */
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.dcc_offset,
- tex->surface.dcc_size,
- DCC_UNCOMPRESSED);
- } else {
- /* GFX8: Initialize mipmap levels and multisamples separately. */
- if (tex->buffer.b.b.nr_samples >= 2) {
- /* Clearing this to black is complicated. */
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.dcc_offset,
- tex->surface.dcc_size,
- DCC_UNCOMPRESSED);
- } else {
- /* Clear the enabled mipmap levels to black. */
- unsigned size = 0;
-
- for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
- if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
- break;
-
- size = tex->surface.u.legacy.level[i].dcc_offset +
- tex->surface.u.legacy.level[i].dcc_fast_clear_size;
- }
-
- /* Mipmap levels with DCC. */
- if (size) {
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.dcc_offset, size,
- DCC_CLEAR_COLOR_0000);
- }
- /* Mipmap levels without DCC. */
- if (size != tex->surface.dcc_size) {
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.dcc_offset + size,
- tex->surface.dcc_size - size,
- DCC_UNCOMPRESSED);
- }
- }
- }
-
- /* Initialize displayable DCC that requires the retile blit. */
- if (tex->surface.dcc_retile_map_offset) {
- /* Uninitialized DCC can hang the display hw.
- * Clear to white to indicate that. */
- si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
- tex->surface.display_dcc_offset,
- tex->surface.u.gfx9.display_dcc_size,
- DCC_CLEAR_COLOR_1111);
-
- /* Upload the DCC retile map.
- * Use a staging buffer for the upload, because
- * the buffer backing the texture is unmappable.
- */
- bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
- unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
- struct si_resource *buf =
- si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
- num_elements * (use_uint16 ? 2 : 4),
- sscreen->info.tcc_cache_line_size);
- uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL,
- PIPE_TRANSFER_WRITE);
- uint16_t *us = (uint16_t*)ui;
-
- /* Upload the retile map into a staging buffer. */
- if (use_uint16) {
- for (unsigned i = 0; i < num_elements; i++)
- us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
- } else {
- for (unsigned i = 0; i < num_elements; i++)
- ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
- }
-
- /* Copy the staging buffer to the buffer backing the texture. */
- struct si_context *sctx = (struct si_context*)sscreen->aux_context;
-
- assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
- simple_mtx_lock(&sscreen->aux_context_lock);
- si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b,
- tex->surface.dcc_retile_map_offset,
- 0, buf->b.b.width0);
- sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
- simple_mtx_unlock(&sscreen->aux_context_lock);
-
- si_resource_reference(&buf, NULL);
- }
- }
-
- /* Initialize the CMASK base register value. */
- tex->cmask_base_address_reg =
- (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
-
- if (sscreen->debug_flags & DBG(VM)) {
- fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
- tex->buffer.gpu_address,
- tex->buffer.gpu_address + tex->buffer.buf->size,
- base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
- base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
- }
-
- if (sscreen->debug_flags & DBG(TEX)) {
- puts("Texture:");
- struct u_log_context log;
- u_log_context_init(&log);
- si_print_texture_info(sscreen, tex, &log);
- u_log_new_page_print(&log, stdout);
- fflush(stdout);
- u_log_context_destroy(&log);
- }
-
- return tex;
+ struct si_texture *tex;
+ struct si_resource *resource;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ tex = CALLOC_STRUCT(si_texture);
+ if (!tex)
+ goto error;
+
+ resource = &tex->buffer;
+ resource->b.b = *base;
+ resource->b.b.next = NULL;
+ resource->b.vtbl = &si_texture_vtbl;
+ pipe_reference_init(&resource->b.b.reference, 1);
+ resource->b.b.screen = screen;
+
+ /* don't include stencil-only formats which we don't support for rendering */
+ tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
+ tex->surface = *surface;
+ tex->tc_compatible_htile =
+ tex->surface.htile_size != 0 && (tex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE);
+
+ /* TC-compatible HTILE:
+ * - GFX8 only supports Z32_FLOAT.
+ * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+ if (tex->tc_compatible_htile) {
+ if (sscreen->info.chip_class >= GFX9 && base->format == PIPE_FORMAT_Z16_UNORM)
+ tex->db_render_format = base->format;
+ else {
+ tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+ tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+ base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+ }
+ } else {
+ tex->db_render_format = base->format;
+ }
+
+ /* Applies to GCN. */
+ tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
+
+ /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+ * between frames, so the only thing that can enable separate DCC
+ * with DRI2 is multiple slow clears within a frame.
+ */
+ tex->ps_draw_ratio = 0;
+
+ if (sscreen->info.chip_class >= GFX9) {
+ tex->surface.u.gfx9.surf_offset = offset;
+ } else {
+ for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+ tex->surface.u.legacy.level[i].offset += offset;
+ }
+
+ if (tex->is_depth) {
+ if (sscreen->info.chip_class >= GFX9) {
+ tex->can_sample_z = true;
+ tex->can_sample_s = true;
+
+ /* Stencil texturing with HTILE doesn't work
+ * with mipmapping on Navi10-14. */
+ if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+ sscreen->info.family == CHIP_NAVI14) &&
+ base->last_level > 0)
+ tex->htile_stencil_disabled = true;
+ } else {
+ tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+ tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
+ }
+
+ tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
+ } else {
+ if (tex->surface.cmask_offset) {
+ tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+ tex->cmask_buffer = &tex->buffer;
+ }
+ }
+
+ if (plane0) {
+ /* The buffer is shared with the first plane. */
+ resource->bo_size = plane0->buffer.bo_size;
+ resource->bo_alignment = plane0->buffer.bo_alignment;
+ resource->flags = plane0->buffer.flags;
+ resource->domains = plane0->buffer.domains;
+ resource->vram_usage = plane0->buffer.vram_usage;
+ resource->gart_usage = plane0->buffer.gart_usage;
+
+ pb_reference(&resource->buf, plane0->buffer.buf);
+ resource->gpu_address = plane0->buffer.gpu_address;
+ } else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
+ /* Create the backing buffer. */
+ si_init_resource_fields(sscreen, resource, alloc_size, alignment);
+
+ if (!si_alloc_resource(sscreen, resource))
+ goto error;
+ } else {
+ resource->buf = imported_buf;
+ resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
+ resource->bo_size = imported_buf->size;
+ resource->bo_alignment = imported_buf->alignment;
+ resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
+ if (resource->domains & RADEON_DOMAIN_VRAM)
+ resource->vram_usage = resource->bo_size;
+ else if (resource->domains & RADEON_DOMAIN_GTT)
+ resource->gart_usage = resource->bo_size;
+ }
+
+ if (tex->cmask_buffer) {
+ /* Initialize the cmask to 0xCC (= compressed state). */
+ si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+ tex->surface.cmask_size, 0xCCCCCCCC);
+ }
+ if (tex->surface.htile_offset) {
+ uint32_t clear_value = 0;
+
+ if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
+ clear_value = 0x0000030F;
+
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.htile_offset,
+ tex->surface.htile_size, clear_value);
+ }
+
+ /* Initialize DCC only if the texture is not being imported. */
+ if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
+ /* Clear DCC to black for all tiles with DCC enabled.
+ *
+ * This fixes corruption in 3DMark Slingshot Extreme, which
+ * uses uninitialized textures, causing corruption.
+ */
+ if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
+ tex->buffer.b.b.nr_samples <= 2) {
+ /* Simple case - all tiles have DCC enabled. */
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+ tex->surface.dcc_size, DCC_CLEAR_COLOR_0000);
+ } else if (sscreen->info.chip_class >= GFX9) {
+ /* Clear to uncompressed. Clearing this to black is complicated. */
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+ tex->surface.dcc_size, DCC_UNCOMPRESSED);
+ } else {
+ /* GFX8: Initialize mipmap levels and multisamples separately. */
+ if (tex->buffer.b.b.nr_samples >= 2) {
+ /* Clearing this to black is complicated. */
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+ tex->surface.dcc_size, DCC_UNCOMPRESSED);
+ } else {
+ /* Clear the enabled mipmap levels to black. */
+ unsigned size = 0;
+
+ for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
+ if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
+ break;
+
+ size = tex->surface.u.legacy.level[i].dcc_offset +
+ tex->surface.u.legacy.level[i].dcc_fast_clear_size;
+ }
+
+ /* Mipmap levels with DCC. */
+ if (size) {
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, size,
+ DCC_CLEAR_COLOR_0000);
+ }
+ /* Mipmap levels without DCC. */
+ if (size != tex->surface.dcc_size) {
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset + size,
+ tex->surface.dcc_size - size, DCC_UNCOMPRESSED);
+ }
+ }
+ }
+
+ /* Initialize displayable DCC that requires the retile blit. */
+ if (tex->surface.dcc_retile_map_offset) {
+ /* Uninitialized DCC can hang the display hw.
+ * Clear to white to indicate that. */
+ si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.display_dcc_offset,
+ tex->surface.u.gfx9.display_dcc_size, DCC_CLEAR_COLOR_1111);
+
+ /* Upload the DCC retile map.
+ * Use a staging buffer for the upload, because
+ * the buffer backing the texture is unmappable.
+ */
+ bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+ unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+ struct si_resource *buf = si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
+ num_elements * (use_uint16 ? 2 : 4),
+ sscreen->info.tcc_cache_line_size);
+ uint32_t *ui = (uint32_t *)sscreen->ws->buffer_map(buf->buf, NULL, PIPE_TRANSFER_WRITE);
+ uint16_t *us = (uint16_t *)ui;
+
+ /* Upload the retile map into a staging buffer. */
+ if (use_uint16) {
+ for (unsigned i = 0; i < num_elements; i++)
+ us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+ } else {
+ for (unsigned i = 0; i < num_elements; i++)
+ ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+ }
+
+ /* Copy the staging buffer to the buffer backing the texture. */
+ struct si_context *sctx = (struct si_context *)sscreen->aux_context;
+
+ assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
+ simple_mtx_lock(&sscreen->aux_context_lock);
+ si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, tex->surface.dcc_retile_map_offset,
+ 0, buf->b.b.width0);
+ sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+ simple_mtx_unlock(&sscreen->aux_context_lock);
+
+ si_resource_reference(&buf, NULL);
+ }
+ }
+
+ /* Initialize the CMASK base register value. */
+ tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+
+ if (sscreen->debug_flags & DBG(VM)) {
+ fprintf(stderr,
+ "VM start=0x%" PRIX64 " end=0x%" PRIX64
+ " | Texture %ix%ix%i, %i levels, %i samples, %s\n",
+ tex->buffer.gpu_address, tex->buffer.gpu_address + tex->buffer.buf->size,
+ base->width0, base->height0, util_num_layers(base, 0), base->last_level + 1,
+ base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
+ }
+
+ if (sscreen->debug_flags & DBG(TEX)) {
+ puts("Texture:");
+ struct u_log_context log;
+ u_log_context_init(&log);
+ si_print_texture_info(sscreen, tex, &log);
+ u_log_new_page_print(&log, stdout);
+ fflush(stdout);
+ u_log_context_destroy(&log);
+ }
+
+ return tex;
error:
- FREE(tex);
- if (sscreen->info.chip_class >= GFX9)
- free(surface->u.gfx9.dcc_retile_map);
- return NULL;
+ FREE(tex);
+ if (sscreen->info.chip_class >= GFX9)
+ free(surface->u.gfx9.dcc_retile_map);
+ return NULL;
}
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
- const struct pipe_resource *templ, bool tc_compatible_htile)
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+ const struct pipe_resource *templ,
+ bool tc_compatible_htile)
{
- const struct util_format_description *desc = util_format_description(templ->format);
- bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
- bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
- !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
-
- /* MSAA resources must be 2D tiled. */
- if (templ->nr_samples > 1)
- return RADEON_SURF_MODE_2D;
-
- /* Transfer resources should be linear. */
- if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
- * which requires 2D tiling.
- */
- if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
- return RADEON_SURF_MODE_2D;
-
- /* Handle common candidates for the linear mode.
- * Compressed textures and DB surfaces must always be tiled.
- */
- if (!force_tiling &&
- !is_depth_stencil &&
- !util_format_is_compressed(templ->format)) {
- if (sscreen->debug_flags & DBG(NO_TILING))
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
- if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- /* Cursors are linear on AMD GCN.
- * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
- if (templ->bind & PIPE_BIND_CURSOR)
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- if (templ->bind & PIPE_BIND_LINEAR)
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- /* Textures with a very small height are recommended to be linear. */
- if (templ->target == PIPE_TEXTURE_1D ||
- templ->target == PIPE_TEXTURE_1D_ARRAY ||
- /* Only very thin and long 2D textures should benefit from
- * linear_aligned. */
- (templ->width0 > 8 && templ->height0 <= 2))
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
- /* Textures likely to be mapped often. */
- if (templ->usage == PIPE_USAGE_STAGING ||
- templ->usage == PIPE_USAGE_STREAM)
- return RADEON_SURF_MODE_LINEAR_ALIGNED;
- }
-
- /* Make small textures 1D tiled. */
- if (templ->width0 <= 16 || templ->height0 <= 16 ||
- (sscreen->debug_flags & DBG(NO_2D_TILING)))
- return RADEON_SURF_MODE_1D;
-
- /* The allocator will switch to 1D if needed. */
- return RADEON_SURF_MODE_2D;
+ const struct util_format_description *desc = util_format_description(templ->format);
+ bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
+ bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+ !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
+
+ /* MSAA resources must be 2D tiled. */
+ if (templ->nr_samples > 1)
+ return RADEON_SURF_MODE_2D;
+
+ /* Transfer resources should be linear. */
+ if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
+ * which requires 2D tiling.
+ */
+ if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
+ return RADEON_SURF_MODE_2D;
+
+ /* Handle common candidates for the linear mode.
+ * Compressed textures and DB surfaces must always be tiled.
+ */
+ if (!force_tiling && !is_depth_stencil && !util_format_is_compressed(templ->format)) {
+ if (sscreen->debug_flags & DBG(NO_TILING))
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
+ if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Cursors are linear on AMD GCN.
+ * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
+ if (templ->bind & PIPE_BIND_CURSOR)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ if (templ->bind & PIPE_BIND_LINEAR)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Textures with a very small height are recommended to be linear. */
+ if (templ->target == PIPE_TEXTURE_1D || templ->target == PIPE_TEXTURE_1D_ARRAY ||
+ /* Only very thin and long 2D textures should benefit from
+ * linear_aligned. */
+ (templ->width0 > 8 && templ->height0 <= 2))
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+ /* Textures likely to be mapped often. */
+ if (templ->usage == PIPE_USAGE_STAGING || templ->usage == PIPE_USAGE_STREAM)
+ return RADEON_SURF_MODE_LINEAR_ALIGNED;
+ }
+
+ /* Make small textures 1D tiled. */
+ if (templ->width0 <= 16 || templ->height0 <= 16 || (sscreen->debug_flags & DBG(NO_2D_TILING)))
+ return RADEON_SURF_MODE_1D;
+
+ /* The allocator will switch to 1D if needed. */
+ return RADEON_SURF_MODE_2D;
}
struct pipe_resource *si_texture_create(struct pipe_screen *screen,
- const struct pipe_resource *templ)
+ const struct pipe_resource *templ)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- bool is_zs = util_format_is_depth_or_stencil(templ->format);
-
- if (templ->nr_samples >= 2) {
- /* This is hackish (overwriting the const pipe_resource template),
- * but should be harmless and state trackers can also see
- * the overriden number of samples in the created pipe_resource.
- */
- if (is_zs && sscreen->eqaa_force_z_samples) {
- ((struct pipe_resource*)templ)->nr_samples =
- ((struct pipe_resource*)templ)->nr_storage_samples =
- sscreen->eqaa_force_z_samples;
- } else if (!is_zs && sscreen->eqaa_force_color_samples) {
- ((struct pipe_resource*)templ)->nr_samples =
- sscreen->eqaa_force_coverage_samples;
- ((struct pipe_resource*)templ)->nr_storage_samples =
- sscreen->eqaa_force_color_samples;
- }
- }
-
- bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH ||
- templ->flags & SI_RESOURCE_FLAG_TRANSFER;
- bool tc_compatible_htile =
- sscreen->info.chip_class >= GFX8 &&
- /* There are issues with TC-compatible HTILE on Tonga (and
- * Iceland is the same design), and documented bug workarounds
- * don't help. For example, this fails:
- * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
- */
- sscreen->info.family != CHIP_TONGA &&
- sscreen->info.family != CHIP_ICELAND &&
- (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
- !(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
- !is_flushed_depth &&
- templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
- is_zs;
- enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ,
- tc_compatible_htile);
-
- /* This allocates textures with multiple planes like NV12 in 1 buffer. */
- enum { SI_TEXTURE_MAX_PLANES = 3 };
- struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
- struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
- uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
- uint64_t total_size = 0;
- unsigned max_alignment = 0;
- unsigned num_planes = util_format_get_num_planes(templ->format);
- assert(num_planes <= SI_TEXTURE_MAX_PLANES);
-
- /* Compute texture or plane layouts and offsets. */
- for (unsigned i = 0; i < num_planes; i++) {
- plane_templ[i] = *templ;
- plane_templ[i].format = util_format_get_plane_format(templ->format, i);
- plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
- plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
-
- /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
- * reallocate the storage to add PIPE_BIND_SHARED, because it's
- * shared by 3 pipe_resources.
- */
- if (num_planes > 1)
- plane_templ[i].bind |= PIPE_BIND_SHARED;
-
- if (si_init_surface(sscreen, &surface[i], &plane_templ[i],
- tile_mode, 0, false,
- plane_templ[i].bind & PIPE_BIND_SCANOUT,
- is_flushed_depth, tc_compatible_htile))
- return NULL;
-
- plane_offset[i] = align64(total_size, surface[i].surf_alignment);
- total_size = plane_offset[i] + surface[i].total_size;
- max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
- }
-
- struct si_texture *plane0 = NULL, *last_plane = NULL;
-
- for (unsigned i = 0; i < num_planes; i++) {
- struct si_texture *tex =
- si_texture_create_object(screen, &plane_templ[i], &surface[i],
- plane0, NULL, plane_offset[i],
- total_size, max_alignment);
- if (!tex) {
- si_texture_reference(&plane0, NULL);
- return NULL;
- }
-
- tex->plane_index = i;
- tex->num_planes = num_planes;
-
- if (!plane0) {
- plane0 = last_plane = tex;
- } else {
- last_plane->buffer.b.b.next = &tex->buffer.b.b;
- last_plane = tex;
- }
- }
-
- return (struct pipe_resource *)plane0;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+ if (templ->nr_samples >= 2) {
+ /* This is hackish (overwriting the const pipe_resource template),
+ * but should be harmless and state trackers can also see
+ * the overriden number of samples in the created pipe_resource.
+ */
+ if (is_zs && sscreen->eqaa_force_z_samples) {
+ ((struct pipe_resource *)templ)->nr_samples =
+ ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_z_samples;
+ } else if (!is_zs && sscreen->eqaa_force_color_samples) {
+ ((struct pipe_resource *)templ)->nr_samples = sscreen->eqaa_force_coverage_samples;
+ ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_color_samples;
+ }
+ }
+
+ bool is_flushed_depth =
+ templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || templ->flags & SI_RESOURCE_FLAG_TRANSFER;
+ bool tc_compatible_htile =
+ sscreen->info.chip_class >= GFX8 &&
+ /* There are issues with TC-compatible HTILE on Tonga (and
+ * Iceland is the same design), and documented bug workarounds
+ * don't help. For example, this fails:
+ * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
+ */
+ sscreen->info.family != CHIP_TONGA && sscreen->info.family != CHIP_ICELAND &&
+ (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+ !(sscreen->debug_flags & DBG(NO_HYPERZ)) && !is_flushed_depth &&
+ templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+ is_zs;
+ enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, tc_compatible_htile);
+
+ /* This allocates textures with multiple planes like NV12 in 1 buffer. */
+ enum
+ {
+ SI_TEXTURE_MAX_PLANES = 3
+ };
+ struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
+ struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
+ uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
+ uint64_t total_size = 0;
+ unsigned max_alignment = 0;
+ unsigned num_planes = util_format_get_num_planes(templ->format);
+ assert(num_planes <= SI_TEXTURE_MAX_PLANES);
+
+ /* Compute texture or plane layouts and offsets. */
+ for (unsigned i = 0; i < num_planes; i++) {
+ plane_templ[i] = *templ;
+ plane_templ[i].format = util_format_get_plane_format(templ->format, i);
+ plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
+ plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
+
+ /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
+ * reallocate the storage to add PIPE_BIND_SHARED, because it's
+ * shared by 3 pipe_resources.
+ */
+ if (num_planes > 1)
+ plane_templ[i].bind |= PIPE_BIND_SHARED;
+
+ if (si_init_surface(sscreen, &surface[i], &plane_templ[i], tile_mode, 0, false,
+ plane_templ[i].bind & PIPE_BIND_SCANOUT, is_flushed_depth,
+ tc_compatible_htile))
+ return NULL;
+
+ plane_offset[i] = align64(total_size, surface[i].surf_alignment);
+ total_size = plane_offset[i] + surface[i].total_size;
+ max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
+ }
+
+ struct si_texture *plane0 = NULL, *last_plane = NULL;
+
+ for (unsigned i = 0; i < num_planes; i++) {
+ struct si_texture *tex =
+ si_texture_create_object(screen, &plane_templ[i], &surface[i], plane0, NULL,
+ plane_offset[i], total_size, max_alignment);
+ if (!tex) {
+ si_texture_reference(&plane0, NULL);
+ return NULL;
+ }
+
+ tex->plane_index = i;
+ tex->num_planes = num_planes;
+
+ if (!plane0) {
+ plane0 = last_plane = tex;
+ } else {
+ last_plane->buffer.b.b.next = &tex->buffer.b.b;
+ last_plane = tex;
+ }
+ }
+
+ return (struct pipe_resource *)plane0;
}
static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
- const struct pipe_resource *templ,
- struct pb_buffer *buf,
- unsigned stride,
- unsigned offset,
- unsigned usage,
- bool dedicated)
+ const struct pipe_resource *templ,
+ struct pb_buffer *buf, unsigned stride,
+ unsigned offset, unsigned usage,
+ bool dedicated)
{
- enum radeon_surf_mode array_mode;
- struct radeon_surf surface = {};
- struct radeon_bo_metadata metadata = {};
- struct si_texture *tex;
- bool is_scanout;
- int r;
-
- /* Ignore metadata for non-zero planes. */
- if (offset != 0)
- dedicated = false;
-
- if (dedicated) {
- sscreen->ws->buffer_get_metadata(buf, &metadata);
- si_get_display_metadata(sscreen, &surface, &metadata,
- &array_mode, &is_scanout);
- } else {
- /**
- * The bo metadata is unset for un-dedicated images. So we fall
- * back to linear. See answer to question 5 of the
- * VK_KHX_external_memory spec for some details.
- *
- * It is possible that this case isn't going to work if the
- * surface pitch isn't correctly aligned by default.
- *
- * In order to support it correctly we require multi-image
- * metadata to be syncrhonized between radv and radeonsi. The
- * semantics of associating multiple image metadata to a memory
- * object on the vulkan export side are not concretely defined
- * either.
- *
- * All the use cases we are aware of at the moment for memory
- * objects use dedicated allocations. So lets keep the initial
- * implementation simple.
- *
- * A possible alternative is to attempt to reconstruct the
- * tiling information when the TexParameter TEXTURE_TILING_EXT
- * is set.
- */
- array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
- is_scanout = false;
- }
-
- r = si_init_surface(sscreen, &surface, templ,
- array_mode, stride, true, is_scanout,
- false, false);
- if (r)
- return NULL;
-
- tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf,
- offset, 0, 0);
- if (!tex)
- return NULL;
-
- tex->buffer.b.is_shared = true;
- tex->buffer.external_usage = usage;
- tex->num_planes = 1;
-
- if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
- si_texture_reference(&tex, NULL);
- return NULL;
- }
-
- /* Displayable DCC requires an explicit flush. */
- if (dedicated && offset == 0 &&
- !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
- si_has_displayable_dcc(tex)) {
- /* TODO: do we need to decompress DCC? */
- if (si_texture_discard_dcc(sscreen, tex)) {
- /* Update BO metadata after disabling DCC. */
- si_set_tex_bo_metadata(sscreen, tex);
- }
- }
-
- assert(tex->surface.tile_swizzle == 0);
- return &tex->buffer.b.b;
+ enum radeon_surf_mode array_mode;
+ struct radeon_surf surface = {};
+ struct radeon_bo_metadata metadata = {};
+ struct si_texture *tex;
+ bool is_scanout;
+ int r;
+
+ /* Ignore metadata for non-zero planes. */
+ if (offset != 0)
+ dedicated = false;
+
+ if (dedicated) {
+ sscreen->ws->buffer_get_metadata(buf, &metadata);
+ si_get_display_metadata(sscreen, &surface, &metadata, &array_mode, &is_scanout);
+ } else {
+ /**
+ * The bo metadata is unset for un-dedicated images. So we fall
+ * back to linear. See answer to question 5 of the
+ * VK_KHX_external_memory spec for some details.
+ *
+ * It is possible that this case isn't going to work if the
+ * surface pitch isn't correctly aligned by default.
+ *
+ * In order to support it correctly we require multi-image
+ * metadata to be syncrhonized between radv and radeonsi. The
+ * semantics of associating multiple image metadata to a memory
+ * object on the vulkan export side are not concretely defined
+ * either.
+ *
+ * All the use cases we are aware of at the moment for memory
+ * objects use dedicated allocations. So lets keep the initial
+ * implementation simple.
+ *
+ * A possible alternative is to attempt to reconstruct the
+ * tiling information when the TexParameter TEXTURE_TILING_EXT
+ * is set.
+ */
+ array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+ is_scanout = false;
+ }
+
+ r =
+ si_init_surface(sscreen, &surface, templ, array_mode, stride, true, is_scanout, false, false);
+ if (r)
+ return NULL;
+
+ tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, offset, 0, 0);
+ if (!tex)
+ return NULL;
+
+ tex->buffer.b.is_shared = true;
+ tex->buffer.external_usage = usage;
+ tex->num_planes = 1;
+
+ if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
+ si_texture_reference(&tex, NULL);
+ return NULL;
+ }
+
+ /* Displayable DCC requires an explicit flush. */
+ if (dedicated && offset == 0 && !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+ si_has_displayable_dcc(tex)) {
+ /* TODO: do we need to decompress DCC? */
+ if (si_texture_discard_dcc(sscreen, tex)) {
+ /* Update BO metadata after disabling DCC. */
+ si_set_tex_bo_metadata(sscreen, tex);
+ }
+ }
+
+ assert(tex->surface.tile_swizzle == 0);
+ return &tex->buffer.b.b;
}
static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
- const struct pipe_resource *templ,
- struct winsys_handle *whandle,
- unsigned usage)
+ const struct pipe_resource *templ,
+ struct winsys_handle *whandle, unsigned usage)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct pb_buffer *buf = NULL;
-
- /* Support only 2D textures without mipmaps */
- if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
- templ->target != PIPE_TEXTURE_2D_ARRAY) ||
- templ->last_level != 0)
- return NULL;
-
- buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
- sscreen->info.max_alignment);
- if (!buf)
- return NULL;
-
- return si_texture_from_winsys_buffer(sscreen, templ, buf,
- whandle->stride, whandle->offset,
- usage, true);
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct pb_buffer *buf = NULL;
+
+ /* Support only 2D textures without mipmaps */
+ if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
+ templ->target != PIPE_TEXTURE_2D_ARRAY) ||
+ templ->last_level != 0)
+ return NULL;
+
+ buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+ if (!buf)
+ return NULL;
+
+ return si_texture_from_winsys_buffer(sscreen, templ, buf, whandle->stride, whandle->offset,
+ usage, true);
}
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
- struct pipe_resource *texture)
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture)
{
- struct si_texture *tex = (struct si_texture*)texture;
- struct pipe_resource resource;
- enum pipe_format pipe_format = texture->format;
-
- assert(!tex->flushed_depth_texture);
-
- if (!tex->can_sample_z && tex->can_sample_s) {
- switch (pipe_format) {
- case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
- /* Save memory by not allocating the S plane. */
- pipe_format = PIPE_FORMAT_Z32_FLOAT;
- break;
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- case PIPE_FORMAT_S8_UINT_Z24_UNORM:
- /* Save memory bandwidth by not copying the
- * stencil part during flush.
- *
- * This potentially increases memory bandwidth
- * if an application uses both Z and S texturing
- * simultaneously (a flushed Z24S8 texture
- * would be stored compactly), but how often
- * does that really happen?
- */
- pipe_format = PIPE_FORMAT_Z24X8_UNORM;
- break;
- default:;
- }
- } else if (!tex->can_sample_s && tex->can_sample_z) {
- assert(util_format_has_stencil(util_format_description(pipe_format)));
-
- /* DB->CB copies to an 8bpp surface don't work. */
- pipe_format = PIPE_FORMAT_X24S8_UINT;
- }
-
- memset(&resource, 0, sizeof(resource));
- resource.target = texture->target;
- resource.format = pipe_format;
- resource.width0 = texture->width0;
- resource.height0 = texture->height0;
- resource.depth0 = texture->depth0;
- resource.array_size = texture->array_size;
- resource.last_level = texture->last_level;
- resource.nr_samples = texture->nr_samples;
- resource.usage = PIPE_USAGE_DEFAULT;
- resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
- resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
-
- tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
- if (!tex->flushed_depth_texture) {
- PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
- return false;
- }
- return true;
+ struct si_texture *tex = (struct si_texture *)texture;
+ struct pipe_resource resource;
+ enum pipe_format pipe_format = texture->format;
+
+ assert(!tex->flushed_depth_texture);
+
+ if (!tex->can_sample_z && tex->can_sample_s) {
+ switch (pipe_format) {
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ /* Save memory by not allocating the S plane. */
+ pipe_format = PIPE_FORMAT_Z32_FLOAT;
+ break;
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ /* Save memory bandwidth by not copying the
+ * stencil part during flush.
+ *
+ * This potentially increases memory bandwidth
+ * if an application uses both Z and S texturing
+ * simultaneously (a flushed Z24S8 texture
+ * would be stored compactly), but how often
+ * does that really happen?
+ */
+ pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+ break;
+ default:;
+ }
+ } else if (!tex->can_sample_s && tex->can_sample_z) {
+ assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+ /* DB->CB copies to an 8bpp surface don't work. */
+ pipe_format = PIPE_FORMAT_X24S8_UINT;
+ }
+
+ memset(&resource, 0, sizeof(resource));
+ resource.target = texture->target;
+ resource.format = pipe_format;
+ resource.width0 = texture->width0;
+ resource.height0 = texture->height0;
+ resource.depth0 = texture->depth0;
+ resource.array_size = texture->array_size;
+ resource.last_level = texture->last_level;
+ resource.nr_samples = texture->nr_samples;
+ resource.usage = PIPE_USAGE_DEFAULT;
+ resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
+ resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+
+ tex->flushed_depth_texture =
+ (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+ if (!tex->flushed_depth_texture) {
+ PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
+ return false;
+ }
+ return true;
}
/**
* which is supposed to hold a subregion of the texture "orig" at the given
* mipmap level.
*/
-static void si_init_temp_resource_from_box(struct pipe_resource *res,
- struct pipe_resource *orig,
- const struct pipe_box *box,
- unsigned level, unsigned flags)
+static void si_init_temp_resource_from_box(struct pipe_resource *res, struct pipe_resource *orig,
+ const struct pipe_box *box, unsigned level,
+ unsigned flags)
{
- memset(res, 0, sizeof(*res));
- res->format = orig->format;
- res->width0 = box->width;
- res->height0 = box->height;
- res->depth0 = 1;
- res->array_size = 1;
- res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
- res->flags = flags;
-
- if (flags & SI_RESOURCE_FLAG_TRANSFER &&
- util_format_is_compressed(orig->format)) {
- /* Transfer resources are allocated with linear tiling, which is
- * not supported for compressed formats.
- */
- unsigned blocksize =
- util_format_get_blocksize(orig->format);
-
- if (blocksize == 8) {
- res->format = PIPE_FORMAT_R16G16B16A16_UINT;
- } else {
- assert(blocksize == 16);
- res->format = PIPE_FORMAT_R32G32B32A32_UINT;
- }
-
- res->width0 = util_format_get_nblocksx(orig->format, box->width);
- res->height0 = util_format_get_nblocksy(orig->format, box->height);
- }
-
- /* We must set the correct texture target and dimensions for a 3D box. */
- if (box->depth > 1 && util_max_layer(orig, level) > 0) {
- res->target = PIPE_TEXTURE_2D_ARRAY;
- res->array_size = box->depth;
- } else {
- res->target = PIPE_TEXTURE_2D;
- }
+ memset(res, 0, sizeof(*res));
+ res->format = orig->format;
+ res->width0 = box->width;
+ res->height0 = box->height;
+ res->depth0 = 1;
+ res->array_size = 1;
+ res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+ res->flags = flags;
+
+ if (flags & SI_RESOURCE_FLAG_TRANSFER && util_format_is_compressed(orig->format)) {
+ /* Transfer resources are allocated with linear tiling, which is
+ * not supported for compressed formats.
+ */
+ unsigned blocksize = util_format_get_blocksize(orig->format);
+
+ if (blocksize == 8) {
+ res->format = PIPE_FORMAT_R16G16B16A16_UINT;
+ } else {
+ assert(blocksize == 16);
+ res->format = PIPE_FORMAT_R32G32B32A32_UINT;
+ }
+
+ res->width0 = util_format_get_nblocksx(orig->format, box->width);
+ res->height0 = util_format_get_nblocksy(orig->format, box->height);
+ }
+
+ /* We must set the correct texture target and dimensions for a 3D box. */
+ if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+ res->target = PIPE_TEXTURE_2D_ARRAY;
+ res->array_size = box->depth;
+ } else {
+ res->target = PIPE_TEXTURE_2D;
+ }
}
-static bool si_can_invalidate_texture(struct si_screen *sscreen,
- struct si_texture *tex,
- unsigned transfer_usage,
- const struct pipe_box *box)
+static bool si_can_invalidate_texture(struct si_screen *sscreen, struct si_texture *tex,
+ unsigned transfer_usage, const struct pipe_box *box)
{
- return !tex->buffer.b.is_shared &&
- !(tex->surface.flags & RADEON_SURF_IMPORTED) &&
- !(transfer_usage & PIPE_TRANSFER_READ) &&
- tex->buffer.b.b.last_level == 0 &&
- util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
- box->x, box->y, box->z,
- box->width, box->height,
- box->depth);
+ return !tex->buffer.b.is_shared && !(tex->surface.flags & RADEON_SURF_IMPORTED) &&
+ !(transfer_usage & PIPE_TRANSFER_READ) && tex->buffer.b.b.last_level == 0 &&
+ util_texrange_covers_whole_level(&tex->buffer.b.b, 0, box->x, box->y, box->z, box->width,
+ box->height, box->depth);
}
-static void si_texture_invalidate_storage(struct si_context *sctx,
- struct si_texture *tex)
+static void si_texture_invalidate_storage(struct si_context *sctx, struct si_texture *tex)
{
- struct si_screen *sscreen = sctx->screen;
+ struct si_screen *sscreen = sctx->screen;
- /* There is no point in discarding depth and tiled buffers. */
- assert(!tex->is_depth);
- assert(tex->surface.is_linear);
+ /* There is no point in discarding depth and tiled buffers. */
+ assert(!tex->is_depth);
+ assert(tex->surface.is_linear);
- /* Reallocate the buffer in the same pipe_resource. */
- si_alloc_resource(sscreen, &tex->buffer);
+ /* Reallocate the buffer in the same pipe_resource. */
+ si_alloc_resource(sscreen, &tex->buffer);
- /* Initialize the CMASK base address (needed even without CMASK). */
- tex->cmask_base_address_reg =
- (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+ /* Initialize the CMASK base address (needed even without CMASK). */
+ tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
- p_atomic_inc(&sscreen->dirty_tex_counter);
+ p_atomic_inc(&sscreen->dirty_tex_counter);
- sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
+ sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
}
-static void *si_texture_transfer_map(struct pipe_context *ctx,
- struct pipe_resource *texture,
- unsigned level,
- unsigned usage,
- const struct pipe_box *box,
- struct pipe_transfer **ptransfer)
+static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture,
+ unsigned level, unsigned usage, const struct pipe_box *box,
+ struct pipe_transfer **ptransfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_texture *tex = (struct si_texture*)texture;
- struct si_transfer *trans;
- struct si_resource *buf;
- unsigned offset = 0;
- char *map;
- bool use_staging_texture = false;
-
- assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
- assert(box->width && box->height && box->depth);
-
- if (tex->is_depth) {
- /* Depth textures use staging unconditionally. */
- use_staging_texture = true;
- } else {
- /* Degrade the tile mode if we get too many transfers on APUs.
- * On dGPUs, the staging texture is always faster.
- * Only count uploads that are at least 4x4 pixels large.
- */
- if (!sctx->screen->info.has_dedicated_vram &&
- level == 0 &&
- box->width >= 4 && box->height >= 4 &&
- p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
- bool can_invalidate =
- si_can_invalidate_texture(sctx->screen, tex,
- usage, box);
-
- si_reallocate_texture_inplace(sctx, tex,
- PIPE_BIND_LINEAR,
- can_invalidate);
- }
-
- /* Tiled textures need to be converted into a linear texture for CPU
- * access. The staging texture is always linear and is placed in GART.
- *
- * Reading from VRAM or GTT WC is slow, always use the staging
- * texture in this case.
- *
- * Use the staging texture for uploads if the underlying BO
- * is busy.
- */
- if (!tex->surface.is_linear)
- use_staging_texture = true;
- else if (usage & PIPE_TRANSFER_READ)
- use_staging_texture =
- tex->buffer.domains & RADEON_DOMAIN_VRAM ||
- tex->buffer.flags & RADEON_FLAG_GTT_WC;
- /* Write & linear only: */
- else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
- RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(tex->buffer.buf, 0,
- RADEON_USAGE_READWRITE)) {
- /* It's busy. */
- if (si_can_invalidate_texture(sctx->screen, tex,
- usage, box))
- si_texture_invalidate_storage(sctx, tex);
- else
- use_staging_texture = true;
- }
- }
-
- trans = CALLOC_STRUCT(si_transfer);
- if (!trans)
- return NULL;
- pipe_resource_reference(&trans->b.b.resource, texture);
- trans->b.b.level = level;
- trans->b.b.usage = usage;
- trans->b.b.box = *box;
-
- if (use_staging_texture) {
- struct pipe_resource resource;
- struct si_texture *staging;
-
- si_init_temp_resource_from_box(&resource, texture, box, level,
- SI_RESOURCE_FLAG_TRANSFER);
- resource.usage = (usage & PIPE_TRANSFER_READ) ?
- PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
-
- /* Since depth-stencil textures don't support linear tiling,
- * blit from ZS to color and vice versa. u_blitter will do
- * the packing for these formats.
- */
- if (tex->is_depth)
- resource.format = util_blitter_get_color_format_for_zs(resource.format);
-
- /* Create the temporary texture. */
- staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
- if (!staging) {
- PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
- goto fail_trans;
- }
- trans->staging = &staging->buffer;
-
- /* Just get the strides. */
- si_texture_get_offset(sctx->screen, staging, 0, NULL,
- &trans->b.b.stride,
- &trans->b.b.layer_stride);
-
- if (usage & PIPE_TRANSFER_READ)
- si_copy_to_staging_texture(ctx, trans);
- else
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-
- buf = trans->staging;
- } else {
- /* the resource is mapped directly */
- offset = si_texture_get_offset(sctx->screen, tex, level, box,
- &trans->b.b.stride,
- &trans->b.b.layer_stride);
- buf = &tex->buffer;
- }
-
- /* Always unmap texture CPU mappings on 32-bit architectures, so that
- * we don't run out of the CPU address space.
- */
- if (sizeof(void*) == 4)
- usage |= RADEON_TRANSFER_TEMPORARY;
-
- if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
- goto fail_trans;
-
- *ptransfer = &trans->b.b;
- return map + offset;
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_texture *tex = (struct si_texture *)texture;
+ struct si_transfer *trans;
+ struct si_resource *buf;
+ unsigned offset = 0;
+ char *map;
+ bool use_staging_texture = false;
+
+ assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
+ assert(box->width && box->height && box->depth);
+
+ if (tex->is_depth) {
+ /* Depth textures use staging unconditionally. */
+ use_staging_texture = true;
+ } else {
+ /* Degrade the tile mode if we get too many transfers on APUs.
+ * On dGPUs, the staging texture is always faster.
+ * Only count uploads that are at least 4x4 pixels large.
+ */
+ if (!sctx->screen->info.has_dedicated_vram && level == 0 && box->width >= 4 &&
+ box->height >= 4 && p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
+ bool can_invalidate = si_can_invalidate_texture(sctx->screen, tex, usage, box);
+
+ si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_LINEAR, can_invalidate);
+ }
+
+ /* Tiled textures need to be converted into a linear texture for CPU
+ * access. The staging texture is always linear and is placed in GART.
+ *
+ * Reading from VRAM or GTT WC is slow, always use the staging
+ * texture in this case.
+ *
+ * Use the staging texture for uploads if the underlying BO
+ * is busy.
+ */
+ if (!tex->surface.is_linear)
+ use_staging_texture = true;
+ else if (usage & PIPE_TRANSFER_READ)
+ use_staging_texture =
+ tex->buffer.domains & RADEON_DOMAIN_VRAM || tex->buffer.flags & RADEON_FLAG_GTT_WC;
+ /* Write & linear only: */
+ else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(tex->buffer.buf, 0, RADEON_USAGE_READWRITE)) {
+ /* It's busy. */
+ if (si_can_invalidate_texture(sctx->screen, tex, usage, box))
+ si_texture_invalidate_storage(sctx, tex);
+ else
+ use_staging_texture = true;
+ }
+ }
+
+ trans = CALLOC_STRUCT(si_transfer);
+ if (!trans)
+ return NULL;
+ pipe_resource_reference(&trans->b.b.resource, texture);
+ trans->b.b.level = level;
+ trans->b.b.usage = usage;
+ trans->b.b.box = *box;
+
+ if (use_staging_texture) {
+ struct pipe_resource resource;
+ struct si_texture *staging;
+
+ si_init_temp_resource_from_box(&resource, texture, box, level, SI_RESOURCE_FLAG_TRANSFER);
+ resource.usage = (usage & PIPE_TRANSFER_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
+
+ /* Since depth-stencil textures don't support linear tiling,
+ * blit from ZS to color and vice versa. u_blitter will do
+ * the packing for these formats.
+ */
+ if (tex->is_depth)
+ resource.format = util_blitter_get_color_format_for_zs(resource.format);
+
+ /* Create the temporary texture. */
+ staging = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+ if (!staging) {
+ PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+ goto fail_trans;
+ }
+ trans->staging = &staging->buffer;
+
+ /* Just get the strides. */
+ si_texture_get_offset(sctx->screen, staging, 0, NULL, &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+
+ if (usage & PIPE_TRANSFER_READ)
+ si_copy_to_staging_texture(ctx, trans);
+ else
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+ buf = trans->staging;
+ } else {
+ /* the resource is mapped directly */
+ offset = si_texture_get_offset(sctx->screen, tex, level, box, &trans->b.b.stride,
+ &trans->b.b.layer_stride);
+ buf = &tex->buffer;
+ }
+
+ /* Always unmap texture CPU mappings on 32-bit architectures, so that
+ * we don't run out of the CPU address space.
+ */
+ if (sizeof(void *) == 4)
+ usage |= RADEON_TRANSFER_TEMPORARY;
+
+ if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
+ goto fail_trans;
+
+ *ptransfer = &trans->b.b;
+ return map + offset;
fail_trans:
- si_resource_reference(&trans->staging, NULL);
- pipe_resource_reference(&trans->b.b.resource, NULL);
- FREE(trans);
- return NULL;
+ si_resource_reference(&trans->staging, NULL);
+ pipe_resource_reference(&trans->b.b.resource, NULL);
+ FREE(trans);
+ return NULL;
}
-static void si_texture_transfer_unmap(struct pipe_context *ctx,
- struct pipe_transfer* transfer)
+static void si_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct si_transfer *stransfer = (struct si_transfer*)transfer;
- struct pipe_resource *texture = transfer->resource;
- struct si_texture *tex = (struct si_texture*)texture;
-
- /* Always unmap texture CPU mappings on 32-bit architectures, so that
- * we don't run out of the CPU address space.
- */
- if (sizeof(void*) == 4) {
- struct si_resource *buf =
- stransfer->staging ? stransfer->staging : &tex->buffer;
-
- sctx->ws->buffer_unmap(buf->buf);
- }
-
- if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
- si_copy_from_staging_texture(ctx, stransfer);
-
- if (stransfer->staging) {
- sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
- si_resource_reference(&stransfer->staging, NULL);
- }
-
- /* Heuristic for {upload, draw, upload, draw, ..}:
- *
- * Flush the gfx IB if we've allocated too much texture storage.
- *
- * The idea is that we don't want to build IBs that use too much
- * memory and put pressure on the kernel memory manager and we also
- * want to make temporary and invalidated buffers go idle ASAP to
- * decrease the total memory usage or make them reusable. The memory
- * usage will be slightly higher than given here because of the buffer
- * cache in the winsys.
- *
- * The result is that the kernel memory manager is never a bottleneck.
- */
- if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- sctx->num_alloc_tex_transfer_bytes = 0;
- }
-
- pipe_resource_reference(&transfer->resource, NULL);
- FREE(transfer);
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_transfer *stransfer = (struct si_transfer *)transfer;
+ struct pipe_resource *texture = transfer->resource;
+ struct si_texture *tex = (struct si_texture *)texture;
+
+ /* Always unmap texture CPU mappings on 32-bit architectures, so that
+ * we don't run out of the CPU address space.
+ */
+ if (sizeof(void *) == 4) {
+ struct si_resource *buf = stransfer->staging ? stransfer->staging : &tex->buffer;
+
+ sctx->ws->buffer_unmap(buf->buf);
+ }
+
+ if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
+ si_copy_from_staging_texture(ctx, stransfer);
+
+ if (stransfer->staging) {
+ sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+ si_resource_reference(&stransfer->staging, NULL);
+ }
+
+ /* Heuristic for {upload, draw, upload, draw, ..}:
+ *
+ * Flush the gfx IB if we've allocated too much texture storage.
+ *
+ * The idea is that we don't want to build IBs that use too much
+ * memory and put pressure on the kernel memory manager and we also
+ * want to make temporary and invalidated buffers go idle ASAP to
+ * decrease the total memory usage or make them reusable. The memory
+ * usage will be slightly higher than given here because of the buffer
+ * cache in the winsys.
+ *
+ * The result is that the kernel memory manager is never a bottleneck.
+ */
+ if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+ sctx->num_alloc_tex_transfer_bytes = 0;
+ }
+
+ pipe_resource_reference(&transfer->resource, NULL);
+ FREE(transfer);
}
-static const struct u_resource_vtbl si_texture_vtbl =
-{
- NULL, /* get_handle */
- si_texture_destroy, /* resource_destroy */
- si_texture_transfer_map, /* transfer_map */
- u_default_transfer_flush_region, /* transfer_flush_region */
- si_texture_transfer_unmap, /* transfer_unmap */
+static const struct u_resource_vtbl si_texture_vtbl = {
+ NULL, /* get_handle */
+ si_texture_destroy, /* resource_destroy */
+ si_texture_transfer_map, /* transfer_map */
+ u_default_transfer_flush_region, /* transfer_flush_region */
+ si_texture_transfer_unmap, /* transfer_unmap */
};
/* Return if it's allowed to reinterpret one format as another with DCC enabled.
*/
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
- enum pipe_format format1,
- enum pipe_format format2)
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+ enum pipe_format format2)
{
- const struct util_format_description *desc1, *desc2;
-
- /* No format change - exit early. */
- if (format1 == format2)
- return true;
-
- format1 = si_simplify_cb_format(format1);
- format2 = si_simplify_cb_format(format2);
-
- /* Check again after format adjustments. */
- if (format1 == format2)
- return true;
-
- desc1 = util_format_description(format1);
- desc2 = util_format_description(format2);
-
- if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
- desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
- return false;
-
- /* Float and non-float are totally incompatible. */
- if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
- (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
- return false;
-
- /* Channel sizes must match across DCC formats.
- * Comparing just the first 2 channels should be enough.
- */
- if (desc1->channel[0].size != desc2->channel[0].size ||
- (desc1->nr_channels >= 2 &&
- desc1->channel[1].size != desc2->channel[1].size))
- return false;
-
- /* Everything below is not needed if the driver never uses the DCC
- * clear code with the value of 1.
- */
-
- /* If the clear values are all 1 or all 0, this constraint can be
- * ignored. */
- if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
- return false;
-
- /* Channel types must match if the clear value of 1 is used.
- * The type categories are only float, signed, unsigned.
- * NORM and INT are always compatible.
- */
- if (desc1->channel[0].type != desc2->channel[0].type ||
- (desc1->nr_channels >= 2 &&
- desc1->channel[1].type != desc2->channel[1].type))
- return false;
-
- return true;
+ const struct util_format_description *desc1, *desc2;
+
+ /* No format change - exit early. */
+ if (format1 == format2)
+ return true;
+
+ format1 = si_simplify_cb_format(format1);
+ format2 = si_simplify_cb_format(format2);
+
+ /* Check again after format adjustments. */
+ if (format1 == format2)
+ return true;
+
+ desc1 = util_format_description(format1);
+ desc2 = util_format_description(format2);
+
+ if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return false;
+
+ /* Float and non-float are totally incompatible. */
+ if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+ (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+ return false;
+
+ /* Channel sizes must match across DCC formats.
+ * Comparing just the first 2 channels should be enough.
+ */
+ if (desc1->channel[0].size != desc2->channel[0].size ||
+ (desc1->nr_channels >= 2 && desc1->channel[1].size != desc2->channel[1].size))
+ return false;
+
+ /* Everything below is not needed if the driver never uses the DCC
+ * clear code with the value of 1.
+ */
+
+ /* If the clear values are all 1 or all 0, this constraint can be
+ * ignored. */
+ if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
+ return false;
+
+ /* Channel types must match if the clear value of 1 is used.
+ * The type categories are only float, signed, unsigned.
+ * NORM and INT are always compatible.
+ */
+ if (desc1->channel[0].type != desc2->channel[0].type ||
+ (desc1->nr_channels >= 2 && desc1->channel[1].type != desc2->channel[1].type))
+ return false;
+
+ return true;
}
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
- unsigned level,
- enum pipe_format view_format)
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+ enum pipe_format view_format)
{
- struct si_texture *stex = (struct si_texture *)tex;
+ struct si_texture *stex = (struct si_texture *)tex;
- return vi_dcc_enabled(stex, level) &&
- !vi_dcc_formats_compatible((struct si_screen*)tex->screen,
- tex->format, view_format);
+ return vi_dcc_enabled(stex, level) &&
+ !vi_dcc_formats_compatible((struct si_screen *)tex->screen, tex->format, view_format);
}
/* This can't be merged with the above function, because
* vi_dcc_formats_compatible should be called only when DCC is enabled. */
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
- struct pipe_resource *tex,
- unsigned level,
- enum pipe_format view_format)
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+ unsigned level, enum pipe_format view_format)
{
- struct si_texture *stex = (struct si_texture *)tex;
+ struct si_texture *stex = (struct si_texture *)tex;
- if (vi_dcc_formats_are_incompatible(tex, level, view_format))
- if (!si_texture_disable_dcc(sctx, stex))
- si_decompress_dcc(sctx, stex);
+ if (vi_dcc_formats_are_incompatible(tex, level, view_format))
+ if (!si_texture_disable_dcc(sctx, stex))
+ si_decompress_dcc(sctx, stex);
}
struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
- struct pipe_resource *texture,
- const struct pipe_surface *templ,
- unsigned width0, unsigned height0,
- unsigned width, unsigned height)
+ struct pipe_resource *texture,
+ const struct pipe_surface *templ, unsigned width0,
+ unsigned height0, unsigned width, unsigned height)
{
- struct si_surface *surface = CALLOC_STRUCT(si_surface);
-
- if (!surface)
- return NULL;
-
- assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
- assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
-
- pipe_reference_init(&surface->base.reference, 1);
- pipe_resource_reference(&surface->base.texture, texture);
- surface->base.context = pipe;
- surface->base.format = templ->format;
- surface->base.width = width;
- surface->base.height = height;
- surface->base.u = templ->u;
-
- surface->width0 = width0;
- surface->height0 = height0;
-
- surface->dcc_incompatible =
- texture->target != PIPE_BUFFER &&
- vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
- templ->format);
- return &surface->base;
+ struct si_surface *surface = CALLOC_STRUCT(si_surface);
+
+ if (!surface)
+ return NULL;
+
+ assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
+ assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
+
+ pipe_reference_init(&surface->base.reference, 1);
+ pipe_resource_reference(&surface->base.texture, texture);
+ surface->base.context = pipe;
+ surface->base.format = templ->format;
+ surface->base.width = width;
+ surface->base.height = height;
+ surface->base.u = templ->u;
+
+ surface->width0 = width0;
+ surface->height0 = height0;
+
+ surface->dcc_incompatible =
+ texture->target != PIPE_BUFFER &&
+ vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, templ->format);
+ return &surface->base;
}
-static struct pipe_surface *si_create_surface(struct pipe_context *pipe,
- struct pipe_resource *tex,
- const struct pipe_surface *templ)
+static struct pipe_surface *si_create_surface(struct pipe_context *pipe, struct pipe_resource *tex,
+ const struct pipe_surface *templ)
{
- unsigned level = templ->u.tex.level;
- unsigned width = u_minify(tex->width0, level);
- unsigned height = u_minify(tex->height0, level);
- unsigned width0 = tex->width0;
- unsigned height0 = tex->height0;
-
- if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
- const struct util_format_description *tex_desc
- = util_format_description(tex->format);
- const struct util_format_description *templ_desc
- = util_format_description(templ->format);
-
- assert(tex_desc->block.bits == templ_desc->block.bits);
-
- /* Adjust size of surface if and only if the block width or
- * height is changed. */
- if (tex_desc->block.width != templ_desc->block.width ||
- tex_desc->block.height != templ_desc->block.height) {
- unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
- unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
-
- width = nblks_x * templ_desc->block.width;
- height = nblks_y * templ_desc->block.height;
-
- width0 = util_format_get_nblocksx(tex->format, width0);
- height0 = util_format_get_nblocksy(tex->format, height0);
- }
- }
-
- return si_create_surface_custom(pipe, tex, templ,
- width0, height0,
- width, height);
+ unsigned level = templ->u.tex.level;
+ unsigned width = u_minify(tex->width0, level);
+ unsigned height = u_minify(tex->height0, level);
+ unsigned width0 = tex->width0;
+ unsigned height0 = tex->height0;
+
+ if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+ const struct util_format_description *tex_desc = util_format_description(tex->format);
+ const struct util_format_description *templ_desc = util_format_description(templ->format);
+
+ assert(tex_desc->block.bits == templ_desc->block.bits);
+
+ /* Adjust size of surface if and only if the block width or
+ * height is changed. */
+ if (tex_desc->block.width != templ_desc->block.width ||
+ tex_desc->block.height != templ_desc->block.height) {
+ unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+ unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+ width = nblks_x * templ_desc->block.width;
+ height = nblks_y * templ_desc->block.height;
+
+ width0 = util_format_get_nblocksx(tex->format, width0);
+ height0 = util_format_get_nblocksy(tex->format, height0);
+ }
+ }
+
+ return si_create_surface_custom(pipe, tex, templ, width0, height0, width, height);
}
-static void si_surface_destroy(struct pipe_context *pipe,
- struct pipe_surface *surface)
+static void si_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surface)
{
- pipe_resource_reference(&surface->texture, NULL);
- FREE(surface);
+ pipe_resource_reference(&surface->texture, NULL);
+ FREE(surface);
}
unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap)
{
- const struct util_format_description *desc = util_format_description(format);
-
-#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
-
- if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
- return V_028C70_SWAP_STD;
-
- if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
- return ~0U;
-
- switch (desc->nr_channels) {
- case 1:
- if (HAS_SWIZZLE(0,X))
- return V_028C70_SWAP_STD; /* X___ */
- else if (HAS_SWIZZLE(3,X))
- return V_028C70_SWAP_ALT_REV; /* ___X */
- break;
- case 2:
- if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) ||
- (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) ||
- (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y)))
- return V_028C70_SWAP_STD; /* XY__ */
- else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
- (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
- (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
- /* YX__ */
- return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
- else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
- return V_028C70_SWAP_ALT; /* X__Y */
- else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
- return V_028C70_SWAP_ALT_REV; /* Y__X */
- break;
- case 3:
- if (HAS_SWIZZLE(0,X))
- return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
- else if (HAS_SWIZZLE(0,Z))
- return V_028C70_SWAP_STD_REV; /* ZYX */
- break;
- case 4:
- /* check the middle channels, the 1st and 4th channel can be NONE */
- if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
- return V_028C70_SWAP_STD; /* XYZW */
- } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
- return V_028C70_SWAP_STD_REV; /* WZYX */
- } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
- return V_028C70_SWAP_ALT; /* ZYXW */
- } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
- /* YZWX */
- if (desc->is_array)
- return V_028C70_SWAP_ALT_REV;
- else
- return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
- }
- break;
- }
- return ~0U;
+ const struct util_format_description *desc = util_format_description(format);
+
+#define HAS_SWIZZLE(chan, swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
+
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+ return V_028C70_SWAP_STD;
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+ return ~0U;
+
+ switch (desc->nr_channels) {
+ case 1:
+ if (HAS_SWIZZLE(0, X))
+ return V_028C70_SWAP_STD; /* X___ */
+ else if (HAS_SWIZZLE(3, X))
+ return V_028C70_SWAP_ALT_REV; /* ___X */
+ break;
+ case 2:
+ if ((HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, Y)) || (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, NONE)) ||
+ (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, Y)))
+ return V_028C70_SWAP_STD; /* XY__ */
+ else if ((HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, X)) ||
+ (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, NONE)) ||
+ (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, X)))
+ /* YX__ */
+ return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
+ else if (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(3, Y))
+ return V_028C70_SWAP_ALT; /* X__Y */
+ else if (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(3, X))
+ return V_028C70_SWAP_ALT_REV; /* Y__X */
+ break;
+ case 3:
+ if (HAS_SWIZZLE(0, X))
+ return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
+ else if (HAS_SWIZZLE(0, Z))
+ return V_028C70_SWAP_STD_REV; /* ZYX */
+ break;
+ case 4:
+ /* check the middle channels, the 1st and 4th channel can be NONE */
+ if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, Z)) {
+ return V_028C70_SWAP_STD; /* XYZW */
+ } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, Y)) {
+ return V_028C70_SWAP_STD_REV; /* WZYX */
+ } else if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, X)) {
+ return V_028C70_SWAP_ALT; /* ZYXW */
+ } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, W)) {
+ /* YZWX */
+ if (desc->is_array)
+ return V_028C70_SWAP_ALT_REV;
+ else
+ return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
+ }
+ break;
+ }
+ return ~0U;
}
/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
-static void vi_dcc_clean_up_context_slot(struct si_context *sctx,
- int slot)
+static void vi_dcc_clean_up_context_slot(struct si_context *sctx, int slot)
{
- int i;
+ int i;
- if (sctx->dcc_stats[slot].query_active)
- vi_separate_dcc_stop_query(sctx,
- sctx->dcc_stats[slot].tex);
+ if (sctx->dcc_stats[slot].query_active)
+ vi_separate_dcc_stop_query(sctx, sctx->dcc_stats[slot].tex);
- for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
- if (sctx->dcc_stats[slot].ps_stats[i]) {
- sctx->b.destroy_query(&sctx->b,
- sctx->dcc_stats[slot].ps_stats[i]);
- sctx->dcc_stats[slot].ps_stats[i] = NULL;
- }
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
+ if (sctx->dcc_stats[slot].ps_stats[i]) {
+ sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[slot].ps_stats[i]);
+ sctx->dcc_stats[slot].ps_stats[i] = NULL;
+ }
- si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+ si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
}
/**
* Return the per-context slot where DCC statistics queries for the texture live.
*/
-static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
- struct si_texture *tex)
+static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, struct si_texture *tex)
{
- int i, empty_slot = -1;
-
- /* Remove zombie textures (textures kept alive by this array only). */
- for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
- if (sctx->dcc_stats[i].tex &&
- sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
- vi_dcc_clean_up_context_slot(sctx, i);
-
- /* Find the texture. */
- for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
- /* Return if found. */
- if (sctx->dcc_stats[i].tex == tex) {
- sctx->dcc_stats[i].last_use_timestamp = os_time_get();
- return i;
- }
-
- /* Record the first seen empty slot. */
- if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
- empty_slot = i;
- }
-
- /* Not found. Remove the oldest member to make space in the array. */
- if (empty_slot == -1) {
- int oldest_slot = 0;
-
- /* Find the oldest slot. */
- for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
- if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
- sctx->dcc_stats[i].last_use_timestamp)
- oldest_slot = i;
-
- /* Clean up the oldest slot. */
- vi_dcc_clean_up_context_slot(sctx, oldest_slot);
- empty_slot = oldest_slot;
- }
-
- /* Add the texture to the new slot. */
- si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
- sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
- return empty_slot;
+ int i, empty_slot = -1;
+
+ /* Remove zombie textures (textures kept alive by this array only). */
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+ if (sctx->dcc_stats[i].tex && sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
+ vi_dcc_clean_up_context_slot(sctx, i);
+
+ /* Find the texture. */
+ for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+ /* Return if found. */
+ if (sctx->dcc_stats[i].tex == tex) {
+ sctx->dcc_stats[i].last_use_timestamp = os_time_get();
+ return i;
+ }
+
+ /* Record the first seen empty slot. */
+ if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
+ empty_slot = i;
+ }
+
+ /* Not found. Remove the oldest member to make space in the array. */
+ if (empty_slot == -1) {
+ int oldest_slot = 0;
+
+ /* Find the oldest slot. */
+ for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+ if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
+ sctx->dcc_stats[i].last_use_timestamp)
+ oldest_slot = i;
+
+ /* Clean up the oldest slot. */
+ vi_dcc_clean_up_context_slot(sctx, oldest_slot);
+ empty_slot = oldest_slot;
+ }
+
+ /* Add the texture to the new slot. */
+ si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+ sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+ return empty_slot;
}
-static struct pipe_query *
-vi_create_resuming_pipestats_query(struct si_context *sctx)
+static struct pipe_query *vi_create_resuming_pipestats_query(struct si_context *sctx)
{
- struct si_query_hw *query = (struct si_query_hw*)
- sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+ struct si_query_hw *query =
+ (struct si_query_hw *)sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
- query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
- return (struct pipe_query*)query;
+ query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
+ return (struct pipe_query *)query;
}
/**
* Called when binding a color buffer.
*/
-void vi_separate_dcc_start_query(struct si_context *sctx,
- struct si_texture *tex)
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex)
{
- unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
- assert(!sctx->dcc_stats[i].query_active);
+ assert(!sctx->dcc_stats[i].query_active);
- if (!sctx->dcc_stats[i].ps_stats[0])
- sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
+ if (!sctx->dcc_stats[i].ps_stats[0])
+ sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
- /* begin or resume the query */
- sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
- sctx->dcc_stats[i].query_active = true;
+ /* begin or resume the query */
+ sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+ sctx->dcc_stats[i].query_active = true;
}
/**
* Called when unbinding a color buffer.
*/
-void vi_separate_dcc_stop_query(struct si_context *sctx,
- struct si_texture *tex)
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex)
{
- unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
- assert(sctx->dcc_stats[i].query_active);
- assert(sctx->dcc_stats[i].ps_stats[0]);
+ assert(sctx->dcc_stats[i].query_active);
+ assert(sctx->dcc_stats[i].ps_stats[0]);
- /* pause or end the query */
- sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
- sctx->dcc_stats[i].query_active = false;
+ /* pause or end the query */
+ sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+ sctx->dcc_stats[i].query_active = false;
}
static bool vi_should_enable_separate_dcc(struct si_texture *tex)
{
- /* The minimum number of fullscreen draws per frame that is required
- * to enable DCC. */
- return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+ /* The minimum number of fullscreen draws per frame that is required
+ * to enable DCC. */
+ return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
}
/* Called by fast clear. */
-void vi_separate_dcc_try_enable(struct si_context *sctx,
- struct si_texture *tex)
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex)
{
- /* The intent is to use this with shared displayable back buffers,
- * but it's not strictly limited only to them.
- */
- if (!tex->buffer.b.is_shared ||
- !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
- tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
- tex->buffer.b.b.last_level > 0 ||
- !tex->surface.dcc_size ||
- sctx->screen->debug_flags & DBG(NO_DCC) ||
- sctx->screen->debug_flags & DBG(NO_DCC_FB))
- return;
-
- assert(sctx->chip_class >= GFX8);
-
- if (tex->surface.dcc_offset)
- return; /* already enabled */
-
- /* Enable the DCC stat gathering. */
- if (!tex->dcc_gather_statistics) {
- tex->dcc_gather_statistics = true;
- vi_separate_dcc_start_query(sctx, tex);
- }
-
- if (!vi_should_enable_separate_dcc(tex))
- return; /* stats show that DCC decompression is too expensive */
-
- assert(tex->surface.num_dcc_levels);
- assert(!tex->dcc_separate_buffer);
-
- si_texture_discard_cmask(sctx->screen, tex);
-
- /* Get a DCC buffer. */
- if (tex->last_dcc_separate_buffer) {
- assert(tex->dcc_gather_statistics);
- assert(!tex->dcc_separate_buffer);
- tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
- tex->last_dcc_separate_buffer = NULL;
- } else {
- tex->dcc_separate_buffer =
- si_aligned_buffer_create(sctx->b.screen,
- SI_RESOURCE_FLAG_UNMAPPABLE,
- PIPE_USAGE_DEFAULT,
- tex->surface.dcc_size,
- tex->surface.dcc_alignment);
- if (!tex->dcc_separate_buffer)
- return;
- }
-
- /* dcc_offset is the absolute GPUVM address. */
- tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
-
- /* no need to flag anything since this is called by fast clear that
- * flags framebuffer state
- */
+ /* The intent is to use this with shared displayable back buffers,
+ * but it's not strictly limited only to them.
+ */
+ if (!tex->buffer.b.is_shared ||
+ !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+ tex->buffer.b.b.target != PIPE_TEXTURE_2D || tex->buffer.b.b.last_level > 0 ||
+ !tex->surface.dcc_size || sctx->screen->debug_flags & DBG(NO_DCC) ||
+ sctx->screen->debug_flags & DBG(NO_DCC_FB))
+ return;
+
+ assert(sctx->chip_class >= GFX8);
+
+ if (tex->surface.dcc_offset)
+ return; /* already enabled */
+
+ /* Enable the DCC stat gathering. */
+ if (!tex->dcc_gather_statistics) {
+ tex->dcc_gather_statistics = true;
+ vi_separate_dcc_start_query(sctx, tex);
+ }
+
+ if (!vi_should_enable_separate_dcc(tex))
+ return; /* stats show that DCC decompression is too expensive */
+
+ assert(tex->surface.num_dcc_levels);
+ assert(!tex->dcc_separate_buffer);
+
+ si_texture_discard_cmask(sctx->screen, tex);
+
+ /* Get a DCC buffer. */
+ if (tex->last_dcc_separate_buffer) {
+ assert(tex->dcc_gather_statistics);
+ assert(!tex->dcc_separate_buffer);
+ tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+ tex->last_dcc_separate_buffer = NULL;
+ } else {
+ tex->dcc_separate_buffer =
+ si_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+ tex->surface.dcc_size, tex->surface.dcc_alignment);
+ if (!tex->dcc_separate_buffer)
+ return;
+ }
+
+ /* dcc_offset is the absolute GPUVM address. */
+ tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+ /* no need to flag anything since this is called by fast clear that
+ * flags framebuffer state
+ */
}
/**
* Called by pipe_context::flush_resource, the place where DCC decompression
* takes place.
*/
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
- struct si_texture *tex)
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex)
{
- struct si_context *sctx = (struct si_context*)ctx;
- struct pipe_query *tmp;
- unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
- bool query_active = sctx->dcc_stats[i].query_active;
- bool disable = false;
-
- if (sctx->dcc_stats[i].ps_stats[2]) {
- union pipe_query_result result;
-
- /* Read the results. */
- struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
- ctx->get_query_result(ctx, query,
- true, &result);
- si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer);
-
- /* Compute the approximate number of fullscreen draws. */
- tex->ps_draw_ratio =
- result.pipeline_statistics.ps_invocations /
- (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
- sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
-
- disable = tex->dcc_separate_buffer &&
- !vi_should_enable_separate_dcc(tex);
- }
-
- tex->num_slow_clears = 0;
-
- /* stop the statistics query for ps_stats[0] */
- if (query_active)
- vi_separate_dcc_stop_query(sctx, tex);
-
- /* Move the queries in the queue by one. */
- tmp = sctx->dcc_stats[i].ps_stats[2];
- sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
- sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
- sctx->dcc_stats[i].ps_stats[0] = tmp;
-
- /* create and start a new query as ps_stats[0] */
- if (query_active)
- vi_separate_dcc_start_query(sctx, tex);
-
- if (disable) {
- assert(!tex->last_dcc_separate_buffer);
- tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
- tex->dcc_separate_buffer = NULL;
- tex->surface.dcc_offset = 0;
- /* no need to flag anything since this is called after
- * decompression that re-sets framebuffer state
- */
- }
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct pipe_query *tmp;
+ unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+ bool query_active = sctx->dcc_stats[i].query_active;
+ bool disable = false;
+
+ if (sctx->dcc_stats[i].ps_stats[2]) {
+ union pipe_query_result result;
+
+ /* Read the results. */
+ struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
+ ctx->get_query_result(ctx, query, true, &result);
+ si_query_buffer_reset(sctx, &((struct si_query_hw *)query)->buffer);
+
+ /* Compute the approximate number of fullscreen draws. */
+ tex->ps_draw_ratio = result.pipeline_statistics.ps_invocations /
+ (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
+ sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+ disable = tex->dcc_separate_buffer && !vi_should_enable_separate_dcc(tex);
+ }
+
+ tex->num_slow_clears = 0;
+
+ /* stop the statistics query for ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_stop_query(sctx, tex);
+
+ /* Move the queries in the queue by one. */
+ tmp = sctx->dcc_stats[i].ps_stats[2];
+ sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
+ sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
+ sctx->dcc_stats[i].ps_stats[0] = tmp;
+
+ /* create and start a new query as ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_start_query(sctx, tex);
+
+ if (disable) {
+ assert(!tex->last_dcc_separate_buffer);
+ tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+ tex->dcc_separate_buffer = NULL;
+ tex->surface.dcc_offset = 0;
+ /* no need to flag anything since this is called after
+ * decompression that re-sets framebuffer state
+ */
+ }
}
static struct pipe_memory_object *
-si_memobj_from_handle(struct pipe_screen *screen,
- struct winsys_handle *whandle,
- bool dedicated)
+si_memobj_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle, bool dedicated)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
- struct pb_buffer *buf = NULL;
-
- if (!memobj)
- return NULL;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
+ struct pb_buffer *buf = NULL;
- buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
- sscreen->info.max_alignment);
- if (!buf) {
- free(memobj);
- return NULL;
- }
+ if (!memobj)
+ return NULL;
- memobj->b.dedicated = dedicated;
- memobj->buf = buf;
- memobj->stride = whandle->stride;
+ buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+ if (!buf) {
+ free(memobj);
+ return NULL;
+ }
- return (struct pipe_memory_object *)memobj;
+ memobj->b.dedicated = dedicated;
+ memobj->buf = buf;
+ memobj->stride = whandle->stride;
+ return (struct pipe_memory_object *)memobj;
}
-static void
-si_memobj_destroy(struct pipe_screen *screen,
- struct pipe_memory_object *_memobj)
+static void si_memobj_destroy(struct pipe_screen *screen, struct pipe_memory_object *_memobj)
{
- struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+ struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
- pb_reference(&memobj->buf, NULL);
- free(memobj);
+ pb_reference(&memobj->buf, NULL);
+ free(memobj);
}
-static struct pipe_resource *
-si_texture_from_memobj(struct pipe_screen *screen,
- const struct pipe_resource *templ,
- struct pipe_memory_object *_memobj,
- uint64_t offset)
+static struct pipe_resource *si_texture_from_memobj(struct pipe_screen *screen,
+ const struct pipe_resource *templ,
+ struct pipe_memory_object *_memobj,
+ uint64_t offset)
{
- struct si_screen *sscreen = (struct si_screen*)screen;
- struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
- struct pipe_resource *tex =
- si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
- memobj->stride, offset,
- PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE |
- PIPE_HANDLE_USAGE_SHADER_WRITE,
- memobj->b.dedicated);
- if (!tex)
- return NULL;
-
- /* si_texture_from_winsys_buffer doesn't increment refcount of
- * memobj->buf, so increment it here.
- */
- struct pb_buffer *buf = NULL;
- pb_reference(&buf, memobj->buf);
- return tex;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+ struct pipe_resource *tex = si_texture_from_winsys_buffer(
+ sscreen, templ, memobj->buf, memobj->stride, offset,
+ PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | PIPE_HANDLE_USAGE_SHADER_WRITE, memobj->b.dedicated);
+ if (!tex)
+ return NULL;
+
+ /* si_texture_from_winsys_buffer doesn't increment refcount of
+ * memobj->buf, so increment it here.
+ */
+ struct pb_buffer *buf = NULL;
+ pb_reference(&buf, memobj->buf);
+ return tex;
}
-static bool si_check_resource_capability(struct pipe_screen *screen,
- struct pipe_resource *resource,
- unsigned bind)
+static bool si_check_resource_capability(struct pipe_screen *screen, struct pipe_resource *resource,
+ unsigned bind)
{
- struct si_texture *tex = (struct si_texture*)resource;
+ struct si_texture *tex = (struct si_texture *)resource;
- /* Buffers only support the linear flag. */
- if (resource->target == PIPE_BUFFER)
- return (bind & ~PIPE_BIND_LINEAR) == 0;
+ /* Buffers only support the linear flag. */
+ if (resource->target == PIPE_BUFFER)
+ return (bind & ~PIPE_BIND_LINEAR) == 0;
- if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
- return false;
+ if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
+ return false;
- if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
- return false;
+ if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
+ return false;
- /* TODO: PIPE_BIND_CURSOR - do we care? */
- return true;
+ /* TODO: PIPE_BIND_CURSOR - do we care? */
+ return true;
}
void si_init_screen_texture_functions(struct si_screen *sscreen)
{
- sscreen->b.resource_from_handle = si_texture_from_handle;
- sscreen->b.resource_get_handle = si_texture_get_handle;
- sscreen->b.resource_get_param = si_resource_get_param;
- sscreen->b.resource_get_info = si_texture_get_info;
- sscreen->b.resource_from_memobj = si_texture_from_memobj;
- sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
- sscreen->b.memobj_destroy = si_memobj_destroy;
- sscreen->b.check_resource_capability = si_check_resource_capability;
+ sscreen->b.resource_from_handle = si_texture_from_handle;
+ sscreen->b.resource_get_handle = si_texture_get_handle;
+ sscreen->b.resource_get_param = si_resource_get_param;
+ sscreen->b.resource_get_info = si_texture_get_info;
+ sscreen->b.resource_from_memobj = si_texture_from_memobj;
+ sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
+ sscreen->b.memobj_destroy = si_memobj_destroy;
+ sscreen->b.check_resource_capability = si_check_resource_capability;
}
void si_init_context_texture_functions(struct si_context *sctx)
{
- sctx->b.create_surface = si_create_surface;
- sctx->b.surface_destroy = si_surface_destroy;
+ sctx->b.create_surface = si_create_surface;
+ sctx->b.surface_destroy = si_surface_destroy;
}
*
**************************************************************************/
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
#include "radeon/radeon_uvd.h"
+#include "radeon/radeon_uvd_enc.h"
#include "radeon/radeon_vce.h"
#include "radeon/radeon_vcn_dec.h"
#include "radeon/radeon_vcn_enc.h"
-#include "radeon/radeon_uvd_enc.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
#include "util/u_video.h"
/**
* creates an video buffer with an UVD compatible memory layout
*/
struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
- const struct pipe_video_buffer *tmpl)
+ const struct pipe_video_buffer *tmpl)
{
- struct pipe_video_buffer vidbuf = *tmpl;
- /* TODO: get tiling working */
- vidbuf.bind |= PIPE_BIND_LINEAR;
+ struct pipe_video_buffer vidbuf = *tmpl;
+ /* TODO: get tiling working */
+ vidbuf.bind |= PIPE_BIND_LINEAR;
- return vl_video_buffer_create_as_resource(pipe, &vidbuf);
+ return vl_video_buffer_create_as_resource(pipe, &vidbuf);
}
/* set the decoding target buffer offsets */
-static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
{
- struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
- struct si_texture *luma = (struct si_texture *)buf->resources[0];
- struct si_texture *chroma = (struct si_texture *)buf->resources[1];
- enum ruvd_surface_type type = (sscreen->info.chip_class >= GFX9) ?
- RUVD_SURFACE_TYPE_GFX9 :
- RUVD_SURFACE_TYPE_LEGACY;
+ struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen;
+ struct si_texture *luma = (struct si_texture *)buf->resources[0];
+ struct si_texture *chroma = (struct si_texture *)buf->resources[1];
+ enum ruvd_surface_type type =
+ (sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY;
- msg->body.decode.dt_field_mode = buf->base.interlaced;
+ msg->body.decode.dt_field_mode = buf->base.interlaced;
- si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
+ si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
- return luma->buffer.buf;
+ return luma->buffer.buf;
}
/* get the radeon resources for VCE */
-static void si_vce_get_buffer(struct pipe_resource *resource,
- struct pb_buffer **handle,
- struct radeon_surf **surface)
+static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle,
+ struct radeon_surf **surface)
{
- struct si_texture *res = (struct si_texture *)resource;
+ struct si_texture *res = (struct si_texture *)resource;
- if (handle)
- *handle = res->buffer.buf;
+ if (handle)
+ *handle = res->buffer.buf;
- if (surface)
- *surface = &res->surface;
+ if (surface)
+ *surface = &res->surface;
}
/**
* creates an UVD compatible decoder
*/
struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
- const struct pipe_video_codec *templ)
+ const struct pipe_video_codec *templ)
{
- struct si_context *ctx = (struct si_context *)context;
- bool vcn = ctx->family >= CHIP_RAVEN;
+ struct si_context *ctx = (struct si_context *)context;
+ bool vcn = ctx->family >= CHIP_RAVEN;
- if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
- if (vcn) {
- return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
- } else {
- if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
- return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
- else
- return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
- }
- }
+ if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+ if (vcn) {
+ return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+ } else {
+ if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
+ return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+ else
+ return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+ }
+ }
- return (vcn) ? radeon_create_decoder(context, templ) :
- si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
+ return (vcn) ? radeon_create_decoder(context, templ)
+ : si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
}