From: Marek Olšák Date: Fri, 13 Sep 2019 01:13:08 +0000 (-0400) Subject: radeonsi: add FMASK slots for shader images (for MSAA images) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=743a9d85e2ca5aef93e40fe7833742a067a5943d;p=mesa.git radeonsi: add FMASK slots for shader images (for MSAA images) Acked-by: Pierre-Eric Pelloux-Prayer --- diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 28fe5c1e585..12ab1f51e97 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -794,8 +794,6 @@ static void si_set_shader_image(struct si_context *ctx, struct si_images *images = &ctx->images[shader]; struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); struct si_resource *res; - unsigned desc_slot = si_get_image_slot(slot); - uint32_t *desc = descs->list + desc_slot * 8; if (!view || !view->resource) { si_disable_shader_image(ctx, shader, slot); @@ -807,7 +805,9 @@ static void si_set_shader_image(struct si_context *ctx, if (&images->views[slot] != view) util_copy_image_view(&images->views[slot], view); - si_set_shader_image_desc(ctx, view, skip_decompress, desc, NULL); + si_set_shader_image_desc(ctx, view, skip_decompress, + descs->list + si_get_image_slot(slot) * 8, + descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8); if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) { @@ -1981,18 +1981,19 @@ static void si_update_bindless_image_descriptor(struct si_context *sctx, struct si_descriptors *desc = &sctx->bindless_descriptors; unsigned desc_slot_offset = img_handle->desc_slot * 16; struct pipe_image_view *view = &img_handle->view; - uint32_t desc_list[8]; + struct pipe_resource *res = view->resource; + uint32_t image_desc[16]; + unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4; - if (view->resource->target == PIPE_BUFFER) + if (res->target == PIPE_BUFFER) return; - memcpy(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list)); + memcpy(image_desc, desc->list + desc_slot_offset, desc_size); si_set_shader_image_desc(sctx, view, true, - desc->list + desc_slot_offset, NULL); + desc->list + desc_slot_offset, + desc->list + desc_slot_offset + 8); - if (memcmp(desc_list, desc->list + desc_slot_offset, - sizeof(desc_list))) { + if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) { img_handle->desc_dirty = true; sctx->bindless_descriptors_dirty = true; } @@ -2584,7 +2585,7 @@ static uint64_t si_create_image_handle(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; struct si_image_handle *img_handle; - uint32_t desc_list[8]; + uint32_t desc_list[16]; uint64_t handle; if (!view || !view->resource) @@ -2595,9 +2596,9 @@ static uint64_t si_create_image_handle(struct pipe_context *ctx, return 0; memset(desc_list, 0, sizeof(desc_list)); - si_init_descriptor_list(&desc_list[0], 8, 1, null_image_descriptor); + si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor); - si_set_shader_image_desc(sctx, view, false, &desc_list[0], NULL); + si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]); img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list)); @@ -2764,7 +2765,7 @@ void si_init_all_descriptors(struct si_context *sctx) bool is_2nd = sctx->chip_class >= GFX9 && (i == PIPE_SHADER_TESS_CTRL || i == PIPE_SHADER_GEOMETRY); - unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; + unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS; unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; int rel_dw_offset; struct si_descriptors *desc; @@ -2809,9 +2810,9 @@ void si_init_all_descriptors(struct si_context *sctx) si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); int j; - for (j = 0; j < SI_NUM_IMAGES; j++) + for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++) memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); - for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++) + for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++) memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 3bf30c72436..680930bb523 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -1114,13 +1114,13 @@ si_nir_load_sampler_desc(struct ac_shader_abi *abi, if (image) { index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), + LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS - 1, 0), index, ""); return si_load_image_desc(ctx, list, index, desc_type, write, false); } index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), ""); + LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); return si_load_sampler_desc(ctx, list, index, desc_type); } diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c index 243286cc629..10fbb808e9b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c @@ -235,7 +235,7 @@ image_fetch_rsrc( image->Register.Index, ctx->num_images); index = LLVMBuildSub(ctx->ac.builder, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), + LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS - 1, 0), index, ""); } @@ -1126,7 +1126,7 @@ static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base, reg->Register.Index, ctx->num_samplers); index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), ""); + LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); } else { index = LLVMConstInt(ctx->i32, si_get_sampler_slot(reg->Register.Index), 0); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index e3e6cf293e1..88e01512cd6 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -37,6 +37,7 @@ #define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */ #define SI_NUM_CONST_BUFFERS 16 #define SI_NUM_IMAGES 16 +#define SI_NUM_IMAGE_SLOTS (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */ #define SI_NUM_SHADER_BUFFERS 16 struct si_screen; @@ -647,14 +648,16 @@ static inline unsigned si_get_shaderbuf_slot(unsigned slot) static inline unsigned si_get_sampler_slot(unsigned slot) { - /* samplers are in slots [8..39], ascending */ - return SI_NUM_IMAGES / 2 + slot; + /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */ + /* those are equivalent to image slots [32..95], 8 dw per slot, ascending */ + return SI_NUM_IMAGE_SLOTS / 2 + slot; } static inline unsigned si_get_image_slot(unsigned slot) { - /* images are in slots [15..0] (sampler slots [7..0]), descending */ - return SI_NUM_IMAGES - 1 - slot; + /* image slots are in [31..0] (sampler slots [15..0]), descending */ + /* images are in slots [31..16], while FMASKs are in slots [15..0] */ + return SI_NUM_IMAGE_SLOTS - 1 - slot; } #endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index bbdd0d08b42..04443db7a44 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2633,12 +2633,13 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info, uint32_t *const_and_shader_buffers, uint64_t *samplers_and_images) { - unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers; + unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers; num_shaderbufs = util_last_bit(info->shader_buffers_declared); num_constbufs = util_last_bit(info->const_buffers_declared); /* two 8-byte images share one 16-byte slot */ num_images = align(util_last_bit(info->images_declared), 2); + num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2); num_samplers = util_last_bit(info->samplers_declared); /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */ @@ -2646,7 +2647,18 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info, *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs); - /* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */ + /* The layout is: + * - fmask[last] ... fmask[0] go to [15-last .. 15] + * - image[last] ... image[0] go to [31-last .. 31] + * - sampler[0] ... sampler[last] go to [32 .. 32+last*2] + * + * FMASKs for images are placed separately, because MSAA images are rare, + * and so we can benefit from a better cache hit rate if we keep image + * descriptors together. + */ + if (num_msaa_images) + num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */ + start = si_get_image_slot(num_images - 1) / 2; *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);