X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_descriptors.c;h=e6c58ffca78cd05f291dd637c48af3822c038bc5;hb=2a47660754ba49d5998929550f639677506c4515;hp=2b911585a69ee58e88595c8c5e5836f0f6770d50;hpb=7d2fa8dc105158e7d2d28763cb9ed020ae64b67f;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 2b911585a69..e6c58ffca78 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -19,9 +19,6 @@ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * Authors: - * Marek Olšák */ /* Resource binding slots and sampler states (each described with 8 or @@ -60,6 +57,8 @@ #include "sid.h" #include "gfx9d.h" +#include "util/hash_table.h" +#include "util/u_idalloc.h" #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_upload_mgr.h" @@ -95,174 +94,102 @@ static uint32_t null_image_descriptor[8] = { * descriptor */ }; -static void si_init_descriptors(struct si_descriptors *desc, - unsigned shader_userdata_index, - unsigned element_dw_size, - unsigned num_elements, - const uint32_t *null_descriptor, - unsigned *ce_offset) +static uint64_t si_desc_extract_buffer_address(uint32_t *desc) { - int i; - - assert(num_elements <= sizeof(desc->dirty_mask)*8); - - desc->list = CALLOC(num_elements, element_dw_size * 4); - desc->element_dw_size = element_dw_size; - desc->num_elements = num_elements; - desc->dirty_mask = num_elements == 32 ? ~0u : (1u << num_elements) - 1; - desc->shader_userdata_offset = shader_userdata_index * 4; - - if (ce_offset) { - desc->uses_ce = true; - desc->ce_offset = *ce_offset; + return desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); +} - /* make sure that ce_offset stays 32 byte aligned */ - *ce_offset += align(element_dw_size * num_elements * 4, 32); - } +static void si_init_descriptor_list(uint32_t *desc_list, + unsigned element_dw_size, + unsigned num_elements, + const uint32_t *null_descriptor) +{ + int i; /* Initialize the array to NULL descriptors if the element size is 8. */ if (null_descriptor) { assert(element_dw_size % 8 == 0); for (i = 0; i < num_elements * element_dw_size / 8; i++) - memcpy(desc->list + i * 8, null_descriptor, - 8 * 4); - } -} - -static void si_release_descriptors(struct si_descriptors *desc) -{ - r600_resource_reference(&desc->buffer, NULL); - FREE(desc->list); -} - -static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, - unsigned *out_offset, struct r600_resource **out_buf) { - uint64_t va; - - u_suballocator_alloc(sctx->ce_suballocator, size, - sctx->screen->b.info.tcc_cache_line_size, - out_offset, (struct pipe_resource**)out_buf); - if (!out_buf) - return false; - - va = (*out_buf)->gpu_address + *out_offset; - - radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); - radeon_emit(sctx->ce_ib, ce_offset); - radeon_emit(sctx->ce_ib, size / 4); - radeon_emit(sctx->ce_ib, va); - radeon_emit(sctx->ce_ib, va >> 32); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, - RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); - - sctx->ce_need_synchronization = true; - return true; -} - -static void si_ce_reinitialize_descriptors(struct si_context *sctx, - struct si_descriptors *desc) -{ - if (desc->buffer) { - struct r600_resource *buffer = (struct r600_resource*)desc->buffer; - unsigned list_size = desc->num_elements * desc->element_dw_size * 4; - uint64_t va = buffer->gpu_address + desc->buffer_offset; - struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; - - if (!ib) - ib = sctx->ce_ib; - - list_size = align(list_size, 32); - - radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); - radeon_emit(ib, va); - radeon_emit(ib, va >> 32); - radeon_emit(ib, list_size / 4); - radeon_emit(ib, desc->ce_offset); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); + memcpy(desc_list + i * 8, null_descriptor, 8 * 4); } - desc->ce_ram_dirty = false; } -void si_ce_reinitialize_all_descriptors(struct si_context *sctx) +static void si_init_descriptors(struct si_descriptors *desc, + short shader_userdata_rel_index, + unsigned element_dw_size, + unsigned num_elements) { - int i; - - for (i = 0; i < SI_NUM_DESCS; ++i) - si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]); + desc->list = CALLOC(num_elements, element_dw_size * 4); + desc->element_dw_size = element_dw_size; + desc->num_elements = num_elements; + desc->shader_userdata_offset = shader_userdata_rel_index * 4; + desc->slot_index_to_bind_directly = -1; } -void si_ce_enable_loads(struct radeon_winsys_cs *ib) +static void si_release_descriptors(struct si_descriptors *desc) { - radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | - CONTEXT_CONTROL_LOAD_CE_RAM(1)); - radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); + r600_resource_reference(&desc->buffer, NULL); + FREE(desc->list); } static bool si_upload_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - struct r600_atom * atom) + struct si_descriptors *desc) { - unsigned list_size = desc->num_elements * desc->element_dw_size * 4; + unsigned slot_size = desc->element_dw_size * 4; + unsigned first_slot_offset = desc->first_active_slot * slot_size; + unsigned upload_size = desc->num_active_slots * slot_size; - if (!desc->dirty_mask) + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ + if (!upload_size) return true; - if (sctx->ce_ib && desc->uses_ce) { - uint32_t const* list = (uint32_t const*)desc->list; - - if (desc->ce_ram_dirty) - si_ce_reinitialize_descriptors(sctx, desc); - - while(desc->dirty_mask) { - int begin, count; - u_bit_scan_consecutive_range(&desc->dirty_mask, &begin, - &count); - - begin *= desc->element_dw_size; - count *= desc->element_dw_size; - - radeon_emit(sctx->ce_ib, - PKT3(PKT3_WRITE_CONST_RAM, count, 0)); - radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); - radeon_emit_array(sctx->ce_ib, list + begin, count); - } - - if (!si_ce_upload(sctx, desc->ce_offset, list_size, - &desc->buffer_offset, &desc->buffer)) - return false; - } else { - void *ptr; + /* If there is just one active descriptor, bind it directly. */ + if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly && + desc->num_active_slots == 1) { + uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * + desc->element_dw_size]; + + /* The buffer is already in the buffer list. */ + r600_resource_reference(&desc->buffer, NULL); + desc->gpu_list = NULL; + desc->gpu_address = si_desc_extract_buffer_address(descriptor); + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); + return true; + } - u_upload_alloc(sctx->b.b.const_uploader, 0, list_size, - sctx->screen->b.info.tcc_cache_line_size, - &desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, &ptr); - if (!desc->buffer) - return false; /* skip the draw call */ + uint32_t *ptr; + int buffer_offset; + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + (unsigned*)&buffer_offset, + (struct pipe_resource**)&desc->buffer, + (void**)&ptr); + if (!desc->buffer) { + desc->gpu_address = 0; + return false; /* skip the draw call */ + } - util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - desc->gpu_list = ptr; + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); - } - desc->dirty_mask = 0; + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); - if (atom) - si_mark_atom_dirty(sctx, atom); + /* The shader pointer should point to slot 0. */ + buffer_offset -= first_slot_offset; + desc->gpu_address = desc->buffer->gpu_address + buffer_offset; + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); return true; } static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc) { - desc->ce_ram_dirty = true; - if (!desc->buffer) return; @@ -272,25 +199,37 @@ si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc /* SAMPLER VIEWS */ +static inline enum radeon_bo_priority +si_get_sampler_view_priority(struct r600_resource *res) +{ + if (res->b.b.target == PIPE_BUFFER) + return RADEON_PRIO_SAMPLER_BUFFER; + + if (res->b.b.nr_samples > 1) + return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; + + return RADEON_PRIO_SAMPLER_TEXTURE; +} + static unsigned -si_sampler_descriptors_idx(unsigned shader) +si_sampler_and_image_descriptors_idx(unsigned shader) { return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_SAMPLERS; + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES; } static struct si_descriptors * -si_sampler_descriptors(struct si_context *sctx, unsigned shader) +si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader) { - return &sctx->descriptors[si_sampler_descriptors_idx(shader)]; + return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)]; } -static void si_release_sampler_views(struct si_sampler_views *views) +static void si_release_sampler_views(struct si_samplers *samplers) { int i; - for (i = 0; i < ARRAY_SIZE(views->views); i++) { - pipe_sampler_view_reference(&views->views[i], NULL); + for (i = 0; i < ARRAY_SIZE(samplers->views); i++) { + pipe_sampler_view_reference(&samplers->views[i], NULL); } } @@ -310,12 +249,12 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, if (resource->target != PIPE_BUFFER) { struct r600_texture *tex = (struct r600_texture*)resource; - if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil_sampler)) + if (tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler)) resource = &tex->flushed_depth_texture->resource.b.b; } rres = (struct r600_resource*)resource; - priority = r600_get_sampler_view_priority(rres); + priority = si_get_sampler_view_priority(rres); radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, rres, usage, priority, @@ -331,25 +270,17 @@ static void si_sampler_view_add_buffer(struct si_context *sctx, rtex->dcc_separate_buffer, usage, RADEON_PRIO_DCC, check_mem); } - - if (rtex->htile_buffer && - rtex->tc_compatible_htile && - !is_stencil_sampler) { - radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, - rtex->htile_buffer, usage, - RADEON_PRIO_HTILE, check_mem); - } } static void si_sampler_views_begin_new_cs(struct si_context *sctx, - struct si_sampler_views *views) + struct si_samplers *samplers) { - unsigned mask = views->enabled_mask; + unsigned mask = samplers->enabled_mask; /* Add buffers to the CS. */ while (mask) { int i = u_bit_scan(&mask); - struct si_sampler_view *sview = (struct si_sampler_view *)views->views[i]; + struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i]; si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ, @@ -387,14 +318,14 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, { uint64_t va, meta_va = 0; - if (tex->is_depth && !r600_can_sample_zs(tex, is_stencil)) { + if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) { tex = tex->flushed_depth_texture; is_stencil = false; } va = tex->resource.gpu_address; - if (sscreen->b.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX9) { /* Only stencil_offset needs to be added here. */ if (is_stencil) va += tex->surface.u.gfx9.stencil_offset; @@ -404,29 +335,42 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, va += base_level_info->offset; } - if (vi_dcc_enabled(tex, first_level)) { - meta_va = (!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) + - tex->dcc_offset; - - if (sscreen->b.chip_class <= VI) - meta_va += base_level_info->dcc_offset; - } else if (tex->tc_compatible_htile && !is_stencil) { - meta_va = tex->htile_buffer->gpu_address; - } - state[0] = va >> 8; state[1] &= C_008F14_BASE_ADDRESS_HI; state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); - state[6] &= C_008F28_COMPRESSION_EN; - state[7] = 0; + /* Only macrotiled modes can set tile swizzle. + * GFX9 doesn't use (legacy) base_level_info. + */ + if (sscreen->info.chip_class >= GFX9 || + base_level_info->mode == RADEON_SURF_MODE_2D) + state[0] |= tex->surface.tile_swizzle; + + if (sscreen->info.chip_class >= VI) { + state[6] &= C_008F28_COMPRESSION_EN; + state[7] = 0; + + if (vi_dcc_enabled(tex, first_level)) { + meta_va = (!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) + + tex->dcc_offset; + + if (sscreen->info.chip_class == VI) { + meta_va += base_level_info->dcc_offset; + assert(base_level_info->mode == RADEON_SURF_MODE_2D); + } + + meta_va |= (uint32_t)tex->surface.tile_swizzle << 8; + } else if (vi_tc_compat_htile_enabled(tex, first_level)) { + meta_va = tex->resource.gpu_address + tex->htile_offset; + } - if (meta_va) { - state[6] |= S_008F28_COMPRESSION_EN(1); - state[7] = meta_va >> 8; + if (meta_va) { + state[6] |= S_008F28_COMPRESSION_EN(1); + state[7] = meta_va >> 8; + } } - if (sscreen->b.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX9) { state[3] &= C_008F1C_SW_MODE; state[4] &= C_008F20_PITCH_GFX9; @@ -465,69 +409,130 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, } } +static void si_set_sampler_state_desc(struct si_sampler_state *sstate, + struct si_sampler_view *sview, + struct r600_texture *tex, + uint32_t *desc) +{ + if (sview && sview->is_integer) + memcpy(desc, sstate->integer_val, 4*4); + else if (tex && tex->upgraded_depth && + (!sview || !sview->is_stencil_sampler)) + memcpy(desc, sstate->upgraded_depth_val, 4*4); + else + memcpy(desc, sstate->val, 4*4); +} + +static void si_set_sampler_view_desc(struct si_context *sctx, + struct si_sampler_view *sview, + struct si_sampler_state *sstate, + uint32_t *desc) +{ + struct pipe_sampler_view *view = &sview->base; + struct r600_texture *rtex = (struct r600_texture *)view->texture; + bool is_buffer = rtex->resource.b.b.target == PIPE_BUFFER; + + if (unlikely(!is_buffer && sview->dcc_incompatible)) { + if (vi_dcc_enabled(rtex, view->u.tex.first_level)) + if (!si_texture_disable_dcc(&sctx->b, rtex)) + sctx->b.decompress_dcc(&sctx->b.b, rtex); + + sview->dcc_incompatible = false; + } + + assert(rtex); /* views with texture == NULL aren't supported */ + memcpy(desc, sview->state, 8*4); + + if (is_buffer) { + si_set_buf_desc_address(&rtex->resource, + sview->base.u.buf.offset, + desc + 4); + } else { + bool is_separate_stencil = rtex->db_compatible && + sview->is_stencil_sampler; + + si_set_mutable_tex_desc_fields(sctx->screen, rtex, + sview->base_level_info, + sview->base_level, + sview->base.u.tex.first_level, + sview->block_width, + is_separate_stencil, + desc); + } + + if (!is_buffer && rtex->fmask.size) { + memcpy(desc + 8, sview->fmask_state, 8*4); + } else { + /* Disable FMASK and bind sampler state in [12:15]. */ + memcpy(desc + 8, null_texture_descriptor, 4*4); + + if (sstate) + si_set_sampler_state_desc(sstate, sview, + is_buffer ? NULL : rtex, + desc + 12); + } +} + +static bool color_needs_decompression(struct r600_texture *rtex) +{ + return rtex->fmask.size || + (rtex->dirty_level_mask && + (rtex->cmask.size || rtex->dcc_offset)); +} + +static bool depth_needs_decompression(struct r600_texture *rtex) +{ + /* If the depth/stencil texture is TC-compatible, no decompression + * will be done. The decompression function will only flush DB caches + * to make it coherent with shaders. That's necessary because the driver + * doesn't flush DB caches in any other case. + */ + return rtex->db_compatible; +} + static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot, struct pipe_sampler_view *view, bool disallow_early_out) { - struct si_sampler_views *views = &sctx->samplers[shader].views; + struct si_samplers *samplers = &sctx->samplers[shader]; struct si_sampler_view *rview = (struct si_sampler_view*)view; - struct si_descriptors *descs = si_sampler_descriptors(sctx, shader); - uint32_t *desc = descs->list + slot * 16; + struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader); + unsigned desc_slot = si_get_sampler_slot(slot); + uint32_t *desc = descs->list + desc_slot * 16; - if (views->views[slot] == view && !disallow_early_out) + if (samplers->views[slot] == view && !disallow_early_out) return; if (view) { struct r600_texture *rtex = (struct r600_texture *)view->texture; - bool is_buffer = rtex->resource.b.b.target == PIPE_BUFFER; - - if (unlikely(!is_buffer && rview->dcc_incompatible)) { - if (vi_dcc_enabled(rtex, view->u.tex.first_level)) - if (!r600_texture_disable_dcc(&sctx->b, rtex)) - sctx->b.decompress_dcc(&sctx->b.b, rtex); - rview->dcc_incompatible = false; - } - - assert(rtex); /* views with texture == NULL aren't supported */ - pipe_sampler_view_reference(&views->views[slot], view); - memcpy(desc, rview->state, 8*4); + si_set_sampler_view_desc(sctx, rview, + samplers->sampler_states[slot], desc); - if (is_buffer) { + if (rtex->resource.b.b.target == PIPE_BUFFER) { rtex->resource.bind_history |= PIPE_BIND_SAMPLER_VIEW; - - si_set_buf_desc_address(&rtex->resource, - view->u.buf.offset, - desc + 4); - } else { - bool is_separate_stencil = - rtex->db_compatible && - rview->is_stencil_sampler; - - si_set_mutable_tex_desc_fields(sctx->screen, rtex, - rview->base_level_info, - rview->base_level, - rview->base.u.tex.first_level, - rview->block_width, - is_separate_stencil, - desc); - } - - if (!is_buffer && rtex->fmask.size) { - memcpy(desc + 8, - rview->fmask_state, 8*4); + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); } else { - /* Disable FMASK and bind sampler state in [12:15]. */ - memcpy(desc + 8, - null_texture_descriptor, 4*4); + if (depth_needs_decompression(rtex)) { + samplers->needs_depth_decompress_mask |= 1u << slot; + } else { + samplers->needs_depth_decompress_mask &= ~(1u << slot); + } + if (color_needs_decompression(rtex)) { + samplers->needs_color_decompress_mask |= 1u << slot; + } else { + samplers->needs_color_decompress_mask &= ~(1u << slot); + } - if (views->sampler_states[slot]) - memcpy(desc + 12, - views->sampler_states[slot]->val, 4*4); + if (rtex->dcc_offset && + p_atomic_read(&rtex->framebuffers_bound)) + sctx->need_check_render_feedback = true; } - views->enabled_mask |= 1u << slot; + pipe_sampler_view_reference(&samplers->views[slot], view); + samplers->enabled_mask |= 1u << slot; /* Since this can flush, it must be done after enabled_mask is * updated. */ @@ -535,40 +540,35 @@ static void si_set_sampler_view(struct si_context *sctx, RADEON_USAGE_READ, rview->is_stencil_sampler, true); } else { - pipe_sampler_view_reference(&views->views[slot], NULL); + pipe_sampler_view_reference(&samplers->views[slot], NULL); memcpy(desc, null_texture_descriptor, 8*4); /* Only clear the lower dwords of FMASK. */ memcpy(desc + 8, null_texture_descriptor, 4*4); /* Re-set the sampler state if we are transitioning from FMASK. */ - if (views->sampler_states[slot]) - memcpy(desc + 12, - views->sampler_states[slot]->val, 4*4); + if (samplers->sampler_states[slot]) + si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, + desc + 12); - views->enabled_mask &= ~(1u << slot); + samplers->enabled_mask &= ~(1u << slot); + samplers->needs_depth_decompress_mask &= ~(1u << slot); + samplers->needs_color_decompress_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; - sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader); -} - -static bool is_compressed_colortex(struct r600_texture *rtex) -{ - return rtex->cmask.size || rtex->fmask.size || - (rtex->dcc_offset && rtex->dirty_level_mask); + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } -static void si_update_compressed_tex_shader_mask(struct si_context *sctx, - unsigned shader) +static void si_update_shader_needs_decompress_mask(struct si_context *sctx, + unsigned shader) { - struct si_textures_info *samplers = &sctx->samplers[shader]; + struct si_samplers *samplers = &sctx->samplers[shader]; unsigned shader_bit = 1 << shader; - if (samplers->depth_texture_mask || - samplers->compressed_colortex_mask || - sctx->images[shader].compressed_colortex_mask) - sctx->compressed_tex_shader_mask |= shader_bit; + if (samplers->needs_depth_decompress_mask || + samplers->needs_color_decompress_mask || + sctx->images[shader].needs_color_decompress_mask) + sctx->shader_needs_decompress_mask |= shader_bit; else - sctx->compressed_tex_shader_mask &= ~shader_bit; + sctx->shader_needs_decompress_mask &= ~shader_bit; } static void si_set_sampler_views(struct pipe_context *ctx, @@ -577,69 +577,38 @@ static void si_set_sampler_views(struct pipe_context *ctx, struct pipe_sampler_view **views) { struct si_context *sctx = (struct si_context *)ctx; - struct si_textures_info *samplers = &sctx->samplers[shader]; int i; if (!count || shader >= SI_NUM_SHADERS) return; - for (i = 0; i < count; i++) { - unsigned slot = start + i; - - if (!views || !views[i]) { - samplers->depth_texture_mask &= ~(1u << slot); - samplers->compressed_colortex_mask &= ~(1u << slot); - si_set_sampler_view(sctx, shader, slot, NULL, false); - continue; - } - - si_set_sampler_view(sctx, shader, slot, views[i], false); - - if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) { - struct r600_texture *rtex = - (struct r600_texture*)views[i]->texture; - struct si_sampler_view *rview = (struct si_sampler_view *)views[i]; - - if (rtex->db_compatible && - (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) { - samplers->depth_texture_mask |= 1u << slot; - } else { - samplers->depth_texture_mask &= ~(1u << slot); - } - if (is_compressed_colortex(rtex)) { - samplers->compressed_colortex_mask |= 1u << slot; - } else { - samplers->compressed_colortex_mask &= ~(1u << slot); - } - - if (rtex->dcc_offset && - p_atomic_read(&rtex->framebuffers_bound)) - sctx->need_check_render_feedback = true; - } else { - samplers->depth_texture_mask &= ~(1u << slot); - samplers->compressed_colortex_mask &= ~(1u << slot); - } + if (views) { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, views[i], false); + } else { + for (i = 0; i < count; i++) + si_set_sampler_view(sctx, shader, start + i, NULL, false); } - si_update_compressed_tex_shader_mask(sctx, shader); + si_update_shader_needs_decompress_mask(sctx, shader); } static void -si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers) +si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers) { - unsigned mask = samplers->views.enabled_mask; + unsigned mask = samplers->enabled_mask; while (mask) { int i = u_bit_scan(&mask); - struct pipe_resource *res = samplers->views.views[i]->texture; + struct pipe_resource *res = samplers->views[i]->texture; if (res && res->target != PIPE_BUFFER) { struct r600_texture *rtex = (struct r600_texture *)res; - if (is_compressed_colortex(rtex)) { - samplers->compressed_colortex_mask |= 1u << i; + if (color_needs_decompression(rtex)) { + samplers->needs_color_decompress_mask |= 1u << i; } else { - samplers->compressed_colortex_mask &= ~(1u << i); + samplers->needs_color_decompress_mask &= ~(1u << i); } } } @@ -647,21 +616,8 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers) /* IMAGE VIEWS */ -static unsigned -si_image_descriptors_idx(unsigned shader) -{ - return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_IMAGES; -} - -static struct si_descriptors* -si_image_descriptors(struct si_context *sctx, unsigned shader) -{ - return &sctx->descriptors[si_image_descriptors_idx(shader)]; -} - static void -si_release_image_views(struct si_images_info *images) +si_release_image_views(struct si_images *images) { unsigned i; @@ -673,7 +629,7 @@ si_release_image_views(struct si_images_info *images) } static void -si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images) +si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images) { uint mask = images->enabled_mask; @@ -692,18 +648,18 @@ si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *imag static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) { - struct si_images_info *images = &ctx->images[shader]; + struct si_images *images = &ctx->images[shader]; if (images->enabled_mask & (1u << slot)) { - struct si_descriptors *descs = si_image_descriptors(ctx, shader); + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + unsigned desc_slot = si_get_image_slot(slot); pipe_resource_reference(&images->views[slot].resource, NULL); - images->compressed_colortex_mask &= ~(1 << slot); + images->needs_color_decompress_mask &= ~(1 << slot); - memcpy(descs->list + slot*8, null_image_descriptor, 8*4); + memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); images->enabled_mask &= ~(1u << slot); - descs->dirty_mask |= 1u << slot; - ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader); + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } @@ -719,27 +675,16 @@ si_mark_image_range_valid(const struct pipe_image_view *view) view->u.buf.offset + view->u.buf.size); } -static void si_set_shader_image(struct si_context *ctx, - unsigned shader, - unsigned slot, const struct pipe_image_view *view, - bool skip_decompress) +static void si_set_shader_image_desc(struct si_context *ctx, + const struct pipe_image_view *view, + bool skip_decompress, + uint32_t *desc) { struct si_screen *screen = ctx->screen; - struct si_images_info *images = &ctx->images[shader]; - struct si_descriptors *descs = si_image_descriptors(ctx, shader); struct r600_resource *res; - uint32_t *desc = descs->list + slot * 8; - - if (!view || !view->resource) { - si_disable_shader_image(ctx, shader, slot); - return; - } res = (struct r600_resource *)view->resource; - if (&images->views[slot] != view) - util_copy_image_view(&images->views[slot], view); - if (res->b.b.target == PIPE_BUFFER) { if (view->access & PIPE_IMAGE_ACCESS_WRITE) si_mark_image_range_valid(view); @@ -747,18 +692,24 @@ static void si_set_shader_image(struct si_context *ctx, si_make_buffer_descriptor(screen, res, view->format, view->u.buf.offset, - view->u.buf.size, - descs->list + slot * 8); + view->u.buf.size, desc); si_set_buf_desc_address(res, view->u.buf.offset, desc + 4); - - images->compressed_colortex_mask &= ~(1 << slot); - res->bind_history |= PIPE_BIND_SHADER_IMAGE; } else { static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; struct r600_texture *tex = (struct r600_texture *)res; unsigned level = view->u.tex.level; - unsigned width, height, depth; + unsigned width, height, depth, hw_level; bool uses_dcc = vi_dcc_enabled(tex, level); + unsigned access = view->access; + + /* Clear the write flag when writes can't occur. + * Note that DCC_DECOMPRESS for MSAA doesn't work in some cases, + * so we don't wanna trigger it. + */ + if (tex->is_depth || tex->resource.b.b.nr_samples >= 2) { + assert(!"Z/S and MSAA image stores are not supported"); + access &= ~PIPE_IMAGE_ACCESS_WRITE; + } assert(!tex->is_depth); assert(tex->fmask.size == 0); @@ -770,36 +721,35 @@ static void si_set_shader_image(struct si_context *ctx, * The decompression is relatively cheap if the surface * has been decompressed already. */ - if (r600_texture_disable_dcc(&ctx->b, tex)) - uses_dcc = false; - else + if (!si_texture_disable_dcc(&ctx->b, tex)) ctx->b.decompress_dcc(&ctx->b.b, tex); } - if (is_compressed_colortex(tex)) { - images->compressed_colortex_mask |= 1 << slot; + if (ctx->b.chip_class >= GFX9) { + /* Always set the base address. The swizzle modes don't + * allow setting mipmap level offsets as the base. + */ + width = res->b.b.width0; + height = res->b.b.height0; + depth = res->b.b.depth0; + hw_level = level; } else { - images->compressed_colortex_mask &= ~(1 << slot); + /* Always force the base level to the selected level. + * + * This is required for 3D textures, where otherwise + * selecting a single slice for non-layered bindings + * fails. It doesn't hurt the other targets. + */ + width = u_minify(res->b.b.width0, level); + height = u_minify(res->b.b.height0, level); + depth = u_minify(res->b.b.depth0, level); + hw_level = 0; } - if (uses_dcc && - p_atomic_read(&tex->framebuffers_bound)) - ctx->need_check_render_feedback = true; - - /* Always force the base level to the selected level. - * - * This is required for 3D textures, where otherwise - * selecting a single slice for non-layered bindings - * fails. It doesn't hurt the other targets. - */ - width = u_minify(res->b.b.width0, level); - height = u_minify(res->b.b.height0, level); - depth = u_minify(res->b.b.depth0, level); - si_make_texture_descriptor(screen, tex, false, res->b.b.target, view->format, swizzle, - 0, 0, + hw_level, hw_level, view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, @@ -810,14 +760,57 @@ static void si_set_shader_image(struct si_context *ctx, util_format_get_blockwidth(view->format), false, desc); } +} + +static void si_set_shader_image(struct si_context *ctx, + unsigned shader, + unsigned slot, const struct pipe_image_view *view, + bool skip_decompress) +{ + struct si_images *images = &ctx->images[shader]; + struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader); + struct r600_resource *res; + unsigned desc_slot = si_get_image_slot(slot); + uint32_t *desc = descs->list + desc_slot * 8; + + if (!view || !view->resource) { + si_disable_shader_image(ctx, shader, slot); + return; + } + + res = (struct r600_resource *)view->resource; + + if (&images->views[slot] != view) + util_copy_image_view(&images->views[slot], view); + + si_set_shader_image_desc(ctx, view, skip_decompress, desc); + + if (res->b.b.target == PIPE_BUFFER) { + images->needs_color_decompress_mask &= ~(1 << slot); + res->bind_history |= PIPE_BIND_SHADER_IMAGE; + } else { + struct r600_texture *tex = (struct r600_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(tex)) { + images->needs_color_decompress_mask |= 1 << slot; + } else { + images->needs_color_decompress_mask &= ~(1 << slot); + } + + if (vi_dcc_enabled(tex, level) && + p_atomic_read(&tex->framebuffers_bound)) + ctx->need_check_render_feedback = true; + } images->enabled_mask |= 1u << slot; - descs->dirty_mask |= 1u << slot; - ctx->descriptors_dirty |= 1u << si_image_descriptors_idx(shader); + ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); /* Since this can flush, it must be done after enabled_mask is updated. */ si_sampler_view_add_buffer(ctx, &res->b.b, - RADEON_USAGE_READWRITE, false, true); + (view->access & PIPE_IMAGE_ACCESS_WRITE) ? + RADEON_USAGE_READWRITE : RADEON_USAGE_READ, + false, true); } static void @@ -844,11 +837,11 @@ si_set_shader_images(struct pipe_context *pipe, si_set_shader_image(ctx, shader, slot, NULL, false); } - si_update_compressed_tex_shader_mask(ctx, shader); + si_update_shader_needs_decompress_mask(ctx, shader); } static void -si_images_update_compressed_colortex_mask(struct si_images_info *images) +si_images_update_needs_color_decompress_mask(struct si_images *images) { unsigned mask = images->enabled_mask; @@ -859,10 +852,10 @@ si_images_update_compressed_colortex_mask(struct si_images_info *images) if (res && res->target != PIPE_BUFFER) { struct r600_texture *rtex = (struct r600_texture *)res; - if (is_compressed_colortex(rtex)) { - images->compressed_colortex_mask |= 1 << i; + if (color_needs_decompression(rtex)) { + images->needs_color_decompress_mask |= 1 << i; } else { - images->compressed_colortex_mask &= ~(1 << i); + images->needs_color_decompress_mask &= ~(1 << i); } } } @@ -875,8 +868,8 @@ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned start, unsigned count, void **states) { struct si_context *sctx = (struct si_context *)ctx; - struct si_textures_info *samplers = &sctx->samplers[shader]; - struct si_descriptors *desc = si_sampler_descriptors(sctx, shader); + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader); struct si_sampler_state **sstates = (struct si_sampler_state**)states; int i; @@ -885,28 +878,36 @@ static void si_bind_sampler_states(struct pipe_context *ctx, for (i = 0; i < count; i++) { unsigned slot = start + i; + unsigned desc_slot = si_get_sampler_slot(slot); if (!sstates[i] || - sstates[i] == samplers->views.sampler_states[slot]) + sstates[i] == samplers->sampler_states[slot]) continue; #ifdef DEBUG assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC); #endif - samplers->views.sampler_states[slot] = sstates[i]; + samplers->sampler_states[slot] = sstates[i]; /* If FMASK is bound, don't overwrite it. * The sampler state will be set after FMASK is unbound. */ - if (samplers->views.views[slot] && - samplers->views.views[slot]->texture && - samplers->views.views[slot]->texture->target != PIPE_BUFFER && - ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size) + struct si_sampler_view *sview = + (struct si_sampler_view *)samplers->views[slot]; + + struct r600_texture *tex = NULL; + + if (sview && sview->base.texture && + sview->base.texture->target != PIPE_BUFFER) + tex = (struct r600_texture *)sview->base.texture; + + if (tex && tex->fmask.size) continue; - memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4); - desc->dirty_mask |= 1u << slot; - sctx->descriptors_dirty |= 1u << si_sampler_descriptors_idx(shader); + si_set_sampler_state_desc(sstates[i], sview, tex, + desc->list + desc_slot * 16 + 12); + + sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } @@ -915,17 +916,19 @@ static void si_bind_sampler_states(struct pipe_context *ctx, static void si_init_buffer_resources(struct si_buffer_resources *buffers, struct si_descriptors *descs, unsigned num_buffers, - unsigned shader_userdata_index, + short shader_userdata_rel_index, enum radeon_bo_usage shader_usage, + enum radeon_bo_usage shader_usage_constbuf, enum radeon_bo_priority priority, - unsigned *ce_offset) + enum radeon_bo_priority priority_constbuf) { buffers->shader_usage = shader_usage; + buffers->shader_usage_constbuf = shader_usage_constbuf; buffers->priority = priority; + buffers->priority_constbuf = priority_constbuf; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - si_init_descriptors(descs, shader_userdata_index, 4, - num_buffers, NULL, ce_offset); + si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, @@ -950,8 +953,11 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, int i = u_bit_scan(&mask); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)buffers->buffers[i], - buffers->shader_usage, buffers->priority); + r600_resource(buffers->buffers[i]), + i < SI_NUM_SHADER_BUFFERS ? buffers->shader_usage : + buffers->shader_usage_constbuf, + i < SI_NUM_SHADER_BUFFERS ? buffers->priority : + buffers->priority_constbuf); } } @@ -980,38 +986,35 @@ static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers, static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) { - struct si_descriptors *desc = &sctx->vertex_buffers; int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0; int i; for (i = 0; i < count; i++) { - int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; + int vb = sctx->vertex_elements->vertex_buffer_index[i]; if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) continue; - if (!sctx->vertex_buffer[vb].buffer) + if (!sctx->vertex_buffer[vb].buffer.resource) continue; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)sctx->vertex_buffer[vb].buffer, + (struct r600_resource*)sctx->vertex_buffer[vb].buffer.resource, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } - if (!desc->buffer) + if (!sctx->vb_descriptors_buffer) return; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - desc->buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); + sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); } bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { - struct si_vertex_element *velems = sctx->vertex_elements; - struct si_descriptors *desc = &sctx->vertex_buffers; + struct si_vertex_elements *velems = sctx->vertex_elements; unsigned i, count; unsigned desc_list_byte_size; unsigned first_vb_use_mask; - uint64_t va; uint32_t *ptr; if (!sctx->vertex_buffers_dirty || !velems) @@ -1032,54 +1035,56 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) u_upload_alloc(sctx->b.b.const_uploader, 0, desc_list_byte_size, si_optimal_tcc_alignment(sctx, desc_list_byte_size), - &desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, (void**)&ptr); - if (!desc->buffer) + &sctx->vb_descriptors_offset, + (struct pipe_resource**)&sctx->vb_descriptors_buffer, + (void**)&ptr); + if (!sctx->vb_descriptors_buffer) { + sctx->vb_descriptors_offset = 0; + sctx->vb_descriptors_gpu_list = NULL; return false; + } + sctx->vb_descriptors_gpu_list = ptr; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - desc->buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); + sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); assert(count <= SI_MAX_ATTRIBS); for (i = 0; i < count; i++) { - struct pipe_vertex_element *ve = &velems->elements[i]; struct pipe_vertex_buffer *vb; struct r600_resource *rbuffer; - unsigned offset; - unsigned vbo_index = ve->vertex_buffer_index; + unsigned vbo_index = velems->vertex_buffer_index[i]; uint32_t *desc = &ptr[i*4]; vb = &sctx->vertex_buffer[vbo_index]; - rbuffer = (struct r600_resource*)vb->buffer; + rbuffer = (struct r600_resource*)vb->buffer.resource; if (!rbuffer) { memset(desc, 0, 16); continue; } - offset = vb->buffer_offset + ve->src_offset; - va = rbuffer->gpu_address + offset; - - /* Fill in T# buffer resource description */ - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(vb->stride); + int64_t offset = (int64_t)((int)vb->buffer_offset) + + velems->src_offset[i]; + uint64_t va = rbuffer->gpu_address + offset; + int64_t num_records = (int64_t)rbuffer->b.b.width0 - offset; if (sctx->b.chip_class != VI && vb->stride) { /* Round up by rounding down and adding 1 */ - desc[2] = (vb->buffer->width0 - offset - - velems->format_size[i]) / - vb->stride + 1; - } else { - desc[2] = vb->buffer->width0 - offset; + num_records = (num_records - velems->format_size[i]) / + vb->stride + 1; } + assert(num_records >= 0 && num_records <= UINT_MAX); + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(vb->stride); + desc[2] = num_records; desc[3] = velems->rsrc_word3[i]; if (first_vb_use_mask & (1 << i)) { radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)vb->buffer, + (struct r600_resource*)vb->buffer.resource, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } } @@ -1088,11 +1093,10 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) * on performance (confirmed by testing). New descriptors are always * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ - si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); - if (sctx->b.chip_class >= CIK) - si_mark_atom_dirty(sctx, &sctx->prefetch_L2); + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); sctx->vertex_buffers_dirty = false; sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; return true; } @@ -1100,16 +1104,16 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) /* CONSTANT BUFFERS */ static unsigned -si_const_buffer_descriptors_idx(unsigned shader) +si_const_and_shader_buffer_descriptors_idx(unsigned shader) { return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_CONST_BUFFERS; + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS; } static struct si_descriptors * -si_const_buffer_descriptors(struct si_context *sctx, unsigned shader) +si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader) { - return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)]; + return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)]; } void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, @@ -1180,8 +1184,8 @@ static void si_set_constant_buffer(struct si_context *sctx, buffers->buffers[slot] = buffer; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, - buffers->shader_usage, - buffers->priority, true); + buffers->shader_usage_constbuf, + buffers->priority_constbuf, true); buffers->enabled_mask |= 1u << slot; } else { /* Clear the descriptor. */ @@ -1189,7 +1193,6 @@ static void si_set_constant_buffer(struct si_context *sctx, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << descriptors_idx; } @@ -1209,8 +1212,15 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, if (shader >= SI_NUM_SHADERS) return; - si_set_constant_buffer(sctx, &sctx->const_buffers[shader], - si_const_buffer_descriptors_idx(shader), + if (slot == 0 && input && input->buffer && + !(r600_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) { + assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader"); + return; + } + + slot = si_get_constbuf_slot(slot); + si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), slot, input); } @@ -1219,35 +1229,22 @@ void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, { cbuf->user_buffer = NULL; si_get_buffer_from_descriptors( - &sctx->const_buffers[shader], - si_const_buffer_descriptors(sctx, shader), - slot, &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); + &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors(sctx, shader), + si_get_constbuf_slot(slot), + &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size); } /* SHADER BUFFERS */ -static unsigned -si_shader_buffer_descriptors_idx(enum pipe_shader_type shader) -{ - return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + - SI_SHADER_DESCS_SHADER_BUFFERS; -} - -static struct si_descriptors * -si_shader_buffer_descriptors(struct si_context *sctx, - enum pipe_shader_type shader) -{ - return &sctx->descriptors[si_shader_buffer_descriptors_idx(shader)]; -} - static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader, unsigned start_slot, unsigned count, const struct pipe_shader_buffer *sbuffers) { struct si_context *sctx = (struct si_context *)ctx; - struct si_buffer_resources *buffers = &sctx->shader_buffers[shader]; - struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader); + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); unsigned i; assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); @@ -1255,7 +1252,7 @@ static void si_set_shader_buffers(struct pipe_context *ctx, for (i = 0; i < count; ++i) { const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; struct r600_resource *buf; - unsigned slot = start_slot + i; + unsigned slot = si_get_shaderbuf_slot(start_slot + i); uint32_t *desc = descs->list + slot * 4; uint64_t va; @@ -1263,9 +1260,8 @@ static void si_set_shader_buffers(struct pipe_context *ctx, pipe_resource_reference(&buffers->buffers[slot], NULL); memset(desc, 0, sizeof(uint32_t) * 4); buffers->enabled_mask &= ~(1u << slot); - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= - 1u << si_shader_buffer_descriptors_idx(shader); + 1u << si_const_and_shader_buffer_descriptors_idx(shader); continue; } @@ -1290,9 +1286,8 @@ static void si_set_shader_buffers(struct pipe_context *ctx, buf->bind_history |= PIPE_BIND_SHADER_BUFFER; buffers->enabled_mask |= 1u << slot; - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= - 1u << si_shader_buffer_descriptors_idx(shader); + 1u << si_const_and_shader_buffer_descriptors_idx(shader); util_range_add(&buf->valid_buffer_range, sbuffer->buffer_offset, sbuffer->buffer_offset + sbuffer->buffer_size); @@ -1304,12 +1299,13 @@ void si_get_shader_buffers(struct si_context *sctx, uint start_slot, uint count, struct pipe_shader_buffer *sbuf) { - struct si_buffer_resources *buffers = &sctx->shader_buffers[shader]; - struct si_descriptors *descs = si_shader_buffer_descriptors(sctx, shader); + struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader]; + struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader); for (unsigned i = 0; i < count; ++i) { si_get_buffer_from_descriptors( - buffers, descs, start_slot + i, + buffers, descs, + si_get_shaderbuf_slot(start_slot + i), &sbuf[i].buffer, &sbuf[i].buffer_offset, &sbuf[i].buffer_size); } @@ -1409,132 +1405,15 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint slot, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } -/* STREAMOUT BUFFERS */ - -static void si_set_streamout_targets(struct pipe_context *ctx, - unsigned num_targets, - struct pipe_stream_output_target **targets, - const unsigned *offsets) -{ - struct si_context *sctx = (struct si_context *)ctx; - struct si_buffer_resources *buffers = &sctx->rw_buffers; - struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; - unsigned old_num_targets = sctx->b.streamout.num_targets; - unsigned i, bufidx; - - /* We are going to unbind the buffers. Mark which caches need to be flushed. */ - if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) { - /* Since streamout uses vector writes which go through TC L2 - * and most other clients can use TC L2 as well, we don't need - * to flush it. - * - * The only cases which requires flushing it is VGT DMA index - * fetching (on <= CIK) and indirect draw data, which are rare - * cases. Thus, flag the TC L2 dirtiness in the resource and - * handle it at draw call time. - */ - for (i = 0; i < sctx->b.streamout.num_targets; i++) - if (sctx->b.streamout.targets[i]) - r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true; - - /* Invalidate the scalar cache in case a streamout buffer is - * going to be used as a constant buffer. - * - * Invalidate TC L1, because streamout bypasses it (done by - * setting GLC=1 in the store instruction), but it can contain - * outdated data of streamout buffers. - * - * VS_PARTIAL_FLUSH is required if the buffers are going to be - * used as an input immediately. - */ - sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_VS_PARTIAL_FLUSH; - } - - /* All readers of the streamout targets need to be finished before we can - * start writing to the targets. - */ - if (num_targets) - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; - - /* Streamout buffers must be bound in 2 places: - * 1) in VGT by setting the VGT_STRMOUT registers - * 2) as shader resources - */ - - /* Set the VGT regs. */ - r600_set_streamout_targets(ctx, num_targets, targets, offsets); - - /* Set the shader resources.*/ - for (i = 0; i < num_targets; i++) { - bufidx = SI_VS_STREAMOUT_BUF0 + i; - - if (targets[i]) { - struct pipe_resource *buffer = targets[i]->buffer; - uint64_t va = r600_resource(buffer)->gpu_address; - - /* Set the descriptor. - * - * On VI, the format must be non-INVALID, otherwise - * the buffer will be considered not bound and store - * instructions will be no-ops. - */ - uint32_t *desc = descs->list + bufidx*4; - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); - desc[2] = 0xffffffff; - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); - - /* Set the resource. */ - pipe_resource_reference(&buffers->buffers[bufidx], - buffer); - radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, - (struct r600_resource*)buffer, - buffers->shader_usage, - RADEON_PRIO_SHADER_RW_BUFFER, - true); - r600_resource(buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; - - buffers->enabled_mask |= 1u << bufidx; - } else { - /* Clear the descriptor and unset the resource. */ - memset(descs->list + bufidx*4, 0, - sizeof(uint32_t) * 4); - pipe_resource_reference(&buffers->buffers[bufidx], - NULL); - buffers->enabled_mask &= ~(1u << bufidx); - } - descs->dirty_mask |= 1u << bufidx; - } - for (; i < old_num_targets; i++) { - bufidx = SI_VS_STREAMOUT_BUF0 + i; - /* Clear the descriptor and unset the resource. */ - memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4); - pipe_resource_reference(&buffers->buffers[bufidx], NULL); - buffers->enabled_mask &= ~(1u << bufidx); - descs->dirty_mask |= 1u << bufidx; - } - - sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; -} - -static void si_desc_reset_buffer_offset(struct pipe_context *ctx, - uint32_t *desc, uint64_t old_buf_va, - struct pipe_resource *new_buf) +static void si_desc_reset_buffer_offset(struct pipe_context *ctx, + uint32_t *desc, uint64_t old_buf_va, + struct pipe_resource *new_buf) { /* Retrieve the buffer offset from the descriptor. */ - uint64_t old_desc_va = - desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); + uint64_t old_desc_va = si_desc_extract_buffer_address(desc); assert(old_buf_va <= old_desc_va); uint64_t offset_within_buffer = old_desc_va - old_buf_va; @@ -1565,17 +1444,59 @@ static void si_set_polygon_stipple(struct pipe_context *ctx, /* TEXTURE METADATA ENABLE/DISABLE */ +static void +si_resident_handles_update_needs_color_decompress(struct si_context *sctx) +{ + util_dynarray_clear(&sctx->resident_tex_needs_color_decompress); + util_dynarray_clear(&sctx->resident_img_needs_color_decompress); + + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct pipe_resource *res = (*tex_handle)->view->texture; + struct r600_texture *rtex; + + if (!res || res->target == PIPE_BUFFER) + continue; + + rtex = (struct r600_texture *)res; + if (!color_needs_decompression(rtex)) + continue; + + util_dynarray_append(&sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, *tex_handle); + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + struct pipe_resource *res = view->resource; + struct r600_texture *rtex; + + if (!res || res->target == PIPE_BUFFER) + continue; + + rtex = (struct r600_texture *)res; + if (!color_needs_decompression(rtex)) + continue; + + util_dynarray_append(&sctx->resident_img_needs_color_decompress, + struct si_image_handle *, *img_handle); + } +} + /* CMASK can be enabled (for fast clear) and disabled (for texture export) * while the texture is bound, possibly by a different context. In that case, - * call this function to update compressed_colortex_masks. + * call this function to update needs_*_decompress_masks. */ -void si_update_compressed_colortex_masks(struct si_context *sctx) +void si_update_needs_color_decompress_masks(struct si_context *sctx) { for (int i = 0; i < SI_NUM_SHADERS; ++i) { - si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]); - si_images_update_compressed_colortex_mask(&sctx->images[i]); - si_update_compressed_tex_shader_mask(sctx, i); + si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]); + si_images_update_needs_color_decompress_mask(&sctx->images[i]); + si_update_shader_needs_decompress_mask(sctx, i); } + + si_resident_handles_update_needs_color_decompress(sctx); } /* BUFFER DISCARD/INVALIDATION */ @@ -1584,11 +1505,14 @@ void si_update_compressed_colortex_masks(struct si_context *sctx) static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers, unsigned descriptors_idx, + unsigned slot_mask, struct pipe_resource *buf, - uint64_t old_va) + uint64_t old_va, + enum radeon_bo_usage usage, + enum radeon_bo_priority priority) { struct si_descriptors *descs = &sctx->descriptors[descriptors_idx]; - unsigned mask = buffers->enabled_mask; + unsigned mask = buffers->enabled_mask & slot_mask; while (mask) { unsigned i = u_bit_scan(&mask); @@ -1596,36 +1520,24 @@ static void si_reset_buffer_resources(struct si_context *sctx, si_desc_reset_buffer_offset(&sctx->b.b, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << descriptors_idx; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, (struct r600_resource *)buf, - buffers->shader_usage, - buffers->priority, true); + usage, priority, true); } } } -/* Reallocate a buffer a update all resource bindings where the buffer is - * bound. - * - * This is used to avoid CPU-GPU synchronizations, because it makes the buffer - * idle by discarding its contents. Apps usually tell us when to do this using - * map_buffer flags, for example. - */ -static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf) +static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf, + uint64_t old_va) { struct si_context *sctx = (struct si_context*)ctx; struct r600_resource *rbuffer = r600_resource(buf); unsigned i, shader; - uint64_t old_va = rbuffer->gpu_address; unsigned num_elems = sctx->vertex_elements ? sctx->vertex_elements->count : 0; - /* Reallocate the buffer in the same pipe_resource. */ - r600_alloc_resource(&sctx->screen->b, rbuffer); - /* We changed the buffer, now we need to bind it where the old one * was bound. This consists of 2 things: * 1) Updating the resource descriptor and dirtying it. @@ -1635,14 +1547,14 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Vertex buffers. */ if (rbuffer->bind_history & PIPE_BIND_VERTEX_BUFFER) { for (i = 0; i < num_elems; i++) { - int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; + int vb = sctx->vertex_elements->vertex_buffer_index[i]; if (vb >= ARRAY_SIZE(sctx->vertex_buffer)) continue; - if (!sctx->vertex_buffer[vb].buffer) + if (!sctx->vertex_buffer[vb].buffer.resource) continue; - if (sctx->vertex_buffer[vb].buffer == buf) { + if (sctx->vertex_buffer[vb].buffer.resource == buf) { sctx->vertex_buffers_dirty = true; break; } @@ -1661,7 +1573,6 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource si_desc_reset_buffer_offset(ctx, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, @@ -1670,47 +1581,54 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource true); /* Update the streamout state. */ - if (sctx->b.streamout.begin_emitted) - r600_emit_streamout_end(&sctx->b); - sctx->b.streamout.append_bitmask = - sctx->b.streamout.enabled_mask; - r600_streamout_buffers_dirty(&sctx->b); + if (sctx->streamout.begin_emitted) + si_emit_streamout_end(sctx); + sctx->streamout.append_bitmask = + sctx->streamout.enabled_mask; + si_streamout_buffers_dirty(sctx); } } /* Constant and shader buffers. */ if (rbuffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) { for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->const_buffers[shader], - si_const_buffer_descriptors_idx(shader), - buf, old_va); + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS), + buf, old_va, + sctx->const_and_shader_buffers[shader].shader_usage_constbuf, + sctx->const_and_shader_buffers[shader].priority_constbuf); } if (rbuffer->bind_history & PIPE_BIND_SHADER_BUFFER) { for (shader = 0; shader < SI_NUM_SHADERS; shader++) - si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader], - si_shader_buffer_descriptors_idx(shader), - buf, old_va); + si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader], + si_const_and_shader_buffer_descriptors_idx(shader), + u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), + buf, old_va, + sctx->const_and_shader_buffers[shader].shader_usage, + sctx->const_and_shader_buffers[shader].priority); } if (rbuffer->bind_history & PIPE_BIND_SAMPLER_VIEW) { /* Texture buffers - update bindings. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_sampler_views *views = &sctx->samplers[shader].views; + struct si_samplers *samplers = &sctx->samplers[shader]; struct si_descriptors *descs = - si_sampler_descriptors(sctx, shader); - unsigned mask = views->enabled_mask; + si_sampler_and_image_descriptors(sctx, shader); + unsigned mask = samplers->enabled_mask; while (mask) { unsigned i = u_bit_scan(&mask); - if (views->views[i]->texture == buf) { + if (samplers->views[i]->texture == buf) { + unsigned desc_slot = si_get_sampler_slot(i); + si_desc_reset_buffer_offset(ctx, descs->list + - i * 16 + 4, + desc_slot * 16 + 4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= - 1u << si_sampler_descriptors_idx(shader); + 1u << si_sampler_and_image_descriptors_idx(shader); radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, rbuffer, RADEON_USAGE_READ, @@ -1724,24 +1642,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Shader images */ if (rbuffer->bind_history & PIPE_BIND_SHADER_IMAGE) { for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { - struct si_images_info *images = &sctx->images[shader]; + struct si_images *images = &sctx->images[shader]; struct si_descriptors *descs = - si_image_descriptors(sctx, shader); + si_sampler_and_image_descriptors(sctx, shader); unsigned mask = images->enabled_mask; while (mask) { unsigned i = u_bit_scan(&mask); if (images->views[i].resource == buf) { + unsigned desc_slot = si_get_image_slot(i); + if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE) si_mark_image_range_valid(&images->views[i]); si_desc_reset_buffer_offset( - ctx, descs->list + i * 8 + 4, + ctx, descs->list + desc_slot * 8 + 4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= - 1u << si_image_descriptors_idx(shader); + 1u << si_sampler_and_image_descriptors_idx(shader); radeon_add_to_buffer_list_check_mem( &sctx->b, &sctx->b.gfx, rbuffer, @@ -1751,6 +1670,204 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } } + + /* Bindless texture handles */ + if (rbuffer->texture_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct pipe_sampler_view *view = (*tex_handle)->view; + unsigned desc_slot = (*tex_handle)->desc_slot; + + if (view->texture == buf) { + si_set_buf_desc_address(rbuffer, + view->u.buf.offset, + descs->list + + desc_slot * 16 + 4); + + (*tex_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_buffer_list_check_mem( + &sctx->b, &sctx->b.gfx, rbuffer, + RADEON_USAGE_READ, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } + + /* Bindless image handles */ + if (rbuffer->image_handle_allocated) { + struct si_descriptors *descs = &sctx->bindless_descriptors; + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + unsigned desc_slot = (*img_handle)->desc_slot; + + if (view->resource == buf) { + if (view->access & PIPE_IMAGE_ACCESS_WRITE) + si_mark_image_range_valid(view); + + si_set_buf_desc_address(rbuffer, + view->u.buf.offset, + descs->list + + desc_slot * 16 + 4); + + (*img_handle)->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + + radeon_add_to_buffer_list_check_mem( + &sctx->b, &sctx->b.gfx, rbuffer, + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER, true); + } + } + } +} + +/* Reallocate a buffer a update all resource bindings where the buffer is + * bound. + * + * This is used to avoid CPU-GPU synchronizations, because it makes the buffer + * idle by discarding its contents. Apps usually tell us when to do this using + * map_buffer flags, for example. + */ +static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf) +{ + struct si_context *sctx = (struct si_context*)ctx; + struct r600_resource *rbuffer = r600_resource(buf); + uint64_t old_va = rbuffer->gpu_address; + + /* Reallocate the buffer in the same pipe_resource. */ + si_alloc_resource(sctx->screen, rbuffer); + + si_rebind_buffer(ctx, buf, old_va); +} + +static void si_upload_bindless_descriptor(struct si_context *sctx, + unsigned desc_slot, + unsigned num_dwords) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *data; + uint64_t va; + + data = desc->list + desc_slot_offset; + va = desc->gpu_address + desc_slot_offset * 4; + + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + num_dwords, 0)); + radeon_emit(cs, S_370_DST_SEL(V_370_TC_L2) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(V_370_ME)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit_array(cs, data, num_dwords); +} + +static void si_upload_bindless_descriptors(struct si_context *sctx) +{ + if (!sctx->bindless_descriptors_dirty) + return; + + /* Wait for graphics/compute to be idle before updating the resident + * descriptors directly in memory, in case the GPU is using them. + */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + si_emit_cache_flush(sctx); + + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + unsigned desc_slot = (*tex_handle)->desc_slot; + + if (!(*tex_handle)->desc_dirty) + continue; + + si_upload_bindless_descriptor(sctx, desc_slot, 16); + (*tex_handle)->desc_dirty = false; + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + unsigned desc_slot = (*img_handle)->desc_slot; + + if (!(*img_handle)->desc_dirty) + continue; + + si_upload_bindless_descriptor(sctx, desc_slot, 8); + (*img_handle)->desc_dirty = false; + } + + /* Invalidate L1 because it doesn't know that L2 changed. */ + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1; + si_emit_cache_flush(sctx); + + sctx->bindless_descriptors_dirty = false; +} + +/* Update mutable image descriptor fields of all resident textures. */ +static void si_update_bindless_texture_descriptor(struct si_context *sctx, + struct si_texture_handle *tex_handle) +{ + struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = tex_handle->desc_slot * 16; + uint32_t desc_list[16]; + + if (sview->base.texture->target == PIPE_BUFFER) + return; + + memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list)); + si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, + desc->list + desc_slot_offset); + + if (memcmp(desc_list, desc->list + desc_slot_offset, + sizeof(desc_list))) { + tex_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } +} + +static void si_update_bindless_image_descriptor(struct si_context *sctx, + struct si_image_handle *img_handle) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot_offset = img_handle->desc_slot * 16; + struct pipe_image_view *view = &img_handle->view; + uint32_t desc_list[8]; + + if (view->resource->target == PIPE_BUFFER) + return; + + memcpy(desc_list, desc->list + desc_slot_offset, + sizeof(desc_list)); + si_set_shader_image_desc(sctx, view, true, + desc->list + desc_slot_offset); + + if (memcmp(desc_list, desc->list + desc_slot_offset, + sizeof(desc_list))) { + img_handle->desc_dirty = true; + sctx->bindless_descriptors_dirty = true; + } +} + +static void si_update_all_resident_texture_descriptors(struct si_context *sctx) +{ + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + si_update_bindless_texture_descriptor(sctx, *tex_handle); + } + + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + si_update_bindless_image_descriptor(sctx, *img_handle); + } + + si_upload_bindless_descriptors(sctx); } /* Update mutable image descriptor fields of all bound textures. */ @@ -1759,8 +1876,8 @@ void si_update_all_texture_descriptors(struct si_context *sctx) unsigned shader; for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_sampler_views *samplers = &sctx->samplers[shader].views; - struct si_images_info *images = &sctx->images[shader]; + struct si_samplers *samplers = &sctx->samplers[shader]; + struct si_images *images = &sctx->images[shader]; unsigned mask; /* Images. */ @@ -1791,8 +1908,10 @@ void si_update_all_texture_descriptors(struct si_context *sctx) samplers->views[i], true); } - si_update_compressed_tex_shader_mask(sctx, shader); + si_update_shader_needs_decompress_mask(sctx, shader); } + + si_update_all_resident_texture_descriptors(sctx); } /* SHADER USER DATA */ @@ -1805,16 +1924,18 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, SI_NUM_SHADER_DESCS); if (shader == PIPE_SHADER_VERTEX) - sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL; + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; - si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); } -static void si_shader_userdata_begin_new_cs(struct si_context *sctx) +static void si_shader_pointers_begin_new_cs(struct si_context *sctx) { sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer != NULL; - si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); + sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL; + si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom); + sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; } /* Set a base register address for user data constants in the given shader. @@ -1823,13 +1944,17 @@ static void si_shader_userdata_begin_new_cs(struct si_context *sctx) static void si_set_user_data_base(struct si_context *sctx, unsigned shader, uint32_t new_base) { - uint32_t *base = &sctx->shader_userdata.sh_base[shader]; + uint32_t *base = &sctx->shader_pointers.sh_base[shader]; if (*base != new_base) { *base = new_base; - if (new_base) + if (new_base) { si_mark_shader_pointers_dirty(sctx, shader); + + if (shader == PIPE_SHADER_VERTEX) + sctx->last_vs_state = ~0; + } } } @@ -1871,89 +1996,613 @@ void si_shader_change_notify(struct si_context *sctx) } } +static void si_emit_shader_pointer_head(struct radeon_winsys_cs *cs, + unsigned sh_offset, + unsigned pointer_count) +{ + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count * (HAVE_32BIT_POINTERS ? 1 : 2), 0)); + radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); +} + +static void si_emit_shader_pointer_body(struct si_screen *sscreen, + struct radeon_winsys_cs *cs, + uint64_t va) +{ + radeon_emit(cs, va); + + if (HAVE_32BIT_POINTERS) + assert((va >> 32) == sscreen->info.address32_hi); + else + radeon_emit(cs, va >> 32); +} + static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, unsigned sh_base) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; - uint64_t va; + unsigned sh_offset = sh_base + desc->shader_userdata_offset; - assert(desc->buffer); + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); +} - va = desc->buffer->gpu_address + - desc->buffer_offset; +static void si_emit_consecutive_shader_pointers(struct si_context *sctx, + unsigned pointer_mask, + unsigned sh_base) +{ + if (!sh_base) + return; - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); - radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + unsigned mask = sctx->shader_pointers_dirty & pointer_mask; + + while (mask) { + int start, count; + u_bit_scan_consecutive_range(&mask, &start, &count); + + struct si_descriptors *descs = &sctx->descriptors[start]; + unsigned sh_offset = sh_base + descs->shader_userdata_offset; + + si_emit_shader_pointer_head(cs, sh_offset, count); + for (int i = 0; i < count; i++) + si_emit_shader_pointer_body(sctx->screen, cs, + descs[i].gpu_address); + } } -void si_emit_graphics_shader_userdata(struct si_context *sctx, - struct r600_atom *atom) +static void si_emit_disjoint_shader_pointers(struct si_context *sctx, + unsigned pointer_mask, + unsigned sh_base) { - unsigned mask; - uint32_t *sh_base = sctx->shader_userdata.sh_base; - struct si_descriptors *descs; + if (!sh_base) + return; - descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS]; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + unsigned mask = sctx->shader_pointers_dirty & pointer_mask; - if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { - si_emit_shader_pointer(sctx, descs, - R_00B030_SPI_SHADER_USER_DATA_PS_0); - si_emit_shader_pointer(sctx, descs, - R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, - R_00B330_SPI_SHADER_USER_DATA_ES_0); + while (mask) { + struct si_descriptors *descs = &sctx->descriptors[u_bit_scan(&mask)]; + unsigned sh_offset = sh_base + descs->shader_userdata_offset; - /* GFX9 merged LS-HS and ES-GS. Only set RW_BUFFERS for ES and LS. */ - if (sctx->b.chip_class >= GFX9) { - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_LS_0); - } else { - si_emit_shader_pointer(sctx, descs, - R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, - R_00B430_SPI_SHADER_USER_DATA_HS_0); - } + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, descs->gpu_address); + } +} + +static void si_emit_global_shader_pointers(struct si_context *sctx, + struct si_descriptors *descs) +{ + if (sctx->b.chip_class == GFX9) { + /* Broadcast it to all shader stages. */ + si_emit_shader_pointer(sctx, descs, + R_00B530_SPI_SHADER_USER_DATA_COMMON_0); + return; } - mask = sctx->shader_pointers_dirty & - u_bit_consecutive(SI_DESCS_FIRST_SHADER, - SI_DESCS_FIRST_COMPUTE - SI_DESCS_FIRST_SHADER); + si_emit_shader_pointer(sctx, descs, + R_00B030_SPI_SHADER_USER_DATA_PS_0); + si_emit_shader_pointer(sctx, descs, + R_00B130_SPI_SHADER_USER_DATA_VS_0); + si_emit_shader_pointer(sctx, descs, + R_00B330_SPI_SHADER_USER_DATA_ES_0); + si_emit_shader_pointer(sctx, descs, + R_00B230_SPI_SHADER_USER_DATA_GS_0); + si_emit_shader_pointer(sctx, descs, + R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_emit_shader_pointer(sctx, descs, + R_00B530_SPI_SHADER_USER_DATA_LS_0); +} + +void si_emit_graphics_shader_pointers(struct si_context *sctx, + struct r600_atom *atom) +{ + uint32_t *sh_base = sctx->shader_pointers.sh_base; - while (mask) { - unsigned i = u_bit_scan(&mask); - unsigned shader = (i - SI_DESCS_FIRST_SHADER) / SI_NUM_SHADER_DESCS; - unsigned base = sh_base[shader]; + if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) { + si_emit_global_shader_pointers(sctx, + &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + } - if (base) - si_emit_shader_pointer(sctx, descs + i, base); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), + sh_base[PIPE_SHADER_VERTEX]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), + sh_base[PIPE_SHADER_TESS_EVAL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), + sh_base[PIPE_SHADER_FRAGMENT]); + if (HAVE_32BIT_POINTERS || sctx->b.chip_class <= VI) { + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), + sh_base[PIPE_SHADER_TESS_CTRL]); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), + sh_base[PIPE_SHADER_GEOMETRY]); + } else { + si_emit_disjoint_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), + sh_base[PIPE_SHADER_TESS_CTRL]); + si_emit_disjoint_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), + sh_base[PIPE_SHADER_GEOMETRY]); } + sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); if (sctx->vertex_buffer_pointer_dirty) { - si_emit_shader_pointer(sctx, &sctx->vertex_buffers, - sh_base[PIPE_SHADER_VERTEX]); + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + + /* Find the location of the VB descriptor pointer. */ + /* TODO: In the future, the pointer will be packed in unused + * bits of the first 2 VB descriptors. */ + unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; + if (sctx->b.chip_class >= GFX9) { + if (sctx->tes_shader.cso) + sh_dw_offset = GFX9_TCS_NUM_USER_SGPR; + else if (sctx->gs_shader.cso) + sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR; + } + + unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; + si_emit_shader_pointer_head(cs, sh_offset, 1); + si_emit_shader_pointer_body(sctx->screen, cs, + sctx->vb_descriptors_buffer->gpu_address + + sctx->vb_descriptors_offset); sctx->vertex_buffer_pointer_dirty = false; } + + if (sctx->graphics_bindless_pointer_dirty) { + si_emit_global_shader_pointers(sctx, + &sctx->bindless_descriptors); + sctx->graphics_bindless_pointer_dirty = false; + } } -void si_emit_compute_shader_userdata(struct si_context *sctx) +void si_emit_compute_shader_pointers(struct si_context *sctx) { unsigned base = R_00B900_COMPUTE_USER_DATA_0; - struct si_descriptors *descs = sctx->descriptors; - unsigned compute_mask = - u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_SHADER_DESCS); - unsigned mask = sctx->shader_pointers_dirty & compute_mask; - while (mask) { - unsigned i = u_bit_scan(&mask); + si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), + R_00B900_COMPUTE_USER_DATA_0); + sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); + + if (sctx->compute_bindless_pointer_dirty) { + si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); + sctx->compute_bindless_pointer_dirty = false; + } +} + +/* BINDLESS */ + +static void si_init_bindless_descriptors(struct si_context *sctx, + struct si_descriptors *desc, + short shader_userdata_rel_index, + unsigned num_elements) +{ + MAYBE_UNUSED unsigned desc_slot; + + si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements); + sctx->bindless_descriptors.num_active_slots = num_elements; - si_emit_shader_pointer(sctx, descs + i, base); + /* The first bindless descriptor is stored at slot 1, because 0 is not + * considered to be a valid handle. + */ + sctx->num_bindless_descriptors = 1; + + /* Track which bindless slots are used (or not). */ + util_idalloc_init(&sctx->bindless_used_slots); + util_idalloc_resize(&sctx->bindless_used_slots, num_elements); + + /* Reserve slot 0 because it's an invalid handle for bindless. */ + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + assert(desc_slot == 0); +} + +static void si_release_bindless_descriptors(struct si_context *sctx) +{ + si_release_descriptors(&sctx->bindless_descriptors); + util_idalloc_fini(&sctx->bindless_used_slots); +} + +static unsigned si_get_first_free_bindless_slot(struct si_context *sctx) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot; + + desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots); + if (desc_slot >= desc->num_elements) { + /* The array of bindless descriptors is full, resize it. */ + unsigned slot_size = desc->element_dw_size * 4; + unsigned new_num_elements = desc->num_elements * 2; + + desc->list = REALLOC(desc->list, desc->num_elements * slot_size, + new_num_elements * slot_size); + desc->num_elements = new_num_elements; + desc->num_active_slots = new_num_elements; + } + + assert(desc_slot); + return desc_slot; +} + +static unsigned +si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list, + unsigned size) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + unsigned desc_slot, desc_slot_offset; + + /* Find a free slot. */ + desc_slot = si_get_first_free_bindless_slot(sctx); + + /* For simplicity, sampler and image bindless descriptors use fixed + * 16-dword slots for now. Image descriptors only need 8-dword but this + * doesn't really matter because no real apps use image handles. + */ + desc_slot_offset = desc_slot * 16; + + /* Copy the descriptor into the array. */ + memcpy(desc->list + desc_slot_offset, desc_list, size); + + /* Re-upload the whole array of bindless descriptors into a new buffer. + */ + if (!si_upload_descriptors(sctx, desc)) + return 0; + + /* Make sure to re-emit the shader pointers for all stages. */ + sctx->graphics_bindless_pointer_dirty = true; + sctx->compute_bindless_pointer_dirty = true; + + return desc_slot; +} + +static void si_update_bindless_buffer_descriptor(struct si_context *sctx, + unsigned desc_slot, + struct pipe_resource *resource, + uint64_t offset, + bool *desc_dirty) +{ + struct si_descriptors *desc = &sctx->bindless_descriptors; + struct r600_resource *buf = r600_resource(resource); + unsigned desc_slot_offset = desc_slot * 16; + uint32_t *desc_list = desc->list + desc_slot_offset + 4; + uint64_t old_desc_va; + + assert(resource->target == PIPE_BUFFER); + + /* Retrieve the old buffer addr from the descriptor. */ + old_desc_va = si_desc_extract_buffer_address(desc_list); + + if (old_desc_va != buf->gpu_address + offset) { + /* The buffer has been invalidated when the handle wasn't + * resident, update the descriptor and the dirty flag. + */ + si_set_buf_desc_address(buf, offset, &desc_list[0]); + + *desc_dirty = true; + } +} + +static uint64_t si_create_texture_handle(struct pipe_context *ctx, + struct pipe_sampler_view *view, + const struct pipe_sampler_state *state) +{ + struct si_sampler_view *sview = (struct si_sampler_view *)view; + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_state *sstate; + uint32_t desc_list[16]; + uint64_t handle; + + tex_handle = CALLOC_STRUCT(si_texture_handle); + if (!tex_handle) + return 0; + + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor); + + sstate = ctx->create_sampler_state(ctx, state); + if (!sstate) { + FREE(tex_handle); + return 0; + } + + si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]); + memcpy(&tex_handle->sstate, sstate, sizeof(*sstate)); + ctx->delete_sampler_state(ctx, sstate); + + tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, + sizeof(desc_list)); + if (!tex_handle->desc_slot) { + FREE(tex_handle); + return 0; + } + + handle = tex_handle->desc_slot; + + if (!_mesa_hash_table_insert(sctx->tex_handles, + (void *)(uintptr_t)handle, + tex_handle)) { + FREE(tex_handle); + return 0; + } + + pipe_sampler_view_reference(&tex_handle->view, view); + + r600_resource(sview->base.texture)->texture_handle_allocated = true; + + return handle; +} + +static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + + /* Allow this descriptor slot to be re-used. */ + util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot); + + pipe_sampler_view_reference(&tex_handle->view, NULL); + _mesa_hash_table_remove(sctx->tex_handles, entry); + FREE(tex_handle); +} + +static void si_make_texture_handle_resident(struct pipe_context *ctx, + uint64_t handle, bool resident) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_texture_handle *tex_handle; + struct si_sampler_view *sview; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->tex_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + tex_handle = (struct si_texture_handle *)entry->data; + sview = (struct si_sampler_view *)tex_handle->view; + + if (resident) { + if (sview->base.texture->target != PIPE_BUFFER) { + struct r600_texture *rtex = + (struct r600_texture *)sview->base.texture; + + if (depth_needs_decompression(rtex)) { + util_dynarray_append( + &sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, + tex_handle); + } + + if (color_needs_decompression(rtex)) { + util_dynarray_append( + &sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, + tex_handle); + } + + if (rtex->dcc_offset && + p_atomic_read(&rtex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_texture_descriptor(sctx, tex_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, + tex_handle->desc_slot, + sview->base.texture, + sview->base.u.buf.offset, + &tex_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (tex_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the texture handle to the per-context list. */ + util_dynarray_append(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer(sctx, sview->base.texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, false); + } else { + /* Remove the texture handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_tex_handles, + struct si_texture_handle *, + tex_handle); + + if (sview->base.texture->target != PIPE_BUFFER) { + util_dynarray_delete_unordered( + &sctx->resident_tex_needs_depth_decompress, + struct si_texture_handle *, tex_handle); + + util_dynarray_delete_unordered( + &sctx->resident_tex_needs_color_decompress, + struct si_texture_handle *, tex_handle); + } + } +} + +static uint64_t si_create_image_handle(struct pipe_context *ctx, + const struct pipe_image_view *view) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + uint32_t desc_list[8]; + uint64_t handle; + + if (!view || !view->resource) + return 0; + + img_handle = CALLOC_STRUCT(si_image_handle); + if (!img_handle) + return 0; + + memset(desc_list, 0, sizeof(desc_list)); + si_init_descriptor_list(&desc_list[0], 8, 1, null_image_descriptor); + + si_set_shader_image_desc(sctx, view, false, &desc_list[0]); + + img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, + sizeof(desc_list)); + if (!img_handle->desc_slot) { + FREE(img_handle); + return 0; + } + + handle = img_handle->desc_slot; + + if (!_mesa_hash_table_insert(sctx->img_handles, + (void *)(uintptr_t)handle, + img_handle)) { + FREE(img_handle); + return 0; + } + + util_copy_image_view(&img_handle->view, view); + + r600_resource(view->resource)->image_handle_allocated = true; + + return handle; +} + +static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + + util_copy_image_view(&img_handle->view, NULL); + _mesa_hash_table_remove(sctx->img_handles, entry); + FREE(img_handle); +} + +static void si_make_image_handle_resident(struct pipe_context *ctx, + uint64_t handle, unsigned access, + bool resident) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_image_handle *img_handle; + struct pipe_image_view *view; + struct r600_resource *res; + struct hash_entry *entry; + + entry = _mesa_hash_table_search(sctx->img_handles, + (void *)(uintptr_t)handle); + if (!entry) + return; + + img_handle = (struct si_image_handle *)entry->data; + view = &img_handle->view; + res = (struct r600_resource *)view->resource; + + if (resident) { + if (res->b.b.target != PIPE_BUFFER) { + struct r600_texture *rtex = (struct r600_texture *)res; + unsigned level = view->u.tex.level; + + if (color_needs_decompression(rtex)) { + util_dynarray_append( + &sctx->resident_img_needs_color_decompress, + struct si_image_handle *, + img_handle); + } + + if (vi_dcc_enabled(rtex, level) && + p_atomic_read(&rtex->framebuffers_bound)) + sctx->need_check_render_feedback = true; + + si_update_bindless_image_descriptor(sctx, img_handle); + } else { + si_update_bindless_buffer_descriptor(sctx, + img_handle->desc_slot, + view->resource, + view->u.buf.offset, + &img_handle->desc_dirty); + } + + /* Re-upload the descriptor if it has been updated while it + * wasn't resident. + */ + if (img_handle->desc_dirty) + sctx->bindless_descriptors_dirty = true; + + /* Add the image handle to the per-context list. */ + util_dynarray_append(&sctx->resident_img_handles, + struct si_image_handle *, img_handle); + + /* Add the buffers to the current CS in case si_begin_new_cs() + * is not going to be called. + */ + si_sampler_view_add_buffer(sctx, view->resource, + (access & PIPE_IMAGE_ACCESS_WRITE) ? + RADEON_USAGE_READWRITE : + RADEON_USAGE_READ, false, false); + } else { + /* Remove the image handle from the per-context list. */ + util_dynarray_delete_unordered(&sctx->resident_img_handles, + struct si_image_handle *, + img_handle); + + if (res->b.b.target != PIPE_BUFFER) { + util_dynarray_delete_unordered( + &sctx->resident_img_needs_color_decompress, + struct si_image_handle *, + img_handle); + } + } +} + + +void si_all_resident_buffers_begin_new_cs(struct si_context *sctx) +{ + unsigned num_resident_tex_handles, num_resident_img_handles; + + num_resident_tex_handles = sctx->resident_tex_handles.size / + sizeof(struct si_texture_handle *); + num_resident_img_handles = sctx->resident_img_handles.size / + sizeof(struct si_image_handle *); + + /* Add all resident texture handles. */ + util_dynarray_foreach(&sctx->resident_tex_handles, + struct si_texture_handle *, tex_handle) { + struct si_sampler_view *sview = + (struct si_sampler_view *)(*tex_handle)->view; + + si_sampler_view_add_buffer(sctx, sview->base.texture, + RADEON_USAGE_READ, + sview->is_stencil_sampler, false); } - sctx->shader_pointers_dirty &= ~compute_mask; + + /* Add all resident image handles. */ + util_dynarray_foreach(&sctx->resident_img_handles, + struct si_image_handle *, img_handle) { + struct pipe_image_view *view = &(*img_handle)->view; + + si_sampler_view_add_buffer(sctx, view->resource, + RADEON_USAGE_READWRITE, + false, false); + } + + sctx->b.num_resident_handles += num_resident_tex_handles + + num_resident_img_handles; } /* INIT/DEINIT/UPLOAD */ @@ -1961,56 +2610,84 @@ void si_emit_compute_shader_userdata(struct si_context *sctx) void si_init_all_descriptors(struct si_context *sctx) { int i; - unsigned ce_offset = 0; + +#if !HAVE_32BIT_POINTERS + STATIC_ASSERT(GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES % 2 == 0); +#endif for (i = 0; i < SI_NUM_SHADERS; i++) { - /* GFX9 has only 4KB of CE, while previous chips had 32KB. - * Rarely used descriptors don't use CE RAM. - */ - bool big_ce = sctx->b.chip_class <= VI; - bool images_use_ce = big_ce; - bool shaderbufs_use_ce = big_ce || - i == PIPE_SHADER_COMPUTE; - bool samplers_use_ce = big_ce || - i == PIPE_SHADER_FRAGMENT; - - si_init_buffer_resources(&sctx->const_buffers[i], - si_const_buffer_descriptors(sctx, i), - SI_NUM_CONST_BUFFERS, SI_SGPR_CONST_BUFFERS, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER, - &ce_offset); - si_init_buffer_resources(&sctx->shader_buffers[i], - si_shader_buffer_descriptors(sctx, i), - SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS, - RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER, - shaderbufs_use_ce ? &ce_offset : NULL); - - si_init_descriptors(si_sampler_descriptors(sctx, i), - SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS, - null_texture_descriptor, - samplers_use_ce ? &ce_offset : NULL); - - si_init_descriptors(si_image_descriptors(sctx, i), - SI_SGPR_IMAGES, 8, SI_NUM_IMAGES, - null_image_descriptor, - images_use_ce ? &ce_offset : NULL); + bool is_2nd = sctx->b.chip_class >= GFX9 && + (i == PIPE_SHADER_TESS_CTRL || + i == PIPE_SHADER_GEOMETRY); + unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; + unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; + int rel_dw_offset; + struct si_descriptors *desc; + + if (is_2nd) { + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - + R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - + R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } + } else { + rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS; + } + desc = si_const_and_shader_buffer_descriptors(sctx, i); + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, + num_buffer_slots, rel_dw_offset, + RADEON_USAGE_READWRITE, + RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RW_BUFFER, + RADEON_PRIO_CONST_BUFFER); + desc->slot_index_to_bind_directly = si_get_constbuf_slot(0); + + if (is_2nd) { +#if HAVE_32BIT_POINTERS + if (i == PIPE_SHADER_TESS_CTRL) { + rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - + R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4; + } else { /* PIPE_SHADER_GEOMETRY */ + rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - + R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4; + } +#else + rel_dw_offset = GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES; +#endif + } else { + rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES; + } + + desc = si_sampler_and_image_descriptors(sctx, i); + si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots); + + int j; + for (j = 0; j < SI_NUM_IMAGES; j++) + memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4); + for (; j < SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2; j++) + memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS], SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, - RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS, - &ce_offset); - si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS, NULL, NULL); + /* The second set of usage/priority is used by + * const buffers in RW buffer slots. */ + RADEON_USAGE_READWRITE, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; + + /* Initialize an array of 1024 bindless descriptors, when the limit is + * reached, just make it larger and re-upload the whole array. + */ + si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors, + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, + 1024); sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - if (sctx->b.chip_class >= GFX9) - assert(ce_offset <= 4096); - else - assert(ce_offset <= 32768); - /* Set pipe_context functions. */ sctx->b.b.bind_sampler_states = si_bind_sampler_states; sctx->b.b.set_shader_images = si_set_shader_images; @@ -2018,12 +2695,18 @@ void si_init_all_descriptors(struct si_context *sctx) sctx->b.b.set_polygon_stipple = si_set_polygon_stipple; sctx->b.b.set_shader_buffers = si_set_shader_buffers; sctx->b.b.set_sampler_views = si_set_sampler_views; - sctx->b.b.set_stream_output_targets = si_set_streamout_targets; + sctx->b.b.create_texture_handle = si_create_texture_handle; + sctx->b.b.delete_texture_handle = si_delete_texture_handle; + sctx->b.b.make_texture_handle_resident = si_make_texture_handle_resident; + sctx->b.b.create_image_handle = si_create_image_handle; + sctx->b.b.delete_image_handle = si_delete_image_handle; + sctx->b.b.make_image_handle_resident = si_make_image_handle_resident; sctx->b.invalidate_buffer = si_invalidate_buffer; + sctx->b.rebind_buffer = si_rebind_buffer; /* Shader user data. */ - si_init_atom(sctx, &sctx->shader_userdata.atom, &sctx->atoms.s.shader_userdata, - si_emit_graphics_shader_userdata); + si_init_atom(sctx, &sctx->shader_pointers.atom, &sctx->atoms.s.shader_pointers, + si_emit_graphics_shader_pointers); /* Set default and immutable mappings. */ si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0); @@ -2042,9 +2725,8 @@ void si_init_all_descriptors(struct si_context *sctx) si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); } -bool si_upload_graphics_shader_descriptors(struct si_context *sctx) +static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask) { - const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); unsigned dirty = sctx->descriptors_dirty & mask; /* Assume nothing will go wrong: */ @@ -2053,15 +2735,23 @@ bool si_upload_graphics_shader_descriptors(struct si_context *sctx) while (dirty) { unsigned i = u_bit_scan(&dirty); - if (!si_upload_descriptors(sctx, &sctx->descriptors[i], - &sctx->shader_userdata.atom)) + if (!si_upload_descriptors(sctx, &sctx->descriptors[i])) return false; } sctx->descriptors_dirty &= ~mask; + + si_upload_bindless_descriptors(sctx); + return true; } +bool si_upload_graphics_shader_descriptors(struct si_context *sctx) +{ + const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE); + return si_upload_shader_descriptors(sctx, mask); +} + bool si_upload_compute_shader_descriptors(struct si_context *sctx) { /* Does not update rw_buffers as that is not needed for compute shaders @@ -2069,21 +2759,7 @@ bool si_upload_compute_shader_descriptors(struct si_context *sctx) */ const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE); - unsigned dirty = sctx->descriptors_dirty & mask; - - /* Assume nothing will go wrong: */ - sctx->shader_pointers_dirty |= dirty; - - while (dirty) { - unsigned i = u_bit_scan(&dirty); - - if (!si_upload_descriptors(sctx, &sctx->descriptors[i], NULL)) - return false; - } - - sctx->descriptors_dirty &= ~mask; - - return true; + return si_upload_shader_descriptors(sctx, mask); } void si_release_all_descriptors(struct si_context *sctx) @@ -2091,19 +2767,23 @@ void si_release_all_descriptors(struct si_context *sctx) int i; for (i = 0; i < SI_NUM_SHADERS; i++) { - si_release_buffer_resources(&sctx->const_buffers[i], - si_const_buffer_descriptors(sctx, i)); - si_release_buffer_resources(&sctx->shader_buffers[i], - si_shader_buffer_descriptors(sctx, i)); - si_release_sampler_views(&sctx->samplers[i].views); + si_release_buffer_resources(&sctx->const_and_shader_buffers[i], + si_const_and_shader_buffer_descriptors(sctx, i)); + si_release_sampler_views(&sctx->samplers[i]); si_release_image_views(&sctx->images[i]); } si_release_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS]); + for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++) + pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]); for (i = 0; i < SI_NUM_DESCS; ++i) si_release_descriptors(&sctx->descriptors[i]); - si_release_descriptors(&sctx->vertex_buffers); + + r600_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */ + + si_release_bindless_descriptors(sctx); } void si_all_descriptors_begin_new_cs(struct si_context *sctx) @@ -2111,9 +2791,8 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) int i; for (i = 0; i < SI_NUM_SHADERS; i++) { - si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]); - si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]); - si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); + si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]); + si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]); si_image_views_begin_new_cs(sctx, &sctx->images[i]); } si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers); @@ -2121,6 +2800,45 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) for (i = 0; i < SI_NUM_DESCS; ++i) si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]); + si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors); + + si_shader_pointers_begin_new_cs(sctx); +} + +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, + uint64_t new_active_mask) +{ + struct si_descriptors *desc = &sctx->descriptors[desc_idx]; + + /* Ignore no-op updates and updates that disable all slots. */ + if (!new_active_mask || + new_active_mask == u_bit_consecutive64(desc->first_active_slot, + desc->num_active_slots)) + return; + + int first, count; + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); + assert(new_active_mask == 0); + + /* Upload/dump descriptors if slots are being enabled. */ + if (first < desc->first_active_slot || + first + count > desc->first_active_slot + desc->num_active_slots) + sctx->descriptors_dirty |= 1u << desc_idx; + + desc->first_active_slot = first; + desc->num_active_slots = count; +} + +void si_set_active_descriptors_for_shader(struct si_context *sctx, + struct si_shader_selector *sel) +{ + if (!sel) + return; - si_shader_userdata_begin_new_cs(sctx); + si_set_active_descriptors(sctx, + si_const_and_shader_buffer_descriptors_idx(sel->type), + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + si_sampler_and_image_descriptors_idx(sel->type), + sel->active_samplers_and_images); }