From b0528118dfb1af00e7d08cdb637191b80c14c2ba Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 25 Jul 2015 00:53:16 +0200 Subject: [PATCH] radeonsi: completely rework updating descriptors without CP DMA MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The patch has a better explanation. Just a summary here: - The CPU always uploads a whole descriptor array to previously-unused memory. - CP DMA isn't used. - No caches need to be flushed. - All descriptors are always up-to-date in memory even after a hang, because CP DMA doesn't serve as a middle man to update them. This should bring: - better hang recovery (descriptors are always up-to-date) - better GPU performance (no KCACHE and TC flushes) - worse CPU performance for partial updates (only whole arrays are uploaded) - less used IB space (no CP_DMA and WRITE_DATA packets) - simpler code - hopefully, some of the corruption issues with SI cards will go away. If not, we'll know the issue is not here. Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_descriptors.c | 354 ++++++------------ src/gallium/drivers/radeonsi/si_pipe.h | 6 - src/gallium/drivers/radeonsi/si_state.h | 32 +- src/gallium/drivers/radeonsi/si_state_draw.c | 7 +- 4 files changed, 128 insertions(+), 271 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 14bb6e1fa21..48ec9b72043 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -24,14 +24,23 @@ * Marek Olšák */ -/* Resource binding slots and sampler states (each described with 8 or 4 dwords) - * live in memory on SI. +/* Resource binding slots and sampler states (each described with 8 or + * 4 dwords) are stored in lists in memory which is accessed by shaders + * using scalar load instructions. * - * This file is responsible for managing lists of resources and sampler states - * in memory and binding them, which means updating those structures in memory. + * This file is responsible for managing such lists. It keeps a copy of all + * descriptors in CPU memory and re-uploads a whole list if some slots have + * been changed. * - * There is also code for updating shader pointers to resources and sampler - * states. CP DMA functions are here too. + * This code is also reponsible for updating shader pointers to those lists. + * + * Note that CP DMA can't be used for updating the lists, because a GPU hang + * could leave the list in a mid-IB state and the next IB would get wrong + * descriptors and the whole context would be unusable at that point. + * (Note: The register shadowing can't be used due to the same reason) + * + * Also, uploading descriptors to newly allocated memory doesn't require + * a KCACHE flush. */ #include "radeon/r600_cs.h" @@ -42,7 +51,6 @@ #include "util/u_memory.h" #include "util/u_upload_mgr.h" -#define SI_NUM_CONTEXTS 16 /* NULL image and buffer descriptor. * @@ -139,159 +147,62 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, } } -static void si_init_descriptors(struct si_context *sctx, - struct si_descriptors *desc, +static void si_init_descriptors(struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements, - void (*emit_func)(struct si_context *ctx, struct r600_atom *state)) + unsigned num_elements) { + int i; + assert(num_elements <= sizeof(desc->enabled_mask)*8); - assert(num_elements <= sizeof(desc->dirty_mask)*8); - desc->atom.emit = (void*)emit_func; - desc->shader_userdata_offset = shader_userdata_index * 4; + desc->list = CALLOC(num_elements, element_dw_size * 4); desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; - desc->context_size = num_elements * element_dw_size * 4; - - desc->buffer = (struct r600_resource*) - pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, - SI_NUM_CONTEXTS * desc->context_size); - - r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, - RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); + desc->list_dirty = true; /* upload the list before the next draw */ + desc->shader_userdata_offset = shader_userdata_index * 4; - /* We don't check for CS space here, because this should be called - * only once at context initialization. */ - si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address, - desc->buffer->b.b.width0, 0, - R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); + /* Initialize the array to NULL descriptors if the element size is 8. */ + if (element_dw_size == 8) + for (i = 0; i < num_elements; i++) + memcpy(desc->list + i*element_dw_size, null_descriptor, + sizeof(null_descriptor)); } static void si_release_descriptors(struct si_descriptors *desc) { pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL); + FREE(desc->list); } -static void si_update_descriptors(struct si_context *sctx, +static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc) { - if (desc->dirty_mask) { - desc->atom.num_dw = - 7 + /* copy */ - (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */ - - desc->atom.dirty = true; - desc->pointer_dirty = true; - sctx->shader_userdata.atom.dirty = true; - - /* TODO: Investigate if these flushes can be removed after - * adding CE support. */ - - /* The descriptors are read with the K cache. */ - sctx->b.flags |= SI_CONTEXT_INV_KCACHE; - - /* Since SI uses uncached CP DMA to update descriptors, - * we have to flush TC L2, which is used to fetch constants - * along with KCACHE. */ - if (sctx->b.chip_class == SI) - sctx->b.flags |= SI_CONTEXT_INV_TC_L2; - } else { - desc->atom.dirty = false; - } -} + unsigned list_size = desc->num_elements * desc->element_dw_size * 4; + void *ptr; -static void si_emit_descriptors(struct si_context *sctx, - struct si_descriptors *desc, - uint32_t **descriptors) -{ - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; - uint64_t va_base; - int packet_start = 0; - int packet_size = 0; - int last_index = desc->num_elements; /* point to a non-existing element */ - uint64_t dirty_mask = desc->dirty_mask; - unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS; - - assert(dirty_mask); - - va_base = desc->buffer->gpu_address; - - /* Copy the descriptors to a new context slot. */ - si_emit_cp_dma_copy_buffer(sctx, - va_base + new_context_id * desc->context_size, - va_base + desc->current_context_id * desc->context_size, - desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); - - va_base += new_context_id * desc->context_size; - - /* Update the descriptors. - * Updates of consecutive descriptors are merged to one WRITE_DATA packet. - * - * XXX When unbinding lots of resources, consider clearing the memory - * with CP DMA instead of emitting zeros. - */ - while (dirty_mask) { - int i = u_bit_scan64(&dirty_mask); - - assert(i < desc->num_elements); + if (!desc->list_dirty) + return true; - if (last_index+1 == i && packet_size) { - /* Append new data at the end of the last packet. */ - packet_size += desc->element_dw_size; - cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0); - } else { - /* Start a new packet. */ - uint64_t va = va_base + i * desc->element_dw_size * 4; - - packet_start = cs->cdw; - packet_size = 2 + desc->element_dw_size; - - radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0)); - radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ? - PKT3_WRITE_DATA_DST_SEL_MEM_SYNC : - PKT3_WRITE_DATA_DST_SEL_TC_L2) | - PKT3_WRITE_DATA_WR_CONFIRM | - PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); - radeon_emit(cs, va & 0xFFFFFFFFUL); - radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL); - } + u_upload_alloc(sctx->b.uploader, 0, list_size, + &desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, &ptr); + if (!desc->buffer) + return false; /* skip the draw call */ - radeon_emit_array(cs, descriptors[i], desc->element_dw_size); + util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - last_index = i; - } + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); - desc->dirty_mask = 0; - desc->current_context_id = new_context_id; + desc->list_dirty = false; + desc->pointer_dirty = true; + sctx->shader_userdata.atom.dirty = true; + return true; } /* SAMPLER VIEWS */ -static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_sampler_views *views = (struct si_sampler_views*)atom; - - si_emit_descriptors(sctx, &views->desc, views->desc_data); -} - -static void si_init_sampler_views(struct si_context *sctx, - struct si_sampler_views *views) -{ - int i; - - si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE, - 8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views); - - for (i = 0; i < views->desc.num_elements; i++) { - views->desc_data[i] = null_descriptor; - views->desc.dirty_mask |= 1llu << i; - } - si_update_descriptors(sctx, &views->desc); -} - static void si_release_sampler_views(struct si_sampler_views *views) { int i; @@ -332,6 +243,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, si_get_resource_ro_priority(rview->resource)); } + if (!views->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); } @@ -354,17 +267,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader, rview->resource, RADEON_USAGE_READ, si_get_resource_ro_priority(rview->resource)); - pipe_sampler_view_reference(&views->views[slot], view); - views->desc_data[slot] = view_desc; + memcpy(views->desc.list + slot*8, view_desc, 8*4); views->desc.enabled_mask |= 1llu << slot; } else { pipe_sampler_view_reference(&views->views[slot], NULL); - views->desc_data[slot] = null_descriptor; + memcpy(views->desc.list + slot*8, null_descriptor, 8*4); views->desc.enabled_mask &= ~(1llu << slot); } - views->desc.dirty_mask |= 1llu << slot; + views->desc.list_dirty = true; } static void si_set_sampler_views(struct pipe_context *ctx, @@ -423,22 +335,15 @@ static void si_set_sampler_views(struct pipe_context *ctx, NULL, NULL); } } - - si_update_descriptors(sctx, &samplers->views.desc); } /* SAMPLER STATES */ -static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_sampler_states *states = (struct si_sampler_states*)atom; - - si_emit_descriptors(sctx, &states->desc, states->desc_data); -} - static void si_sampler_states_begin_new_cs(struct si_context *sctx, struct si_sampler_states *states) { + if (!states->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); } @@ -460,64 +365,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, for (i = 0; i < count; i++) { unsigned slot = start + i; - if (!sstates[i]) { - samplers->desc.dirty_mask &= ~(1llu << slot); + if (!sstates[i]) continue; - } - samplers->desc_data[slot] = sstates[i]->val; - samplers->desc.dirty_mask |= 1llu << slot; + memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4); + samplers->desc.list_dirty = true; } - - si_update_descriptors(sctx, &samplers->desc); } /* BUFFER RESOURCES */ -static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom) -{ - struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom; - - si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data); -} - -static void si_init_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, +static void si_init_buffer_resources(struct si_buffer_resources *buffers, unsigned num_buffers, unsigned shader_userdata_index, enum radeon_bo_usage shader_usage, enum radeon_bo_priority priority) { - int i; - - buffers->num_buffers = num_buffers; buffers->shader_usage = shader_usage; buffers->priority = priority; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4); - /* si_emit_descriptors only accepts an array of arrays. - * This adds such an array. */ - buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*)); - for (i = 0; i < num_buffers; i++) { - buffers->desc_data[i] = &buffers->desc_storage[i*4]; - } - - si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4, - num_buffers, si_emit_buffer_resources); + si_init_descriptors(&buffers->desc, shader_userdata_index, 4, + num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers) { int i; - for (i = 0; i < buffers->num_buffers; i++) { + for (i = 0; i < buffers->desc.num_elements; i++) { pipe_resource_reference(&buffers->buffers[i], NULL); } FREE(buffers->buffers); - FREE(buffers->desc_storage); - FREE(buffers->desc_data); si_release_descriptors(&buffers->desc); } @@ -535,6 +415,8 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, buffers->shader_usage, buffers->priority); } + if (!buffers->desc.buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); @@ -560,12 +442,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); } + + if (!desc->buffer) + return; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); } -void si_update_vertex_buffers(struct si_context *sctx) +static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { struct si_descriptors *desc = &sctx->vertex_buffers; bool bound[SI_NUM_VERTEX_BUFFERS] = {}; @@ -573,8 +458,10 @@ void si_update_vertex_buffers(struct si_context *sctx) uint64_t va; uint32_t *ptr; + if (!sctx->vertex_buffers_dirty) + return true; if (!count || !sctx->vertex_elements) - return; + return true; /* Vertex buffer descriptors are the only ones which are uploaded * directly through a staging buffer and don't go through @@ -582,13 +469,14 @@ void si_update_vertex_buffers(struct si_context *sctx) */ u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); + if (!desc->buffer) + return false; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA); assert(count <= SI_NUM_VERTEX_BUFFERS); - assert(desc->current_context_id == 0); for (i = 0; i < count; i++) { struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; @@ -640,6 +528,8 @@ void si_update_vertex_buffers(struct si_context *sctx) * cache is needed. */ desc->pointer_dirty = true; sctx->shader_userdata.atom.dirty = true; + sctx->vertex_buffers_dirty = false; + return true; } @@ -664,7 +554,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s if (shader >= SI_NUM_SHADERS) return; - assert(slot < buffers->num_buffers); + assert(slot < buffers->desc.num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy @@ -691,7 +581,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s } /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[slot]; + uint32_t *desc = buffers->desc.list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0); @@ -710,12 +600,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s buffers->desc.enabled_mask |= 1llu << slot; } else { /* Clear the descriptor. */ - memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4); buffers->desc.enabled_mask &= ~(1llu << slot); } - buffers->desc.dirty_mask |= 1llu << slot; - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } /* RING BUFFERS */ @@ -735,7 +624,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, /* The stride field in the resource descriptor has 14 bits */ assert(stride < (1 << 14)); - assert(slot < buffers->num_buffers); + assert(slot < buffers->desc.num_elements); pipe_resource_reference(&buffers->buffers[slot], NULL); if (buffer) { @@ -780,7 +669,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, } /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[slot]; + uint32_t *desc = buffers->desc.list + slot*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) | @@ -803,12 +692,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, buffers->desc.enabled_mask |= 1llu << slot; } else { /* Clear the descriptor. */ - memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4); buffers->desc.enabled_mask &= ~(1llu << slot); } - buffers->desc.dirty_mask |= 1llu << slot; - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } /* STREAMOUT BUFFERS */ @@ -870,7 +758,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, uint64_t va = r600_resource(buffer)->gpu_address; /* Set the descriptor. */ - uint32_t *desc = buffers->desc_data[bufidx]; + uint32_t *desc = buffers->desc.list + bufidx*4; desc[0] = va; desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); desc[2] = 0xffffffff; @@ -888,24 +776,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx, buffers->desc.enabled_mask |= 1llu << bufidx; } else { /* Clear the descriptor and unset the resource. */ - memset(buffers->desc_data[bufidx], 0, + memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->desc.enabled_mask &= ~(1llu << bufidx); } - buffers->desc.dirty_mask |= 1llu << bufidx; } for (; i < old_num_targets; i++) { bufidx = SI_SO_BUF_OFFSET + i; /* Clear the descriptor and unset the resource. */ - memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4); + memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->desc.enabled_mask &= ~(1llu << bufidx); - buffers->desc.dirty_mask |= 1llu << bufidx; } - si_update_descriptors(sctx, &buffers->desc); + buffers->desc.list_dirty = true; } static void si_desc_reset_buffer_offset(struct pipe_context *ctx, @@ -974,22 +860,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Read/Write buffers. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; - bool found = false; uint64_t mask = buffers->desc.enabled_mask; while (mask) { i = u_bit_scan64(&mask); if (buffers->buffers[i] == buf) { - si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], + si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4, old_va, buf); + buffers->desc.list_dirty = true; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, buffers->shader_usage, buffers->priority); - buffers->desc.dirty_mask |= 1llu << i; - found = true; - if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) { /* Update the streamout state. */ if (sctx->b.streamout.begin_emitted) { @@ -1001,34 +884,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } } - if (found) { - si_update_descriptors(sctx, &buffers->desc); - } } /* Constant buffers. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; - bool found = false; uint64_t mask = buffers->desc.enabled_mask; while (mask) { unsigned i = u_bit_scan64(&mask); if (buffers->buffers[i] == buf) { - si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], + si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4, old_va, buf); + buffers->desc.list_dirty = true; r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, buffers->shader_usage, buffers->priority); - - buffers->desc.dirty_mask |= 1llu << i; - found = true; } } - if (found) { - si_update_descriptors(sctx, &buffers->desc); - } } /* Texture buffers - update virtual addresses in sampler view descriptors. */ @@ -1040,23 +914,20 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource /* Texture buffers - update bindings. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { struct si_sampler_views *views = &sctx->samplers[shader].views; - bool found = false; uint64_t mask = views->desc.enabled_mask; while (mask) { unsigned i = u_bit_scan64(&mask); if (views->views[i]->texture == buf) { + si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4, + old_va, buf); + views->desc.list_dirty = true; + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); - - views->desc.dirty_mask |= 1llu << i; - found = true; } } - if (found) { - si_update_descriptors(sctx, &views->desc); - } } } @@ -1297,11 +1168,10 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; uint64_t va; - if (!desc->pointer_dirty) + if (!desc->pointer_dirty || !desc->buffer) return; va = desc->buffer->gpu_address + - desc->current_context_id * desc->context_size + desc->buffer_offset; radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); @@ -1351,34 +1221,28 @@ static void si_emit_shader_userdata(struct si_context *sctx, si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false); } -/* INIT/DEINIT */ +/* INIT/DEINIT/UPLOAD */ void si_init_all_descriptors(struct si_context *sctx) { int i; for (i = 0; i < SI_NUM_SHADERS; i++) { - si_init_buffer_resources(sctx, &sctx->const_buffers[i], + si_init_buffer_resources(&sctx->const_buffers[i], SI_NUM_CONST_BUFFERS, SI_SGPR_CONST, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); - si_init_buffer_resources(sctx, &sctx->rw_buffers[i], + si_init_buffer_resources(&sctx->rw_buffers[i], SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW); - si_init_sampler_views(sctx, &sctx->samplers[i].views); - - si_init_descriptors(sctx, &sctx->samplers[i].states.desc, - SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES, - si_emit_sampler_states); - - sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom; - sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom; - sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom; - sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom; + si_init_descriptors(&sctx->samplers[i].views.desc, + SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS); + si_init_descriptors(&sctx->samplers[i].states.desc, + SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES); } - si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER, - 4, SI_NUM_VERTEX_BUFFERS, NULL); + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER, + 4, SI_NUM_VERTEX_BUFFERS); /* Set pipe_context functions. */ sctx->b.b.set_constant_buffer = si_set_constant_buffer; @@ -1401,6 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx) si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); } +bool si_upload_shader_descriptors(struct si_context *sctx) +{ + int i; + + for (i = 0; i < SI_NUM_SHADERS; i++) { + if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) || + !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) || + !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) || + !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc)) + return false; + } + return si_upload_vertex_buffer_descriptors(sctx); +} + void si_release_all_descriptors(struct si_context *sctx) { int i; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 7b2263b1162..28cb4e990ae 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -142,12 +142,6 @@ struct si_context { union { struct { /* The order matters. */ - struct r600_atom *const_buffers[SI_NUM_SHADERS]; - struct r600_atom *rw_buffers[SI_NUM_SHADERS]; - struct r600_atom *sampler_views[SI_NUM_SHADERS]; - struct r600_atom *sampler_states[SI_NUM_SHADERS]; - /* Caches must be flushed after resource descriptors are - * updated in memory. */ struct r600_atom *cache_flush; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index e4d859a4fb7..e6bacdf22fb 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -158,60 +158,48 @@ struct si_shader_data { #define SI_NUM_VERTEX_BUFFERS 16 -/* This represents resource descriptors in memory, such as buffer resources, +/* This represents descriptors in memory, such as buffer resources, * image resources, and sampler states. */ struct si_descriptors { - struct r600_atom atom; - - /* The size of one resource descriptor. */ + /* The list of descriptors in malloc'd memory. */ + uint32_t *list; + /* The size of one descriptor. */ unsigned element_dw_size; - /* The maximum number of resource descriptors. */ + /* The maximum number of descriptors. */ unsigned num_elements; + /* Whether the list has been changed and should be re-uploaded. */ + bool list_dirty; - /* The buffer where resource descriptors are stored. */ + /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; unsigned buffer_offset; - /* The i-th bit is set if that element is dirty (changed but not emitted). */ - uint64_t dirty_mask; /* The i-th bit is set if that element is enabled (non-NULL resource). */ uint64_t enabled_mask; - /* We can't update descriptors directly because the GPU might be - * reading them at the same time, so we have to update them - * in a copy-on-write manner. Each such copy is called a context, - * which is just another array descriptors in the same buffer. */ - unsigned current_context_id; - /* The size of a context, should be equal to 4*element_dw_size*num_elements. */ - unsigned context_size; - /* The shader userdata offset within a shader where the 64-bit pointer to the descriptor * array will be stored. */ unsigned shader_userdata_offset; + /* Whether the pointer should be re-emitted. */ bool pointer_dirty; }; struct si_sampler_views { struct si_descriptors desc; struct pipe_sampler_view *views[SI_NUM_SAMPLER_VIEWS]; - uint32_t *desc_data[SI_NUM_SAMPLER_VIEWS]; }; struct si_sampler_states { struct si_descriptors desc; - uint32_t *desc_data[SI_NUM_SAMPLER_STATES]; void *saved_states[2]; /* saved for u_blitter */ }; struct si_buffer_resources { struct si_descriptors desc; - unsigned num_buffers; enum radeon_bo_usage shader_usage; /* READ, WRITE, or READWRITE */ enum radeon_bo_priority priority; struct pipe_resource **buffers; /* this has num_buffers elements */ - uint32_t *desc_storage; /* this has num_buffers*4 elements */ - uint32_t **desc_data; /* an array of pointers pointing to desc_storage */ }; #define si_pm4_block_idx(member) \ @@ -247,13 +235,13 @@ struct si_buffer_resources { /* si_descriptors.c */ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, unsigned start, unsigned count, void **states); -void si_update_vertex_buffers(struct si_context *sctx); void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_resource *buffer, unsigned stride, unsigned num_records, bool add_tid, bool swizzle, unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); +bool si_upload_shader_descriptors(struct si_context *sctx); void si_release_all_descriptors(struct si_context *sctx); void si_all_descriptors_begin_new_cs(struct si_context *sctx); void si_copy_buffer(struct si_context *sctx, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ec8dd84c9dd..e8faf405afc 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -743,11 +743,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) sctx->current_rast_prim = info->mode; si_update_shaders(sctx); - - if (sctx->vertex_buffers_dirty) { - si_update_vertex_buffers(sctx); - sctx->vertex_buffers_dirty = false; - } + if (!si_upload_shader_descriptors(sctx)) + return; if (info->indexed) { /* Initialize the index buffer struct. */ -- 2.30.2