From 0ed0bf06961677906c7e2c5250935148dcd9e860 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 9 Jul 2014 04:00:53 +0200 Subject: [PATCH] radeonsi: move vertex buffer descriptors from IB to memory MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This removes the intermediate storage (pm4 state) and generates descriptors directly in a staging buffer. It also reduces the number of flushes, because the descriptors no longer take CS space. Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeonsi/si_descriptors.c | 131 +++++++++++++++++- src/gallium/drivers/radeonsi/si_pipe.h | 4 +- src/gallium/drivers/radeonsi/si_pm4.c | 31 ----- src/gallium/drivers/radeonsi/si_pm4.h | 4 - src/gallium/drivers/radeonsi/si_state.c | 1 + src/gallium/drivers/radeonsi/si_state.h | 4 +- src/gallium/drivers/radeonsi/si_state_draw.c | 64 +-------- 7 files changed, 133 insertions(+), 106 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 0b0704c7e28..c3236375e4b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -166,11 +166,13 @@ static void si_update_descriptors(struct si_context *sctx, } static void si_emit_shader_pointer(struct si_context *sctx, - struct si_descriptors *desc) + struct r600_atom *atom) { + struct si_descriptors *desc = (struct si_descriptors*)atom; struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; uint64_t va = r600_resource_va(sctx->b.b.screen, &desc->buffer->b.b) + - desc->current_context_id * desc->context_size; + desc->current_context_id * desc->context_size + + desc->buffer_offset; radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2); @@ -253,7 +255,7 @@ static void si_emit_descriptors(struct si_context *sctx, desc->current_context_id = new_context_id; /* Now update the shader userdata pointer. */ - si_emit_shader_pointer(sctx, desc); + si_emit_shader_pointer(sctx, &desc->atom); } static unsigned si_get_shader_user_data_base(unsigned shader) @@ -330,7 +332,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - si_emit_shader_pointer(sctx, &views->desc); + si_emit_shader_pointer(sctx, &views->desc.atom); } static void si_set_sampler_view(struct si_context *sctx, unsigned shader, @@ -432,7 +434,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx, { r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - si_emit_shader_pointer(sctx, &states->desc); + si_emit_shader_pointer(sctx, &states->desc.atom); } void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, @@ -533,9 +535,119 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); - si_emit_shader_pointer(sctx, &buffers->desc); + si_emit_shader_pointer(sctx, &buffers->desc.atom); } +/* VERTEX BUFFERS */ + +static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) +{ + struct si_descriptors *desc = &sctx->vertex_buffers; + int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0; + int i; + + for (i = 0; i < count; i++) { + int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; + + if (vb >= sctx->nr_vertex_buffers) + continue; + if (!sctx->vertex_buffer[vb].buffer) + continue; + + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + (struct r600_resource*)sctx->vertex_buffer[vb].buffer, + RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); + } + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + desc->buffer, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_DATA); + + si_emit_shader_pointer(sctx, &desc->atom); +} + +void si_update_vertex_buffers(struct si_context *sctx) +{ + struct pipe_context *ctx = &sctx->b.b; + struct si_descriptors *desc = &sctx->vertex_buffers; + bool bound[SI_NUM_VERTEX_BUFFERS] = {}; + unsigned i, count = sctx->vertex_elements->count; + uint64_t va; + uint32_t *ptr; + + if (!count || !sctx->vertex_elements) + return; + + /* Vertex buffer descriptors are the only ones which are uploaded + * directly through a staging buffer and don't go through + * the fine-grained upload path. + */ + u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, (void**)&ptr); + + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + desc->buffer, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_DATA); + + assert(count <= SI_NUM_VERTEX_BUFFERS); + assert(desc->current_context_id == 0); + + for (i = 0; i < count; i++) { + struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; + struct pipe_vertex_buffer *vb; + struct r600_resource *rbuffer; + unsigned offset; + uint32_t *desc = &ptr[i*4]; + + if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) { + memset(desc, 0, 16); + continue; + } + + vb = &sctx->vertex_buffer[ve->vertex_buffer_index]; + rbuffer = (struct r600_resource*)vb->buffer; + if (rbuffer == NULL) { + memset(desc, 0, 16); + continue; + } + + offset = vb->buffer_offset + ve->src_offset; + + va = r600_resource_va(ctx->screen, (void*)rbuffer); + va += offset; + + /* Fill in T# buffer resource description */ + desc[0] = va & 0xFFFFFFFF; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(vb->stride); + if (vb->stride) + /* Round up by rounding down and adding 1 */ + desc[2] = (vb->buffer->width0 - offset - + sctx->vertex_elements->format_size[i]) / + vb->stride + 1; + else + desc[2] = vb->buffer->width0 - offset; + + desc[3] = sctx->vertex_elements->rsrc_word3[i]; + + if (!bound[ve->vertex_buffer_index]) { + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + (struct r600_resource*)vb->buffer, + RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); + bound[ve->vertex_buffer_index] = true; + } + } + + desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */ + desc->atom.dirty = true; + + /* Don't flush the const cache. It would have a very negative effect + * on performance (confirmed by testing). New descriptors are always + * uploaded to a fresh new buffer, so I don't think flushing the const + * cache is needed. */ + sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; +} + + /* CONSTANT BUFFERS */ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, @@ -1096,6 +1208,11 @@ void si_init_all_descriptors(struct si_context *sctx) sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom; } + si_init_descriptors(sctx, &sctx->vertex_buffers, + si_get_shader_user_data_base(PIPE_SHADER_VERTEX) + + SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS, + si_emit_shader_pointer); + sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom; /* Set pipe_context functions. */ sctx->b.b.set_constant_buffer = si_set_constant_buffer; @@ -1115,6 +1232,7 @@ void si_release_all_descriptors(struct si_context *sctx) si_release_sampler_views(&sctx->samplers[i].views); si_release_descriptors(&sctx->samplers[i].states.desc); } + si_release_descriptors(&sctx->vertex_buffers); } void si_all_descriptors_begin_new_cs(struct si_context *sctx) @@ -1127,4 +1245,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states); } + si_vertex_buffers_begin_new_cs(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index dd1f3565a55..227b8fad3d8 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -97,6 +97,7 @@ struct si_context { union { struct { /* The order matters. */ + struct r600_atom *vertex_buffers; struct r600_atom *const_buffers[SI_NUM_SHADERS]; struct r600_atom *rw_buffers[SI_NUM_SHADERS]; struct r600_atom *sampler_views[SI_NUM_SHADERS]; @@ -124,9 +125,10 @@ struct si_context { struct si_cs_shader_state cs_shader_state; /* shader information */ unsigned sprite_coord_enable; + struct si_descriptors vertex_buffers; struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; - struct si_textures_info samplers[SI_NUM_SHADERS]; + struct si_textures_info samplers[SI_NUM_SHADERS]; struct r600_resource *border_color_table; unsigned border_color_offset; diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 082da85e03d..705b226d4dd 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -103,37 +103,6 @@ void si_pm4_add_bo(struct si_pm4_state *state, state->bo_priority[idx] = priority; } -void si_pm4_sh_data_begin(struct si_pm4_state *state) -{ - si_pm4_cmd_begin(state, PKT3_NOP); -} - -void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw) -{ - si_pm4_cmd_add(state, dw); -} - -void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx) -{ - unsigned offs = state->last_pm4 + 1; - unsigned reg = base + idx * 4; - - /* Bail if no data was added */ - if (state->ndw == offs) { - state->ndw--; - return; - } - - si_pm4_cmd_end(state, false); - - si_pm4_cmd_begin(state, PKT3_SET_SH_REG_OFFSET); - si_pm4_cmd_add(state, (reg - SI_SH_REG_OFFSET) >> 2); - state->relocs[state->nrelocs++] = state->ndw; - si_pm4_cmd_add(state, offs << 2); - si_pm4_cmd_add(state, 0); - si_pm4_cmd_end(state, false); -} - void si_pm4_inval_shader_cache(struct si_pm4_state *state) { state->cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index a71958601aa..0702bd46faa 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -76,10 +76,6 @@ void si_pm4_add_bo(struct si_pm4_state *state, enum radeon_bo_usage usage, enum radeon_bo_priority priority); -void si_pm4_sh_data_begin(struct si_pm4_state *state); -void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw); -void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx); - void si_pm4_inval_shader_cache(struct si_pm4_state *state); void si_pm4_inval_texture_cache(struct si_pm4_state *state); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 604f8d3456e..31d387e974e 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2800,6 +2800,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) | S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); + v->format_size[i] = desc->block.bits / 8; } memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index a765e243ce6..82bea790cff 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -72,6 +72,7 @@ struct si_vertex_element { unsigned count; uint32_t rsrc_word3[PIPE_MAX_ATTRIBS]; + uint32_t format_size[PIPE_MAX_ATTRIBS]; struct pipe_vertex_element elements[PIPE_MAX_ATTRIBS]; }; @@ -97,7 +98,6 @@ union si_state { struct si_pm4_state *vs; struct si_pm4_state *ps; struct si_pm4_state *spi; - struct si_pm4_state *vertex_buffers; struct si_pm4_state *draw_info; struct si_pm4_state *draw; } named; @@ -147,6 +147,7 @@ struct si_descriptors { /* The buffer where resource descriptors are stored. */ struct r600_resource *buffer; + unsigned buffer_offset; /* The i-th bit is set if that element is dirty (changed but not emitted). */ unsigned dirty_mask; @@ -221,6 +222,7 @@ struct si_buffer_resources { /* si_descriptors.c */ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, unsigned start, unsigned count, void **states); +void si_update_vertex_buffers(struct si_context *sctx); void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, struct pipe_constant_buffer *input, unsigned stride, unsigned num_records, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index bac18464a1d..a0078c0c355 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -658,68 +658,6 @@ static void si_update_derived_state(struct si_context *sctx) } } -static void si_vertex_buffer_update(struct si_context *sctx) -{ - struct pipe_context *ctx = &sctx->b.b; - struct si_pm4_state *pm4 = si_pm4_alloc_state(sctx); - bool bound[PIPE_MAX_ATTRIBS] = {}; - unsigned i, count; - uint64_t va; - - sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; - - count = sctx->vertex_elements->count; - assert(count <= 256 / 4); - - si_pm4_sh_data_begin(pm4); - for (i = 0 ; i < count; i++) { - struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; - struct pipe_vertex_buffer *vb; - struct r600_resource *rbuffer; - unsigned offset; - - if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) - continue; - - vb = &sctx->vertex_buffer[ve->vertex_buffer_index]; - rbuffer = (struct r600_resource*)vb->buffer; - if (rbuffer == NULL) - continue; - - offset = 0; - offset += vb->buffer_offset; - offset += ve->src_offset; - - va = r600_resource_va(ctx->screen, (void*)rbuffer); - va += offset; - - /* Fill in T# buffer resource description */ - si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF); - si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) | - S_008F04_STRIDE(vb->stride))); - if (vb->stride) - /* Round up by rounding down and adding 1 */ - si_pm4_sh_data_add(pm4, - (vb->buffer->width0 - offset - - util_format_get_blocksize(ve->src_format)) / - vb->stride + 1); - else - si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset); - si_pm4_sh_data_add(pm4, sctx->vertex_elements->rsrc_word3[i]); - - if (!bound[ve->vertex_buffer_index]) { - si_pm4_add_bo(pm4, rbuffer, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_BUFFER_RO); - bound[ve->vertex_buffer_index] = true; - } - } - si_pm4_sh_data_end(pm4, sctx->gs_shader ? - R_00B330_SPI_SHADER_USER_DATA_ES_0 : - R_00B130_SPI_SHADER_USER_DATA_VS_0, - SI_SGPR_VERTEX_BUFFER); - si_pm4_set_state(sctx, vertex_buffers, pm4); -} - static void si_state_draw(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_index_buffer *ib) @@ -954,7 +892,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) return; si_update_derived_state(sctx); - si_vertex_buffer_update(sctx); + si_update_vertex_buffers(sctx); if (info->indexed) { /* Initialize the index buffer struct. */ -- 2.30.2