radeonsi: move vertex buffer descriptors from IB to memory
authorMarek Olšák <marek.olsak@amd.com>
Wed, 9 Jul 2014 02:00:53 +0000 (04:00 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 17 Jul 2014 23:58:59 +0000 (01:58 +0200)
This removes the intermediate storage (pm4 state) and generates descriptors
directly in a staging buffer.

It also reduces the number of flushes, because the descriptors no longer
take CS space.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_pm4.c
src/gallium/drivers/radeonsi/si_pm4.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_draw.c

index 0b0704c7e284f5826358a5123caf5f2f483ac219..c3236375e4b8fd25973d09d5ff6019a64fd1548c 100644 (file)
@@ -166,11 +166,13 @@ static void si_update_descriptors(struct si_context *sctx,
 }
 
 static void si_emit_shader_pointer(struct si_context *sctx,
-                                  struct si_descriptors *desc)
+                                  struct r600_atom *atom)
 {
+       struct si_descriptors *desc = (struct si_descriptors*)atom;
        struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
        uint64_t va = r600_resource_va(sctx->b.b.screen, &desc->buffer->b.b) +
-                     desc->current_context_id * desc->context_size;
+                     desc->current_context_id * desc->context_size +
+                     desc->buffer_offset;
 
        radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
        radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
@@ -253,7 +255,7 @@ static void si_emit_descriptors(struct si_context *sctx,
        desc->current_context_id = new_context_id;
 
        /* Now update the shader userdata pointer. */
-       si_emit_shader_pointer(sctx, desc);
+       si_emit_shader_pointer(sctx, &desc->atom);
 }
 
 static unsigned si_get_shader_user_data_base(unsigned shader)
@@ -330,7 +332,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 
-       si_emit_shader_pointer(sctx, &views->desc);
+       si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -432,7 +434,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
        r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-       si_emit_shader_pointer(sctx, &states->desc);
+       si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -533,9 +535,119 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
                              buffers->desc.buffer, RADEON_USAGE_READWRITE,
                              RADEON_PRIO_SHADER_DATA);
 
-       si_emit_shader_pointer(sctx, &buffers->desc);
+       si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
+/* VERTEX BUFFERS */
+
+static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
+{
+       struct si_descriptors *desc = &sctx->vertex_buffers;
+       int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
+
+               if (vb >= sctx->nr_vertex_buffers)
+                       continue;
+               if (!sctx->vertex_buffer[vb].buffer)
+                       continue;
+
+               r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+                                     (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
+                                     RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+       }
+       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+                             desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_SHADER_DATA);
+
+       si_emit_shader_pointer(sctx, &desc->atom);
+}
+
+void si_update_vertex_buffers(struct si_context *sctx)
+{
+       struct pipe_context *ctx = &sctx->b.b;
+       struct si_descriptors *desc = &sctx->vertex_buffers;
+       bool bound[SI_NUM_VERTEX_BUFFERS] = {};
+       unsigned i, count = sctx->vertex_elements->count;
+       uint64_t va;
+       uint32_t *ptr;
+
+       if (!count || !sctx->vertex_elements)
+               return;
+
+       /* Vertex buffer descriptors are the only ones which are uploaded
+        * directly through a staging buffer and don't go through
+        * the fine-grained upload path.
+        */
+       u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
+                      (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+
+       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+                             desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_SHADER_DATA);
+
+       assert(count <= SI_NUM_VERTEX_BUFFERS);
+       assert(desc->current_context_id == 0);
+
+       for (i = 0; i < count; i++) {
+               struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
+               struct pipe_vertex_buffer *vb;
+               struct r600_resource *rbuffer;
+               unsigned offset;
+               uint32_t *desc = &ptr[i*4];
+
+               if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) {
+                       memset(desc, 0, 16);
+                       continue;
+               }
+
+               vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
+               rbuffer = (struct r600_resource*)vb->buffer;
+               if (rbuffer == NULL) {
+                       memset(desc, 0, 16);
+                       continue;
+               }
+
+               offset = vb->buffer_offset + ve->src_offset;
+
+               va = r600_resource_va(ctx->screen, (void*)rbuffer);
+               va += offset;
+
+               /* Fill in T# buffer resource description */
+               desc[0] = va & 0xFFFFFFFF;
+               desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+                         S_008F04_STRIDE(vb->stride);
+               if (vb->stride)
+                       /* Round up by rounding down and adding 1 */
+                       desc[2] = (vb->buffer->width0 - offset -
+                                  sctx->vertex_elements->format_size[i]) /
+                                 vb->stride + 1;
+               else
+                       desc[2] = vb->buffer->width0 - offset;
+
+               desc[3] = sctx->vertex_elements->rsrc_word3[i];
+
+               if (!bound[ve->vertex_buffer_index]) {
+                       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+                                             (struct r600_resource*)vb->buffer,
+                                             RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
+                       bound[ve->vertex_buffer_index] = true;
+               }
+       }
+
+       desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
+       desc->atom.dirty = true;
+
+       /* Don't flush the const cache. It would have a very negative effect
+        * on performance (confirmed by testing). New descriptors are always
+        * uploaded to a fresh new buffer, so I don't think flushing the const
+        * cache is needed. */
+       sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
+}
+
+
 /* CONSTANT BUFFERS */
 
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
@@ -1096,6 +1208,11 @@ void si_init_all_descriptors(struct si_context *sctx)
                sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
        }
 
+       si_init_descriptors(sctx, &sctx->vertex_buffers,
+                           si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
+                           SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
+                           si_emit_shader_pointer);
+       sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
 
        /* Set pipe_context functions. */
        sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1115,6 +1232,7 @@ void si_release_all_descriptors(struct si_context *sctx)
                si_release_sampler_views(&sctx->samplers[i].views);
                si_release_descriptors(&sctx->samplers[i].states.desc);
        }
+       si_release_descriptors(&sctx->vertex_buffers);
 }
 
 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@@ -1127,4 +1245,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
                si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
                si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
        }
+       si_vertex_buffers_begin_new_cs(sctx);
 }
index dd1f3565a553938b02070781bb35f607195a0d42..227b8fad3d860eaba5f777354beb0d80dbfc3709 100644 (file)
@@ -97,6 +97,7 @@ struct si_context {
        union {
                struct {
                        /* The order matters. */
+                       struct r600_atom *vertex_buffers;
                        struct r600_atom *const_buffers[SI_NUM_SHADERS];
                        struct r600_atom *rw_buffers[SI_NUM_SHADERS];
                        struct r600_atom *sampler_views[SI_NUM_SHADERS];
@@ -124,9 +125,10 @@ struct si_context {
        struct si_cs_shader_state       cs_shader_state;
        /* shader information */
        unsigned                        sprite_coord_enable;
+       struct si_descriptors           vertex_buffers;
        struct si_buffer_resources      const_buffers[SI_NUM_SHADERS];
        struct si_buffer_resources      rw_buffers[SI_NUM_SHADERS];
-       struct si_textures_info samplers[SI_NUM_SHADERS];
+       struct si_textures_info         samplers[SI_NUM_SHADERS];
        struct r600_resource            *border_color_table;
        unsigned                        border_color_offset;
 
index 082da85e03dd3d7caf359102d0e3d01adbaa9bdc..705b226d4ddee228a40508fcb2107d1204adda8b 100644 (file)
@@ -103,37 +103,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
        state->bo_priority[idx] = priority;
 }
 
-void si_pm4_sh_data_begin(struct si_pm4_state *state)
-{
-       si_pm4_cmd_begin(state, PKT3_NOP);
-}
-
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw)
-{
-       si_pm4_cmd_add(state, dw);
-}
-
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx)
-{
-       unsigned offs = state->last_pm4 + 1;
-       unsigned reg = base + idx * 4;
-
-       /* Bail if no data was added */
-       if (state->ndw == offs) {
-               state->ndw--;
-               return;
-       }
-
-       si_pm4_cmd_end(state, false);
-
-       si_pm4_cmd_begin(state, PKT3_SET_SH_REG_OFFSET);
-       si_pm4_cmd_add(state, (reg - SI_SH_REG_OFFSET) >> 2);
-       state->relocs[state->nrelocs++] = state->ndw;
-       si_pm4_cmd_add(state, offs << 2);
-       si_pm4_cmd_add(state, 0);
-       si_pm4_cmd_end(state, false);
-}
-
 void si_pm4_inval_shader_cache(struct si_pm4_state *state)
 {
        state->cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
index a71958601aaa8687d038201efbd1ef8ecd372dc4..0702bd46faa5412c603b4fe68cc6447b8923288e 100644 (file)
@@ -76,10 +76,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
                   enum radeon_bo_usage usage,
                   enum radeon_bo_priority priority);
 
-void si_pm4_sh_data_begin(struct si_pm4_state *state);
-void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw);
-void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx);
-
 void si_pm4_inval_shader_cache(struct si_pm4_state *state);
 void si_pm4_inval_texture_cache(struct si_pm4_state *state);
 
index 604f8d3456e33c5e793c80dfeac549b91f7d4e61..31d387e974e19915810797281b5284d65886d889 100644 (file)
@@ -2800,6 +2800,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                                   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
                                   S_008F0C_NUM_FORMAT(num_format) |
                                   S_008F0C_DATA_FORMAT(data_format);
+               v->format_size[i] = desc->block.bits / 8;
        }
        memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
 
index a765e243ce626c4a3631fbe2dceef11527765b1c..82bea790cff2f17a3c26881f756cf564b939a1be 100644 (file)
@@ -72,6 +72,7 @@ struct si_vertex_element
 {
        unsigned                        count;
        uint32_t                        rsrc_word3[PIPE_MAX_ATTRIBS];
+       uint32_t                        format_size[PIPE_MAX_ATTRIBS];
        struct pipe_vertex_element      elements[PIPE_MAX_ATTRIBS];
 };
 
@@ -97,7 +98,6 @@ union si_state {
                struct si_pm4_state             *vs;
                struct si_pm4_state             *ps;
                struct si_pm4_state             *spi;
-               struct si_pm4_state             *vertex_buffers;
                struct si_pm4_state             *draw_info;
                struct si_pm4_state             *draw;
        } named;
@@ -147,6 +147,7 @@ struct si_descriptors {
 
        /* The buffer where resource descriptors are stored. */
        struct r600_resource *buffer;
+       unsigned buffer_offset;
 
        /* The i-th bit is set if that element is dirty (changed but not emitted). */
        unsigned dirty_mask;
@@ -221,6 +222,7 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
                                unsigned start, unsigned count, void **states);
+void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
                        struct pipe_constant_buffer *input,
                        unsigned stride, unsigned num_records,
index bac18464a1d5b2ca7acb4a562040be99009cbf0e..a0078c0c355e1285c836dd6770da4d2c9e5189c6 100644 (file)
@@ -658,68 +658,6 @@ static void si_update_derived_state(struct si_context *sctx)
        }
 }
 
-static void si_vertex_buffer_update(struct si_context *sctx)
-{
-       struct pipe_context *ctx = &sctx->b.b;
-       struct si_pm4_state *pm4 = si_pm4_alloc_state(sctx);
-       bool bound[PIPE_MAX_ATTRIBS] = {};
-       unsigned i, count;
-       uint64_t va;
-
-       sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
-
-       count = sctx->vertex_elements->count;
-       assert(count <= 256 / 4);
-
-       si_pm4_sh_data_begin(pm4);
-       for (i = 0 ; i < count; i++) {
-               struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
-               struct pipe_vertex_buffer *vb;
-               struct r600_resource *rbuffer;
-               unsigned offset;
-
-               if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers)
-                       continue;
-
-               vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
-               rbuffer = (struct r600_resource*)vb->buffer;
-               if (rbuffer == NULL)
-                       continue;
-
-               offset = 0;
-               offset += vb->buffer_offset;
-               offset += ve->src_offset;
-
-               va = r600_resource_va(ctx->screen, (void*)rbuffer);
-               va += offset;
-
-               /* Fill in T# buffer resource description */
-               si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
-               si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
-                                        S_008F04_STRIDE(vb->stride)));
-               if (vb->stride)
-                       /* Round up by rounding down and adding 1 */
-                       si_pm4_sh_data_add(pm4,
-                                          (vb->buffer->width0 - offset -
-                                           util_format_get_blocksize(ve->src_format)) /
-                                          vb->stride + 1);
-               else
-                       si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
-               si_pm4_sh_data_add(pm4, sctx->vertex_elements->rsrc_word3[i]);
-
-               if (!bound[ve->vertex_buffer_index]) {
-                       si_pm4_add_bo(pm4, rbuffer, RADEON_USAGE_READ,
-                                     RADEON_PRIO_SHADER_BUFFER_RO);
-                       bound[ve->vertex_buffer_index] = true;
-               }
-       }
-       si_pm4_sh_data_end(pm4, sctx->gs_shader ?
-                          R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-                          R_00B130_SPI_SHADER_USER_DATA_VS_0,
-                          SI_SGPR_VERTEX_BUFFER);
-       si_pm4_set_state(sctx, vertex_buffers, pm4);
-}
-
 static void si_state_draw(struct si_context *sctx,
                          const struct pipe_draw_info *info,
                          const struct pipe_index_buffer *ib)
@@ -954,7 +892,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                return;
 
        si_update_derived_state(sctx);
-       si_vertex_buffer_update(sctx);
+       si_update_vertex_buffers(sctx);
 
        if (info->indexed) {
                /* Initialize the index buffer struct. */