From 90c34aed1d2f814ff8baca87b338d250257ae1d0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 1 Jun 2020 15:56:12 -0400 Subject: [PATCH] gallium/u_vbuf: add a faster path for uploading non-interleaved attribs +1% higher FPS in torcs. Reviewed-by: Alyssa Rosenzweig Part-of: --- src/gallium/auxiliary/util/u_vbuf.c | 117 ++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 34 deletions(-) diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index 999fcb80135..7e2631c2e86 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -131,6 +131,9 @@ struct u_vbuf_elements { * non-instanced. */ uint32_t noninstance_vb_mask_any; + /* Which buffers are used by multiple vertex attribs. */ + uint32_t interleaved_vb_mask; + void *driver_cso; }; @@ -802,6 +805,9 @@ u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count, ve->src_format_size[i] = util_format_get_blocksize(format); + if (used_buffers & vb_index_bit) + ve->interleaved_vb_mask |= vb_index_bit; + used_buffers |= vb_index_bit; if (!ve->ve[i].instance_divisor) { @@ -955,6 +961,49 @@ void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr, mgr->dirty_real_vb_mask |= ~mask; } +static ALWAYS_INLINE bool +get_upload_offset_size(struct u_vbuf *mgr, + const struct pipe_vertex_buffer *vb, + struct u_vbuf_elements *ve, + const struct pipe_vertex_element *velem, + unsigned vb_index, unsigned velem_index, + int start_vertex, unsigned num_vertices, + int start_instance, unsigned num_instances, + unsigned *offset, unsigned *size) +{ + /* Skip the buffers generated by translate. */ + if ((1 << vb_index) & mgr->fallback_vbs_mask || !vb->is_user_buffer) + return false; + + unsigned instance_div = velem->instance_divisor; + *offset = vb->buffer_offset + velem->src_offset; + + if (!vb->stride) { + /* Constant attrib. */ + *size = ve->src_format_size[velem_index]; + } else if (instance_div) { + /* Per-instance attrib. */ + + /* Figure out how many instances we'll render given instance_div. We + * can't use the typical div_round_up() pattern because the CTS uses + * instance_div = ~0 for a test, which overflows div_round_up()'s + * addition. + */ + unsigned count = num_instances / instance_div; + if (count * instance_div != num_instances) + count++; + + *offset += vb->stride * start_instance; + *size = vb->stride * (count - 1) + ve->src_format_size[velem_index]; + } else { + /* Per-vertex attrib. */ + *offset += vb->stride * start_vertex; + *size = vb->stride * (num_vertices - 1) + ve->src_format_size[velem_index]; + } + return true; +} + + static enum pipe_error u_vbuf_upload_buffers(struct u_vbuf *mgr, int start_vertex, unsigned num_vertices, @@ -965,51 +1014,51 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr, unsigned nr_velems = ve->count; const struct pipe_vertex_element *velems = mgr->using_translate ? mgr->fallback_velems.velems : ve->ve; + + /* Faster path when no vertex attribs are interleaved. */ + if ((ve->interleaved_vb_mask & mgr->user_vb_mask) == 0) { + for (i = 0; i < nr_velems; i++) { + const struct pipe_vertex_element *velem = &velems[i]; + unsigned index = velem->vertex_buffer_index; + struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index]; + unsigned offset, size; + + if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex, + num_vertices, start_instance, num_instances, + &offset, &size)) + continue; + + struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[index]; + const uint8_t *ptr = mgr->vertex_buffer[index].buffer.user; + + u_upload_data(mgr->pipe->stream_uploader, + mgr->has_signed_vb_offset ? 0 : offset, + size, 4, ptr + offset, &real_vb->buffer_offset, + &real_vb->buffer.resource); + if (!real_vb->buffer.resource) + return PIPE_ERROR_OUT_OF_MEMORY; + + real_vb->buffer_offset -= offset; + } + return PIPE_OK; + } + unsigned start_offset[PIPE_MAX_ATTRIBS]; unsigned end_offset[PIPE_MAX_ATTRIBS]; uint32_t buffer_mask = 0; + /* Slower path supporting interleaved vertex attribs using 2 loops. */ /* Determine how much data needs to be uploaded. */ for (i = 0; i < nr_velems; i++) { const struct pipe_vertex_element *velem = &velems[i]; unsigned index = velem->vertex_buffer_index; struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index]; - unsigned instance_div, first, size, index_bit; + unsigned first, size, index_bit; - /* Skip the buffers generated by translate. */ - if ((1 << index) & mgr->fallback_vbs_mask) { + if (!get_upload_offset_size(mgr, vb, ve, velem, index, i, start_vertex, + num_vertices, start_instance, num_instances, + &first, &size)) continue; - } - - if (!vb->is_user_buffer) { - continue; - } - - instance_div = velem->instance_divisor; - first = vb->buffer_offset + velem->src_offset; - - if (!vb->stride) { - /* Constant attrib. */ - size = ve->src_format_size[i]; - } else if (instance_div) { - /* Per-instance attrib. */ - - /* Figure out how many instances we'll render given instance_div. We - * can't use the typical div_round_up() pattern because the CTS uses - * instance_div = ~0 for a test, which overflows div_round_up()'s - * addition. - */ - unsigned count = num_instances / instance_div; - if (count * instance_div != num_instances) - count++; - - first += vb->stride * start_instance; - size = vb->stride * (count - 1) + ve->src_format_size[i]; - } else { - /* Per-vertex attrib. */ - first += vb->stride * start_vertex; - size = vb->stride * (num_vertices - 1) + ve->src_format_size[i]; - } index_bit = 1 << index; -- 2.30.2