From: Marek Olšák Date: Tue, 17 Jul 2018 05:52:25 +0000 (-0400) Subject: gallium/u_vbuf: handle indirect multidraws correctly and efficiently (v3) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0f79b2015bc0c44a8ed470684b6789f0e2e6aa6c;p=mesa.git gallium/u_vbuf: handle indirect multidraws correctly and efficiently (v3) v2: need to do MAX{start+count} instead of MAX{count} added piglit tests v3: use malloc Cc: 18.2 Reviewed-by: Eric Anholt --- diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index 746ff1085ce..a7a8a3be21b 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -1131,6 +1131,31 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr) mgr->dirty_real_vb_mask = 0; } +static void +u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info, + unsigned *indirect_data, unsigned stride, + unsigned draw_count) +{ + assert(info->index_size); + info->indirect = NULL; + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * stride / 4; + + info->count = indirect_data[offset + 0]; + info->instance_count = indirect_data[offset + 1]; + + if (!info->count || !info->instance_count) + continue; + + info->start = indirect_data[offset + 2]; + info->index_bias = indirect_data[offset + 3]; + info->start_instance = indirect_data[offset + 4]; + + u_vbuf_draw_vbo(mgr, info); + } +} + void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) { struct pipe_context *pipe = mgr->pipe; @@ -1160,33 +1185,163 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) new_info = *info; - /* Fallback. We need to know all the parameters. */ + /* Handle indirect (multi)draws. */ if (new_info.indirect) { - struct pipe_transfer *transfer = NULL; - int *data; - - if (new_info.index_size) { - data = pipe_buffer_map_range(pipe, new_info.indirect->buffer, - new_info.indirect->offset, 20, - PIPE_TRANSFER_READ, &transfer); - new_info.index_bias = data[3]; - new_info.start_instance = data[4]; - } - else { - data = pipe_buffer_map_range(pipe, new_info.indirect->buffer, - new_info.indirect->offset, 16, - PIPE_TRANSFER_READ, &transfer); - new_info.start_instance = data[3]; + const struct pipe_draw_indirect_info *indirect = new_info.indirect; + unsigned draw_count = 0; + + /* Get the number of draws. */ + if (indirect->indirect_draw_count) { + pipe_buffer_read(pipe, indirect->indirect_draw_count, + indirect->indirect_draw_count_offset, + 4, &draw_count); + } else { + draw_count = indirect->draw_count; } - new_info.count = data[0]; - new_info.instance_count = data[1]; - new_info.start = data[2]; - pipe_buffer_unmap(pipe, transfer); - new_info.indirect = NULL; - - if (!new_info.count) + if (!draw_count) return; + + unsigned data_size = (draw_count - 1) * indirect->stride + + (new_info.index_size ? 20 : 16); + unsigned *data = malloc(data_size); + if (!data) + return; /* report an error? */ + + /* Read the used buffer range only once, because the read can be + * uncached. + */ + pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size, + data); + + if (info->index_size) { + /* Indexed multidraw. */ + unsigned index_bias0 = data[3]; + bool index_bias_same = true; + + /* If we invoke the translate path, we have to split the multidraw. */ + if (incompatible_vb_mask || + mgr->ve->incompatible_elem_mask) { + u_vbuf_split_indexed_multidraw(mgr, &new_info, data, + indirect->stride, draw_count); + free(data); + return; + } + + /* See if index_bias is the same for all draws. */ + for (unsigned i = 1; i < draw_count; i++) { + if (data[i * indirect->stride / 4 + 3] != index_bias0) { + index_bias_same = false; + break; + } + } + + /* Split the multidraw if index_bias is different. */ + if (!index_bias_same) { + u_vbuf_split_indexed_multidraw(mgr, &new_info, data, + indirect->stride, draw_count); + free(data); + return; + } + + /* If we don't need to use the translate path and index_bias is + * the same, we can process the multidraw with the time complexity + * equal to 1 draw call (except for the index range computation). + * We only need to compute the index range covering all draw calls + * of the multidraw. + * + * The driver will not look at these values because indirect != NULL. + * These values determine the user buffer bounds to upload. + */ + new_info.index_bias = index_bias0; + new_info.min_index = ~0u; + new_info.max_index = 0; + new_info.start_instance = ~0u; + unsigned end_instance = 0; + + struct pipe_transfer *transfer = NULL; + const uint8_t *indices; + + if (info->has_user_indices) { + indices = (uint8_t*)info->index.user; + } else { + indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource, + PIPE_TRANSFER_READ, &transfer); + } + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * indirect->stride / 4; + unsigned start = data[offset + 2]; + unsigned count = data[offset + 0]; + unsigned start_instance = data[offset + 4]; + unsigned instance_count = data[offset + 1]; + + if (!count || !instance_count) + continue; + + /* Update the ranges of instances. */ + new_info.start_instance = MIN2(new_info.start_instance, + start_instance); + end_instance = MAX2(end_instance, start_instance + instance_count); + + /* Update the index range. */ + unsigned min, max; + new_info.count = count; /* only used by get_minmax_index */ + u_vbuf_get_minmax_index_mapped(&new_info, + indices + + new_info.index_size * start, + &min, &max); + + new_info.min_index = MIN2(new_info.min_index, min); + new_info.max_index = MAX2(new_info.max_index, max); + } + free(data); + + if (transfer) + pipe_buffer_unmap(pipe, transfer); + + /* Set the final instance count. */ + new_info.instance_count = end_instance - new_info.start_instance; + + if (new_info.start_instance == ~0u || !new_info.instance_count) + return; + } else { + /* Non-indexed multidraw. + * + * Keep the draw call indirect and compute minimums & maximums, + * which will determine the user buffer bounds to upload, but + * the driver will not look at these values because indirect != NULL. + * + * This efficiently processes the multidraw with the time complexity + * equal to 1 draw call. + */ + new_info.start = ~0u; + new_info.start_instance = ~0u; + unsigned end_vertex = 0; + unsigned end_instance = 0; + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * indirect->stride / 4; + unsigned start = data[offset + 2]; + unsigned count = data[offset + 0]; + unsigned start_instance = data[offset + 3]; + unsigned instance_count = data[offset + 1]; + + new_info.start = MIN2(new_info.start, start); + new_info.start_instance = MIN2(new_info.start_instance, + start_instance); + + end_vertex = MAX2(end_vertex, start + count); + end_instance = MAX2(end_instance, start_instance + instance_count); + } + + /* Set the final counts. */ + new_info.count = end_vertex - new_info.start; + new_info.instance_count = end_instance - new_info.start_instance; + + if (new_info.start == ~0u || !new_info.count || !new_info.instance_count) + return; + } } if (new_info.index_size) { @@ -1211,7 +1366,8 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) * We would have to break this drawing operation into several ones. */ /* Use some heuristic to see if unrolling indices improves * performance. */ - if (!new_info.primitive_restart && + if (!info->indirect && + !new_info.primitive_restart && num_vertices > new_info.count*2 && num_vertices - new_info.count > 32 && !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {