u_vbuf: implement another upload codepath which unrolls indices
authorMarek Olšák <maraeo@gmail.com>
Tue, 3 Jan 2012 21:01:03 +0000 (22:01 +0100)
committerMarek Olšák <maraeo@gmail.com>
Thu, 5 Jan 2012 17:29:11 +0000 (18:29 +0100)
Improves performance from cca 1 fps to 23 fps in Cogs.
This new codepath is not always used, instead, there is a heuristic which
determines whether to use it. Using translate for uploads is generally
slower than what we have had already, it's a win only in a few cases.

src/gallium/auxiliary/util/u_vbuf.c
src/gallium/auxiliary/util/u_vbuf.h
src/gallium/drivers/r600/r600_state_common.c

index 5dfee42bcbda309a9b766a09f686d82fde4622d7..08f5c627a25eea306d7de418006c98fd6b7089b2 100644 (file)
@@ -166,7 +166,9 @@ void u_vbuf_destroy(struct u_vbuf *mgrb)
 static void
 u_vbuf_translate_buffers(struct u_vbuf_priv *mgr, struct translate_key *key,
                          unsigned vb_mask, unsigned out_vb,
-                         int start_vertex, unsigned num_vertices)
+                         int start_vertex, unsigned num_vertices,
+                         int start_index, unsigned num_indices, int min_index,
+                         bool unroll_indices)
 {
    struct translate *tr;
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
@@ -198,21 +200,65 @@ u_vbuf_translate_buffers(struct u_vbuf_priv *mgr, struct translate_key *key,
                                         PIPE_TRANSFER_READ, &vb_transfer[i]);
          }
 
+         /* Subtract min_index so that indexing with the index buffer works. */
+         if (unroll_indices) {
+            map -= vb->stride * min_index;
+         }
+
          tr->set_buffer(tr, i, map, vb->stride, ~0);
       }
    }
 
-   /* Create and map the output buffer. */
-   u_upload_alloc(mgr->b.uploader,
-                  key->output_stride * start_vertex,
-                  key->output_stride * num_vertices,
-                  &out_offset, &out_buffer,
-                  (void**)&out_map);
+   /* Translate. */
+   if (unroll_indices) {
+      struct pipe_index_buffer *ib = &mgr->b.index_buffer;
+      struct pipe_transfer *transfer = NULL;
+      unsigned offset = ib->offset + start_index * ib->index_size;
+      uint8_t *map;
 
-   out_offset -= key->output_stride * start_vertex;
+      assert(ib->buffer && ib->index_size);
 
-   /* Translate. */
-   tr->run(tr, 0, num_vertices, 0, out_map);
+      if (u_vbuf_resource(ib->buffer)->user_ptr) {
+         map = u_vbuf_resource(ib->buffer)->user_ptr + offset;
+      } else {
+         map = pipe_buffer_map_range(mgr->pipe, ib->buffer, offset,
+                                     num_indices * ib->index_size,
+                                     PIPE_TRANSFER_READ, &transfer);
+      }
+
+      /* Create and map the output buffer. */
+      u_upload_alloc(mgr->b.uploader, 0,
+                     key->output_stride * num_indices,
+                     &out_offset, &out_buffer,
+                     (void**)&out_map);
+
+      switch (ib->index_size) {
+      case 4:
+         tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
+         break;
+      case 2:
+         tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, out_map);
+         break;
+      case 1:
+         tr->run_elts8(tr, map, num_indices, 0, out_map);
+         break;
+      }
+
+      if (transfer) {
+         pipe_buffer_unmap(mgr->pipe, transfer);
+      }
+   } else {
+      /* Create and map the output buffer. */
+      u_upload_alloc(mgr->b.uploader,
+                     key->output_stride * start_vertex,
+                     key->output_stride * num_vertices,
+                     &out_offset, &out_buffer,
+                     (void**)&out_map);
+
+      out_offset -= key->output_stride * start_vertex;
+
+      tr->run(tr, 0, num_vertices, 0, out_map);
+   }
 
    /* Unmap all buffers. */
    for (i = 0; i < mgr->b.nr_vertex_buffers; i++) {
@@ -283,7 +329,9 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf_priv *mgr,
 static boolean
 u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
                        int start_vertex, unsigned num_vertices,
-                       int start_instance, unsigned num_instances)
+                       int start_instance, unsigned num_instances,
+                       int start_index, unsigned num_indices, int min_index,
+                       bool unroll_indices)
 {
    unsigned mask[VB_NUM] = {0};
    struct translate_key key[VB_NUM];
@@ -310,16 +358,24 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
    for (i = 0; i < mgr->ve->count; i++) {
       unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;
 
-      if (!mgr->ve->incompatible_layout_elem[i] &&
-          !mgr->incompatible_vb[vb_index]) {
-         continue;
-      }
-
       if (!mgr->b.vertex_buffer[vb_index].stride) {
+         if (!mgr->ve->incompatible_layout_elem[i] &&
+             !mgr->incompatible_vb[vb_index]) {
+            continue;
+         }
          mask[VB_CONST] |= 1 << vb_index;
       } else if (mgr->ve->ve[i].instance_divisor) {
+         if (!mgr->ve->incompatible_layout_elem[i] &&
+             !mgr->incompatible_vb[vb_index]) {
+            continue;
+         }
          mask[VB_INSTANCE] |= 1 << vb_index;
       } else {
+         if (!unroll_indices &&
+             !mgr->ve->incompatible_layout_elem[i] &&
+             !mgr->incompatible_vb[vb_index]) {
+            continue;
+         }
          mask[VB_VERTEX] |= 1 << vb_index;
       }
    }
@@ -336,15 +392,16 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
       struct translate_key *k;
       struct translate_element *te;
       unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
+      bit = 1 << vb_index;
 
       if (!mgr->ve->incompatible_layout_elem[i] &&
-          !mgr->incompatible_vb[vb_index]) {
+          !mgr->incompatible_vb[vb_index] &&
+          (!unroll_indices || !(mask[VB_VERTEX] & bit))) {
          continue;
       }
 
       /* Set type to what we will translate.
        * Whether vertex, instance, or constant attribs. */
-      bit = 1 << vb_index;
       for (type = 0; type < VB_NUM; type++) {
          if (mask[type] & bit) {
             break;
@@ -376,7 +433,9 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
       if (key[type].nr_elements) {
          u_vbuf_translate_buffers(mgr, &key[type], mask[type],
                                   mgr->fallback_vbs[type],
-                                  start[type], num[type]);
+                                  start[type], num[type],
+                                  start_index, num_indices, min_index,
+                                  unroll_indices && type == VB_VERTEX);
 
          /* Fixup the stride for constant attribs. */
          if (type == VB_CONST) {
@@ -917,11 +976,12 @@ static void u_vbuf_get_minmax_index(struct pipe_context *pipe,
 
 enum u_vbuf_return_flags
 u_vbuf_draw_begin(struct u_vbuf *mgrb,
-                  const struct pipe_draw_info *info)
+                  struct pipe_draw_info *info)
 {
    struct u_vbuf_priv *mgr = (struct u_vbuf_priv*)mgrb;
-   int start_vertex;
+   int start_vertex, min_index;
    unsigned num_vertices;
+   bool unroll_indices = false;
 
    if (!mgr->incompatible_vb_layout &&
        !mgr->ve->incompatible_layout &&
@@ -930,7 +990,7 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
    }
 
    if (info->indexed) {
-      int min_index, max_index;
+      int max_index;
       bool index_bounds_valid = false;
 
       if (info->max_index != ~0) {
@@ -950,6 +1010,17 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
 
          start_vertex = min_index + info->index_bias;
          num_vertices = max_index + 1 - min_index;
+
+         /* Primitive restart doesn't work when unrolling indices.
+          * We would have to break this drawing operation into several ones. */
+         /* Use some heuristic to see if unrolling indices improves
+          * performance. */
+         if (!info->primitive_restart &&
+             num_vertices > info->count*2 &&
+             num_vertices-info->count > 32) {
+            /*printf("num_vertices=%i count=%i\n", num_vertices, info->count);*/
+            unroll_indices = true;
+         }
       } else {
          /* Nothing to do for per-vertex attribs. */
          start_vertex = 0;
@@ -959,13 +1030,18 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
    } else {
       start_vertex = info->start;
       num_vertices = info->count;
+      min_index = 0;
    }
 
    /* Translate vertices with non-native layouts or formats. */
-   if (mgr->incompatible_vb_layout || mgr->ve->incompatible_layout) {
+   if (unroll_indices ||
+       mgr->incompatible_vb_layout ||
+       mgr->ve->incompatible_layout) {
       /* XXX check the return value */
       u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
-                             info->start_instance, info->instance_count);
+                             info->start_instance, info->instance_count,
+                             info->start, info->count, min_index,
+                             unroll_indices);
    }
 
    /* Upload user buffers. */
@@ -974,7 +1050,15 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
                             info->start_instance, info->instance_count);
    }
 
-   /*unsigned i;
+   /*
+   if (unroll_indices) {
+      printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
+             start_vertex, num_vertices);
+      util_dump_draw_info(stdout, info);
+      printf("\n");
+   }
+
+   unsigned i;
    for (i = 0; i < mgr->b.nr_vertex_buffers; i++) {
       printf("input %i: ", i);
       util_dump_vertex_buffer(stdout, mgr->b.vertex_buffer+i);
@@ -984,7 +1068,16 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
       printf("real %i: ", i);
       util_dump_vertex_buffer(stdout, mgr->b.real_vertex_buffer+i);
       printf("\n");
-   }*/
+   }
+   */
+
+   if (unroll_indices) {
+      info->indexed = FALSE;
+      info->index_bias = 0;
+      info->min_index = 0;
+      info->max_index = info->count - 1;
+      info->start = 0;
+   }
 
    return U_VBUF_BUFFERS_UPDATED;
 }
index 57b93ddea6bbadf881502b4a11705586a0ca3bab..3669c9b874ae171e43de2ad4d626c1cf76e77c9f 100644 (file)
@@ -130,7 +130,7 @@ void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
                              const struct pipe_index_buffer *ib);
 
 enum u_vbuf_return_flags u_vbuf_draw_begin(struct u_vbuf *mgr,
-                                           const struct pipe_draw_info *info);
+                                           struct pipe_draw_info *info);
 
 unsigned u_vbuf_draw_max_vertex_count(struct u_vbuf *mgr);
 
index 9f6f5142c09fb931088fcdfd3f78e0aa7f5b5862..054ab90595c3deb9a89b9d0ce21f71c47178a17b 100644 (file)
@@ -605,7 +605,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 
        r600_update_derived_state(rctx);
 
-       u_vbuf_draw_begin(rctx->vbuf_mgr, dinfo);
+       u_vbuf_draw_begin(rctx->vbuf_mgr, &info);
        r600_vertex_buffer_update(rctx);
 
        rdraw.vgt_num_indices = info.count;