From 1c2e5c223da28cdffe156b6b430fcdf638909021 Mon Sep 17 00:00:00 2001
From: Zack Rusin <zackr@vmware.com>
Date: Thu, 27 Jun 2013 20:40:10 -0400
Subject: [PATCH] draw/translate: fix instancing

We were incorrectly computing the buffer offset when using the
instances. The buffer offset is always equal to:
start_instance * stride + (instance_num / instance_divisor) *
stride
We were completely ignoring the start instance quite
often producing instances that completely wrong, e.g. if
start instance = 5, instance divisor = 2, then on the first
iteration it should be:
5 * stride, not (5/2) * stride as we'd have currently, and if
start instance = 1, instance divisor = 3, then on the first
iteration it should be:
1 * stride, not 0 as we'd have.
This fixes it and adjusts all the code to the changes.

Signed-off-by: Zack Rusin <zackr@vmware.com>
---
 src/gallium/auxiliary/draw/draw_llvm.c        | 18 ++++++++---
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c   |  2 +-
 src/gallium/auxiliary/draw/draw_private.h     |  1 +
 src/gallium/auxiliary/draw/draw_pt.c          |  1 +
 src/gallium/auxiliary/draw/draw_pt_emit.c     |  2 ++
 src/gallium/auxiliary/draw/draw_pt_fetch.c    |  2 ++
 .../auxiliary/draw/draw_pt_fetch_emit.c       |  3 ++
 src/gallium/auxiliary/draw/draw_pt_so_emit.c  | 23 +++++++++++--
 src/gallium/auxiliary/draw/draw_vs_variant.c  |  4 +++
 src/gallium/auxiliary/translate/translate.h   |  4 +++
 .../auxiliary/translate/translate_generic.c   | 17 +++++++---
 .../auxiliary/translate/translate_sse.c       | 32 +++++++++++++++----
 src/gallium/auxiliary/util/u_vbuf.c           |  8 ++---
 src/gallium/drivers/nv30/nv30_push.c          |  8 ++---
 src/gallium/drivers/nv50/nv50_push.c          |  8 ++---
 src/gallium/drivers/nvc0/nvc0_push.c          |  8 ++---
 src/gallium/drivers/nvc0/nvc0_vbo_translate.c |  8 ++---
 17 files changed, 109 insertions(+), 40 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 97b463f4ff8..f9bcadc6cfa 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -674,6 +674,7 @@ generate_vs(struct draw_llvm_variant *variant,
 
 static void
 generate_fetch(struct gallivm_state *gallivm,
+               struct draw_context *draw,
                LLVMValueRef vbuffers_ptr,
                LLVMValueRef *res,
                struct pipe_vertex_element *velem,
@@ -704,10 +705,17 @@ generate_fetch(struct gallivm_state *gallivm,
    struct lp_build_if_state if_ctx;
 
    if (velem->instance_divisor) {
-      /* array index = instance_id / instance_divisor */
-      index = LLVMBuildUDiv(builder, instance_id,
-                            lp_build_const_int32(gallivm, velem->instance_divisor),
-                            "instance_divisor");
+      /* Index is equal to the start instance plus the number of current 
+       * instance divided by the divisor. In this case we compute it as:
+       * index = start_instance + ((instance_id - start_instance) / divisor)
+       */
+      LLVMValueRef current_instance;
+      index = lp_build_const_int32(gallivm, draw->start_instance);
+      current_instance = LLVMBuildSub(builder, instance_id, index, "");
+      current_instance = LLVMBuildUDiv(builder, current_instance,
+                                       lp_build_const_int32(gallivm, velem->instance_divisor),
+                                       "instance_divisor");
+      index = LLVMBuildAdd(builder, index, current_instance, "instance");
    }
 
    stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
@@ -1697,7 +1705,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
             LLVMValueRef vb_index =
                lp_build_const_int32(gallivm, velem->vertex_buffer_index);
             LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
-            generate_fetch(gallivm, vbuffers_ptr,
+            generate_fetch(gallivm, draw, vbuffers_ptr,
                            &aos_attribs[j][i], velem, vb, true_index,
                            system_values.instance_id);
          }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 578433c2006..d3b38eb2df6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -138,7 +138,7 @@ emit_vertex( struct vbuf_stage *vbuf,
       /* Note: we really do want data[0] here, not data[pos]: 
        */
       vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0, ~0);
-      vbuf->translate->run(vbuf->translate, 0, 1, 0, vbuf->vertex_ptr);
+      vbuf->translate->run(vbuf->translate, 0, 1, 0, 0, vbuf->vertex_ptr);
 
       if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
       
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index fd52c2d6b4c..f42cded118a 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -306,6 +306,7 @@ struct draw_context
    } extra_shader_outputs;
 
    unsigned instance_id;
+   unsigned start_instance;
 
 #ifdef HAVE_LLVM
    struct draw_llvm *llvm;
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index c4d06de84bb..e89ccd25401 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -533,6 +533,7 @@ draw_vbo(struct draw_context *draw,
 
    for (instance = 0; instance < info->instance_count; instance++) {
       draw->instance_id = instance + info->start_instance;
+      draw->start_instance = info->start_instance;
       /* check for overflow */
       if (draw->instance_id < instance ||
           draw->instance_id < info->start_instance) {
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 4c96d7414ee..fc64048be1b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -171,6 +171,7 @@ draw_pt_emit(struct pt_emit *emit,
    translate->run(translate,
 		  0,
 		  vertex_count,
+                  draw->start_instance,
                   draw->instance_id,
 		  hw_verts );
 
@@ -234,6 +235,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
    translate->run(translate,
                   0,
                   count,
+                  draw->start_instance,
                   draw->instance_id,
                   hw_verts);
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 3740deab5a8..8716f657777 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -168,6 +168,7 @@ draw_pt_fetch_run(struct pt_fetch *fetch,
    translate->run_elts( translate,
 			elts,
 			count,
+                        draw->start_instance,
                         draw->instance_id,
 			verts );
 }
@@ -195,6 +196,7 @@ draw_pt_fetch_run_linear(struct pt_fetch *fetch,
    translate->run( translate,
                    start,
                    count,
+                   draw->start_instance,
                    draw->instance_id,
                    verts );
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index dc6decba70e..22ec8d6b2b5 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -210,6 +210,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
    feme->translate->run_elts( feme->translate,
 			      fetch_elts,
 			      fetch_count,
+                              draw->start_instance,
                               draw->instance_id,
 			      hw_verts );
 
@@ -267,6 +268,7 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->start_instance,
                          draw->instance_id,
                          hw_verts );
 
@@ -326,6 +328,7 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->start_instance,
                          draw->instance_id,
                          hw_verts );
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index d624a990bc6..a6d1da4eb2e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -182,12 +182,29 @@ static void so_emit_prim(struct pt_so_emit *so,
 
          buffer = (float *)((char *)draw->so.targets[ob]->mapping +
                             draw->so.targets[ob]->target.buffer_offset +
-                            draw->so.targets[ob]->internal_offset) + state->output[slot].dst_offset;
+                            draw->so.targets[ob]->internal_offset) +
+            state->output[slot].dst_offset;
          
          if (idx == so->pos_idx && pcp_ptr)
-            memcpy(buffer, &pre_clip_pos[start_comp], num_comps * sizeof(float));
+            memcpy(buffer, &pre_clip_pos[start_comp],
+                   num_comps * sizeof(float));
          else
-            memcpy(buffer, &input[idx][start_comp], num_comps * sizeof(float));
+            memcpy(buffer, &input[idx][start_comp],
+                   num_comps * sizeof(float));
+#if 0
+         {
+            int j;
+            debug_printf("VERT[%d], offset = %d, slot[%d] sc = %d, num_c = %d, idx = %d = [",
+                         i + draw->so.targets[ob]->emitted_vertices,
+                         draw->so.targets[ob]->internal_offset,
+                         slot, start_comp, num_comps, idx);
+            for (j = 0; j < num_comps; ++j) {
+               unsigned *ubuffer = (unsigned*)buffer;
+               debug_printf("%d (0x%x), ", ubuffer[j], ubuffer[j]);
+            }
+            debug_printf("]\n");
+         }
+#endif
       }
       for (ob = 0; ob < draw->so.num_targets; ++ob) {
          struct draw_so_target *target = draw->so.targets[ob];
diff --git a/src/gallium/auxiliary/draw/draw_vs_variant.c b/src/gallium/auxiliary/draw/draw_vs_variant.c
index 152c1303183..37500c7db8d 100644
--- a/src/gallium/auxiliary/draw/draw_vs_variant.c
+++ b/src/gallium/auxiliary/draw/draw_vs_variant.c
@@ -168,6 +168,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
    vsvg->fetch->run_elts( vsvg->fetch, 
                           elts,
                           count,
+                          vsvg->draw->start_instance,
                           vsvg->draw->instance_id,
                           temp_buffer );
 
@@ -211,6 +212,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
 
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->start_instance,
                     vsvg->draw->instance_id,
                     output_buffer );
 
@@ -234,6 +236,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
    vsvg->fetch->run( vsvg->fetch, 
                      start,
                      count,
+                     vsvg->draw->start_instance,
                      vsvg->draw->instance_id,
                      temp_buffer );
 
@@ -274,6 +277,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
    
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->start_instance,
                     vsvg->draw->instance_id,
                     output_buffer );
 
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 850ef39ef21..1132114de9d 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -74,24 +74,28 @@ struct translate;
 typedef void (PIPE_CDECL *run_elts_func)(struct translate *,
                                          const unsigned *elts,
                                          unsigned count,
+                                         unsigned start_instance,
                                          unsigned instance_id,
                                          void *output_buffer);
 
 typedef void (PIPE_CDECL *run_elts16_func)(struct translate *,
                                            const uint16_t *elts,
                                            unsigned count,
+                                           unsigned start_instance,
                                            unsigned instance_id,
                                            void *output_buffer);
 
 typedef void (PIPE_CDECL *run_elts8_func)(struct translate *,
                                           const uint8_t *elts,
                                           unsigned count,
+                                          unsigned start_instance,
                                           unsigned instance_id,
                                           void *output_buffer);
 
 typedef void (PIPE_CDECL *run_func)(struct translate *,
                                     unsigned start,
                                     unsigned count,
+                                    unsigned start_instance,
                                     unsigned instance_id,
                                     void *output_buffer);
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 894c1684813..96e35b0eb41 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -607,6 +607,7 @@ static emit_func get_emit_func( enum pipe_format format )
 
 static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
                                          unsigned elt,
+                                         unsigned start_instance,
                                          unsigned instance_id,
                                          void *vert )
 {
@@ -623,7 +624,9 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *
          int copy_size;
 
          if (tg->attrib[attr].instance_divisor) {
-            index = instance_id / tg->attrib[attr].instance_divisor;
+            index = start_instance;
+            index += (instance_id - start_instance) /
+               tg->attrib[attr].instance_divisor;
             /* XXX we need to clamp the index here too, but to a
              * per-array max value, not the draw->pt.max_index value
              * that's being given to us via translate->set_buffer().
@@ -674,6 +677,7 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *
 static void PIPE_CDECL generic_run_elts( struct translate *translate,
                                          const unsigned *elts,
                                          unsigned count,
+                                         unsigned start_instance,
                                          unsigned instance_id,
                                          void *output_buffer )
 {
@@ -682,7 +686,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
    unsigned i;
 
    for (i = 0; i < count; i++) {
-      generic_run_one(tg, *elts++, instance_id, vert);
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -690,6 +694,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 static void PIPE_CDECL generic_run_elts16( struct translate *translate,
                                          const uint16_t *elts,
                                          unsigned count,
+                                         unsigned start_instance,
                                          unsigned instance_id,
                                          void *output_buffer )
 {
@@ -698,7 +703,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate,
    unsigned i;
 
    for (i = 0; i < count; i++) {
-      generic_run_one(tg, *elts++, instance_id, vert);
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -706,6 +711,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate,
 static void PIPE_CDECL generic_run_elts8( struct translate *translate,
                                          const uint8_t *elts,
                                          unsigned count,
+                                         unsigned start_instance, 
                                          unsigned instance_id,
                                          void *output_buffer )
 {
@@ -714,7 +720,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate,
    unsigned i;
 
    for (i = 0; i < count; i++) {
-      generic_run_one(tg, *elts++, instance_id, vert);
+      generic_run_one(tg, *elts++, start_instance, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -722,6 +728,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate,
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
                                     unsigned count,
+                                    unsigned start_instance,
                                     unsigned instance_id,
                                     void *output_buffer )
 {
@@ -730,7 +737,7 @@ static void PIPE_CDECL generic_run( struct translate *translate,
    unsigned i;
 
    for (i = 0; i < count; i++) {
-      generic_run_one(tg, start + i, instance_id, vert);
+      generic_run_one(tg, start + i, start_instance, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index c2dd42db96e..a4f7b243c13 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -112,6 +112,7 @@ struct translate_sse {
 
    boolean use_instancing;
    unsigned instance_id;
+   unsigned start_instance;
 
    /* these are actually known values, but putting them in a struct
     * like this is helpful to keep them in sync across the file.
@@ -1061,6 +1062,8 @@ static boolean init_inputs( struct translate_sse *p,
    unsigned i;
    struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
                                               get_offset(p, &p->instance_id));
+   struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
+                                                 get_offset(p, &p->start_instance));
 
    for (i = 0; i < p->nr_buffer_variants; i++) {
       struct translate_buffer_variant *variant = &p->buffer_variant[i];
@@ -1082,7 +1085,8 @@ static boolean init_inputs( struct translate_sse *p,
           *   base_ptr + stride * index, where index depends on instance divisor
           */
          if (variant->instance_divisor) {
-            /* Our index is instance ID divided by instance divisor.
+            /* Start with instance = instance_id
+             * which is true if divisor is 1.
              */
             x86_mov(p->func, tmp_EAX, instance_id);
 
@@ -1090,13 +1094,22 @@ static boolean init_inputs( struct translate_sse *p,
                struct x86_reg tmp_EDX = p->tmp2_EDX;
                struct x86_reg tmp_ECX = p->src_ECX;
 
+               /* instance_num = instance_id - start_instance */
+               x86_mov(p->func, tmp_EDX, start_instance);
+               x86_sub(p->func, tmp_EAX, tmp_EDX);
+
                /* TODO: Add x86_shr() to rtasm and use it whenever
                 *       instance divisor is power of two.
                 */
-
                x86_xor(p->func, tmp_EDX, tmp_EDX);
                x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
+
+               /* instance = (instance_id - start_instance) / divisor + 
+                *             start_instance 
+                */
+               x86_mov(p->func, tmp_EDX, start_instance);
+               x86_add(p->func, tmp_EAX, tmp_EDX);
             }
 
             /* XXX we need to clamp the index here too, but to a
@@ -1312,16 +1325,23 @@ static boolean build_vertex_emit( struct translate_sse *p,
    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
 
    if(x86_target(p->func) != X86_32)
-      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
    else
-      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
 
    /* Load instance ID.
     */
-   if (p->use_instancing) {
+   if (p->use_instancing) {      
       x86_mov(p->func,
-              p->tmp_EAX,
+              p->tmp2_EDX,
               x86_fn_arg(p->func, 4));
+      x86_mov(p->func,
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
+              p->tmp2_EDX);
+
+      x86_mov(p->func,
+              p->tmp_EAX,
+              x86_fn_arg(p->func, 5));
       x86_mov(p->func,
               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
               p->tmp_EAX);
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 5936f74a039..52b360ed7aa 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -403,13 +403,13 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 
       switch (ib->index_size) {
       case 4:
-         tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
+         tr->run_elts(tr, (unsigned*)map, num_indices, 0, 0, out_map);
          break;
       case 2:
-         tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, out_map);
+         tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, 0, out_map);
          break;
       case 1:
-         tr->run_elts8(tr, map, num_indices, 0, out_map);
+         tr->run_elts8(tr, map, num_indices, 0, 0, out_map);
          break;
       }
 
@@ -428,7 +428,7 @@ u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
 
       out_offset -= key->output_stride * start_vertex;
 
-      tr->run(tr, 0, num_vertices, 0, out_map);
+      tr->run(tr, 0, num_vertices, 0, 0, out_map);
    }
 
    /* Unmap all buffers. */
diff --git a/src/gallium/drivers/nv30/nv30_push.c b/src/gallium/drivers/nv30/nv30_push.c
index 854d2d7222a..d37af2c7b2b 100644
--- a/src/gallium/drivers/nv30/nv30_push.c
+++ b/src/gallium/drivers/nv30/nv30_push.c
@@ -99,7 +99,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->push->cur);
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
 
       ctx->push->cur += size;
       count -= nr;
@@ -131,7 +131,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->push->cur);
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
 
       ctx->push->cur += size;
       count -= nr;
@@ -163,7 +163,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->push->cur);
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
 
       ctx->push->cur += size;
       count -= nr;
@@ -187,7 +187,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
 
-      ctx->translate->run(ctx->translate, start, push, 0, ctx->push->cur);
+      ctx->translate->run(ctx->translate, start, push, 0, 0, ctx->push->cur);
       ctx->push->cur += size;
       count -= push;
       start += push;
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index eb6bfbccfa7..73804328138 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -76,7 +76,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
                                 ctx->push->cur);
 
       ctx->push->cur += size;
@@ -109,7 +109,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
                                  ctx->push->cur);
 
       ctx->push->cur += size;
@@ -142,7 +142,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
                                ctx->push->cur);
 
       ctx->push->cur += size;
@@ -167,7 +167,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
-      ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
+      ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
                           ctx->push->cur);
       ctx->push->cur += size;
       count -= push;
diff --git a/src/gallium/drivers/nvc0/nvc0_push.c b/src/gallium/drivers/nvc0/nvc0_push.c
index 78cc5c20641..75caf8a895c 100644
--- a/src/gallium/drivers/nvc0/nvc0_push.c
+++ b/src/gallium/drivers/nvc0/nvc0_push.c
@@ -137,7 +137,7 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts8(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
                                 ctx->push->cur);
       ctx->push->cur += size;
 
@@ -178,7 +178,7 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts16(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
                                  ctx->push->cur);
       ctx->push->cur += size;
 
@@ -219,7 +219,7 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
 
-      ctx->translate->run_elts(ctx->translate, elts, nr, ctx->instance_id,
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
                                ctx->push->cur);
       ctx->push->cur += size;
 
@@ -252,7 +252,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 
       BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
 
-      ctx->translate->run(ctx->translate, start, push, ctx->instance_id,
+      ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
                           ctx->push->cur);
       ctx->push->cur += size;
 
diff --git a/src/gallium/drivers/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nvc0/nvc0_vbo_translate.c
index ec52de2b568..6ea4220ea60 100644
--- a/src/gallium/drivers/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nvc0/nvc0_vbo_translate.c
@@ -215,7 +215,7 @@ disp_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
       if (unlikely(ctx->prim_restart))
          nR = prim_restart_search_i08(elts, nR, ctx->restart_index);
 
-      translate->run_elts8(translate, elts, nR, ctx->instance_id, ctx->dest);
+      translate->run_elts8(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
       count -= nR;
       ctx->dest += nR * ctx->vertex_size;
 
@@ -271,7 +271,7 @@ disp_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
       if (unlikely(ctx->prim_restart))
          nR = prim_restart_search_i16(elts, nR, ctx->restart_index);
 
-      translate->run_elts16(translate, elts, nR, ctx->instance_id, ctx->dest);
+      translate->run_elts16(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
       count -= nR;
       ctx->dest += nR * ctx->vertex_size;
 
@@ -327,7 +327,7 @@ disp_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
       if (unlikely(ctx->prim_restart))
          nR = prim_restart_search_i32(elts, nR, ctx->restart_index);
 
-      translate->run_elts(translate, elts, nR, ctx->instance_id, ctx->dest);
+      translate->run_elts(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
       count -= nR;
       ctx->dest += nR * ctx->vertex_size;
 
@@ -376,7 +376,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
    struct translate *translate = ctx->translate;
    unsigned pos = 0;
 
-   translate->run(translate, start, count, ctx->instance_id, ctx->dest);
+   translate->run(translate, start, count, 0, ctx->instance_id, ctx->dest);
    do {
       unsigned nr = count;
 
-- 
2.30.2