X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fsvga%2Fsvga_draw.c;h=6de233888f6f32e16e9f735c2fec9cf4c7885580;hb=28feb63580e94085dd47d5391f9f6f20d69eea6c;hp=41e0e1adc2f040230e5cb62476bc136105e5a3d0;hpb=105ef87842d4ba82dc0235ec154e662cce56a927;p=mesa.git

diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 41e0e1adc2f..6de233888f6 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -26,34 +26,34 @@
 #include "pipe/p_compiler.h"
 #include "util/u_inlines.h"
 #include "pipe/p_defines.h"
+#include "util/u_helpers.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
-#include "util/u_upload_mgr.h"
 
 #include "svga_context.h"
 #include "svga_draw.h"
 #include "svga_draw_private.h"
 #include "svga_debug.h"
 #include "svga_screen.h"
+#include "svga_resource.h"
 #include "svga_resource_buffer.h"
 #include "svga_resource_texture.h"
+#include "svga_shader.h"
 #include "svga_surface.h"
 #include "svga_winsys.h"
 #include "svga_cmd.h"
 
 
-struct svga_hwtnl *svga_hwtnl_create( struct svga_context *svga,
-                                      struct u_upload_mgr *upload_ib,
-                                      struct svga_winsys_context *swc )
+struct svga_hwtnl *
+svga_hwtnl_create(struct svga_context *svga)
 {
    struct svga_hwtnl *hwtnl = CALLOC_STRUCT(svga_hwtnl);
-   if (hwtnl == NULL)
+   if (!hwtnl)
       goto fail;
 
    hwtnl->svga = svga;
-   hwtnl->upload_ib = upload_ib;
-   
-   hwtnl->cmd.swc = swc;
+
+   hwtnl->cmd.swc = svga->swc;
 
    return hwtnl;
 
@@ -61,70 +61,98 @@ fail:
    return NULL;
 }
 
-void svga_hwtnl_destroy( struct svga_hwtnl *hwtnl )
+
+void
+svga_hwtnl_destroy(struct svga_hwtnl *hwtnl)
 {
-   int i, j;
+   unsigned i, j;
 
    for (i = 0; i < PIPE_PRIM_MAX; i++) {
       for (j = 0; j < IDX_CACHE_MAX; j++) {
-         pipe_resource_reference( &hwtnl->index_cache[i][j].buffer,
-                                NULL );
+         pipe_resource_reference(&hwtnl->index_cache[i][j].buffer, NULL);
       }
    }
 
-   for (i = 0; i < hwtnl->cmd.vdecl_count; i++)
-      pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i], NULL);
+   for (i = 0; i < hwtnl->cmd.vbuf_count; i++)
+      pipe_resource_reference(&hwtnl->cmd.vbufs[i].buffer, NULL);
 
    for (i = 0; i < hwtnl->cmd.prim_count; i++)
       pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
-      
 
    FREE(hwtnl);
 }
 
 
-void svga_hwtnl_set_flatshade( struct svga_hwtnl *hwtnl,
-                               boolean flatshade,
-                               boolean flatshade_first )
+void
+svga_hwtnl_set_flatshade(struct svga_hwtnl *hwtnl,
+                         boolean flatshade, boolean flatshade_first)
 {
-   hwtnl->hw_pv = PV_FIRST;
+   struct svga_screen *svgascreen = svga_screen(hwtnl->svga->pipe.screen);
+
+   /* User-specified PV */
    hwtnl->api_pv = (flatshade && !flatshade_first) ? PV_LAST : PV_FIRST;
-}                               
 
-void svga_hwtnl_set_unfilled( struct svga_hwtnl *hwtnl,
-                              unsigned mode )
+   /* Device supported PV */
+   if (svgascreen->haveProvokingVertex) {
+      /* use the mode specified by the user */
+      hwtnl->hw_pv = hwtnl->api_pv;
+   }
+   else {
+      /* the device only support first provoking vertex */
+      hwtnl->hw_pv = PV_FIRST;
+   }
+}
+
+
+void
+svga_hwtnl_set_fillmode(struct svga_hwtnl *hwtnl, unsigned mode)
 {
    hwtnl->api_fillmode = mode;
-}                               
+}
 
-void svga_hwtnl_reset_vdecl( struct svga_hwtnl *hwtnl,
-                             unsigned count )
-{
-   unsigned i;
 
+void
+svga_hwtnl_vertex_decls(struct svga_hwtnl *hwtnl,
+                        unsigned count,
+                        const SVGA3dVertexDecl * decls,
+                        const unsigned *buffer_indexes,
+                        SVGA3dElementLayoutId layout_id)
+{
    assert(hwtnl->cmd.prim_count == 0);
-
-   for (i = count; i < hwtnl->cmd.vdecl_count; i++) {
-      pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i],
-                            NULL);
-   }
-
    hwtnl->cmd.vdecl_count = count;
+   hwtnl->cmd.vdecl_layout_id = layout_id;
+   memcpy(hwtnl->cmd.vdecl, decls, count * sizeof(*decls));
+   memcpy(hwtnl->cmd.vdecl_buffer_index, buffer_indexes,
+          count * sizeof(unsigned));
 }
 
 
-void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
-		       unsigned i,
-		       const SVGA3dVertexDecl *decl,
-		       struct pipe_resource *vb)
+/**
+ * Specify vertex buffers for hardware drawing.
+ */
+void
+svga_hwtnl_vertex_buffers(struct svga_hwtnl *hwtnl,
+                          unsigned count, struct pipe_vertex_buffer *buffers)
 {
-   assert(hwtnl->cmd.prim_count == 0);
+   struct pipe_vertex_buffer *dst = hwtnl->cmd.vbufs;
+   const struct pipe_vertex_buffer *src = buffers;
+   unsigned i;
 
-   assert( i < hwtnl->cmd.vdecl_count );
+   for (i = 0; i < count; i++) {
+      pipe_resource_reference(&dst[i].buffer, src[i].buffer);
+      dst[i].user_buffer = src[i].user_buffer;
+      dst[i].stride = src[i].stride;
+      dst[i].buffer_offset = src[i].buffer_offset;
+   }
 
-   hwtnl->cmd.vdecl[i] = *decl;
+   /* release old buffer references */
+   for ( ; i < hwtnl->cmd.vbuf_count; i++) {
+      pipe_resource_reference(&dst[i].buffer, NULL);
+      dst[i].user_buffer = NULL;  /* just to be safe */
+      /* don't bother zeroing stride/offset fields */
+   }
 
-   pipe_resource_reference(&hwtnl->cmd.vdecl_vb[i], vb);   
+   hwtnl->cmd.vbuf_count = count;
 }
 
 
@@ -133,8 +161,8 @@ void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
  * for which no commands have been written yet.
  */
 boolean
-svga_hwtnl_is_buffer_referred( struct svga_hwtnl *hwtnl,
-                               struct pipe_resource *buffer)
+svga_hwtnl_is_buffer_referred(struct svga_hwtnl *hwtnl,
+                              struct pipe_resource *buffer)
 {
    unsigned i;
 
@@ -146,8 +174,8 @@ svga_hwtnl_is_buffer_referred( struct svga_hwtnl *hwtnl,
       return FALSE;
    }
 
-   for (i = 0; i < hwtnl->cmd.vdecl_count; ++i) {
-      if (hwtnl->cmd.vdecl_vb[i] == buffer) {
+   for (i = 0; i < hwtnl->cmd.vbuf_count; ++i) {
+      if (hwtnl->cmd.vbufs[i].buffer == buffer) {
          return TRUE;
       }
    }
@@ -162,121 +190,590 @@ svga_hwtnl_is_buffer_referred( struct svga_hwtnl *hwtnl,
 }
 
 
-enum pipe_error
-svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
+static enum pipe_error
+draw_vgpu9(struct svga_hwtnl *hwtnl)
 {
    struct svga_winsys_context *swc = hwtnl->cmd.swc;
    struct svga_context *svga = hwtnl->svga;
    enum pipe_error ret;
+   struct svga_winsys_surface *vb_handle[SVGA3D_INPUTREG_MAX];
+   struct svga_winsys_surface *ib_handle[QSZ];
+   struct svga_winsys_surface *handle;
+   SVGA3dVertexDecl *vdecl;
+   SVGA3dPrimitiveRange *prim;
+   unsigned i;
 
-   if (hwtnl->cmd.prim_count) {
-      struct svga_winsys_surface *vb_handle[SVGA3D_INPUTREG_MAX];
-      struct svga_winsys_surface *ib_handle[QSZ];
-      struct svga_winsys_surface *handle;
-      SVGA3dVertexDecl *vdecl;
-      SVGA3dPrimitiveRange *prim;
-      unsigned i;
+   for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+      unsigned j = hwtnl->cmd.vdecl_buffer_index[i];
+      handle = svga_buffer_handle(svga, hwtnl->cmd.vbufs[j].buffer);
+      if (!handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
-      /* Unmap upload manager vertex buffers */
-      u_upload_unmap(svga->upload_vb);
+      vb_handle[i] = handle;
+   }
 
-      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
-         handle = svga_buffer_handle(svga, hwtnl->cmd.vdecl_vb[i]);
-         if (handle == NULL)
+   for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+      if (hwtnl->cmd.prim_ib[i]) {
+         handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
+         if (!handle)
             return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+      else
+         handle = NULL;
 
-         vb_handle[i] = handle;
+      ib_handle[i] = handle;
+   }
+
+   if (svga->rebind.flags.rendertargets) {
+      ret = svga_reemit_framebuffer_bindings(svga);
+      if (ret != PIPE_OK) {
+         return ret;
+      }
+   }
+
+   if (svga->rebind.flags.texture_samplers) {
+      ret = svga_reemit_tss_bindings(svga);
+      if (ret != PIPE_OK) {
+         return ret;
       }
+   }
+
+   if (svga->rebind.flags.vs) {
+      ret = svga_reemit_vs_bindings(svga);
+      if (ret != PIPE_OK) {
+         return ret;
+      }
+   }
+
+   if (svga->rebind.flags.fs) {
+      ret = svga_reemit_fs_bindings(svga);
+      if (ret != PIPE_OK) {
+         return ret;
+      }
+   }
+
+   SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
+            svga->curr.framebuffer.cbufs[0] ?
+            svga_surface(svga->curr.framebuffer.cbufs[0])->handle : NULL,
+            hwtnl->cmd.prim_count);
+
+   ret = SVGA3D_BeginDrawPrimitives(swc,
+                                    &vdecl,
+                                    hwtnl->cmd.vdecl_count,
+                                    &prim, hwtnl->cmd.prim_count);
+   if (ret != PIPE_OK)
+      return ret;
+
+   memcpy(vdecl,
+          hwtnl->cmd.vdecl,
+          hwtnl->cmd.vdecl_count * sizeof hwtnl->cmd.vdecl[0]);
+
+   for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+      /* check for 4-byte alignment */
+      assert(vdecl[i].array.offset % 4 == 0);
+      assert(vdecl[i].array.stride % 4 == 0);
+
+      /* Given rangeHint is considered to be relative to indexBias, and
+       * indexBias varies per primitive, we cannot accurately supply an
+       * rangeHint when emitting more than one primitive per draw command.
+       */
+      if (hwtnl->cmd.prim_count == 1) {
+         vdecl[i].rangeHint.first = hwtnl->cmd.min_index[0];
+         vdecl[i].rangeHint.last = hwtnl->cmd.max_index[0] + 1;
+      }
+      else {
+         vdecl[i].rangeHint.first = 0;
+         vdecl[i].rangeHint.last = 0;
+      }
+
+      swc->surface_relocation(swc,
+                              &vdecl[i].array.surfaceId,
+                              NULL, vb_handle[i], SVGA_RELOC_READ);
+   }
+
+   memcpy(prim,
+          hwtnl->cmd.prim, hwtnl->cmd.prim_count * sizeof hwtnl->cmd.prim[0]);
+
+   for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+      swc->surface_relocation(swc,
+                              &prim[i].indexArray.surfaceId,
+                              NULL, ib_handle[i], SVGA_RELOC_READ);
+      pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
+   }
+
+   SVGA_FIFOCommitAll(swc);
+
+   hwtnl->cmd.prim_count = 0;
 
-      /* Unmap upload manager index buffers */
-      u_upload_unmap(svga->upload_ib);
+   return PIPE_OK;
+}
+
+
+static SVGA3dSurfaceFormat
+xlate_index_format(unsigned indexWidth)
+{
+   if (indexWidth == 2) {
+      return SVGA3D_R16_UINT;
+   }
+   else if (indexWidth == 4) {
+      return SVGA3D_R32_UINT;
+   }
+   else {
+      assert(!"Bad indexWidth");
+      return SVGA3D_R32_UINT;
+   }
+}
 
-      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
-         if (hwtnl->cmd.prim_ib[i]) {
-            handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
-            if (handle == NULL)
-               return PIPE_ERROR_OUT_OF_MEMORY;
+
+static enum pipe_error
+validate_sampler_resources(struct svga_context *svga)
+{
+   enum pipe_shader_type shader;
+
+   assert(svga_have_vgpu10(svga));
+
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+      unsigned count = svga->curr.num_sampler_views[shader];
+      unsigned i;
+      struct svga_winsys_surface *surfaces[PIPE_MAX_SAMPLERS];
+      enum pipe_error ret;
+
+      /*
+       * Reference bound sampler resources to ensure pending updates are
+       * noticed by the device.
+       */
+      for (i = 0; i < count; i++) {
+         struct svga_pipe_sampler_view *sv =
+            svga_pipe_sampler_view(svga->curr.sampler_views[shader][i]);
+
+         if (sv) {
+            if (sv->base.texture->target == PIPE_BUFFER) {
+               surfaces[i] = svga_buffer_handle(svga, sv->base.texture);
+            }
+            else {
+               surfaces[i] = svga_texture(sv->base.texture)->handle;
+            }
          }
-         else
-            handle = NULL;
+         else {
+            surfaces[i] = NULL;
+         }
+      }
+
+      if (shader == PIPE_SHADER_FRAGMENT &&
+          svga->curr.rast->templ.poly_stipple_enable) {
+         const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+         struct svga_pipe_sampler_view *sv =
+            svga->polygon_stipple.sampler_view;
 
-         ib_handle[i] = handle;
+         assert(sv);
+         surfaces[unit] = svga_texture(sv->base.texture)->handle;
+         count = MAX2(count, unit+1);
       }
 
-      if (svga->rebind.rendertargets) {
-         ret = svga_reemit_framebuffer_bindings(svga);
-         if (ret != PIPE_OK) {
-            return ret;
+      /* rebind the shader resources if needed */
+      if (svga->rebind.flags.texture_samplers) {
+         for (i = 0; i < count; i++) {
+            if (surfaces[i]) {
+               ret = svga->swc->resource_rebind(svga->swc,
+                                                surfaces[i],
+                                                NULL,
+                                                SVGA_RELOC_READ);
+               if (ret != PIPE_OK)
+                  return ret;
+            }
          }
       }
+   }
+   svga->rebind.flags.texture_samplers = FALSE;
 
-      if (svga->rebind.texture_samplers) {
-         ret = svga_reemit_tss_bindings(svga);
-         if (ret != PIPE_OK) {
-            return ret;
+   return PIPE_OK;
+}
+
+
+static enum pipe_error
+validate_constant_buffers(struct svga_context *svga)
+{
+   enum pipe_shader_type shader;
+
+   assert(svga_have_vgpu10(svga));
+
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+      enum pipe_error ret;
+      struct svga_buffer *buffer;
+      struct svga_winsys_surface *handle;
+      unsigned enabled_constbufs;
+
+      /* Rebind the default constant buffer if needed */
+      if (svga->rebind.flags.constbufs) {
+         buffer = svga_buffer(svga->state.hw_draw.constbuf[shader]);
+         if (buffer) {
+            ret = svga->swc->resource_rebind(svga->swc,
+                                             buffer->handle,
+                                             NULL,
+                                             SVGA_RELOC_READ);
+            if (ret != PIPE_OK)
+               return ret;
          }
       }
 
-      SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
-               svga->curr.framebuffer.cbufs[0] ?
-               svga_surface(svga->curr.framebuffer.cbufs[0])->handle : NULL,
-               hwtnl->cmd.prim_count);
+      /*
+       * Reference other bound constant buffers to ensure pending updates are
+       * noticed by the device.
+       */
+      enabled_constbufs = svga->state.hw_draw.enabled_constbufs[shader] & ~1u;
+      while (enabled_constbufs) {
+         unsigned i = u_bit_scan(&enabled_constbufs);
+         buffer = svga_buffer(svga->curr.constbufs[shader][i].buffer);
+         if (buffer) {
+            handle = svga_buffer_handle(svga, &buffer->b.b);
+
+            if (svga->rebind.flags.constbufs) {
+               ret = svga->swc->resource_rebind(svga->swc,
+                                                handle,
+                                                NULL,
+                                                SVGA_RELOC_READ);
+               if (ret != PIPE_OK)
+                  return ret;
+            }
+         }
+      }
+   }
+   svga->rebind.flags.constbufs = FALSE;
+
+   return PIPE_OK;
+}
+
 
-      ret = SVGA3D_BeginDrawPrimitives(swc, 
-                                       &vdecl, 
-                                       hwtnl->cmd.vdecl_count, 
-                                       &prim, 
-                                       hwtnl->cmd.prim_count);
-      if (ret != PIPE_OK) 
+/**
+ * Was the last command put into the command buffer a drawing command?
+ * We use this to determine if we can skip emitting buffer re-bind
+ * commands when we have a sequence of drawing commands that use the
+ * same vertex/index buffers with no intervening commands.
+ *
+ * The first drawing command will bind the vertex/index buffers.  If
+ * the immediately following command is also a drawing command using the
+ * same buffers, we shouldn't have to rebind them.
+ */
+static bool
+last_command_was_draw(const struct svga_context *svga)
+{
+   switch (SVGA3D_GetLastCommand(svga->swc)) {
+   case SVGA_3D_CMD_DX_DRAW:
+   case SVGA_3D_CMD_DX_DRAW_INDEXED:
+   case SVGA_3D_CMD_DX_DRAW_INSTANCED:
+   case SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED:
+   case SVGA_3D_CMD_DX_DRAW_AUTO:
+      return true;
+   default:
+      return false;
+   }
+}
+
+
+static enum pipe_error
+draw_vgpu10(struct svga_hwtnl *hwtnl,
+            const SVGA3dPrimitiveRange *range,
+            unsigned vcount,
+            unsigned min_index,
+            unsigned max_index, struct pipe_resource *ib,
+            unsigned start_instance, unsigned instance_count)
+{
+   struct svga_context *svga = hwtnl->svga;
+   struct pipe_resource *vbuffers[SVGA3D_INPUTREG_MAX];
+   struct svga_winsys_surface *vbuffer_handles[SVGA3D_INPUTREG_MAX];
+   struct svga_winsys_surface *ib_handle;
+   const unsigned vbuf_count = hwtnl->cmd.vbuf_count;
+   int last_vbuf = -1;
+   enum pipe_error ret;
+   unsigned i;
+
+   assert(svga_have_vgpu10(svga));
+   assert(hwtnl->cmd.prim_count == 0);
+
+   /* We need to reemit all the current resource bindings along with the Draw
+    * command to be sure that the referenced resources are available for the
+    * Draw command, just in case the surfaces associated with the resources
+    * are paged out.
+    */
+   if (svga->rebind.val) {
+      ret = svga_rebind_framebuffer_bindings(svga);
+      if (ret != PIPE_OK)
          return ret;
 
-      
-      memcpy( vdecl,
-              hwtnl->cmd.vdecl,
-              hwtnl->cmd.vdecl_count * sizeof hwtnl->cmd.vdecl[0]);
+      ret = svga_rebind_shaders(svga);
+      if (ret != PIPE_OK)
+         return ret;
 
-      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
-         /* Given rangeHint is considered to be relative to indexBias, and 
-          * indexBias varies per primitive, we cannot accurately supply an 
-          * rangeHint when emitting more than one primitive per draw command.
+      /* Rebind stream output targets */
+      ret = svga_rebind_stream_output_targets(svga);
+      if (ret != PIPE_OK)
+         return ret;
+
+      /* No need to explicitly rebind index buffer and vertex buffers here.
+       * Even if the same index buffer or vertex buffers are referenced for this
+       * draw and we skip emitting the redundant set command, we will still
+       * reference the associated resources.
+       */
+   }
+
+   ret = validate_sampler_resources(svga);
+   if (ret != PIPE_OK)
+      return ret;
+
+   ret = validate_constant_buffers(svga);
+   if (ret != PIPE_OK)
+      return ret;
+
+   /* Get handle for each referenced vertex buffer */
+   for (i = 0; i < vbuf_count; i++) {
+      struct svga_buffer *sbuf = svga_buffer(hwtnl->cmd.vbufs[i].buffer);
+
+      if (sbuf) {
+         assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_VERTEX_BUFFER);
+         vbuffer_handles[i] = svga_buffer_handle(svga, &sbuf->b.b);
+         if (vbuffer_handles[i] == NULL)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         vbuffers[i] = &sbuf->b.b;
+         last_vbuf = i;
+      }
+      else {
+         vbuffers[i] = NULL;
+         vbuffer_handles[i] = NULL;
+      }
+   }
+
+   for (; i < svga->state.hw_draw.num_vbuffers; i++) {
+      vbuffers[i] = NULL;
+      vbuffer_handles[i] = NULL;
+   }
+
+   /* Get handle for the index buffer */
+   if (ib) {
+      struct svga_buffer *sbuf = svga_buffer(ib);
+
+      assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_INDEX_BUFFER);
+      (void) sbuf; /* silence unused var warning */
+
+      ib_handle = svga_buffer_handle(svga, ib);
+      if (!ib_handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+   else {
+      ib_handle = NULL;
+   }
+
+   /* setup vertex attribute input layout */
+   if (svga->state.hw_draw.layout_id != hwtnl->cmd.vdecl_layout_id) {
+      ret = SVGA3D_vgpu10_SetInputLayout(svga->swc,
+                                         hwtnl->cmd.vdecl_layout_id);
+      if (ret != PIPE_OK)
+         return ret;
+
+      svga->state.hw_draw.layout_id = hwtnl->cmd.vdecl_layout_id;
+   }
+
+   /* setup vertex buffers */
+   {
+      SVGA3dVertexBuffer vbuffer_attrs[PIPE_MAX_ATTRIBS];
+
+      for (i = 0; i < vbuf_count; i++) {
+         vbuffer_attrs[i].stride = hwtnl->cmd.vbufs[i].stride;
+         vbuffer_attrs[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
+         vbuffer_attrs[i].sid = 0;
+      }
+
+      /* If we haven't yet emitted a drawing command or if any
+       * vertex buffer state is changing, issue that state now.
+       */
+      if (((hwtnl->cmd.swc->hints & SVGA_HINT_FLAG_CAN_PRE_FLUSH) == 0) ||
+          vbuf_count != svga->state.hw_draw.num_vbuffers ||
+          memcmp(vbuffer_attrs, svga->state.hw_draw.vbuffer_attrs,
+                 vbuf_count * sizeof(vbuffer_attrs[0])) ||
+          memcmp(vbuffers, svga->state.hw_draw.vbuffers,
+                 vbuf_count * sizeof(vbuffers[0]))) {
+
+         unsigned num_vbuffers;
+
+         /* get the max of the current bound vertex buffers count and
+          * the to-be-bound vertex buffers count, so as to unbind
+          * the unused vertex buffers.
+          */
+         num_vbuffers = MAX2(vbuf_count, svga->state.hw_draw.num_vbuffers);
+
+         /* Zero-out the old buffers we want to unbind (the number of loop
+          * iterations here is typically very small, and often zero.)
           */
-         if (hwtnl->cmd.prim_count == 1) {
-            vdecl[i].rangeHint.first = hwtnl->cmd.min_index[0];
-            vdecl[i].rangeHint.last = hwtnl->cmd.max_index[0] + 1;
+         for (i = vbuf_count; i < num_vbuffers; i++) {
+            vbuffer_attrs[i].sid = 0;
+            vbuffer_attrs[i].stride = 0;
+            vbuffer_attrs[i].offset = 0;
+            vbuffer_handles[i] = NULL;
          }
-         else {
-            vdecl[i].rangeHint.first = 0;
-            vdecl[i].rangeHint.last = 0;
+
+         if (num_vbuffers > 0) {
+
+            ret = SVGA3D_vgpu10_SetVertexBuffers(svga->swc, num_vbuffers,
+                                                 0,    /* startBuffer */
+                                                 vbuffer_attrs,
+                                                 vbuffer_handles);
+            if (ret != PIPE_OK)
+               return ret;
+
+            /* save the number of vertex buffers sent to the device, not
+             * including trailing unbound vertex buffers.
+             */
+            svga->state.hw_draw.num_vbuffers = last_vbuf + 1;
+            memcpy(svga->state.hw_draw.vbuffer_attrs, vbuffer_attrs,
+                   num_vbuffers * sizeof(vbuffer_attrs[0]));
+            for (i = 0; i < num_vbuffers; i++) {
+               pipe_resource_reference(&svga->state.hw_draw.vbuffers[i],
+                                       vbuffers[i]);
+            }
+         }
+      }
+      else {
+         /* Even though we can avoid emitting the redundant SetVertexBuffers
+          * command, we still need to reference the vertex buffers surfaces.
+          */
+         for (i = 0; i < vbuf_count; i++) {
+            if (vbuffer_handles[i] && !last_command_was_draw(svga)) {
+               ret = svga->swc->resource_rebind(svga->swc, vbuffer_handles[i],
+                                                NULL, SVGA_RELOC_READ);
+               if (ret != PIPE_OK)
+                  return ret;
+            }
          }
+      }
+   }
+
+   /* Set primitive type (line, tri, etc) */
+   if (svga->state.hw_draw.topology != range->primType) {
+      ret = SVGA3D_vgpu10_SetTopology(svga->swc, range->primType);
+      if (ret != PIPE_OK)
+         return ret;
 
-         swc->surface_relocation(swc,
-                                 &vdecl[i].array.surfaceId,
-                                 vb_handle[i],
-                                 SVGA_RELOC_READ);
+      svga->state.hw_draw.topology = range->primType;
+   }
+
+   if (ib_handle) {
+      /* indexed drawing */
+      SVGA3dSurfaceFormat indexFormat = xlate_index_format(range->indexWidth);
+
+      /* setup index buffer */
+      if (ib != svga->state.hw_draw.ib ||
+          indexFormat != svga->state.hw_draw.ib_format ||
+          range->indexArray.offset != svga->state.hw_draw.ib_offset) {
+
+         assert(indexFormat != SVGA3D_FORMAT_INVALID);
+         ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, ib_handle,
+                                            indexFormat,
+                                            range->indexArray.offset);
+         if (ret != PIPE_OK)
+            return ret;
+
+         pipe_resource_reference(&svga->state.hw_draw.ib, ib);
+         svga->state.hw_draw.ib_format = indexFormat;
+         svga->state.hw_draw.ib_offset = range->indexArray.offset;
+      }
+      else {
+         /* Even though we can avoid emitting the redundant SetIndexBuffer
+          * command, we still need to reference the index buffer surface.
+          */
+         if (!last_command_was_draw(svga)) {
+            ret = svga->swc->resource_rebind(svga->swc, ib_handle,
+                                             NULL, SVGA_RELOC_READ);
+            if (ret != PIPE_OK)
+               return ret;
+         }
       }
 
-      memcpy( prim,
-              hwtnl->cmd.prim,
-              hwtnl->cmd.prim_count * sizeof hwtnl->cmd.prim[0]);
+      if (instance_count > 1) {
+         ret = SVGA3D_vgpu10_DrawIndexedInstanced(svga->swc,
+                                                  vcount,
+                                                  instance_count,
+                                                  0, /* startIndexLocation */
+                                                  range->indexBias,
+                                                  start_instance);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      else {
+         /* non-instanced drawing */
+         ret = SVGA3D_vgpu10_DrawIndexed(svga->swc,
+                                         vcount,
+                                         0,      /* startIndexLocation */
+                                         range->indexBias);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+   }
+   else {
+      /* non-indexed drawing */
+      if (svga->state.hw_draw.ib_format != SVGA3D_FORMAT_INVALID ||
+          svga->state.hw_draw.ib != NULL) {
+         /* Unbind previously bound index buffer */
+         ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, NULL,
+                                            SVGA3D_FORMAT_INVALID, 0);
+         if (ret != PIPE_OK)
+            return ret;
+         pipe_resource_reference(&svga->state.hw_draw.ib, NULL);
+         svga->state.hw_draw.ib_format = SVGA3D_FORMAT_INVALID;
+      }
+
+      assert(svga->state.hw_draw.ib == NULL);
 
-      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
-         swc->surface_relocation(swc,
-                                 &prim[i].indexArray.surfaceId,
-                                 ib_handle[i],
-                                 SVGA_RELOC_READ);
-         pipe_resource_reference(&hwtnl->cmd.prim_ib[i], NULL);
+      if (instance_count > 1) {
+         ret = SVGA3D_vgpu10_DrawInstanced(svga->swc,
+                                           vcount,
+                                           instance_count,
+                                           range->indexBias,
+                                           start_instance);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      else {
+         /* non-instanced */
+         ret = SVGA3D_vgpu10_Draw(svga->swc,
+                                  vcount,
+                                  range->indexBias);
+         if (ret != PIPE_OK)
+            return ret;
       }
-      
-      SVGA_FIFOCommitAll( swc );
-      hwtnl->cmd.prim_count = 0;
    }
 
+   hwtnl->cmd.prim_count = 0;
+
    return PIPE_OK;
 }
 
 
-void svga_hwtnl_set_index_bias( struct svga_hwtnl *hwtnl,
-				int index_bias)
+
+/**
+ * Emit any pending drawing commands to the command buffer.
+ * When we receive VGPU9 drawing commands we accumulate them and don't
+ * immediately emit them into the command buffer.
+ * This function needs to be called before we change state that could
+ * effect those pending draws.
+ */
+enum pipe_error
+svga_hwtnl_flush(struct svga_hwtnl *hwtnl)
+{
+   enum pipe_error ret = PIPE_OK;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(hwtnl->svga), SVGA_STATS_TIME_HWTNLFLUSH);
+
+   if (!svga_have_vgpu10(hwtnl->svga) && hwtnl->cmd.prim_count) {
+      /* we only queue up primitive for VGPU9 */
+      ret = draw_vgpu9(hwtnl);
+   }
+
+   SVGA_STATS_TIME_POP(svga_screen(hwtnl->svga->pipe.screen)->sws);
+   return ret;
+}
+
+
+void
+svga_hwtnl_set_index_bias(struct svga_hwtnl *hwtnl, int index_bias)
 {
    hwtnl->index_bias = index_bias;
 }
@@ -287,155 +784,205 @@ void svga_hwtnl_set_index_bias( struct svga_hwtnl *hwtnl,
  * Internal functions:
  */
 
-enum pipe_error svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
-                                 const SVGA3dPrimitiveRange *range,
-                                 unsigned min_index,
-                                 unsigned max_index,
-                                 struct pipe_resource *ib )
+/**
+ * For debugging only.
+ */
+static void
+check_draw_params(struct svga_hwtnl *hwtnl,
+                  const SVGA3dPrimitiveRange *range,
+                  unsigned min_index, unsigned max_index,
+                  struct pipe_resource *ib)
 {
-   enum pipe_error ret = PIPE_OK;
+   unsigned i;
 
-#ifdef DEBUG
-   {
-      unsigned i;
-      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
-         struct pipe_resource *vb = hwtnl->cmd.vdecl_vb[i];
-         unsigned size = vb ? vb->width0 : 0;
-         unsigned offset = hwtnl->cmd.vdecl[i].array.offset;
-         unsigned stride = hwtnl->cmd.vdecl[i].array.stride;
-         int index_bias = (int) range->indexBias + hwtnl->index_bias;
-         unsigned width;
-
-         assert(vb);
-         assert(size);
-         assert(offset < size);
-         assert(min_index <= max_index);
-
-         switch (hwtnl->cmd.vdecl[i].identity.type) {
-         case SVGA3D_DECLTYPE_FLOAT1:
-            width = 4;
-            break;
-         case SVGA3D_DECLTYPE_FLOAT2:
-            width = 4*2;
-            break;
-         case SVGA3D_DECLTYPE_FLOAT3:
-            width = 4*3;
-            break;
-         case SVGA3D_DECLTYPE_FLOAT4:
-            width = 4*4;
-            break;
-         case SVGA3D_DECLTYPE_D3DCOLOR:
-            width = 4;
-            break;
-         case SVGA3D_DECLTYPE_UBYTE4:
-            width = 1*4;
-            break;
-         case SVGA3D_DECLTYPE_SHORT2:
-            width = 2*2;
-            break;
-         case SVGA3D_DECLTYPE_SHORT4:
-            width = 2*4;
-            break;
-         case SVGA3D_DECLTYPE_UBYTE4N:
-            width = 1*4;
-            break;
-         case SVGA3D_DECLTYPE_SHORT2N:
-            width = 2*2;
-            break;
-         case SVGA3D_DECLTYPE_SHORT4N:
-            width = 2*4;
-            break;
-         case SVGA3D_DECLTYPE_USHORT2N:
-            width = 2*2;
-            break;
-         case SVGA3D_DECLTYPE_USHORT4N:
-            width = 2*4;
-            break;
-         case SVGA3D_DECLTYPE_UDEC3:
-            width = 4;
-            break;
-         case SVGA3D_DECLTYPE_DEC3N:
-            width = 4;
-            break;
-         case SVGA3D_DECLTYPE_FLOAT16_2:
-            width = 2*2;
-            break;
-         case SVGA3D_DECLTYPE_FLOAT16_4:
-            width = 2*4;
-            break;
-         default:
-            assert(0);
-            width = 0;
-            break;
-         }
+   assert(!svga_have_vgpu10(hwtnl->svga));
+
+   for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+      unsigned j = hwtnl->cmd.vdecl_buffer_index[i];
+      const struct pipe_vertex_buffer *vb = &hwtnl->cmd.vbufs[j];
+      unsigned size = vb->buffer ? vb->buffer->width0 : 0;
+      unsigned offset = hwtnl->cmd.vdecl[i].array.offset;
+      unsigned stride = hwtnl->cmd.vdecl[i].array.stride;
+      int index_bias = (int) range->indexBias + hwtnl->index_bias;
+      unsigned width;
+
+      if (size == 0)
+         continue;
+
+      assert(vb);
+      assert(size);
+      assert(offset < size);
+      assert(min_index <= max_index);
+      (void) width;
+      (void) stride;
+      (void) offset;
+      (void) size;
+
+      switch (hwtnl->cmd.vdecl[i].identity.type) {
+      case SVGA3D_DECLTYPE_FLOAT1:
+         width = 4;
+         break;
+      case SVGA3D_DECLTYPE_FLOAT2:
+         width = 4 * 2;
+         break;
+      case SVGA3D_DECLTYPE_FLOAT3:
+         width = 4 * 3;
+         break;
+      case SVGA3D_DECLTYPE_FLOAT4:
+         width = 4 * 4;
+         break;
+      case SVGA3D_DECLTYPE_D3DCOLOR:
+         width = 4;
+         break;
+      case SVGA3D_DECLTYPE_UBYTE4:
+         width = 1 * 4;
+         break;
+      case SVGA3D_DECLTYPE_SHORT2:
+         width = 2 * 2;
+         break;
+      case SVGA3D_DECLTYPE_SHORT4:
+         width = 2 * 4;
+         break;
+      case SVGA3D_DECLTYPE_UBYTE4N:
+         width = 1 * 4;
+         break;
+      case SVGA3D_DECLTYPE_SHORT2N:
+         width = 2 * 2;
+         break;
+      case SVGA3D_DECLTYPE_SHORT4N:
+         width = 2 * 4;
+         break;
+      case SVGA3D_DECLTYPE_USHORT2N:
+         width = 2 * 2;
+         break;
+      case SVGA3D_DECLTYPE_USHORT4N:
+         width = 2 * 4;
+         break;
+      case SVGA3D_DECLTYPE_UDEC3:
+         width = 4;
+         break;
+      case SVGA3D_DECLTYPE_DEC3N:
+         width = 4;
+         break;
+      case SVGA3D_DECLTYPE_FLOAT16_2:
+         width = 2 * 2;
+         break;
+      case SVGA3D_DECLTYPE_FLOAT16_4:
+         width = 2 * 4;
+         break;
+      default:
+         assert(0);
+         width = 0;
+         break;
+      }
 
-         if (index_bias >= 0) {
-            assert(offset + index_bias*stride + width <= size);
-         }
+      if (index_bias >= 0) {
+         assert(offset + index_bias * stride + width <= size);
+      }
 
-         /*
-          * min_index/max_index are merely conservative guesses, so we can't
-          * make buffer overflow detection based on their values.
-          */
+      /*
+       * min_index/max_index are merely conservative guesses, so we can't
+       * make buffer overflow detection based on their values.
+       */
+   }
+
+   assert(range->indexWidth == range->indexArray.stride);
+
+   if (ib) {
+      MAYBE_UNUSED unsigned size = ib->width0;
+      MAYBE_UNUSED unsigned offset = range->indexArray.offset;
+      MAYBE_UNUSED unsigned stride = range->indexArray.stride;
+      MAYBE_UNUSED unsigned count;
+
+      assert(size);
+      assert(offset < size);
+      assert(stride);
+
+      switch (range->primType) {
+      case SVGA3D_PRIMITIVE_POINTLIST:
+         count = range->primitiveCount;
+         break;
+      case SVGA3D_PRIMITIVE_LINELIST:
+         count = range->primitiveCount * 2;
+         break;
+      case SVGA3D_PRIMITIVE_LINESTRIP:
+         count = range->primitiveCount + 1;
+         break;
+      case SVGA3D_PRIMITIVE_TRIANGLELIST:
+         count = range->primitiveCount * 3;
+         break;
+      case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
+         count = range->primitiveCount + 2;
+         break;
+      case SVGA3D_PRIMITIVE_TRIANGLEFAN:
+         count = range->primitiveCount + 2;
+         break;
+      default:
+         assert(0);
+         count = 0;
+         break;
       }
 
-      assert(range->indexWidth == range->indexArray.stride);
-
-      if(ib) {
-         unsigned size = ib->width0;
-         unsigned offset = range->indexArray.offset;
-         unsigned stride = range->indexArray.stride;
-         unsigned count;
-
-         assert(size);
-         assert(offset < size);
-         assert(stride);
-
-         switch (range->primType) {
-         case SVGA3D_PRIMITIVE_POINTLIST:
-            count = range->primitiveCount;
-            break;
-         case SVGA3D_PRIMITIVE_LINELIST:
-            count = range->primitiveCount * 2;
-            break;
-         case SVGA3D_PRIMITIVE_LINESTRIP:
-            count = range->primitiveCount + 1;
-            break;
-         case SVGA3D_PRIMITIVE_TRIANGLELIST:
-            count = range->primitiveCount * 3;
-            break;
-         case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
-            count = range->primitiveCount + 2;
-            break;
-         case SVGA3D_PRIMITIVE_TRIANGLEFAN:
-            count = range->primitiveCount + 2;
-            break;
-         default:
-            assert(0);
-            count = 0;
-            break;
-         }
+      assert(offset + count * stride <= size);
+   }
+}
+
 
-         assert(offset + count*stride <= size);
+/**
+ * All drawing filters down into this function, either directly
+ * on the hardware path or after doing software vertex processing.
+ */
+enum pipe_error
+svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
+                const SVGA3dPrimitiveRange * range,
+                unsigned vcount,
+                unsigned min_index,
+                unsigned max_index, struct pipe_resource *ib,
+                unsigned start_instance, unsigned instance_count)
+{
+   enum pipe_error ret = PIPE_OK;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(hwtnl->svga), SVGA_STATS_TIME_HWTNLPRIM);
+
+   if (svga_have_vgpu10(hwtnl->svga)) {
+      /* draw immediately */
+      ret = draw_vgpu10(hwtnl, range, vcount, min_index, max_index, ib,
+                        start_instance, instance_count);
+      if (ret != PIPE_OK) {
+         svga_context_flush(hwtnl->svga, NULL);
+         ret = draw_vgpu10(hwtnl, range, vcount, min_index, max_index, ib,
+                           start_instance, instance_count);
+         assert(ret == PIPE_OK);
       }
    }
+   else {
+      /* batch up drawing commands */
+#ifdef DEBUG
+      check_draw_params(hwtnl, range, min_index, max_index, ib);
+      assert(start_instance == 0);
+      assert(instance_count <= 1);
+#else
+      (void) check_draw_params;
 #endif
 
-   if (hwtnl->cmd.prim_count+1 >= QSZ) {
-      ret = svga_hwtnl_flush( hwtnl );
-      if (ret != PIPE_OK)
-         return ret;
-   }
-   
-   /* min/max indices are relative to bias */
-   hwtnl->cmd.min_index[hwtnl->cmd.prim_count] = min_index;
-   hwtnl->cmd.max_index[hwtnl->cmd.prim_count] = max_index;
+      if (hwtnl->cmd.prim_count + 1 >= QSZ) {
+         ret = svga_hwtnl_flush(hwtnl);
+         if (ret != PIPE_OK)
+            goto done;
+      }
+
+      /* min/max indices are relative to bias */
+      hwtnl->cmd.min_index[hwtnl->cmd.prim_count] = min_index;
+      hwtnl->cmd.max_index[hwtnl->cmd.prim_count] = max_index;
 
-   hwtnl->cmd.prim[hwtnl->cmd.prim_count] = *range;
-   hwtnl->cmd.prim[hwtnl->cmd.prim_count].indexBias += hwtnl->index_bias;
+      hwtnl->cmd.prim[hwtnl->cmd.prim_count] = *range;
+      hwtnl->cmd.prim[hwtnl->cmd.prim_count].indexBias += hwtnl->index_bias;
 
-   pipe_resource_reference(&hwtnl->cmd.prim_ib[hwtnl->cmd.prim_count], ib);
-   hwtnl->cmd.prim_count++;
+      pipe_resource_reference(&hwtnl->cmd.prim_ib[hwtnl->cmd.prim_count], ib);
+      hwtnl->cmd.prim_count++;
+   }
 
+done:
+   SVGA_STATS_TIME_POP(svga_screen(hwtnl->svga->pipe.screen)->sws);
    return ret;
 }