gallium/ureg: Set the next shader stage from the shader info.
[mesa.git] / src / gallium / auxiliary / cso_cache / cso_context.c
index 8cfb6746e61143b1b77615fbc4b0a74224c39415..3911ee73d302eb28324091580dd947bd3bfe72c1 100644 (file)
 
 
 /**
- * Info related to samplers and sampler views.
- * We have one of these for fragment samplers and another for vertex samplers.
+ * Per-shader sampler information.
  */
 struct sampler_info
 {
+   struct cso_sampler *cso_samplers[PIPE_MAX_SAMPLERS];
    void *samplers[PIPE_MAX_SAMPLERS];
-   unsigned nr_samplers;
 };
 
 
@@ -65,7 +64,10 @@ struct sampler_info
 struct cso_context {
    struct pipe_context *pipe;
    struct cso_cache *cache;
+
    struct u_vbuf *vbuf;
+   struct u_vbuf *vbuf_current;
+   bool always_use_vbuf;
 
    boolean has_geometry_shader;
    boolean has_tessellation;
@@ -80,18 +82,23 @@ struct cso_context {
    struct pipe_sampler_view *fragment_views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    unsigned nr_fragment_views_saved;
 
-   void *fragment_samplers_saved[PIPE_MAX_SAMPLERS];
-   unsigned nr_fragment_samplers_saved;
-
+   struct sampler_info fragment_samplers_saved;
    struct sampler_info samplers[PIPE_SHADER_TYPES];
 
-   struct pipe_vertex_buffer aux_vertex_buffer_current;
-   struct pipe_vertex_buffer aux_vertex_buffer_saved;
-   unsigned aux_vertex_buffer_index;
+   /* Temporary number until cso_single_sampler_done is called.
+    * It tracks the highest sampler seen in cso_single_sampler.
+    */
+   int max_sampler_seen;
+
+   struct pipe_vertex_buffer vertex_buffer0_current;
+   struct pipe_vertex_buffer vertex_buffer0_saved;
 
    struct pipe_constant_buffer aux_constbuf_current[PIPE_SHADER_TYPES];
    struct pipe_constant_buffer aux_constbuf_saved[PIPE_SHADER_TYPES];
 
+   struct pipe_image_view fragment_image0_current;
+   struct pipe_image_view fragment_image0_saved;
+
    unsigned nr_so_targets;
    struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];
 
@@ -123,6 +130,10 @@ struct cso_context {
    struct pipe_stencil_ref stencil_ref, stencil_ref_saved;
 };
 
+struct pipe_context *cso_get_pipe_context(struct cso_context *cso)
+{
+   return cso->pipe;
+}
 
 static boolean delete_blend_state(struct cso_context *ctx, void *state)
 {
@@ -152,7 +163,7 @@ static boolean delete_depth_stencil_state(struct cso_context *ctx, void *state)
    return TRUE;
 }
 
-static boolean delete_sampler_state(struct cso_context *ctx, void *state)
+static boolean delete_sampler_state(UNUSED struct cso_context *ctx, void *state)
 {
    struct cso_sampler *cso = (struct cso_sampler *)state;
    if (cso->delete_state)
@@ -219,37 +230,87 @@ sanitize_hash(struct cso_hash *hash, enum cso_cache_type type,
    int hash_size = cso_hash_size(hash);
    int max_entries = (max_size > hash_size) ? max_size : hash_size;
    int to_remove =  (max_size < max_entries) * max_entries/4;
-   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   struct cso_hash_iter iter;
+   struct cso_sampler **samplers_to_restore = NULL;
+   unsigned to_restore = 0;
+
    if (hash_size > max_size)
       to_remove += hash_size - max_size;
+
+   if (to_remove == 0)
+      return;
+
+   if (type == CSO_SAMPLER) {
+      int i, j;
+
+      samplers_to_restore = MALLOC(PIPE_SHADER_TYPES * PIPE_MAX_SAMPLERS *
+                                   sizeof(*samplers_to_restore));
+
+      /* Temporarily remove currently bound sampler states from the hash
+       * table, to prevent them from being deleted
+       */
+      for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+         for (j = 0; j < PIPE_MAX_SAMPLERS; j++) {
+            struct cso_sampler *sampler = ctx->samplers[i].cso_samplers[j];
+
+            if (sampler && cso_hash_take(hash, sampler->hash_key))
+               samplers_to_restore[to_restore++] = sampler;
+         }
+      }
+   }
+
+   iter = cso_hash_first_node(hash);
    while (to_remove) {
       /*remove elements until we're good */
       /*fixme: currently we pick the nodes to remove at random*/
       void *cso = cso_hash_iter_data(iter);
+
+      if (!cso)
+         break;
+
       if (delete_cso(ctx, cso, type)) {
          iter = cso_hash_erase(hash, iter);
          --to_remove;
       } else
          iter = cso_hash_iter_next(iter);
    }
+
+   if (type == CSO_SAMPLER) {
+      /* Put currently bound sampler states back into the hash table */
+      while (to_restore--) {
+         struct cso_sampler *sampler = samplers_to_restore[to_restore];
+
+         cso_hash_insert(hash, sampler->hash_key, sampler);
+      }
+
+      FREE(samplers_to_restore);
+   }
 }
 
-static void cso_init_vbuf(struct cso_context *cso)
+static void cso_init_vbuf(struct cso_context *cso, unsigned flags)
 {
    struct u_vbuf_caps caps;
+   bool uses_user_vertex_buffers = !(flags & CSO_NO_USER_VERTEX_BUFFERS);
+   bool needs64b = !(flags & CSO_NO_64B_VERTEX_BUFFERS);
 
-   /* Install u_vbuf if there is anything unsupported. */
-   if (u_vbuf_get_caps(cso->pipe->screen, &caps)) {
-      cso->vbuf = u_vbuf_create(cso->pipe, &caps,
-                                cso->aux_vertex_buffer_index);
+   u_vbuf_get_caps(cso->pipe->screen, &caps, needs64b);
+
+   /* Enable u_vbuf if needed. */
+   if (caps.fallback_always ||
+       (uses_user_vertex_buffers &&
+        caps.fallback_only_for_user_vbuffers)) {
+      cso->vbuf = u_vbuf_create(cso->pipe, &caps);
+      cso->vbuf_current = cso->vbuf;
+      cso->always_use_vbuf = caps.fallback_always;
    }
 }
 
-struct cso_context *cso_create_context( struct pipe_context *pipe )
+struct cso_context *
+cso_create_context(struct pipe_context *pipe, unsigned flags)
 {
    struct cso_context *ctx = CALLOC_STRUCT(cso_context);
    if (!ctx)
-      goto out;
+      return NULL;
 
    ctx->cache = cso_cache_create();
    if (ctx->cache == NULL)
@@ -261,9 +322,7 @@ struct cso_context *cso_create_context( struct pipe_context *pipe )
    ctx->pipe = pipe;
    ctx->sample_mask = ~0;
 
-   ctx->aux_vertex_buffer_index = 0; /* 0 for now */
-
-   cso_init_vbuf(ctx);
+   cso_init_vbuf(ctx, flags);
 
    /* Enable for testing: */
    if (0) cso_set_maximum_cache_size( ctx->cache, 4 );
@@ -278,13 +337,20 @@ struct cso_context *cso_create_context( struct pipe_context *pipe )
    }
    if (pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_COMPUTE,
                                       PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
-      ctx->has_compute_shader = TRUE;
+      int supported_irs =
+         pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_COMPUTE,
+                                        PIPE_SHADER_CAP_SUPPORTED_IRS);
+      if (supported_irs & ((1 << PIPE_SHADER_IR_TGSI) |
+                           (1 << PIPE_SHADER_IR_NIR))) {
+         ctx->has_compute_shader = TRUE;
+      }
    }
    if (pipe->screen->get_param(pipe->screen,
                                PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS) != 0) {
       ctx->has_streamout = TRUE;
    }
 
+   ctx->max_sampler_seen = -1;
    return ctx;
 
 out:
@@ -300,8 +366,6 @@ void cso_destroy_context( struct cso_context *ctx )
    unsigned i;
 
    if (ctx->pipe) {
-      ctx->pipe->set_index_buffer(ctx->pipe, NULL);
-
       ctx->pipe->bind_blend_state( ctx->pipe, NULL );
       ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
 
@@ -309,7 +373,7 @@ void cso_destroy_context( struct cso_context *ctx )
          static struct pipe_sampler_view *views[PIPE_MAX_SHADER_SAMPLER_VIEWS] = { NULL };
          static void *zeros[PIPE_MAX_SAMPLERS] = { NULL };
          struct pipe_screen *scr = ctx->pipe->screen;
-         unsigned sh;
+         enum pipe_shader_type sh;
          for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
             int maxsam = scr->get_shader_param(scr, sh,
                                                PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS);
@@ -351,22 +415,27 @@ void cso_destroy_context( struct cso_context *ctx )
          ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, NULL);
    }
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   for (i = 0; i < ctx->nr_fragment_views; i++) {
       pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+   }
+   for (i = 0; i < ctx->nr_fragment_views_saved; i++) {
       pipe_sampler_view_reference(&ctx->fragment_views_saved[i], NULL);
    }
 
    util_unreference_framebuffer_state(&ctx->fb);
    util_unreference_framebuffer_state(&ctx->fb_saved);
 
-   pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer, NULL);
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_saved);
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       pipe_resource_reference(&ctx->aux_constbuf_current[i].buffer, NULL);
       pipe_resource_reference(&ctx->aux_constbuf_saved[i].buffer, NULL);
    }
 
+   pipe_resource_reference(&ctx->fragment_image0_current.resource, NULL);
+   pipe_resource_reference(&ctx->fragment_image0_saved.resource, NULL);
+
    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
       pipe_so_target_reference(&ctx->so_targets[i], NULL);
       pipe_so_target_reference(&ctx->so_targets_saved[i], NULL);
@@ -533,6 +602,11 @@ enum pipe_error cso_set_rasterizer(struct cso_context *ctx,
                                                        (void*)templ, key_size);
    void *handle = NULL;
 
+   /* We can't have both point_quad_rasterization (sprites) and point_smooth
+    * (round AA points) enabled at the same time.
+    */
+   assert(!(templ->point_quad_rasterization && templ->point_smooth));
+
    if (cso_hash_iter_is_null(iter)) {
       struct cso_rasterizer *cso = MALLOC(sizeof(struct cso_rasterizer));
       if (!cso)
@@ -589,16 +663,6 @@ void cso_set_fragment_shader_handle(struct cso_context *ctx, void *handle )
    }
 }
 
-void cso_delete_fragment_shader(struct cso_context *ctx, void *handle )
-{
-   if (handle == ctx->fragment_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_fs_state(ctx->pipe, NULL);
-      ctx->fragment_shader = NULL;
-   }
-   ctx->pipe->delete_fs_state(ctx->pipe, handle);
-}
-
 static void
 cso_save_fragment_shader(struct cso_context *ctx)
 {
@@ -625,16 +689,6 @@ void cso_set_vertex_shader_handle(struct cso_context *ctx, void *handle)
    }
 }
 
-void cso_delete_vertex_shader(struct cso_context *ctx, void *handle )
-{
-   if (handle == ctx->vertex_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_vs_state(ctx->pipe, NULL);
-      ctx->vertex_shader = NULL;
-   }
-   ctx->pipe->delete_vs_state(ctx->pipe, handle);
-}
-
 static void
 cso_save_vertex_shader(struct cso_context *ctx)
 {
@@ -703,6 +757,10 @@ cso_set_viewport_dims(struct cso_context *ctx,
    vp.translate[0] = 0.5f * width;
    vp.translate[1] = 0.5f * height;
    vp.translate[2] = 0.5f;
+   vp.swizzle_x = PIPE_VIEWPORT_SWIZZLE_POSITIVE_X;
+   vp.swizzle_y = PIPE_VIEWPORT_SWIZZLE_POSITIVE_Y;
+   vp.swizzle_z = PIPE_VIEWPORT_SWIZZLE_POSITIVE_Z;
+   vp.swizzle_w = PIPE_VIEWPORT_SWIZZLE_POSITIVE_W;
    cso_set_viewport(ctx, &vp);
 }
 
@@ -800,7 +858,8 @@ cso_restore_stencil_ref(struct cso_context *ctx)
 
 void cso_set_render_condition(struct cso_context *ctx,
                               struct pipe_query *query,
-                              boolean condition, uint mode)
+                              boolean condition,
+                              enum pipe_render_cond_flag mode)
 {
    struct pipe_context *pipe = ctx->pipe;
 
@@ -840,16 +899,6 @@ void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle)
    }
 }
 
-void cso_delete_geometry_shader(struct cso_context *ctx, void *handle)
-{
-    if (handle == ctx->geometry_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_gs_state(ctx->pipe, NULL);
-      ctx->geometry_shader = NULL;
-   }
-   ctx->pipe->delete_gs_state(ctx->pipe, handle);
-}
-
 static void
 cso_save_geometry_shader(struct cso_context *ctx)
 {
@@ -885,16 +934,6 @@ void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle)
    }
 }
 
-void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle)
-{
-    if (handle == ctx->tessctrl_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_tcs_state(ctx->pipe, NULL);
-      ctx->tessctrl_shader = NULL;
-   }
-   ctx->pipe->delete_tcs_state(ctx->pipe, handle);
-}
-
 static void
 cso_save_tessctrl_shader(struct cso_context *ctx)
 {
@@ -930,16 +969,6 @@ void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle)
    }
 }
 
-void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle)
-{
-    if (handle == ctx->tesseval_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_tes_state(ctx->pipe, NULL);
-      ctx->tesseval_shader = NULL;
-   }
-   ctx->pipe->delete_tes_state(ctx->pipe, handle);
-}
-
 static void
 cso_save_tesseval_shader(struct cso_context *ctx)
 {
@@ -975,52 +1004,33 @@ void cso_set_compute_shader_handle(struct cso_context *ctx, void *handle)
    }
 }
 
-void cso_delete_compute_shader(struct cso_context *ctx, void *handle)
-{
-    if (handle == ctx->compute_shader) {
-      /* unbind before deleting */
-      ctx->pipe->bind_compute_state(ctx->pipe, NULL);
-      ctx->compute_shader = NULL;
-   }
-   ctx->pipe->delete_compute_state(ctx->pipe, handle);
-}
-
-enum pipe_error
-cso_set_vertex_elements(struct cso_context *ctx,
-                        unsigned count,
-                        const struct pipe_vertex_element *states)
+static void
+cso_set_vertex_elements_direct(struct cso_context *ctx,
+                               const struct cso_velems_state *velems)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
    unsigned key_size, hash_key;
    struct cso_hash_iter iter;
    void *handle;
-   struct cso_velems_state velems_state;
-
-   if (vbuf) {
-      u_vbuf_set_vertex_elements(vbuf, count, states);
-      return PIPE_OK;
-   }
 
    /* Need to include the count into the stored state data too.
     * Otherwise first few count pipe_vertex_elements could be identical
     * even if count is different, and there's no guarantee the hash would
     * be different in that case neither.
     */
-   key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
-   velems_state.count = count;
-   memcpy(velems_state.velems, states,
-          sizeof(struct pipe_vertex_element) * count);
-   hash_key = cso_construct_key((void*)&velems_state, key_size);
+   key_size = sizeof(struct pipe_vertex_element) * velems->count +
+              sizeof(unsigned);
+   hash_key = cso_construct_key((void*)velems, key_size);
    iter = cso_find_state_template(ctx->cache, hash_key, CSO_VELEMENTS,
-                                  (void*)&velems_state, key_size);
+                                  (void*)velems, key_size);
 
    if (cso_hash_iter_is_null(iter)) {
       struct cso_velements *cso = MALLOC(sizeof(struct cso_velements));
       if (!cso)
-         return PIPE_ERROR_OUT_OF_MEMORY;
+         return;
 
-      memcpy(&cso->state, &velems_state, key_size);
-      cso->data = ctx->pipe->create_vertex_elements_state(ctx->pipe, count,
+      memcpy(&cso->state, velems, key_size);
+      cso->data = ctx->pipe->create_vertex_elements_state(ctx->pipe,
+                                                          velems->count,
                                                       &cso->state.velems[0]);
       cso->delete_state =
          (cso_state_callback) ctx->pipe->delete_vertex_elements_state;
@@ -1029,7 +1039,7 @@ cso_set_vertex_elements(struct cso_context *ctx,
       iter = cso_insert_state(ctx->cache, hash_key, CSO_VELEMENTS, cso);
       if (cso_hash_iter_is_null(iter)) {
          FREE(cso);
-         return PIPE_ERROR_OUT_OF_MEMORY;
+         return;
       }
 
       handle = cso->data;
@@ -1042,13 +1052,27 @@ cso_set_vertex_elements(struct cso_context *ctx,
       ctx->velements = handle;
       ctx->pipe->bind_vertex_elements_state(ctx->pipe, handle);
    }
+}
+
+enum pipe_error
+cso_set_vertex_elements(struct cso_context *ctx,
+                        const struct cso_velems_state *velems)
+{
+   struct u_vbuf *vbuf = ctx->vbuf_current;
+
+   if (vbuf) {
+      u_vbuf_set_vertex_elements(vbuf, velems);
+      return PIPE_OK;
+   }
+
+   cso_set_vertex_elements_direct(ctx, velems);
    return PIPE_OK;
 }
 
 static void
 cso_save_vertex_elements(struct cso_context *ctx)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
+   struct u_vbuf *vbuf = ctx->vbuf_current;
 
    if (vbuf) {
       u_vbuf_save_vertex_elements(vbuf);
@@ -1062,7 +1086,7 @@ cso_save_vertex_elements(struct cso_context *ctx)
 static void
 cso_restore_vertex_elements(struct cso_context *ctx)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
+   struct u_vbuf *vbuf = ctx->vbuf_current;
 
    if (vbuf) {
       u_vbuf_restore_vertex_elements(vbuf);
@@ -1078,137 +1102,192 @@ cso_restore_vertex_elements(struct cso_context *ctx)
 
 /* vertex buffers */
 
+static void
+cso_set_vertex_buffers_direct(struct cso_context *ctx,
+                              unsigned start_slot, unsigned count,
+                              const struct pipe_vertex_buffer *buffers)
+{
+   /* Save what's in the auxiliary slot, so that we can save and restore it
+    * for meta ops.
+    */
+   if (start_slot == 0) {
+      if (buffers) {
+         pipe_vertex_buffer_reference(&ctx->vertex_buffer0_current,
+                                      buffers);
+      } else {
+         pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current);
+      }
+   }
+
+   ctx->pipe->set_vertex_buffers(ctx->pipe, start_slot, count, buffers);
+}
+
+
 void cso_set_vertex_buffers(struct cso_context *ctx,
                             unsigned start_slot, unsigned count,
                             const struct pipe_vertex_buffer *buffers)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
+   struct u_vbuf *vbuf = ctx->vbuf_current;
+
+   if (!count)
+      return;
 
    if (vbuf) {
       u_vbuf_set_vertex_buffers(vbuf, start_slot, count, buffers);
       return;
    }
 
-   /* Save what's in the auxiliary slot, so that we can save and restore it
-    * for meta ops. */
-   if (start_slot <= ctx->aux_vertex_buffer_index &&
-       start_slot+count > ctx->aux_vertex_buffer_index) {
-      if (buffers) {
-         const struct pipe_vertex_buffer *vb =
-               buffers + (ctx->aux_vertex_buffer_index - start_slot);
-
-         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
-                                 vb->buffer);
-         memcpy(&ctx->aux_vertex_buffer_current, vb,
-                sizeof(struct pipe_vertex_buffer));
-      }
-      else {
-         pipe_resource_reference(&ctx->aux_vertex_buffer_current.buffer,
-                                 NULL);
-         ctx->aux_vertex_buffer_current.user_buffer = NULL;
-      }
-   }
-
-   ctx->pipe->set_vertex_buffers(ctx->pipe, start_slot, count, buffers);
+   cso_set_vertex_buffers_direct(ctx, start_slot, count, buffers);
 }
 
 static void
-cso_save_aux_vertex_buffer_slot(struct cso_context *ctx)
+cso_save_vertex_buffer0(struct cso_context *ctx)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
+   struct u_vbuf *vbuf = ctx->vbuf_current;
 
    if (vbuf) {
-      u_vbuf_save_aux_vertex_buffer_slot(vbuf);
+      u_vbuf_save_vertex_buffer0(vbuf);
       return;
    }
 
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer,
-                           ctx->aux_vertex_buffer_current.buffer);
-   memcpy(&ctx->aux_vertex_buffer_saved, &ctx->aux_vertex_buffer_current,
-          sizeof(struct pipe_vertex_buffer));
+   pipe_vertex_buffer_reference(&ctx->vertex_buffer0_saved,
+                                &ctx->vertex_buffer0_current);
 }
 
 static void
-cso_restore_aux_vertex_buffer_slot(struct cso_context *ctx)
+cso_restore_vertex_buffer0(struct cso_context *ctx)
 {
-   struct u_vbuf *vbuf = ctx->vbuf;
+   struct u_vbuf *vbuf = ctx->vbuf_current;
 
    if (vbuf) {
-      u_vbuf_restore_aux_vertex_buffer_slot(vbuf);
+      u_vbuf_restore_vertex_buffer0(vbuf);
       return;
    }
 
-   cso_set_vertex_buffers(ctx, ctx->aux_vertex_buffer_index, 1,
-                          &ctx->aux_vertex_buffer_saved);
-   pipe_resource_reference(&ctx->aux_vertex_buffer_saved.buffer, NULL);
+   cso_set_vertex_buffers(ctx, 0, 1, &ctx->vertex_buffer0_saved);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_saved);
 }
 
-unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
+/**
+ * Set vertex buffers and vertex elements. Skip u_vbuf if it's only needed
+ * for user vertex buffers and user vertex buffers are not set by this call.
+ * u_vbuf will be disabled. To re-enable u_vbuf, call this function again.
+ *
+ * Skipping u_vbuf decreases CPU overhead for draw calls that don't need it,
+ * such as VBOs, glBegin/End, and display lists.
+ *
+ * Internal operations that do "save states, draw, restore states" shouldn't
+ * use this, because the states are only saved in either cso_context or
+ * u_vbuf, not both.
+ */
+void
+cso_set_vertex_buffers_and_elements(struct cso_context *ctx,
+                                    const struct cso_velems_state *velems,
+                                    unsigned vb_count,
+                                    unsigned unbind_trailing_vb_count,
+                                    const struct pipe_vertex_buffer *vbuffers,
+                                    bool uses_user_vertex_buffers)
 {
-   return ctx->aux_vertex_buffer_index;
-}
+   struct u_vbuf *vbuf = ctx->vbuf;
 
+   if (vbuf && (ctx->always_use_vbuf || uses_user_vertex_buffers)) {
+      if (!ctx->vbuf_current) {
+         /* Unbind all buffers in cso_context, because we'll use u_vbuf. */
+         unsigned unbind_vb_count = vb_count + unbind_trailing_vb_count;
+         if (unbind_vb_count)
+            cso_set_vertex_buffers_direct(ctx, 0, unbind_vb_count, NULL);
+
+         /* Unset this to make sure the CSO is re-bound on the next use. */
+         ctx->velements = NULL;
+         ctx->vbuf_current = vbuf;
+      } else if (unbind_trailing_vb_count) {
+         u_vbuf_set_vertex_buffers(vbuf, vb_count, unbind_trailing_vb_count,
+                                   NULL);
+      }
 
-/**************** fragment/vertex sampler view state *************************/
+      if (vb_count)
+         u_vbuf_set_vertex_buffers(vbuf, 0, vb_count, vbuffers);
+      u_vbuf_set_vertex_elements(vbuf, velems);
+      return;
+   }
 
-enum pipe_error
-cso_single_sampler(struct cso_context *ctx, unsigned shader_stage,
+   if (ctx->vbuf_current) {
+      /* Unbind all buffers in u_vbuf, because we'll use cso_context. */
+      unsigned unbind_vb_count = vb_count + unbind_trailing_vb_count;
+      if (unbind_vb_count)
+         u_vbuf_set_vertex_buffers(vbuf, 0, unbind_vb_count, NULL);
+
+      /* Unset this to make sure the CSO is re-bound on the next use. */
+      u_vbuf_unset_vertex_elements(vbuf);
+      ctx->vbuf_current = NULL;
+   } else if (unbind_trailing_vb_count) {
+      cso_set_vertex_buffers_direct(ctx, vb_count, unbind_trailing_vb_count,
+                                    NULL);
+   }
+
+   if (vb_count)
+      cso_set_vertex_buffers_direct(ctx, 0, vb_count, vbuffers);
+   cso_set_vertex_elements_direct(ctx, velems);
+}
+
+void
+cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage,
                    unsigned idx, const struct pipe_sampler_state *templ)
 {
-   void *handle = NULL;
-
    if (templ) {
       unsigned key_size = sizeof(struct pipe_sampler_state);
       unsigned hash_key = cso_construct_key((void*)templ, key_size);
+      struct cso_sampler *cso;
       struct cso_hash_iter iter =
          cso_find_state_template(ctx->cache,
                                  hash_key, CSO_SAMPLER,
                                  (void *) templ, key_size);
 
       if (cso_hash_iter_is_null(iter)) {
-         struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
+         cso = MALLOC(sizeof(struct cso_sampler));
          if (!cso)
-            return PIPE_ERROR_OUT_OF_MEMORY;
+            return;
 
          memcpy(&cso->state, templ, sizeof(*templ));
          cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
          cso->delete_state =
             (cso_state_callback) ctx->pipe->delete_sampler_state;
          cso->context = ctx->pipe;
+         cso->hash_key = hash_key;
 
          iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
          if (cso_hash_iter_is_null(iter)) {
             FREE(cso);
-            return PIPE_ERROR_OUT_OF_MEMORY;
+            return;
          }
-
-         handle = cso->data;
       }
       else {
-         handle = ((struct cso_sampler *)cso_hash_iter_data(iter))->data;
+         cso = cso_hash_iter_data(iter);
       }
-   }
 
-   ctx->samplers[shader_stage].samplers[idx] = handle;
-   return PIPE_OK;
+      ctx->samplers[shader_stage].cso_samplers[idx] = cso;
+      ctx->samplers[shader_stage].samplers[idx] = cso->data;
+      ctx->max_sampler_seen = MAX2(ctx->max_sampler_seen, (int)idx);
+   }
 }
 
 
+/**
+ * Send staged sampler state to the driver.
+ */
 void
-cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
+cso_single_sampler_done(struct cso_context *ctx,
+                        enum pipe_shader_type shader_stage)
 {
    struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i;
 
-   /* find highest non-null sampler */
-   for (i = PIPE_MAX_SAMPLERS; i > 0; i--) {
-      if (info->samplers[i - 1] != NULL)
-         break;
-   }
+   if (ctx->max_sampler_seen == -1)
+      return;
 
-   info->nr_samplers = i;
-   ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0, i,
+   ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0,
+                                  ctx->max_sampler_seen + 1,
                                   info->samplers);
+   ctx->max_sampler_seen = -1;
 }
 
 
@@ -1217,44 +1296,27 @@ cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
  * last one. Done to always try to set as many samplers
  * as possible.
  */
-enum pipe_error
+void
 cso_set_samplers(struct cso_context *ctx,
-                 unsigned shader_stage,
+                 enum pipe_shader_type shader_stage,
                  unsigned nr,
                  const struct pipe_sampler_state **templates)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i;
-   enum pipe_error temp, error = PIPE_OK;
-
-   /* TODO: fastpath
-    */
-
-   for (i = 0; i < nr; i++) {
-      temp = cso_single_sampler(ctx, shader_stage, i, templates[i]);
-      if (temp != PIPE_OK)
-         error = temp;
-   }
-
-   for ( ; i < info->nr_samplers; i++) {
-      temp = cso_single_sampler(ctx, shader_stage, i, NULL);
-      if (temp != PIPE_OK)
-         error = temp;
-   }
+   for (unsigned i = 0; i < nr; i++)
+      cso_single_sampler(ctx, shader_stage, i, templates[i]);
 
    cso_single_sampler_done(ctx, shader_stage);
-
-   return error;
 }
 
 static void
 cso_save_fragment_samplers(struct cso_context *ctx)
 {
    struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+   struct sampler_info *saved = &ctx->fragment_samplers_saved;
 
-   ctx->nr_fragment_samplers_saved = info->nr_samplers;
-   memcpy(ctx->fragment_samplers_saved, info->samplers,
-          sizeof(info->samplers));
+   memcpy(saved->cso_samplers, info->cso_samplers,
+          sizeof(info->cso_samplers));
+   memcpy(saved->samplers, info->samplers, sizeof(info->samplers));
 }
 
 
@@ -1262,17 +1324,26 @@ static void
 cso_restore_fragment_samplers(struct cso_context *ctx)
 {
    struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+   struct sampler_info *saved = &ctx->fragment_samplers_saved;
+
+   memcpy(info->cso_samplers, saved->cso_samplers,
+          sizeof(info->cso_samplers));
+   memcpy(info->samplers, saved->samplers, sizeof(info->samplers));
+
+   for (int i = PIPE_MAX_SAMPLERS - 1; i >= 0; i--) {
+      if (info->samplers[i]) {
+         ctx->max_sampler_seen = i;
+         break;
+      }
+   }
 
-   info->nr_samplers = ctx->nr_fragment_samplers_saved;
-   memcpy(info->samplers, ctx->fragment_samplers_saved,
-          sizeof(info->samplers));
    cso_single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
 }
 
 
 void
 cso_set_sampler_views(struct cso_context *ctx,
-                      unsigned shader_stage,
+                      enum pipe_shader_type shader_stage,
                       unsigned count,
                       struct pipe_sampler_view **views)
 {
@@ -1347,6 +1418,36 @@ cso_restore_fragment_sampler_views(struct cso_context *ctx)
 }
 
 
+void
+cso_set_shader_images(struct cso_context *ctx,
+                      enum pipe_shader_type shader_stage,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view *images)
+{
+   if (shader_stage == PIPE_SHADER_FRAGMENT && start == 0 && count >= 1) {
+      util_copy_image_view(&ctx->fragment_image0_current, &images[0]);
+   }
+
+   ctx->pipe->set_shader_images(ctx->pipe, shader_stage, start, count, images);
+}
+
+
+static void
+cso_save_fragment_image0(struct cso_context *ctx)
+{
+   util_copy_image_view(&ctx->fragment_image0_saved,
+                        &ctx->fragment_image0_current);
+}
+
+
+static void
+cso_restore_fragment_image0(struct cso_context *ctx)
+{
+   cso_set_shader_images(ctx, PIPE_SHADER_FRAGMENT, 0, 1,
+                         &ctx->fragment_image0_saved);
+}
+
+
 void
 cso_set_stream_outputs(struct cso_context *ctx,
                        unsigned num_targets,
@@ -1436,7 +1537,8 @@ cso_restore_stream_outputs(struct cso_context *ctx)
 /* constant buffers */
 
 void
-cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
+cso_set_constant_buffer(struct cso_context *cso,
+                        enum pipe_shader_type shader_stage,
                         unsigned index, struct pipe_constant_buffer *cb)
 {
    struct pipe_context *pipe = cso->pipe;
@@ -1450,7 +1552,7 @@ cso_set_constant_buffer(struct cso_context *cso, unsigned shader_stage,
 
 void
 cso_set_constant_buffer_resource(struct cso_context *cso,
-                                 unsigned shader_stage,
+                                 enum pipe_shader_type shader_stage,
                                  unsigned index,
                                  struct pipe_resource *buffer)
 {
@@ -1466,9 +1568,26 @@ cso_set_constant_buffer_resource(struct cso_context *cso,
    }
 }
 
+void
+cso_set_constant_user_buffer(struct cso_context *cso,
+                             enum pipe_shader_type shader_stage,
+                             unsigned index, void *ptr, unsigned size)
+{
+   if (ptr) {
+      struct pipe_constant_buffer cb;
+      cb.buffer = NULL;
+      cb.buffer_offset = 0;
+      cb.buffer_size = size;
+      cb.user_buffer = ptr;
+      cso_set_constant_buffer(cso, shader_stage, index, &cb);
+   } else {
+      cso_set_constant_buffer(cso, shader_stage, index, NULL);
+   }
+}
+
 void
 cso_save_constant_buffer_slot0(struct cso_context *cso,
-                                  unsigned shader_stage)
+                               enum pipe_shader_type shader_stage)
 {
    util_copy_constant_buffer(&cso->aux_constbuf_saved[shader_stage],
                              &cso->aux_constbuf_current[shader_stage]);
@@ -1476,7 +1595,7 @@ cso_save_constant_buffer_slot0(struct cso_context *cso,
 
 void
 cso_restore_constant_buffer_slot0(struct cso_context *cso,
-                                     unsigned shader_stage)
+                                  enum pipe_shader_type shader_stage)
 {
    cso_set_constant_buffer(cso, shader_stage, 0,
                            &cso->aux_constbuf_saved[shader_stage]);
@@ -1497,7 +1616,7 @@ cso_save_state(struct cso_context *cso, unsigned state_mask)
    cso->saved_state = state_mask;
 
    if (state_mask & CSO_BIT_AUX_VERTEX_BUFFER_SLOT)
-      cso_save_aux_vertex_buffer_slot(cso);
+      cso_save_vertex_buffer0(cso);
    if (state_mask & CSO_BIT_BLEND)
       cso_save_blend(cso);
    if (state_mask & CSO_BIT_DEPTH_STENCIL_ALPHA)
@@ -1534,6 +1653,10 @@ cso_save_state(struct cso_context *cso, unsigned state_mask)
       cso_save_vertex_shader(cso);
    if (state_mask & CSO_BIT_VIEWPORT)
       cso_save_viewport(cso);
+   if (state_mask & CSO_BIT_PAUSE_QUERIES)
+      cso->pipe->set_active_query_state(cso->pipe, false);
+   if (state_mask & CSO_BIT_FRAGMENT_IMAGE0)
+      cso_save_fragment_image0(cso);
 }
 
 
@@ -1548,7 +1671,7 @@ cso_restore_state(struct cso_context *cso)
    assert(state_mask);
 
    if (state_mask & CSO_BIT_AUX_VERTEX_BUFFER_SLOT)
-      cso_restore_aux_vertex_buffer_slot(cso);
+      cso_restore_vertex_buffer0(cso);
    if (state_mask & CSO_BIT_BLEND)
       cso_restore_blend(cso);
    if (state_mask & CSO_BIT_DEPTH_STENCIL_ALPHA)
@@ -1585,6 +1708,10 @@ cso_restore_state(struct cso_context *cso)
       cso_restore_vertex_shader(cso);
    if (state_mask & CSO_BIT_VIEWPORT)
       cso_restore_viewport(cso);
+   if (state_mask & CSO_BIT_PAUSE_QUERIES)
+      cso->pipe->set_active_query_state(cso->pipe, true);
+   if (state_mask & CSO_BIT_FRAGMENT_IMAGE0)
+      cso_restore_fragment_image0(cso);
 
    cso->saved_state = 0;
 }
@@ -1593,25 +1720,17 @@ cso_restore_state(struct cso_context *cso)
 
 /* drawing */
 
-void
-cso_set_index_buffer(struct cso_context *cso,
-                     const struct pipe_index_buffer *ib)
-{
-   struct u_vbuf *vbuf = cso->vbuf;
-
-   if (vbuf) {
-      u_vbuf_set_index_buffer(vbuf, ib);
-   } else {
-      struct pipe_context *pipe = cso->pipe;
-      pipe->set_index_buffer(pipe, ib);
-   }
-}
-
 void
 cso_draw_vbo(struct cso_context *cso,
              const struct pipe_draw_info *info)
 {
-   struct u_vbuf *vbuf = cso->vbuf;
+   struct u_vbuf *vbuf = cso->vbuf_current;
+
+   /* We can't have both indirect drawing and SO-vertex-count drawing */
+   assert(info->indirect == NULL || info->count_from_stream_output == NULL);
+
+   /* We can't have SO-vertex-count drawing with an index buffer */
+   assert(info->count_from_stream_output == NULL || info->index_size == 0);
 
    if (vbuf) {
       u_vbuf_draw_vbo(vbuf, info);