st/xa: Fix a memory leak
[mesa.git] / src / gallium / state_trackers / nine / nine_state.c
index a5fa55328ca9180e49871155c63964a4b078d377..273be88e2b81f293246eacdf1f57b072ae4f3638 100644 (file)
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#define NINE_STATE
+
 #include "device9.h"
+#include "swapchain9.h"
 #include "basetexture9.h"
 #include "buffer9.h"
 #include "indexbuffer9.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_atomic.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_math.h"
 #include "util/u_box.h"
 #include "util/u_simple_shaders.h"
+#include "util/u_gen_mipmap.h"
+
+/* CSMT headers */
+#include "nine_queue.h"
+#include "nine_csmt_helper.h"
+#include "os/os_thread.h"
 
 #define DBG_CHANNEL DBG_DEVICE
 
+/* Nine CSMT */
+
+struct csmt_instruction {
+    int (* func)(struct NineDevice9 *This, struct csmt_instruction *instr);
+};
+
+struct csmt_context {
+    thrd_t worker;
+    struct nine_queue_pool* pool;
+    BOOL terminate;
+    cnd_t event_processed;
+    mtx_t mutex_processed;
+    struct NineDevice9 *device;
+    BOOL processed;
+    BOOL toPause;
+    BOOL hasPaused;
+    mtx_t thread_running;
+    mtx_t thread_resume;
+};
+
+/* Wait for instruction to be processed.
+ * Caller has to ensure that only one thread waits at time.
+ */
+static void
+nine_csmt_wait_processed(struct csmt_context *ctx)
+{
+    mtx_lock(&ctx->mutex_processed);
+    while (!p_atomic_read(&ctx->processed)) {
+        cnd_wait(&ctx->event_processed, &ctx->mutex_processed);
+    }
+    mtx_unlock(&ctx->mutex_processed);
+}
+
+/* CSMT worker thread */
+static
+int
+nine_csmt_worker(void *arg)
+{
+    struct csmt_context *ctx = arg;
+    struct csmt_instruction *instr;
+    DBG("CSMT worker spawned\n");
+
+    u_thread_setname("CSMT-Worker");
+
+    while (1) {
+        nine_queue_wait_flush(ctx->pool);
+        mtx_lock(&ctx->thread_running);
+
+        /* Get instruction. NULL on empty cmdbuf. */
+        while (!p_atomic_read(&ctx->terminate) &&
+               (instr = (struct csmt_instruction *)nine_queue_get(ctx->pool))) {
+
+            /* decode */
+            if (instr->func(ctx->device, instr)) {
+                mtx_lock(&ctx->mutex_processed);
+                p_atomic_set(&ctx->processed, TRUE);
+                cnd_signal(&ctx->event_processed);
+                mtx_unlock(&ctx->mutex_processed);
+            }
+            if (p_atomic_read(&ctx->toPause)) {
+                mtx_unlock(&ctx->thread_running);
+                /* will wait here the thread can be resumed */
+                mtx_lock(&ctx->thread_resume);
+                mtx_lock(&ctx->thread_running);
+                mtx_unlock(&ctx->thread_resume);
+            }
+        }
+
+        mtx_unlock(&ctx->thread_running);
+        if (p_atomic_read(&ctx->terminate)) {
+            mtx_lock(&ctx->mutex_processed);
+            p_atomic_set(&ctx->processed, TRUE);
+            cnd_signal(&ctx->event_processed);
+            mtx_unlock(&ctx->mutex_processed);
+            break;
+        }
+    }
+
+    DBG("CSMT worker destroyed\n");
+    return 0;
+}
+
+/* Create a CSMT context.
+ * Spawns a worker thread.
+ */
+struct csmt_context *
+nine_csmt_create( struct NineDevice9 *This )
+{
+    struct csmt_context *ctx;
+
+    ctx = CALLOC_STRUCT(csmt_context);
+    if (!ctx)
+        return NULL;
+
+    ctx->pool = nine_queue_create();
+    if (!ctx->pool) {
+        FREE(ctx);
+        return NULL;
+    }
+    cnd_init(&ctx->event_processed);
+    (void) mtx_init(&ctx->mutex_processed, mtx_plain);
+    (void) mtx_init(&ctx->thread_running, mtx_plain);
+    (void) mtx_init(&ctx->thread_resume, mtx_plain);
+
+#if DEBUG
+    u_thread_setname("Main thread");
+#endif
+
+    ctx->device = This;
+
+    ctx->worker = u_thread_create(nine_csmt_worker, ctx);
+    if (!ctx->worker) {
+        nine_queue_delete(ctx->pool);
+        FREE(ctx);
+        return NULL;
+    }
+
+    DBG("Returning context %p\n", ctx);
+
+    return ctx;
+}
+
+static int
+nop_func( struct NineDevice9 *This, struct csmt_instruction *instr )
+{
+    (void) This;
+    (void) instr;
+
+    return 1;
+}
+
+/* Push nop instruction and flush the queue.
+ * Waits for the worker to complete. */
+void
+nine_csmt_process( struct NineDevice9 *device )
+{
+    struct csmt_instruction* instr;
+    struct csmt_context *ctx = device->csmt_ctx;
+
+    if (!device->csmt_active)
+        return;
+
+    if (nine_queue_isempty(ctx->pool))
+        return;
+
+    DBG("device=%p\n", device);
+
+    /* NOP */
+    instr = nine_queue_alloc(ctx->pool, sizeof(struct csmt_instruction));
+    assert(instr);
+    instr->func = nop_func;
+
+    p_atomic_set(&ctx->processed, FALSE);
+    nine_queue_flush(ctx->pool);
+
+    nine_csmt_wait_processed(ctx);
+}
+
+/* Destroys a CSMT context.
+ * Waits for the worker thread to terminate.
+ */
+void
+nine_csmt_destroy( struct NineDevice9 *device, struct csmt_context *ctx )
+{
+    struct csmt_instruction* instr;
+    thrd_t render_thread = ctx->worker;
+
+    DBG("device=%p ctx=%p\n", device, ctx);
+
+    /* Push nop and flush the queue. */
+    instr = nine_queue_alloc(ctx->pool, sizeof(struct csmt_instruction));
+    assert(instr);
+    instr->func = nop_func;
+
+    p_atomic_set(&ctx->processed, FALSE);
+    /* Signal worker to terminate. */
+    p_atomic_set(&ctx->terminate, TRUE);
+    nine_queue_flush(ctx->pool);
+
+    nine_csmt_wait_processed(ctx);
+    nine_queue_delete(ctx->pool);
+
+    mtx_destroy(&ctx->thread_resume);
+    mtx_destroy(&ctx->thread_running);
+
+    mtx_destroy(&ctx->mutex_processed);
+    cnd_destroy(&ctx->event_processed);
+
+    FREE(ctx);
+
+    thrd_join(render_thread, NULL);
+}
+
+static void
+nine_csmt_pause( struct NineDevice9 *device )
+{
+    struct csmt_context *ctx = device->csmt_ctx;
+
+    if (!device->csmt_active)
+        return;
+
+    /* No need to pause the thread */
+    if (nine_queue_no_flushed_work(ctx->pool))
+        return;
+
+    mtx_lock(&ctx->thread_resume);
+    p_atomic_set(&ctx->toPause, TRUE);
+
+    /* Wait the thread is paused */
+    mtx_lock(&ctx->thread_running);
+    ctx->hasPaused = TRUE;
+    p_atomic_set(&ctx->toPause, FALSE);
+}
+
+static void
+nine_csmt_resume( struct NineDevice9 *device )
+{
+    struct csmt_context *ctx = device->csmt_ctx;
+
+    if (!device->csmt_active)
+        return;
+
+    if (!ctx->hasPaused)
+        return;
+
+    ctx->hasPaused = FALSE;
+    mtx_unlock(&ctx->thread_running);
+    mtx_unlock(&ctx->thread_resume);
+}
+
+struct pipe_context *
+nine_context_get_pipe( struct NineDevice9 *device )
+{
+    nine_csmt_process(device);
+    return device->context.pipe;
+}
+
+struct pipe_context *
+nine_context_get_pipe_multithread( struct NineDevice9 *device )
+{
+    struct csmt_context *ctx = device->csmt_ctx;
+
+    if (!device->csmt_active)
+        return device->context.pipe;
+
+    if (!u_thread_is_self(ctx->worker))
+        nine_csmt_process(device);
+
+    return device->context.pipe;
+}
+
+struct pipe_context *
+nine_context_get_pipe_acquire( struct NineDevice9 *device )
+{
+    nine_csmt_pause(device);
+    return device->context.pipe;
+}
+
+void
+nine_context_get_pipe_release( struct NineDevice9 *device )
+{
+    nine_csmt_resume(device);
+}
+
+/* Nine state functions */
+
 /* Check if some states need to be set dirty */
 
 static inline DWORD
@@ -64,21 +340,21 @@ check_multisample(struct NineDevice9 *device)
 static inline void
 prepare_blend(struct NineDevice9 *device)
 {
-    nine_convert_blend_state(&device->context.pipe.blend, device->context.rs);
+    nine_convert_blend_state(&device->context.pipe_data.blend, device->context.rs);
     device->context.commit |= NINE_STATE_COMMIT_BLEND;
 }
 
 static inline void
 prepare_dsa(struct NineDevice9 *device)
 {
-    nine_convert_dsa_state(&device->context.pipe.dsa, device->context.rs);
+    nine_convert_dsa_state(&device->context.pipe_data.dsa, device->context.rs);
     device->context.commit |= NINE_STATE_COMMIT_DSA;
 }
 
 static inline void
 prepare_rasterizer(struct NineDevice9 *device)
 {
-    nine_convert_rasterizer_state(device, &device->context.pipe.rast, device->context.rs);
+    nine_convert_rasterizer_state(device, &device->context.pipe_data.rast, device->context.rs);
     device->context.commit |= NINE_STATE_COMMIT_RASTERIZER;
 }
 
@@ -111,17 +387,14 @@ prepare_vs_constants_userbuf_swvp(struct NineDevice9 *device)
             cb.user_buffer = dst;
         }
 
-        /* Do not erase the buffer field.
-         * It is either NULL (user_cbufs), or a resource.
-         * u_upload_data will do the proper refcount */
-        context->pipe.cb0_swvp.buffer_offset = cb.buffer_offset;
-        context->pipe.cb0_swvp.buffer_size = cb.buffer_size;
-        context->pipe.cb0_swvp.user_buffer = cb.user_buffer;
+        context->pipe_data.cb0_swvp.buffer_offset = cb.buffer_offset;
+        context->pipe_data.cb0_swvp.buffer_size = cb.buffer_size;
+        context->pipe_data.cb0_swvp.user_buffer = cb.user_buffer;
 
         cb.user_buffer = (char *)cb.user_buffer + 4096 * sizeof(float[4]);
-        context->pipe.cb1_swvp.buffer_offset = cb.buffer_offset;
-        context->pipe.cb1_swvp.buffer_size = cb.buffer_size;
-        context->pipe.cb1_swvp.user_buffer = cb.user_buffer;
+        context->pipe_data.cb1_swvp.buffer_offset = cb.buffer_offset;
+        context->pipe_data.cb1_swvp.buffer_size = cb.buffer_size;
+        context->pipe_data.cb1_swvp.user_buffer = cb.user_buffer;
 
         context->changed.vs_const_f = 0;
     }
@@ -133,9 +406,9 @@ prepare_vs_constants_userbuf_swvp(struct NineDevice9 *device)
         cb.buffer_size = 2048 * sizeof(float[4]);
         cb.user_buffer = context->vs_const_i;
 
-        context->pipe.cb2_swvp.buffer_offset = cb.buffer_offset;
-        context->pipe.cb2_swvp.buffer_size = cb.buffer_size;
-        context->pipe.cb2_swvp.user_buffer = cb.user_buffer;
+        context->pipe_data.cb2_swvp.buffer_offset = cb.buffer_offset;
+        context->pipe_data.cb2_swvp.buffer_size = cb.buffer_size;
+        context->pipe_data.cb2_swvp.user_buffer = cb.user_buffer;
         context->changed.vs_const_i = 0;
     }
 
@@ -146,58 +419,12 @@ prepare_vs_constants_userbuf_swvp(struct NineDevice9 *device)
         cb.buffer_size = 512 * sizeof(float[4]);
         cb.user_buffer = context->vs_const_b;
 
-        context->pipe.cb3_swvp.buffer_offset = cb.buffer_offset;
-        context->pipe.cb3_swvp.buffer_size = cb.buffer_size;
-        context->pipe.cb3_swvp.user_buffer = cb.user_buffer;
+        context->pipe_data.cb3_swvp.buffer_offset = cb.buffer_offset;
+        context->pipe_data.cb3_swvp.buffer_size = cb.buffer_size;
+        context->pipe_data.cb3_swvp.user_buffer = cb.user_buffer;
         context->changed.vs_const_b = 0;
     }
 
-    if (!device->driver_caps.user_cbufs) {
-        struct pipe_constant_buffer *cb = &(context->pipe.cb0_swvp);
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb->buffer_size,
-                      device->constbuf_alignment,
-                      cb->user_buffer,
-                      &(cb->buffer_offset),
-                      &(cb->buffer));
-        u_upload_unmap(device->constbuf_uploader);
-        cb->user_buffer = NULL;
-
-        cb = &(context->pipe.cb1_swvp);
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb->buffer_size,
-                      device->constbuf_alignment,
-                      cb->user_buffer,
-                      &(cb->buffer_offset),
-                      &(cb->buffer));
-        u_upload_unmap(device->constbuf_uploader);
-        cb->user_buffer = NULL;
-
-        cb = &(context->pipe.cb2_swvp);
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb->buffer_size,
-                      device->constbuf_alignment,
-                      cb->user_buffer,
-                      &(cb->buffer_offset),
-                      &(cb->buffer));
-        u_upload_unmap(device->constbuf_uploader);
-        cb->user_buffer = NULL;
-
-        cb = &(context->pipe.cb3_swvp);
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb->buffer_size,
-                      device->constbuf_alignment,
-                      cb->user_buffer,
-                      &(cb->buffer_offset),
-                      &(cb->buffer));
-        u_upload_unmap(device->constbuf_uploader);
-        cb->user_buffer = NULL;
-    }
-
     context->changed.group &= ~NINE_STATE_VS_CONST;
     context->commit |= NINE_STATE_COMMIT_CONST_VS;
 }
@@ -212,7 +439,7 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
     cb.buffer_size = context->vs->const_used_size;
     cb.user_buffer = context->vs_const_f;
 
-    if (device->swvp) {
+    if (context->swvp) {
         prepare_vs_constants_userbuf_swvp(device);
         return;
     }
@@ -251,20 +478,7 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
         cb.user_buffer = dst;
     }
 
-    if (!device->driver_caps.user_cbufs) {
-        context->pipe.cb_vs.buffer_size = cb.buffer_size;
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb.buffer_size,
-                      device->constbuf_alignment,
-                      cb.user_buffer,
-                      &context->pipe.cb_vs.buffer_offset,
-                      &context->pipe.cb_vs.buffer);
-        u_upload_unmap(device->constbuf_uploader);
-        context->pipe.cb_vs.user_buffer = NULL;
-    } else
-        context->pipe.cb_vs = cb;
-
+    context->pipe_data.cb_vs = cb;
     context->changed.vs_const_f = 0;
 
     context->changed.group &= ~NINE_STATE_VS_CONST;
@@ -322,20 +536,7 @@ prepare_ps_constants_userbuf(struct NineDevice9 *device)
     if (!cb.buffer_size)
         return;
 
-    if (!device->driver_caps.user_cbufs) {
-        context->pipe.cb_ps.buffer_size = cb.buffer_size;
-        u_upload_data(device->constbuf_uploader,
-                      0,
-                      cb.buffer_size,
-                      device->constbuf_alignment,
-                      cb.user_buffer,
-                      &context->pipe.cb_ps.buffer_offset,
-                      &context->pipe.cb_ps.buffer);
-        u_upload_unmap(device->constbuf_uploader);
-        context->pipe.cb_ps.user_buffer = NULL;
-    } else
-        context->pipe.cb_ps = cb;
-
+    context->pipe_data.cb_ps = cb;
     context->changed.ps_const_f = 0;
 
     context->changed.group &= ~NINE_STATE_PS_CONST;
@@ -358,10 +559,10 @@ prepare_vs(struct NineDevice9 *device, uint8_t shader_changed)
 
     /* likely because we dislike FF */
     if (likely(context->programmable_vs)) {
-        context->cso.vs = NineVertexShader9_GetVariant(vs);
+        context->cso_shader.vs = NineVertexShader9_GetVariant(vs);
     } else {
         vs = device->ff.vs;
-        context->cso.vs = vs->ff_cso;
+        context->cso_shader.vs = vs->ff_cso;
     }
 
     if (context->rs[NINED3DRS_VSPOINTSIZE] != vs->point_size) {
@@ -392,10 +593,10 @@ prepare_ps(struct NineDevice9 *device, uint8_t shader_changed)
         return 0;
 
     if (likely(ps)) {
-        context->cso.ps = NinePixelShader9_GetVariant(ps);
+        context->cso_shader.ps = NinePixelShader9_GetVariant(ps);
     } else {
         ps = device->ff.ps;
-        context->cso.ps = ps->ff_cso;
+        context->cso_shader.ps = ps->ff_cso;
     }
 
     if ((context->bound_samplers_mask_ps & ps->sampler_mask) != ps->sampler_mask)
@@ -413,9 +614,9 @@ prepare_ps(struct NineDevice9 *device, uint8_t shader_changed)
 static void
 update_framebuffer(struct NineDevice9 *device, bool is_clear)
 {
-    struct pipe_context *pipe = device->pipe;
     struct nine_context *context = &device->context;
-    struct pipe_framebuffer_state *fb = &context->pipe.fb;
+    struct pipe_context *pipe = context->pipe;
+    struct pipe_framebuffer_state *fb = &context->pipe_data.fb;
     unsigned i;
     struct NineSurface9 *rt0 = context->rt[0];
     unsigned w = rt0->desc.Width;
@@ -455,12 +656,6 @@ update_framebuffer(struct NineDevice9 *device, bool is_clear)
             fb->cbufs[i] = NineSurface9_GetSurface(rt, sRGB);
             context->rt_mask |= 1 << i;
             fb->nr_cbufs = i + 1;
-
-            if (unlikely(rt->desc.Usage & D3DUSAGE_AUTOGENMIPMAP)) {
-                assert(rt->texture == D3DRTYPE_TEXTURE ||
-                       rt->texture == D3DRTYPE_CUBETEXTURE);
-                NineBaseTexture9(rt->base.base.container)->dirty_mip = TRUE;
-            }
         } else {
             /* Color outputs must match RT slot,
              * drivers will have to handle NULL entries for GL, too.
@@ -489,7 +684,8 @@ update_framebuffer(struct NineDevice9 *device, bool is_clear)
 static void
 update_viewport(struct NineDevice9 *device)
 {
-    const D3DVIEWPORT9 *vport = &device->context.viewport;
+    struct nine_context *context = &device->context;
+    const D3DVIEWPORT9 *vport = &context->viewport;
     struct pipe_viewport_state pvport;
 
     /* D3D coordinates are:
@@ -527,7 +723,7 @@ update_viewport(struct NineDevice9 *device)
         pvport.translate[1] -= 1.0f / 128.0f;
     }
 
-    cso_set_viewport(device->cso, &pvport);
+    cso_set_viewport(context->cso, &pvport);
 }
 
 /* Loop through VS inputs and pick the vertex elements with the declared
@@ -617,14 +813,14 @@ update_vertex_elements(struct NineDevice9 *device)
         context->dummy_vbo_bound_at = dummy_vbo_stream;
     }
 
-    cso_set_vertex_elements(device->cso, vs->num_inputs, ve);
+    cso_set_vertex_elements(context->cso, vs->num_inputs, ve);
 }
 
 static void
 update_vertex_buffers(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
     struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
     struct pipe_vertex_buffer dummy_vtxbuf;
     uint32_t mask = context->changed.vtxbuf;
     unsigned i;
@@ -633,9 +829,9 @@ update_vertex_buffers(struct NineDevice9 *device)
 
     if (context->dummy_vbo_bound_at >= 0) {
         if (!context->vbo_bound_done) {
-            dummy_vtxbuf.buffer = device->dummy_vbo;
+            dummy_vtxbuf.buffer.resource = device->dummy_vbo;
             dummy_vtxbuf.stride = 0;
-            dummy_vtxbuf.user_buffer = NULL;
+            dummy_vtxbuf.is_user_buffer = false;
             dummy_vtxbuf.buffer_offset = 0;
             pipe->set_vertex_buffers(pipe, context->dummy_vbo_bound_at,
                                      1, &dummy_vtxbuf);
@@ -646,7 +842,7 @@ update_vertex_buffers(struct NineDevice9 *device)
 
     for (i = 0; mask; mask >>= 1, ++i) {
         if (mask & 1) {
-            if (context->vtxbuf[i].buffer)
+            if (context->vtxbuf[i].buffer.resource)
                 pipe->set_vertex_buffers(pipe, i, 1, &context->vtxbuf[i]);
             else
                 pipe->set_vertex_buffers(pipe, i, 1, NULL);
@@ -661,20 +857,20 @@ update_sampler_derived(struct nine_context *context, unsigned s)
 {
     boolean changed = FALSE;
 
-    if (context->samp[s][NINED3DSAMP_SHADOW] != context->texture[s]->shadow) {
+    if (context->samp[s][NINED3DSAMP_SHADOW] != context->texture[s].shadow) {
         changed = TRUE;
-        context->samp[s][NINED3DSAMP_SHADOW] = context->texture[s]->shadow;
+        context->samp[s][NINED3DSAMP_SHADOW] = context->texture[s].shadow;
     }
 
     if (context->samp[s][NINED3DSAMP_CUBETEX] !=
-        (NineResource9(context->texture[s])->type == D3DRTYPE_CUBETEXTURE)) {
+        (context->texture[s].type == D3DRTYPE_CUBETEXTURE)) {
         changed = TRUE;
         context->samp[s][NINED3DSAMP_CUBETEX] =
-                NineResource9(context->texture[s])->type == D3DRTYPE_CUBETEXTURE;
+                context->texture[s].type == D3DRTYPE_CUBETEXTURE;
     }
 
     if (context->samp[s][D3DSAMP_MIPFILTER] != D3DTEXF_NONE) {
-        int lod = context->samp[s][D3DSAMP_MAXMIPLEVEL] - context->texture[s]->managed.lod;
+        int lod = context->samp[s][D3DSAMP_MAXMIPLEVEL] - context->texture[s].lod;
         if (lod < 0)
             lod = 0;
         if (context->samp[s][NINED3DSAMP_MINLOD] != lod) {
@@ -708,21 +904,21 @@ update_textures_and_samplers(struct NineDevice9 *device)
         const unsigned s = NINE_SAMPLER_PS(i);
         int sRGB;
 
-        if (!context->texture[s] && !(sampler_mask & (1 << i))) {
+        if (!context->texture[s].enabled && !(sampler_mask & (1 << i))) {
             view[i] = NULL;
             continue;
         }
 
-        if (context->texture[s]) {
+        if (context->texture[s].enabled) {
             sRGB = context->samp[s][D3DSAMP_SRGBTEXTURE] ? 1 : 0;
 
-            view[i] = NineBaseTexture9_GetSamplerView(context->texture[s], sRGB);
+            view[i] = context->texture[s].view[sRGB];
             num_textures = i + 1;
 
             if (update_sampler_derived(context, s) || (context->changed.sampler[s] & 0x05fe)) {
                 context->changed.sampler[s] = 0;
                 commit_samplers = TRUE;
-                nine_convert_sampler_state(device->cso, s, context->samp[s]);
+                nine_convert_sampler_state(context->cso, s, context->samp[s]);
             }
         } else {
             /* Bind dummy sampler. We do not bind dummy sampler when
@@ -734,7 +930,7 @@ update_textures_and_samplers(struct NineDevice9 *device)
             view[i] = device->dummy_sampler_view;
             num_textures = i + 1;
 
-            cso_single_sampler(device->cso, PIPE_SHADER_FRAGMENT,
+            cso_single_sampler(context->cso, PIPE_SHADER_FRAGMENT,
                                s - NINE_SAMPLER_PS(0), &device->dummy_sampler_state);
 
             commit_samplers = TRUE;
@@ -744,10 +940,10 @@ update_textures_and_samplers(struct NineDevice9 *device)
         context->bound_samplers_mask_ps |= (1 << s);
     }
 
-    cso_set_sampler_views(device->cso, PIPE_SHADER_FRAGMENT, num_textures, view);
+    cso_set_sampler_views(context->cso, PIPE_SHADER_FRAGMENT, num_textures, view);
 
     if (commit_samplers)
-        cso_single_sampler_done(device->cso, PIPE_SHADER_FRAGMENT);
+        cso_single_sampler_done(context->cso, PIPE_SHADER_FRAGMENT);
 
     commit_samplers = FALSE;
     sampler_mask = context->programmable_vs ? context->vs->sampler_mask : 0;
@@ -756,21 +952,21 @@ update_textures_and_samplers(struct NineDevice9 *device)
         const unsigned s = NINE_SAMPLER_VS(i);
         int sRGB;
 
-        if (!context->texture[s] && !(sampler_mask & (1 << i))) {
+        if (!context->texture[s].enabled && !(sampler_mask & (1 << i))) {
             view[i] = NULL;
             continue;
         }
 
-        if (context->texture[s]) {
+        if (context->texture[s].enabled) {
             sRGB = context->samp[s][D3DSAMP_SRGBTEXTURE] ? 1 : 0;
 
-            view[i] = NineBaseTexture9_GetSamplerView(context->texture[s], sRGB);
+            view[i] = context->texture[s].view[sRGB];
             num_textures = i + 1;
 
             if (update_sampler_derived(context, s) || (context->changed.sampler[s] & 0x05fe)) {
                 context->changed.sampler[s] = 0;
                 commit_samplers = TRUE;
-                nine_convert_sampler_state(device->cso, s, context->samp[s]);
+                nine_convert_sampler_state(context->cso, s, context->samp[s]);
             }
         } else {
             /* Bind dummy sampler. We do not bind dummy sampler when
@@ -782,20 +978,20 @@ update_textures_and_samplers(struct NineDevice9 *device)
             view[i] = device->dummy_sampler_view;
             num_textures = i + 1;
 
-            cso_single_sampler(device->cso, PIPE_SHADER_VERTEX,
+            cso_single_sampler(context->cso, PIPE_SHADER_VERTEX,
                                s - NINE_SAMPLER_VS(0), &device->dummy_sampler_state);
 
             commit_samplers = TRUE;
             context->changed.sampler[s] = ~0;
         }
 
-        context->bound_samplers_mask_vs |= (1 << s);
+        context->bound_samplers_mask_vs |= (1 << i);
     }
 
-    cso_set_sampler_views(device->cso, PIPE_SHADER_VERTEX, num_textures, view);
+    cso_set_sampler_views(context->cso, PIPE_SHADER_VERTEX, num_textures, view);
 
     if (commit_samplers)
-        cso_single_sampler_done(device->cso, PIPE_SHADER_VERTEX);
+        cso_single_sampler_done(context->cso, PIPE_SHADER_VERTEX);
 }
 
 /* State commit only */
@@ -803,54 +999,52 @@ update_textures_and_samplers(struct NineDevice9 *device)
 static inline void
 commit_blend(struct NineDevice9 *device)
 {
-    cso_set_blend(device->cso, &device->context.pipe.blend);
+    struct nine_context *context = &device->context;
+
+    cso_set_blend(context->cso, &context->pipe_data.blend);
 }
 
 static inline void
 commit_dsa(struct NineDevice9 *device)
 {
-    cso_set_depth_stencil_alpha(device->cso, &device->context.pipe.dsa);
+    struct nine_context *context = &device->context;
+
+    cso_set_depth_stencil_alpha(context->cso, &context->pipe_data.dsa);
 }
 
 static inline void
 commit_scissor(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
+    struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
 
-    pipe->set_scissor_states(pipe, 0, 1, &device->context.scissor);
+    pipe->set_scissor_states(pipe, 0, 1, &context->scissor);
 }
 
 static inline void
 commit_rasterizer(struct NineDevice9 *device)
 {
-    cso_set_rasterizer(device->cso, &device->context.pipe.rast);
-}
+    struct nine_context *context = &device->context;
 
-static inline void
-commit_index_buffer(struct NineDevice9 *device)
-{
-    struct pipe_context *pipe = device->pipe;
-    if (device->context.idxbuf.buffer)
-        pipe->set_index_buffer(pipe, &device->context.idxbuf);
-    else
-        pipe->set_index_buffer(pipe, NULL);
+    cso_set_rasterizer(context->cso, &context->pipe_data.rast);
 }
 
 static inline void
 commit_vs_constants(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
+    struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
 
-    if (unlikely(!device->context.programmable_vs))
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->context.pipe.cb_vs_ff);
+    if (unlikely(!context->programmable_vs))
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &context->pipe_data.cb_vs_ff);
     else {
-        if (device->swvp) {
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->context.pipe.cb0_swvp);
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 1, &device->context.pipe.cb1_swvp);
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 2, &device->context.pipe.cb2_swvp);
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 3, &device->context.pipe.cb3_swvp);
+        if (context->swvp) {
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &context->pipe_data.cb0_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 1, &context->pipe_data.cb1_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 2, &context->pipe_data.cb2_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 3, &context->pipe_data.cb3_swvp);
         } else {
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->context.pipe.cb_vs);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &context->pipe_data.cb_vs);
         }
     }
 }
@@ -858,12 +1052,13 @@ commit_vs_constants(struct NineDevice9 *device)
 static inline void
 commit_ps_constants(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
+    struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
 
-    if (unlikely(!device->context.ps))
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &device->context.pipe.cb_ps_ff);
+    if (unlikely(!context->ps))
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &context->pipe_data.cb_ps_ff);
     else
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &device->context.pipe.cb_ps);
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &context->pipe_data.cb_ps);
 }
 
 static inline void
@@ -871,7 +1066,7 @@ commit_vs(struct NineDevice9 *device)
 {
     struct nine_context *context = &device->context;
 
-    device->pipe->bind_vs_state(device->pipe, context->cso.vs);
+    context->pipe->bind_vs_state(context->pipe, context->cso_shader.vs);
 }
 
 
@@ -880,22 +1075,20 @@ commit_ps(struct NineDevice9 *device)
 {
     struct nine_context *context = &device->context;
 
-    device->pipe->bind_fs_state(device->pipe, context->cso.ps);
+    context->pipe->bind_fs_state(context->pipe, context->cso_shader.ps);
 }
 /* State Update */
 
 #define NINE_STATE_SHADER_CHANGE_VS \
    (NINE_STATE_VS |         \
     NINE_STATE_TEXTURE |    \
-    NINE_STATE_FOG_SHADER | \
-    NINE_STATE_POINTSIZE_SHADER | \
+    NINE_STATE_VS_PARAMS_MISC | \
     NINE_STATE_SWVP)
 
 #define NINE_STATE_SHADER_CHANGE_PS \
    (NINE_STATE_PS |         \
     NINE_STATE_TEXTURE |    \
-    NINE_STATE_FOG_SHADER | \
-    NINE_STATE_PS1X_SHADER)
+    NINE_STATE_PS_PARAMS_MISC)
 
 #define NINE_STATE_FREQUENT \
    (NINE_STATE_RASTERIZER | \
@@ -920,33 +1113,11 @@ commit_ps(struct NineDevice9 *device)
     NINE_STATE_STENCIL_REF | \
     NINE_STATE_SAMPLE_MASK)
 
-
-/* TODO: only go through dirty textures */
-static void
-validate_textures(struct NineDevice9 *device)
-{
-    struct NineBaseTexture9 *tex, *ptr;
-    LIST_FOR_EACH_ENTRY_SAFE(tex, ptr, &device->update_textures, list) {
-        list_delinit(&tex->list);
-        NineBaseTexture9_Validate(tex);
-    }
-}
-
-static void
-update_managed_buffers(struct NineDevice9 *device)
-{
-    struct NineBuffer9 *buf, *ptr;
-    LIST_FOR_EACH_ENTRY_SAFE(buf, ptr, &device->update_buffers, managed.list) {
-        list_delinit(&buf->managed.list);
-        NineBuffer9_Upload(buf);
-    }
-}
-
 static void
 nine_update_state(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
     struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
     uint32_t group;
 
     DBG("changed state groups: %x\n", context->changed.group);
@@ -957,8 +1128,6 @@ nine_update_state(struct NineDevice9 *device)
      * into update_textures. Except, we also need to re-validate textures that
      * may be dirty anyway, even if no texture bindings changed.
      */
-    validate_textures(device); /* may clobber state */
-    update_managed_buffers(device);
 
     /* ff_update may change VS/PS dirty bits */
     if (unlikely(!context->programmable_vs || !context->ps))
@@ -983,8 +1152,6 @@ nine_update_state(struct NineDevice9 *device)
             update_viewport(device);
         if (group & (NINE_STATE_VDECL | NINE_STATE_VS | NINE_STATE_STREAMFREQ))
             update_vertex_elements(device);
-        if (group & NINE_STATE_IDXBUF)
-            commit_index_buffer(device);
     }
 
     if (likely(group & (NINE_STATE_FREQUENT | NINE_STATE_VS | NINE_STATE_PS | NINE_STATE_SWVP))) {
@@ -1062,18 +1229,17 @@ NineDevice9_ResolveZ( struct NineDevice9 *device )
     struct nine_context *context = &device->context;
     const struct util_format_description *desc;
     struct NineSurface9 *source = context->ds;
-    struct NineBaseTexture9 *destination = context->texture[0];
     struct pipe_resource *src, *dst;
     struct pipe_blit_info blit;
 
     DBG("RESZ resolve\n");
 
-    if (!source || !destination ||
-        destination->base.type != D3DRTYPE_TEXTURE)
+    if (!source || !context->texture[0].enabled ||
+        context->texture[0].type != D3DRTYPE_TEXTURE)
         return;
 
     src = source->base.resource;
-    dst = destination->base.resource;
+    dst = context->texture[0].resource;
 
     if (!src || !dst)
         return;
@@ -1108,17 +1274,50 @@ NineDevice9_ResolveZ( struct NineDevice9 *device )
     blit.filter = PIPE_TEX_FILTER_NEAREST;
     blit.scissor_enable = FALSE;
 
-    device->pipe->blit(device->pipe, &blit);
+    context->pipe->blit(context->pipe, &blit);
 }
 
 #define ALPHA_TO_COVERAGE_ENABLE   MAKEFOURCC('A', '2', 'M', '1')
 #define ALPHA_TO_COVERAGE_DISABLE  MAKEFOURCC('A', '2', 'M', '0')
 
+/* Nine_context functions.
+ * Serialized through CSMT macros.
+ */
 
-void
-nine_context_set_render_state(struct NineDevice9 *device,
-                              D3DRENDERSTATETYPE State,
-                              DWORD Value)
+static void
+nine_context_set_texture_apply(struct NineDevice9 *device,
+                               DWORD stage,
+                               BOOL enabled,
+                               BOOL shadow,
+                               DWORD lod,
+                               D3DRESOURCETYPE type,
+                               uint8_t pstype,
+                               struct pipe_resource *res,
+                               struct pipe_sampler_view *view0,
+                               struct pipe_sampler_view *view1);
+static void
+nine_context_set_stream_source_apply(struct NineDevice9 *device,
+                                    UINT StreamNumber,
+                                    struct pipe_resource *res,
+                                    UINT OffsetInBytes,
+                                    UINT Stride);
+
+static void
+nine_context_set_indices_apply(struct NineDevice9 *device,
+                               struct pipe_resource *res,
+                               UINT IndexSize,
+                               UINT OffsetInBytes);
+
+static void
+nine_context_set_pixel_shader_constant_i_transformed(struct NineDevice9 *device,
+                                                     UINT StartRegister,
+                                                     const int *pConstantData,
+                                                     unsigned pConstantData_size,
+                                                     UINT Vector4iCount);
+
+CSMT_ITEM_NO_WAIT(nine_context_set_render_state,
+                  ARG_VAL(D3DRENDERSTATETYPE, State),
+                  ARG_VAL(DWORD, Value))
 {
     struct nine_context *context = &device->context;
 
@@ -1157,27 +1356,69 @@ nine_context_set_render_state(struct NineDevice9 *device,
     context->changed.group |= nine_render_state_group[State];
 }
 
-void
-nine_context_set_texture(struct NineDevice9 *device,
-                         DWORD Stage,
-                         struct NineBaseTexture9 *tex)
+CSMT_ITEM_NO_WAIT(nine_context_set_texture_apply,
+                  ARG_VAL(DWORD, stage),
+                  ARG_VAL(BOOL, enabled),
+                  ARG_VAL(BOOL, shadow),
+                  ARG_VAL(DWORD, lod),
+                  ARG_VAL(D3DRESOURCETYPE, type),
+                  ARG_VAL(uint8_t, pstype),
+                  ARG_BIND_RES(struct pipe_resource, res),
+                  ARG_BIND_VIEW(struct pipe_sampler_view, view0),
+                  ARG_BIND_VIEW(struct pipe_sampler_view, view1))
 {
     struct nine_context *context = &device->context;
 
-    context->samplers_shadow &= ~(1 << Stage);
-    if (tex)
-        context->samplers_shadow |= tex->shadow << Stage;
-
-    nine_bind(&context->texture[Stage], tex);
+    context->texture[stage].enabled = enabled;
+    context->samplers_shadow &= ~(1 << stage);
+    context->samplers_shadow |= shadow << stage;
+    context->texture[stage].shadow = shadow;
+    context->texture[stage].lod = lod;
+    context->texture[stage].type = type;
+    context->texture[stage].pstype = pstype;
+    pipe_resource_reference(&context->texture[stage].resource, res);
+    pipe_sampler_view_reference(&context->texture[stage].view[0], view0);
+    pipe_sampler_view_reference(&context->texture[stage].view[1], view1);
 
     context->changed.group |= NINE_STATE_TEXTURE;
 }
 
 void
-nine_context_set_sampler_state(struct NineDevice9 *device,
-                               DWORD Sampler,
-                               D3DSAMPLERSTATETYPE Type,
-                               DWORD Value)
+nine_context_set_texture(struct NineDevice9 *device,
+                         DWORD Stage,
+                         struct NineBaseTexture9 *tex)
+{
+    BOOL enabled = FALSE;
+    BOOL shadow = FALSE;
+    DWORD lod = 0;
+    D3DRESOURCETYPE type = D3DRTYPE_TEXTURE;
+    uint8_t pstype = 0;
+    struct pipe_resource *res = NULL;
+    struct pipe_sampler_view *view0 = NULL, *view1 = NULL;
+
+    /* For managed pool, the data can be initially incomplete.
+     * In that case, the texture is rebound later
+     * (in NineBaseTexture9_Validate/NineBaseTexture9_UploadSelf). */
+    if (tex && tex->base.resource) {
+        enabled = TRUE;
+        shadow = tex->shadow;
+        lod = tex->managed.lod;
+        type = tex->base.type;
+        pstype = tex->pstype;
+        res = tex->base.resource;
+        view0 = NineBaseTexture9_GetSamplerView(tex, 0);
+        view1 = NineBaseTexture9_GetSamplerView(tex, 1);
+    }
+
+    nine_context_set_texture_apply(device, Stage, enabled,
+                                   shadow, lod, type, pstype,
+                                   res, view0, view1);
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_set_sampler_state,
+                  ARG_VAL(DWORD, Sampler),
+                  ARG_VAL(D3DSAMPLERSTATETYPE, Type),
+                  ARG_VAL(DWORD, Value))
 {
     struct nine_context *context = &device->context;
 
@@ -1189,6 +1430,22 @@ nine_context_set_sampler_state(struct NineDevice9 *device,
     context->changed.sampler[Sampler] |= 1 << Type;
 }
 
+CSMT_ITEM_NO_WAIT(nine_context_set_stream_source_apply,
+                  ARG_VAL(UINT, StreamNumber),
+                  ARG_BIND_RES(struct pipe_resource, res),
+                  ARG_VAL(UINT, OffsetInBytes),
+                  ARG_VAL(UINT, Stride))
+{
+    struct nine_context *context = &device->context;
+    const unsigned i = StreamNumber;
+
+    context->vtxbuf[i].stride = Stride;
+    context->vtxbuf[i].buffer_offset = OffsetInBytes;
+    pipe_resource_reference(&context->vtxbuf[i].buffer.resource, res);
+
+    context->changed.vtxbuf |= 1 << StreamNumber;
+}
+
 void
 nine_context_set_stream_source(struct NineDevice9 *device,
                                UINT StreamNumber,
@@ -1196,23 +1453,22 @@ nine_context_set_stream_source(struct NineDevice9 *device,
                                UINT OffsetInBytes,
                                UINT Stride)
 {
-    struct nine_context *context = &device->context;
-    const unsigned i = StreamNumber;
+    struct pipe_resource *res = NULL;
+    unsigned offset = 0;
 
-    context->changed.vtxbuf |= 1 << StreamNumber;
+    if (pVBuf9)
+        res = NineVertexBuffer9_GetResource(pVBuf9, &offset);
+    /* in the future when there is internal offset, add it
+     * to OffsetInBytes */
 
-    if (pVBuf9) {
-        context->vtxbuf[i].stride = Stride;
-        context->vtxbuf[i].buffer_offset = OffsetInBytes;
-    }
-    pipe_resource_reference(&context->vtxbuf[i].buffer,
-                            pVBuf9 ? NineVertexBuffer9_GetResource(pVBuf9) : NULL);
+    nine_context_set_stream_source_apply(device, StreamNumber,
+                                         res, offset + OffsetInBytes,
+                                         Stride);
 }
 
-void
-nine_context_set_stream_source_freq(struct NineDevice9 *device,
-                                    UINT StreamNumber,
-                                    UINT Setting)
+CSMT_ITEM_NO_WAIT(nine_context_set_stream_source_freq,
+                  ARG_VAL(UINT, StreamNumber),
+                  ARG_VAL(UINT, Setting))
 {
     struct nine_context *context = &device->context;
 
@@ -1227,28 +1483,38 @@ nine_context_set_stream_source_freq(struct NineDevice9 *device,
         context->changed.group |= NINE_STATE_STREAMFREQ;
 }
 
+CSMT_ITEM_NO_WAIT(nine_context_set_indices_apply,
+                  ARG_BIND_RES(struct pipe_resource, res),
+                  ARG_VAL(UINT, IndexSize),
+                  ARG_VAL(UINT, OffsetInBytes))
+{
+    struct nine_context *context = &device->context;
+
+    context->index_size = IndexSize;
+    context->index_offset = OffsetInBytes;
+    pipe_resource_reference(&context->idxbuf, res);
+
+    context->changed.group |= NINE_STATE_IDXBUF;
+}
+
 void
 nine_context_set_indices(struct NineDevice9 *device,
                          struct NineIndexBuffer9 *idxbuf)
 {
-    struct nine_context *context = &device->context;
-    const struct pipe_index_buffer *pipe_idxbuf;
+    struct pipe_resource *res = NULL;
+    UINT IndexSize = 0;
+    unsigned OffsetInBytes = 0;
 
     if (idxbuf) {
-        pipe_idxbuf = NineIndexBuffer9_GetBuffer(idxbuf);
-        context->idxbuf.index_size = pipe_idxbuf->index_size;
-        pipe_resource_reference(&context->idxbuf.buffer, pipe_idxbuf->buffer);
-        context->idxbuf.offset = pipe_idxbuf->offset;
-        context->idxbuf.user_buffer = NULL;
-    } else
-        pipe_resource_reference(&context->idxbuf.buffer, NULL);
+        res = NineIndexBuffer9_GetBuffer(idxbuf, &OffsetInBytes);
+        IndexSize = idxbuf->index_size;
+    }
 
-    context->changed.group |= NINE_STATE_IDXBUF;
+    nine_context_set_indices_apply(device, res, IndexSize, OffsetInBytes);
 }
 
-void
-nine_context_set_vertex_declaration(struct NineDevice9 *device,
-                                    struct NineVertexDeclaration9 *vdecl)
+CSMT_ITEM_NO_WAIT(nine_context_set_vertex_declaration,
+                  ARG_BIND_REF(struct NineVertexDeclaration9, vdecl))
 {
     struct nine_context *context = &device->context;
     BOOL was_programmable_vs = context->programmable_vs;
@@ -1264,9 +1530,8 @@ nine_context_set_vertex_declaration(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VDECL;
 }
 
-void
-nine_context_set_vertex_shader(struct NineDevice9 *device,
-                               struct NineVertexShader9 *pShader)
+CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader,
+                  ARG_BIND_REF(struct NineVertexShader9, pShader))
 {
     struct nine_context *context = &device->context;
     BOOL was_programmable_vs = context->programmable_vs;
@@ -1282,18 +1547,18 @@ nine_context_set_vertex_shader(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VS;
 }
 
-void
-nine_context_set_vertex_shader_constant_f(struct NineDevice9 *device,
-                                          UINT StartRegister,
-                                          const float *pConstantData,
-                                          UINT Vector4fCount)
+CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_f,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(float, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, Vector4fCount))
 {
     struct nine_context *context = &device->context;
     float *vs_const_f = device->may_swvp ? context->vs_const_f_swvp : context->vs_const_f;
 
     memcpy(&vs_const_f[StartRegister * 4],
            pConstantData,
-           Vector4fCount * 4 * sizeof(context->vs_const_f[0]));
+           pConstantData_size);
 
     if (device->may_swvp) {
         Vector4fCount = MIN2(StartRegister + Vector4fCount, NINE_MAX_CONST_F) - StartRegister;
@@ -1307,12 +1572,11 @@ nine_context_set_vertex_shader_constant_f(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VS_CONST;
 }
 
-
-void
-nine_context_set_vertex_shader_constant_i(struct NineDevice9 *device,
-                                          UINT StartRegister,
-                                          const int *pConstantData,
-                                          UINT Vector4iCount)
+CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_i,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(int, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, Vector4iCount))
 {
     struct nine_context *context = &device->context;
     int i;
@@ -1320,7 +1584,7 @@ nine_context_set_vertex_shader_constant_i(struct NineDevice9 *device,
     if (device->driver_caps.vs_integer) {
         memcpy(&context->vs_const_i[4 * StartRegister],
                pConstantData,
-               Vector4iCount * sizeof(int[4]));
+               pConstantData_size);
     } else {
         for (i = 0; i < Vector4iCount; i++) {
             context->vs_const_i[4 * (StartRegister + i)] = fui((float)(pConstantData[4 * i]));
@@ -1334,16 +1598,18 @@ nine_context_set_vertex_shader_constant_i(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VS_CONST;
 }
 
-void
-nine_context_set_vertex_shader_constant_b(struct NineDevice9 *device,
-                                          UINT StartRegister,
-                                          const BOOL *pConstantData,
-                                          UINT BoolCount)
+CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_b,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(BOOL, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, BoolCount))
 {
     struct nine_context *context = &device->context;
     int i;
     uint32_t bool_true = device->driver_caps.vs_integer ? 0xFFFFFFFF : fui(1.0f);
 
+    (void) pConstantData_size;
+
     for (i = 0; i < BoolCount; i++)
         context->vs_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
@@ -1351,9 +1617,8 @@ nine_context_set_vertex_shader_constant_b(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VS_CONST;
 }
 
-void
-nine_context_set_pixel_shader(struct NineDevice9 *device,
-                              struct NinePixelShader9* ps)
+CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader,
+                  ARG_BIND_REF(struct NinePixelShader9, ps))
 {
     struct nine_context *context = &device->context;
     unsigned old_mask = context->ps ? context->ps->rt_mask : 1;
@@ -1374,27 +1639,44 @@ nine_context_set_pixel_shader(struct NineDevice9 *device,
         context->changed.group |= NINE_STATE_FB;
 }
 
-void
-nine_context_set_pixel_shader_constant_f(struct NineDevice9 *device,
-                                        UINT StartRegister,
-                                        const float *pConstantData,
-                                        UINT Vector4fCount)
+CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_f,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(float, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, Vector4fCount))
 {
     struct nine_context *context = &device->context;
 
     memcpy(&context->ps_const_f[StartRegister * 4],
            pConstantData,
-           Vector4fCount * 4 * sizeof(context->ps_const_f[0]));
+           pConstantData_size);
 
     context->changed.ps_const_f = TRUE;
     context->changed.group |= NINE_STATE_PS_CONST;
 }
 
-void
-nine_context_set_pixel_shader_constant_i(struct NineDevice9 *device,
-                                         UINT StartRegister,
-                                         const int *pConstantData,
-                                         UINT Vector4iCount)
+/* For stateblocks */
+CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_i_transformed,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(int, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, Vector4iCount))
+{
+    struct nine_context *context = &device->context;
+
+    memcpy(&context->ps_const_i[StartRegister][0],
+           pConstantData,
+           Vector4iCount * sizeof(context->ps_const_i[0]));
+
+    context->changed.ps_const_i = TRUE;
+    context->changed.group |= NINE_STATE_PS_CONST;
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_i,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(int, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, Vector4iCount))
 {
     struct nine_context *context = &device->context;
     int i;
@@ -1402,7 +1684,7 @@ nine_context_set_pixel_shader_constant_i(struct NineDevice9 *device,
     if (device->driver_caps.ps_integer) {
         memcpy(&context->ps_const_i[StartRegister][0],
                pConstantData,
-               Vector4iCount * sizeof(context->ps_const_i[0]));
+               pConstantData_size);
     } else {
         for (i = 0; i < Vector4iCount; i++) {
             context->ps_const_i[StartRegister+i][0] = fui((float)(pConstantData[4*i]));
@@ -1415,16 +1697,18 @@ nine_context_set_pixel_shader_constant_i(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_PS_CONST;
 }
 
-void
-nine_context_set_pixel_shader_constant_b(struct NineDevice9 *device,
-                                         UINT StartRegister,
-                                         const BOOL *pConstantData,
-                                         UINT BoolCount)
+CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_b,
+                  ARG_VAL(UINT, StartRegister),
+                  ARG_MEM(BOOL, pConstantData),
+                  ARG_MEM_SIZE(unsigned, pConstantData_size),
+                  ARG_VAL(UINT, BoolCount))
 {
     struct nine_context *context = &device->context;
     int i;
     uint32_t bool_true = device->driver_caps.ps_integer ? 0xFFFFFFFF : fui(1.0f);
 
+    (void) pConstantData_size;
+
     for (i = 0; i < BoolCount; i++)
         context->ps_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
@@ -1432,10 +1716,10 @@ nine_context_set_pixel_shader_constant_b(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_PS_CONST;
 }
 
-void
-nine_context_set_render_target(struct NineDevice9 *device,
-                               DWORD RenderTargetIndex,
-                               struct NineSurface9 *rt)
+/* XXX: use resource, as resource might change */
+CSMT_ITEM_NO_WAIT(nine_context_set_render_target,
+                  ARG_VAL(DWORD, RenderTargetIndex),
+                  ARG_BIND_REF(struct NineSurface9, rt))
 {
     struct nine_context *context = &device->context;
     const unsigned i = RenderTargetIndex;
@@ -1467,9 +1751,9 @@ nine_context_set_render_target(struct NineDevice9 *device,
     }
 }
 
-void
-nine_context_set_depth_stencil(struct NineDevice9 *device,
-                               struct NineSurface9 *ds)
+/* XXX: use resource instead of ds, as resource might change */
+CSMT_ITEM_NO_WAIT(nine_context_set_depth_stencil,
+                  ARG_BIND_REF(struct NineSurface9, ds))
 {
     struct nine_context *context = &device->context;
 
@@ -1477,9 +1761,8 @@ nine_context_set_depth_stencil(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_FB;
 }
 
-void
-nine_context_set_viewport(struct NineDevice9 *device,
-                          const D3DVIEWPORT9 *viewport)
+CSMT_ITEM_NO_WAIT(nine_context_set_viewport,
+                  ARG_COPY_REF(D3DVIEWPORT9, viewport))
 {
     struct nine_context *context = &device->context;
 
@@ -1487,9 +1770,8 @@ nine_context_set_viewport(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_VIEWPORT;
 }
 
-void
-nine_context_set_scissor(struct NineDevice9 *device,
-                         const struct pipe_scissor_state *scissor)
+CSMT_ITEM_NO_WAIT(nine_context_set_scissor,
+                  ARG_COPY_REF(struct pipe_scissor_state, scissor))
 {
     struct nine_context *context = &device->context;
 
@@ -1497,10 +1779,9 @@ nine_context_set_scissor(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_SCISSOR;
 }
 
-void
-nine_context_set_transform(struct NineDevice9 *device,
-                           D3DTRANSFORMSTATETYPE State,
-                           const D3DMATRIX *pMatrix)
+CSMT_ITEM_NO_WAIT(nine_context_set_transform,
+                  ARG_VAL(D3DTRANSFORMSTATETYPE, State),
+                  ARG_COPY_REF(D3DMATRIX, pMatrix))
 {
     struct nine_context *context = &device->context;
     D3DMATRIX *M = nine_state_access_transform(&context->ff, State, TRUE);
@@ -1510,9 +1791,8 @@ nine_context_set_transform(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_FF;
 }
 
-void
-nine_context_set_material(struct NineDevice9 *device,
-                          const D3DMATERIAL9 *pMaterial)
+CSMT_ITEM_NO_WAIT(nine_context_set_material,
+                  ARG_COPY_REF(D3DMATERIAL9, pMaterial))
 {
     struct nine_context *context = &device->context;
 
@@ -1520,10 +1800,9 @@ nine_context_set_material(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_FF_MATERIAL;
 }
 
-void
-nine_context_set_light(struct NineDevice9 *device,
-                       DWORD Index,
-                       const D3DLIGHT9 *pLight)
+CSMT_ITEM_NO_WAIT(nine_context_set_light,
+                  ARG_VAL(DWORD, Index),
+                  ARG_COPY_REF(D3DLIGHT9, pLight))
 {
     struct nine_context *context = &device->context;
 
@@ -1531,21 +1810,36 @@ nine_context_set_light(struct NineDevice9 *device,
     context->changed.group |= NINE_STATE_FF_LIGHTING;
 }
 
-void
-nine_context_light_enable(struct NineDevice9 *device,
-                          DWORD Index,
-                          BOOL Enable)
+
+/* For stateblocks */
+static void
+nine_context_light_enable_stateblock(struct NineDevice9 *device,
+                                     const uint16_t active_light[NINE_MAX_LIGHTS_ACTIVE], /* TODO: use pointer that convey size for csmt */
+                                     unsigned int num_lights_active)
 {
     struct nine_context *context = &device->context;
 
-    nine_state_light_enable(&context->ff, &context->changed.group, Index, Enable);
+    /* TODO: Use CSMT_* to avoid calling nine_csmt_process */
+    nine_csmt_process(device);
+    memcpy(context->ff.active_light, active_light, NINE_MAX_LIGHTS_ACTIVE * sizeof(context->ff.active_light[0]));
+    context->ff.num_lights_active = num_lights_active;
+    context->changed.group |= NINE_STATE_FF_LIGHTING;
 }
 
-void
-nine_context_set_texture_stage_state(struct NineDevice9 *device,
-                                     DWORD Stage,
-                                     D3DTEXTURESTAGESTATETYPE Type,
-                                     DWORD Value)
+CSMT_ITEM_NO_WAIT(nine_context_light_enable,
+                  ARG_VAL(DWORD, Index),
+                  ARG_VAL(BOOL, Enable))
+{
+    struct nine_context *context = &device->context;
+
+    nine_state_light_enable(&context->ff, Index, Enable);
+    context->changed.group |= NINE_STATE_FF_LIGHTING;
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_set_texture_stage_state,
+                  ARG_VAL(DWORD, Stage),
+                  ARG_VAL(D3DTEXTURESTAGESTATETYPE, Type),
+                  ARG_VAL(DWORD, Value))
 {
     struct nine_context *context = &device->context;
     int bumpmap_index = -1;
@@ -1571,7 +1865,7 @@ nine_context_set_texture_stage_state(struct NineDevice9 *device,
         bumpmap_index = 4 * 8 + 2 * Stage + 1;
         break;
     case D3DTSS_TEXTURETRANSFORMFLAGS:
-        context->changed.group |= NINE_STATE_PS1X_SHADER;
+        context->changed.group |= NINE_STATE_PS_PARAMS_MISC;
         break;
     default:
         break;
@@ -1582,14 +1876,13 @@ nine_context_set_texture_stage_state(struct NineDevice9 *device,
         context->changed.group |= NINE_STATE_PS_CONST;
     }
 
-    context->changed.group |= NINE_STATE_FF_PSSTAGES;
+    context->changed.group |= NINE_STATE_FF_PS_CONSTS;
     context->ff.changed.tex_stage[Stage][Type / 32] |= 1 << (Type % 32);
 }
 
-void
-nine_context_set_clip_plane(struct NineDevice9 *device,
-                            DWORD Index,
-                            const float *pPlane)
+CSMT_ITEM_NO_WAIT(nine_context_set_clip_plane,
+                  ARG_VAL(DWORD, Index),
+                  ARG_COPY_REF(struct nine_clipplane, pPlane))
 {
     struct nine_context *context = &device->context;
 
@@ -1597,21 +1890,35 @@ nine_context_set_clip_plane(struct NineDevice9 *device,
     context->changed.ucp = TRUE;
 }
 
+CSMT_ITEM_NO_WAIT(nine_context_set_swvp,
+                  ARG_VAL(boolean, swvp))
+{
+    struct nine_context *context = &device->context;
+
+    context->swvp = swvp;
+    context->changed.group |= NINE_STATE_SWVP;
+}
+
+/* Do not write to nine_context directly. Slower,
+ * but works with csmt. TODO: write a special csmt version that
+ * would record the list of commands as much as possible,
+ * and use the version above else.
+ */
 void
 nine_context_apply_stateblock(struct NineDevice9 *device,
                               const struct nine_state *src)
 {
-    struct nine_context *context = &device->context;
     int i;
 
-    context->changed.group |= src->changed.group;
+    /* No need to apply src->changed.group, since all calls do
+    * set context->changed.group */
 
     for (i = 0; i < ARRAY_SIZE(src->changed.rs); ++i) {
         uint32_t m = src->changed.rs[i];
         while (m) {
             const int r = ffs(m) - 1;
             m &= ~(1 << r);
-            context->rs[i * 32 + r] = nine_fix_render_state_value(i * 32 + r, src->rs_advertised[i * 32 + r]);
+            nine_context_set_render_state(device, i * 32 + r, src->rs_advertised[i * 32 + r]);
         }
     }
 
@@ -1620,15 +1927,11 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
         uint32_t m = src->changed.texture;
         unsigned s;
 
-        context->samplers_shadow &= ~m;
-
         for (s = 0; m; ++s, m >>= 1) {
             struct NineBaseTexture9 *tex = src->texture[s];
             if (!(m & 1))
                 continue;
-            if (tex)
-                context->samplers_shadow |= tex->shadow << s;
-            nine_bind(&context->texture[s], src->texture[s]);
+            nine_context_set_texture(device, s, tex);
         }
     }
 
@@ -1641,10 +1944,8 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
             while (m) {
                 const int i = ffs(m) - 1;
                 m &= ~(1 << i);
-                if (nine_check_sampler_state_value(i, src->samp_advertised[s][i]))
-                    context->samp[s][i] = src->samp_advertised[s][i];
+                nine_context_set_sampler_state(device, s, i, src->samp_advertised[s][i]);
             }
-            context->changed.sampler[s] |= src->changed.sampler[s];
         }
     }
 
@@ -1652,23 +1953,11 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
     if (src->changed.vtxbuf | src->changed.stream_freq) {
         uint32_t m = src->changed.vtxbuf | src->changed.stream_freq;
         for (i = 0; m; ++i, m >>= 1) {
-            if (src->changed.vtxbuf & (1 << i)) {
-                if (src->stream[i]) {
-                    context->vtxbuf[i].buffer_offset = src->vtxbuf[i].buffer_offset;
-                    pipe_resource_reference(&context->vtxbuf[i].buffer,
-                        src->stream[i] ? NineVertexBuffer9_GetResource(src->stream[i]) : NULL);
-                    context->vtxbuf[i].stride = src->vtxbuf[i].stride;
-                }
-            }
-            if (src->changed.stream_freq & (1 << i)) {
-                context->stream_freq[i] = src->stream_freq[i];
-                if (src->stream_freq[i] & D3DSTREAMSOURCE_INSTANCEDATA)
-                    context->stream_instancedata_mask |= 1 << i;
-                else
-                    context->stream_instancedata_mask &= ~(1 << i);
-            }
+            if (src->changed.vtxbuf & (1 << i))
+                nine_context_set_stream_source(device, i, src->stream[i], src->vtxbuf[i].buffer_offset, src->vtxbuf[i].stride);
+            if (src->changed.stream_freq & (1 << i))
+                nine_context_set_stream_source_freq(device, i, src->stream_freq[i]);
         }
-        context->changed.vtxbuf |= src->changed.vtxbuf;
     }
 
     /* Index buffer */
@@ -1681,94 +1970,69 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
 
     /* Vertex shader */
     if (src->changed.group & NINE_STATE_VS)
-        nine_bind(&context->vs, src->vs);
-
-    context->programmable_vs = context->vs && !(context->vdecl && context->vdecl->position_t);
+        nine_context_set_vertex_shader(device, src->vs);
 
     /* Pixel shader */
     if (src->changed.group & NINE_STATE_PS)
-        nine_bind(&context->ps, src->ps);
+        nine_context_set_pixel_shader(device, src->ps);
 
     /* Vertex constants */
     if (src->changed.group & NINE_STATE_VS_CONST) {
         struct nine_range *r;
-        if (device->may_swvp) {
-            for (r = src->changed.vs_const_f; r; r = r->next) {
-                int bgn = r->bgn;
-                int end = r->end;
-                memcpy(&context->vs_const_f_swvp[bgn * 4],
-                       &src->vs_const_f[bgn * 4],
-                       (end - bgn) * 4 * sizeof(float));
-                if (bgn < device->max_vs_const_f) {
-                    end = MIN2(end, device->max_vs_const_f);
-                    memcpy(&context->vs_const_f[bgn * 4],
-                           &src->vs_const_f[bgn * 4],
-                           (end - bgn) * 4 * sizeof(float));
-                }
-            }
-        } else {
-            for (r = src->changed.vs_const_f; r; r = r->next) {
-                memcpy(&context->vs_const_f[r->bgn * 4],
-                       &src->vs_const_f[r->bgn * 4],
-                       (r->end - r->bgn) * 4 * sizeof(float));
-            }
-        }
-        for (r = src->changed.vs_const_i; r; r = r->next) {
-            memcpy(&context->vs_const_i[r->bgn * 4],
-                   &src->vs_const_i[r->bgn * 4],
-                   (r->end - r->bgn) * 4 * sizeof(int));
-        }
-        for (r = src->changed.vs_const_b; r; r = r->next) {
-            memcpy(&context->vs_const_b[r->bgn],
-                   &src->vs_const_b[r->bgn],
-                   (r->end - r->bgn) * sizeof(int));
-        }
-        context->changed.vs_const_f = !!src->changed.vs_const_f;
-        context->changed.vs_const_i = !!src->changed.vs_const_i;
-        context->changed.vs_const_b = !!src->changed.vs_const_b;
+        for (r = src->changed.vs_const_f; r; r = r->next)
+            nine_context_set_vertex_shader_constant_f(device, r->bgn,
+                                                      &src->vs_const_f[r->bgn * 4],
+                                                      sizeof(float[4]) * (r->end - r->bgn),
+                                                      r->end - r->bgn);
+        for (r = src->changed.vs_const_i; r; r = r->next)
+            nine_context_set_vertex_shader_constant_i(device, r->bgn,
+                                                      &src->vs_const_i[r->bgn * 4],
+                                                      sizeof(int[4]) * (r->end - r->bgn),
+                                                      r->end - r->bgn);
+        for (r = src->changed.vs_const_b; r; r = r->next)
+            nine_context_set_vertex_shader_constant_b(device, r->bgn,
+                                                      &src->vs_const_b[r->bgn * 4],
+                                                      sizeof(BOOL) * (r->end - r->bgn),
+                                                      r->end - r->bgn);
     }
 
     /* Pixel constants */
     if (src->changed.group & NINE_STATE_PS_CONST) {
         struct nine_range *r;
-        for (r = src->changed.ps_const_f; r; r = r->next) {
-            memcpy(&context->ps_const_f[r->bgn * 4],
-                   &src->ps_const_f[r->bgn * 4],
-                   (r->end - r->bgn) * 4 * sizeof(float));
-        }
+        for (r = src->changed.ps_const_f; r; r = r->next)
+            nine_context_set_pixel_shader_constant_f(device, r->bgn,
+                                                     &src->ps_const_f[r->bgn * 4],
+                                                     sizeof(float[4]) * (r->end - r->bgn),
+                                                     r->end - r->bgn);
         if (src->changed.ps_const_i) {
             uint16_t m = src->changed.ps_const_i;
             for (i = ffs(m) - 1, m >>= i; m; ++i, m >>= 1)
                 if (m & 1)
-                    memcpy(context->ps_const_i[i], src->ps_const_i[i], 4 * sizeof(int));
+                    nine_context_set_pixel_shader_constant_i_transformed(device, i,
+                                                                         src->ps_const_i[i], sizeof(int[4]), 1);
         }
         if (src->changed.ps_const_b) {
             uint16_t m = src->changed.ps_const_b;
             for (i = ffs(m) - 1, m >>= i; m; ++i, m >>= 1)
                 if (m & 1)
-                    context->ps_const_b[i] = src->ps_const_b[i];
+                    nine_context_set_pixel_shader_constant_b(device, i,
+                                                             &src->ps_const_b[i], sizeof(BOOL), 1);
         }
-        context->changed.ps_const_f = !!src->changed.ps_const_f;
-        context->changed.ps_const_i = !!src->changed.ps_const_i;
-        context->changed.ps_const_b = !!src->changed.ps_const_b;
     }
 
     /* Viewport */
     if (src->changed.group & NINE_STATE_VIEWPORT)
-        context->viewport = src->viewport;
+        nine_context_set_viewport(device, &src->viewport);
 
     /* Scissor */
     if (src->changed.group & NINE_STATE_SCISSOR)
-        context->scissor = src->scissor;
+        nine_context_set_scissor(device, &src->scissor);
 
     /* User Clip Planes */
-    if (src->changed.ucp) {
+    if (src->changed.ucp)
         for (i = 0; i < PIPE_MAX_CLIP_PLANES; ++i)
             if (src->changed.ucp & (1 << i))
-                memcpy(context->clip.ucp[i],
-                       src->clip.ucp[i], sizeof(src->clip.ucp[0]));
-        context->changed.ucp = TRUE;
-    }
+                nine_context_set_clip_plane(device, i, (struct nine_clipplane*)&src->clip.ucp[i][0]);
 
     if (!(src->changed.group & NINE_STATE_FF))
         return;
@@ -1776,38 +2040,22 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
     /* Fixed function state. */
 
     if (src->changed.group & NINE_STATE_FF_MATERIAL)
-        context->ff.material = src->ff.material;
+        nine_context_set_material(device, &src->ff.material);
 
-    if (src->changed.group & NINE_STATE_FF_PSSTAGES) {
+    if (src->changed.group & NINE_STATE_FF_PS_CONSTS) {
         unsigned s;
         for (s = 0; s < NINE_MAX_TEXTURE_STAGES; ++s) {
             for (i = 0; i < NINED3DTSS_COUNT; ++i)
                 if (src->ff.changed.tex_stage[s][i / 32] & (1 << (i % 32)))
-                    context->ff.tex_stage[s][i] = src->ff.tex_stage[s][i];
+                   nine_context_set_texture_stage_state(device, s, i, src->ff.tex_stage[s][i]);
         }
     }
     if (src->changed.group & NINE_STATE_FF_LIGHTING) {
-        unsigned num_lights = MAX2(context->ff.num_lights, src->ff.num_lights);
-        /* Can happen if the stateblock had recorded the creation of
-         * new lights. */
-        if (context->ff.num_lights < num_lights) {
-            context->ff.light = REALLOC(context->ff.light,
-                                    context->ff.num_lights * sizeof(D3DLIGHT9),
-                                    num_lights * sizeof(D3DLIGHT9));
-            memset(&context->ff.light[context->ff.num_lights], 0, (num_lights - context->ff.num_lights) * sizeof(D3DLIGHT9));
-            for (i = context->ff.num_lights; i < num_lights; ++i)
-                context->ff.light[i].Type = (D3DLIGHTTYPE)NINED3DLIGHT_INVALID;
-            context->ff.num_lights = num_lights;
-        }
-        /* src->ff.num_lights < num_lights has been handled before */
-        assert (src->ff.num_lights == num_lights);
-
-        for (i = 0; i < num_lights; ++i)
+        for (i = 0; i < src->ff.num_lights; ++i)
             if (src->ff.light[i].Type != NINED3DLIGHT_INVALID)
-                context->ff.light[i] = src->ff.light[i];
+                nine_context_set_light(device, i, &src->ff.light[i]);
 
-        memcpy(context->ff.active_light, src->ff.active_light, sizeof(src->ff.active_light) );
-        context->ff.num_lights_active = src->ff.num_lights_active;
+        nine_context_light_enable_stateblock(device, src->ff.active_light, src->ff.num_lights_active);
     }
     if (src->changed.group & NINE_STATE_FF_VSTRANSF) {
         for (i = 0; i < ARRAY_SIZE(src->ff.changed.transform); ++i) {
@@ -1817,11 +2065,17 @@ nine_context_apply_stateblock(struct NineDevice9 *device,
             for (s = i * 32; s < (i * 32 + 32); ++s) {
                 if (!(src->ff.changed.transform[i] & (1 << (s % 32))))
                     continue;
-                *nine_state_access_transform(&context->ff, s, TRUE) =
-                    *nine_state_access_transform( /* const because !alloc */
-                        (struct nine_ff_state *)&src->ff, s, FALSE);
+                /* MaxVertexBlendMatrixIndex is 8, which means
+                 * we don't read past index D3DTS_WORLDMATRIX(8).
+                 * swvp is supposed to allow all 256, but we don't
+                 * implement it for now. */
+                if (s > D3DTS_WORLDMATRIX(8))
+                    break;
+                nine_context_set_transform(device, s,
+                                           nine_state_access_transform(
+                                               (struct nine_ff_state *)&src->ff,
+                                                                       s, FALSE));
             }
-            context->ff.changed.transform[i] |= src->ff.changed.transform[i];
         }
     }
 }
@@ -1831,26 +2085,22 @@ nine_update_state_framebuffer_clear(struct NineDevice9 *device)
 {
     struct nine_context *context = &device->context;
 
-    validate_textures(device);
-
     if (context->changed.group & NINE_STATE_FB)
         update_framebuffer(device, TRUE);
 }
 
-/* Checks were already done before the call */
-void
-nine_context_clear_fb(struct NineDevice9 *device,
-              DWORD Count,
-              const D3DRECT *pRects,
-              DWORD Flags,
-              D3DCOLOR Color,
-              float Z,
-              DWORD Stencil)
+CSMT_ITEM_NO_WAIT(nine_context_clear_fb,
+                  ARG_VAL(DWORD, Count),
+                  ARG_COPY_REF(D3DRECT, pRects),
+                  ARG_VAL(DWORD, Flags),
+                  ARG_VAL(D3DCOLOR, Color),
+                  ARG_VAL(float, Z),
+                  ARG_VAL(DWORD, Stencil))
 {
     struct nine_context *context = &device->context;
     const int sRGB = context->rs[D3DRS_SRGBWRITEENABLE] ? 1 : 0;
     struct pipe_surface *cbuf, *zsbuf;
-    struct pipe_context *pipe = device->pipe;
+    struct pipe_context *pipe = context->pipe;
     struct NineSurface9 *zsbuf_surf = context->ds;
     struct NineSurface9 *rt;
     unsigned bufs = 0;
@@ -1863,7 +2113,7 @@ nine_context_clear_fb(struct NineDevice9 *device,
 
     if (Flags & D3DCLEAR_TARGET) bufs |= PIPE_CLEAR_COLOR;
     /* Ignore Z buffer if not bound */
-    if (context->pipe.fb.zsbuf != NULL) {
+    if (context->pipe_data.fb.zsbuf != NULL) {
         if (Flags & D3DCLEAR_ZBUFFER) bufs |= PIPE_CLEAR_DEPTH;
         if (Flags & D3DCLEAR_STENCIL) bufs |= PIPE_CLEAR_STENCIL;
     }
@@ -1894,7 +2144,7 @@ nine_context_clear_fb(struct NineDevice9 *device,
         }
     }
 
-    if (rect.x1 >= context->pipe.fb.width || rect.y1 >= context->pipe.fb.height)
+    if (rect.x1 >= context->pipe_data.fb.width || rect.y1 >= context->pipe_data.fb.height)
         return;
 
     for (i = 0; i < device->caps.NumSimultaneousRTs; ++i) {
@@ -1908,8 +2158,8 @@ nine_context_clear_fb(struct NineDevice9 *device,
         rect.x1 == 0 && rect.y1 == 0 &&
         /* Case we clear only render target. Check clear region vs rt. */
         ((!(bufs & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
-         rect.x2 >= context->pipe.fb.width &&
-         rect.y2 >= context->pipe.fb.height) ||
+         rect.x2 >= context->pipe_data.fb.width &&
+         rect.y2 >= context->pipe_data.fb.height) ||
         /* Case we clear depth buffer (and eventually rt too).
          * depth buffer size is always >= rt size. Compare to clear region */
         ((bufs & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
@@ -1994,101 +2244,297 @@ init_draw_info(struct pipe_draw_info *info,
     if (dev->context.stream_instancedata_mask & dev->context.stream_usage_mask)
         info->instance_count = MAX2(dev->context.stream_freq[0] & 0x7FFFFF, 1);
     info->primitive_restart = FALSE;
+    info->has_user_indices = FALSE;
     info->restart_index = 0;
     info->count_from_stream_output = NULL;
     info->indirect = NULL;
-    info->indirect_params = NULL;
 }
 
-void
-nine_context_draw_primitive(struct NineDevice9 *device,
-                            D3DPRIMITIVETYPE PrimitiveType,
-                            UINT StartVertex,
-                            UINT PrimitiveCount)
+CSMT_ITEM_NO_WAIT(nine_context_draw_primitive,
+                  ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
+                  ARG_VAL(UINT, StartVertex),
+                  ARG_VAL(UINT, PrimitiveCount))
 {
+    struct nine_context *context = &device->context;
     struct pipe_draw_info info;
 
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = FALSE;
+    info.index_size = 0;
     info.start = StartVertex;
     info.index_bias = 0;
     info.min_index = info.start;
     info.max_index = info.count - 1;
+    info.index.resource = NULL;
 
-    device->pipe->draw_vbo(device->pipe, &info);
+    context->pipe->draw_vbo(context->pipe, &info);
 }
 
-void
-nine_context_draw_indexed_primitive(struct NineDevice9 *device,
-                                    D3DPRIMITIVETYPE PrimitiveType,
-                                    INT BaseVertexIndex,
-                                    UINT MinVertexIndex,
-                                    UINT NumVertices,
-                                    UINT StartIndex,
-                                    UINT PrimitiveCount)
+CSMT_ITEM_NO_WAIT(nine_context_draw_indexed_primitive,
+                  ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
+                  ARG_VAL(INT, BaseVertexIndex),
+                  ARG_VAL(UINT, MinVertexIndex),
+                  ARG_VAL(UINT, NumVertices),
+                  ARG_VAL(UINT, StartIndex),
+                  ARG_VAL(UINT, PrimitiveCount))
 {
+    struct nine_context *context = &device->context;
     struct pipe_draw_info info;
 
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = TRUE;
-    info.start = StartIndex;
+    info.index_size = context->index_size;
+    info.start = context->index_offset / context->index_size + StartIndex;
     info.index_bias = BaseVertexIndex;
     /* These don't include index bias: */
     info.min_index = MinVertexIndex;
     info.max_index = MinVertexIndex + NumVertices - 1;
+    info.index.resource = context->idxbuf;
 
-    device->pipe->draw_vbo(device->pipe, &info);
+    context->pipe->draw_vbo(context->pipe, &info);
 }
 
-void
-nine_context_draw_primitive_from_vtxbuf(struct NineDevice9 *device,
-                                        D3DPRIMITIVETYPE PrimitiveType,
-                                        UINT PrimitiveCount,
-                                        struct pipe_vertex_buffer *vtxbuf)
+CSMT_ITEM_NO_WAIT(nine_context_draw_primitive_from_vtxbuf,
+                  ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
+                  ARG_VAL(UINT, PrimitiveCount),
+                  ARG_BIND_VBUF(struct pipe_vertex_buffer, vtxbuf))
 {
+    struct nine_context *context = &device->context;
     struct pipe_draw_info info;
 
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = FALSE;
+    info.index_size = 0;
     info.start = 0;
     info.index_bias = 0;
     info.min_index = 0;
     info.max_index = info.count - 1;
+    info.index.resource = NULL;
 
-    device->pipe->set_vertex_buffers(device->pipe, 0, 1, vtxbuf);
+    context->pipe->set_vertex_buffers(context->pipe, 0, 1, vtxbuf);
 
-    device->pipe->draw_vbo(device->pipe, &info);
+    context->pipe->draw_vbo(context->pipe, &info);
 }
 
-void
-nine_context_draw_indexed_primitive_from_vtxbuf_idxbuf(struct NineDevice9 *device,
-                                                       D3DPRIMITIVETYPE PrimitiveType,
-                                                       UINT MinVertexIndex,
-                                                       UINT NumVertices,
-                                                       UINT PrimitiveCount,
-                                                       struct pipe_vertex_buffer *vbuf,
-                                                       struct pipe_index_buffer *ibuf)
+CSMT_ITEM_NO_WAIT(nine_context_draw_indexed_primitive_from_vtxbuf_idxbuf,
+                  ARG_VAL(D3DPRIMITIVETYPE, PrimitiveType),
+                  ARG_VAL(UINT, MinVertexIndex),
+                  ARG_VAL(UINT, NumVertices),
+                  ARG_VAL(UINT, PrimitiveCount),
+                  ARG_BIND_VBUF(struct pipe_vertex_buffer, vbuf),
+                  ARG_BIND_RES(struct pipe_resource, ibuf),
+                  ARG_VAL(void *, user_ibuf),
+                  ARG_VAL(UINT, index_offset),
+                  ARG_VAL(UINT, index_size))
 {
+    struct nine_context *context = &device->context;
     struct pipe_draw_info info;
 
     nine_update_state(device);
 
     init_draw_info(&info, device, PrimitiveType, PrimitiveCount);
-    info.indexed = TRUE;
-    info.start = 0;
+    info.index_size = index_size;
+    info.start = index_offset / info.index_size;
     info.index_bias = 0;
     info.min_index = MinVertexIndex;
     info.max_index = MinVertexIndex + NumVertices - 1;
-    device->pipe->set_vertex_buffers(device->pipe, 0, 1, vbuf);
-    device->pipe->set_index_buffer(device->pipe, ibuf);
+    info.has_user_indices = ibuf == NULL;
+    if (ibuf)
+        info.index.resource = ibuf;
+    else
+        info.index.user = user_ibuf;
+
+    context->pipe->set_vertex_buffers(context->pipe, 0, 1, vbuf);
 
-    device->pipe->draw_vbo(device->pipe, &info);
+    context->pipe->draw_vbo(context->pipe, &info);
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_resource_copy_region,
+                  ARG_BIND_REF(struct NineUnknown, dst),
+                  ARG_BIND_REF(struct NineUnknown, src),
+                  ARG_BIND_RES(struct pipe_resource, dst_res),
+                  ARG_VAL(unsigned, dst_level),
+                  ARG_COPY_REF(struct pipe_box, dst_box),
+                  ARG_BIND_RES(struct pipe_resource, src_res),
+                  ARG_VAL(unsigned, src_level),
+                  ARG_COPY_REF(struct pipe_box, src_box))
+{
+    struct nine_context *context = &device->context;
+
+    (void) dst;
+    (void) src;
+
+    context->pipe->resource_copy_region(context->pipe,
+            dst_res, dst_level,
+            dst_box->x, dst_box->y, dst_box->z,
+            src_res, src_level,
+            src_box);
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_blit,
+                  ARG_BIND_REF(struct NineUnknown, dst),
+                  ARG_BIND_REF(struct NineUnknown, src),
+                  ARG_BIND_BLIT(struct pipe_blit_info, blit))
+{
+    struct nine_context *context = &device->context;
+
+    (void) dst;
+    (void) src;
+
+    context->pipe->blit(context->pipe, blit);
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_clear_render_target,
+                  ARG_BIND_REF(struct NineSurface9, surface),
+                  ARG_VAL(D3DCOLOR, color),
+                  ARG_VAL(UINT, x),
+                  ARG_VAL(UINT, y),
+                  ARG_VAL(UINT, width),
+                  ARG_VAL(UINT, height))
+{
+    struct nine_context *context = &device->context;
+    struct pipe_surface *surf;
+    union pipe_color_union rgba;
+
+    d3dcolor_to_pipe_color_union(&rgba, color);
+    surf = NineSurface9_GetSurface(surface, 0);
+    context->pipe->clear_render_target(context->pipe, surf, &rgba, x, y, width, height, false);
+}
+
+CSMT_ITEM_NO_WAIT(nine_context_gen_mipmap,
+                  ARG_BIND_REF(struct NineUnknown, dst),
+                  ARG_BIND_RES(struct pipe_resource, res),
+                  ARG_VAL(UINT, base_level),
+                  ARG_VAL(UINT, last_level),
+                  ARG_VAL(UINT, first_layer),
+                  ARG_VAL(UINT, last_layer),
+                  ARG_VAL(UINT, filter))
+{
+    struct nine_context *context = &device->context;
+
+    /* We just bind dst for the bind count */
+    (void)dst;
+
+    util_gen_mipmap(context->pipe, res, res->format, base_level,
+                    last_level, first_layer, last_layer, filter);
+}
+
+CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload,
+                               ARG_BIND_RES(struct pipe_resource, res),
+                               ARG_VAL(unsigned, offset),
+                               ARG_VAL(unsigned, size),
+                               ARG_VAL(const void *, data))
+{
+    struct nine_context *context = &device->context;
+
+    context->pipe->buffer_subdata(context->pipe, res, 0, offset, size, data);
+}
+
+CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_box_upload,
+                               ARG_BIND_REF(struct NineUnknown, dst),
+                               ARG_BIND_RES(struct pipe_resource, res),
+                               ARG_VAL(unsigned, level),
+                               ARG_COPY_REF(struct pipe_box, dst_box),
+                               ARG_VAL(enum pipe_format, src_format),
+                               ARG_VAL(const void *, src),
+                               ARG_VAL(unsigned, src_stride),
+                               ARG_VAL(unsigned, src_layer_stride),
+                               ARG_COPY_REF(struct pipe_box, src_box))
+{
+    struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
+    struct pipe_transfer *transfer = NULL;
+    uint8_t *map;
+
+    /* We just bind dst for the bind count */
+    (void)dst;
+
+    map = pipe->transfer_map(pipe,
+                             res,
+                             level,
+                             PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
+                             dst_box, &transfer);
+    if (!map)
+        return;
+
+    /* Note: if formats are the sames, it will revert
+     * to normal memcpy */
+    (void) util_format_translate_3d(res->format,
+                                    map, transfer->stride,
+                                    transfer->layer_stride,
+                                    0, 0, 0,
+                                    src_format,
+                                    src, src_stride,
+                                    src_layer_stride,
+                                    src_box->x, src_box->y, src_box->z,
+                                    dst_box->width, dst_box->height,
+                                    dst_box->depth);
+
+    pipe_transfer_unmap(pipe, transfer);
+}
+
+struct pipe_query *
+nine_context_create_query(struct NineDevice9 *device, unsigned query_type)
+{
+    struct pipe_context *pipe;
+    struct pipe_query *res;
+
+    pipe = nine_context_get_pipe_acquire(device);
+    res = pipe->create_query(pipe, query_type, 0);
+    nine_context_get_pipe_release(device);
+    return res;
+}
+
+CSMT_ITEM_DO_WAIT(nine_context_destroy_query,
+                  ARG_REF(struct pipe_query, query))
+{
+    struct nine_context *context = &device->context;
+
+    context->pipe->destroy_query(context->pipe, query);
+}
+
+CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_begin_query,
+                               ARG_REF(struct pipe_query, query))
+{
+    struct nine_context *context = &device->context;
+
+    (void) context->pipe->begin_query(context->pipe, query);
+}
+
+CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_end_query,
+                               ARG_REF(struct pipe_query, query))
+{
+    struct nine_context *context = &device->context;
+
+    (void) context->pipe->end_query(context->pipe, query);
+}
+
+boolean
+nine_context_get_query_result(struct NineDevice9 *device, struct pipe_query *query,
+                              unsigned *counter, boolean flush, boolean wait,
+                              union pipe_query_result *result)
+{
+    struct pipe_context *pipe;
+    boolean ret;
+
+    if (wait)
+        nine_csmt_process(device);
+    else if (p_atomic_read(counter) > 0) {
+        if (flush && device->csmt_active)
+            nine_queue_flush(device->csmt_ctx->pool);
+        DBG("Pending begin/end. Returning\n");
+        return false;
+    }
+
+    pipe = nine_context_get_pipe_acquire(device);
+    ret = pipe->get_query_result(pipe, query, wait, result);
+    nine_context_get_pipe_release(device);
+
+    DBG("Query result %s\n", ret ? "found" : "not yet available");
+    return ret;
 }
 
 /* State defaults */
@@ -2249,6 +2695,9 @@ static const DWORD nine_samp_state_defaults[NINED3DSAMP_LAST + 1] =
     [NINED3DSAMP_CUBETEX] = 0
 };
 
+/* Note: The following 4 functions assume there is no
+ * pending commands */
+
 void nine_state_restore_non_cso(struct NineDevice9 *device)
 {
     struct nine_context *context = &device->context;
@@ -2361,10 +2810,32 @@ nine_state_clear(struct nine_state *state, const boolean device)
 }
 
 void
-nine_context_clear(struct nine_context *context)
+nine_context_clear(struct NineDevice9 *device)
 {
+    struct nine_context *context = &device->context;
+    struct pipe_context *pipe = context->pipe;
+    struct cso_context *cso = context->cso;
     unsigned i;
 
+    /* Early device ctor failure. Nothing to do */
+    if (!pipe || !cso)
+        return;
+
+    pipe->bind_vs_state(pipe, NULL);
+    pipe->bind_fs_state(pipe, NULL);
+
+    /* Don't unbind constant buffers, they're device-private and
+     * do not change on Reset.
+     */
+
+    cso_set_samplers(cso, PIPE_SHADER_VERTEX, 0, NULL);
+    cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, 0, NULL);
+
+    cso_set_sampler_views(cso, PIPE_SHADER_VERTEX, 0, NULL);
+    cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, 0, NULL);
+
+    pipe->set_vertex_buffers(pipe, 0, device->caps.MaxStreams, NULL);
+
     for (i = 0; i < ARRAY_SIZE(context->rt); ++i)
        nine_bind(&context->rt[i], NULL);
     nine_bind(&context->ds, NULL);
@@ -2372,11 +2843,18 @@ nine_context_clear(struct nine_context *context)
     nine_bind(&context->ps, NULL);
     nine_bind(&context->vdecl, NULL);
     for (i = 0; i < PIPE_MAX_ATTRIBS; ++i)
-        pipe_resource_reference(&context->vtxbuf[i].buffer, NULL);
-    pipe_resource_reference(&context->idxbuf.buffer, NULL);
+        pipe_vertex_buffer_unreference(&context->vtxbuf[i]);
+    pipe_resource_reference(&context->idxbuf, NULL);
 
-    for (i = 0; i < NINE_MAX_SAMPLERS; ++i)
-        nine_bind(&context->texture[i], NULL);
+    for (i = 0; i < NINE_MAX_SAMPLERS; ++i) {
+        context->texture[i].enabled = FALSE;
+        pipe_resource_reference(&context->texture[i].resource,
+                                NULL);
+        pipe_sampler_view_reference(&context->texture[i].view[0],
+                                    NULL);
+        pipe_sampler_view_reference(&context->texture[i].view[1],
+                                    NULL);
+    }
 }
 
 void
@@ -2485,7 +2963,7 @@ update_vertex_elements_sw(struct NineDevice9 *device)
 static void
 update_vertex_buffers_sw(struct NineDevice9 *device, int start_vertice, int num_vertices)
 {
-    struct pipe_context *pipe = device->pipe;
+    struct pipe_context *pipe = nine_context_get_pipe_acquire(device);
     struct pipe_context *pipe_sw = device->pipe_sw;
     struct nine_state *state = &device->state;
     struct nine_state_sw_internal *sw_internal = &device->state_sw_internal;
@@ -2500,39 +2978,44 @@ update_vertex_buffers_sw(struct NineDevice9 *device, int start_vertice, int num_
     for (i = 0; mask; mask >>= 1, ++i) {
         if (mask & 1) {
             if (state->stream[i]) {
+                unsigned offset;
                 struct pipe_resource *buf;
                 struct pipe_box box;
+                void *userbuf;
 
                 vtxbuf = state->vtxbuf[i];
-                vtxbuf.buffer = NineVertexBuffer9_GetResource(state->stream[i]);
+                buf = NineVertexBuffer9_GetResource(state->stream[i], &offset);
 
-                DBG("Locking %p (offset %d, length %d)\n", vtxbuf.buffer,
+                DBG("Locking %p (offset %d, length %d)\n", buf,
                     vtxbuf.buffer_offset, num_vertices * vtxbuf.stride);
 
-                u_box_1d(vtxbuf.buffer_offset + start_vertice * vtxbuf.stride,
+                u_box_1d(vtxbuf.buffer_offset + offset + start_vertice * vtxbuf.stride,
                          num_vertices * vtxbuf.stride, &box);
-                buf = vtxbuf.buffer;
-                vtxbuf.user_buffer = pipe->transfer_map(pipe, buf, 0, PIPE_TRANSFER_READ, &box,
-                                                        &(sw_internal->transfers_so[i]));
-                vtxbuf.buffer = NULL;
+
+                userbuf = pipe->transfer_map(pipe, buf, 0, PIPE_TRANSFER_READ, &box,
+                                             &(sw_internal->transfers_so[i]));
+                vtxbuf.is_user_buffer = true;
+                vtxbuf.buffer.user = userbuf;
+
                 if (!device->driver_caps.user_sw_vbufs) {
-                    u_upload_data(device->vertex_sw_uploader,
+                    vtxbuf.buffer.resource = NULL;
+                    vtxbuf.is_user_buffer = false;
+                    u_upload_data(device->pipe_sw->stream_uploader,
                                   0,
                                   box.width,
                                   16,
-                                  vtxbuf.user_buffer,
+                                  userbuf,
                                   &(vtxbuf.buffer_offset),
-                                  &(vtxbuf.buffer));
-                    u_upload_unmap(device->vertex_sw_uploader);
-                    vtxbuf.user_buffer = NULL;
+                                  &(vtxbuf.buffer.resource));
+                    u_upload_unmap(device->pipe_sw->stream_uploader);
                 }
                 pipe_sw->set_vertex_buffers(pipe_sw, i, 1, &vtxbuf);
-                if (vtxbuf.buffer)
-                    pipe_resource_reference(&vtxbuf.buffer, NULL);
+                pipe_vertex_buffer_unreference(&vtxbuf);
             } else
                 pipe_sw->set_vertex_buffers(pipe_sw, i, 1, NULL);
         }
     }
+    nine_context_get_pipe_release(device);
 }
 
 static void
@@ -2570,34 +3053,12 @@ update_vs_constants_sw(struct NineDevice9 *device)
         }
 
         buf = cb.user_buffer;
-        if (!device->driver_caps.user_sw_cbufs) {
-            u_upload_data(device->constbuf_sw_uploader,
-                          0,
-                          cb.buffer_size,
-                          16,
-                          cb.user_buffer,
-                          &(cb.buffer_offset),
-                          &(cb.buffer));
-            u_upload_unmap(device->constbuf_sw_uploader);
-            cb.user_buffer = NULL;
-        }
 
         pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 0, &cb);
         if (cb.buffer)
             pipe_resource_reference(&cb.buffer, NULL);
 
         cb.user_buffer = (char *)buf + 4096 * sizeof(float[4]);
-        if (!device->driver_caps.user_sw_cbufs) {
-            u_upload_data(device->constbuf_sw_uploader,
-                          0,
-                          cb.buffer_size,
-                          16,
-                          cb.user_buffer,
-                          &(cb.buffer_offset),
-                          &(cb.buffer));
-            u_upload_unmap(device->constbuf_sw_uploader);
-            cb.user_buffer = NULL;
-        }
 
         pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 1, &cb);
         if (cb.buffer)
@@ -2612,18 +3073,6 @@ update_vs_constants_sw(struct NineDevice9 *device)
         cb.buffer_size = 2048 * sizeof(float[4]);
         cb.user_buffer = state->vs_const_i;
 
-        if (!device->driver_caps.user_sw_cbufs) {
-            u_upload_data(device->constbuf_sw_uploader,
-                          0,
-                          cb.buffer_size,
-                          16,
-                          cb.user_buffer,
-                          &(cb.buffer_offset),
-                          &(cb.buffer));
-            u_upload_unmap(device->constbuf_sw_uploader);
-            cb.user_buffer = NULL;
-        }
-
         pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 2, &cb);
         if (cb.buffer)
             pipe_resource_reference(&cb.buffer, NULL);
@@ -2637,18 +3086,6 @@ update_vs_constants_sw(struct NineDevice9 *device)
         cb.buffer_size = 512 * sizeof(float[4]);
         cb.user_buffer = state->vs_const_b;
 
-        if (!device->driver_caps.user_sw_cbufs) {
-            u_upload_data(device->constbuf_sw_uploader,
-                          0,
-                          cb.buffer_size,
-                          16,
-                          cb.user_buffer,
-                          &(cb.buffer_offset),
-                          &(cb.buffer));
-            u_upload_unmap(device->constbuf_sw_uploader);
-            cb.user_buffer = NULL;
-        }
-
         pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 3, &cb);
         if (cb.buffer)
             pipe_resource_reference(&cb.buffer, NULL);
@@ -2669,14 +3106,14 @@ update_vs_constants_sw(struct NineDevice9 *device)
         cb.user_buffer = viewport_data;
 
         {
-            u_upload_data(device->constbuf_sw_uploader,
+            u_upload_data(device->pipe_sw->const_uploader,
                           0,
                           cb.buffer_size,
                           16,
                           cb.user_buffer,
                           &(cb.buffer_offset),
                           &(cb.buffer));
-            u_upload_unmap(device->constbuf_sw_uploader);
+            u_upload_unmap(device->pipe_sw->const_uploader);
             cb.user_buffer = NULL;
         }
 
@@ -2710,7 +3147,7 @@ void
 nine_state_after_draw_sw(struct NineDevice9 *device)
 {
     struct nine_state_sw_internal *sw_internal = &device->state_sw_internal;
-    struct pipe_context *pipe = device->pipe;
+    struct pipe_context *pipe = nine_context_get_pipe_acquire(device);
     struct pipe_context *pipe_sw = device->pipe_sw;
     int i;
 
@@ -2720,6 +3157,7 @@ nine_state_after_draw_sw(struct NineDevice9 *device)
             pipe->transfer_unmap(pipe, sw_internal->transfers_so[i]);
         sw_internal->transfers_so[i] = NULL;
     }
+    nine_context_get_pipe_release(device);
 }
 
 void
@@ -2873,14 +3311,14 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_ALPHAFUNC] = NINE_STATE_DSA,
     [D3DRS_DITHERENABLE] = NINE_STATE_BLEND,
     [D3DRS_ALPHABLENDENABLE] = NINE_STATE_BLEND,
-    [D3DRS_FOGENABLE] = NINE_STATE_FF_OTHER | NINE_STATE_FOG_SHADER | NINE_STATE_PS_CONST,
+    [D3DRS_FOGENABLE] = NINE_STATE_FF_SHADER | NINE_STATE_VS_PARAMS_MISC | NINE_STATE_PS_PARAMS_MISC | NINE_STATE_PS_CONST,
     [D3DRS_SPECULARENABLE] = NINE_STATE_FF_LIGHTING,
-    [D3DRS_FOGCOLOR] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
-    [D3DRS_FOGTABLEMODE] = NINE_STATE_FF_OTHER | NINE_STATE_FOG_SHADER | NINE_STATE_PS_CONST,
-    [D3DRS_FOGSTART] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
-    [D3DRS_FOGEND] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
-    [D3DRS_FOGDENSITY] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
-    [D3DRS_RANGEFOGENABLE] = NINE_STATE_FF_OTHER,
+    [D3DRS_FOGCOLOR] = NINE_STATE_FF_PS_CONSTS | NINE_STATE_PS_CONST,
+    [D3DRS_FOGTABLEMODE] = NINE_STATE_FF_SHADER | NINE_STATE_PS_PARAMS_MISC | NINE_STATE_PS_CONST,
+    [D3DRS_FOGSTART] = NINE_STATE_FF_VS_OTHER | NINE_STATE_FF_PS_CONSTS | NINE_STATE_PS_CONST,
+    [D3DRS_FOGEND] = NINE_STATE_FF_VS_OTHER | NINE_STATE_FF_PS_CONSTS | NINE_STATE_PS_CONST,
+    [D3DRS_FOGDENSITY] = NINE_STATE_FF_VS_OTHER | NINE_STATE_FF_PS_CONSTS | NINE_STATE_PS_CONST,
+    [D3DRS_RANGEFOGENABLE] = NINE_STATE_FF_SHADER,
     [D3DRS_STENCILENABLE] = NINE_STATE_DSA | NINE_STATE_MULTISAMPLE,
     [D3DRS_STENCILFAIL] = NINE_STATE_DSA,
     [D3DRS_STENCILZFAIL] = NINE_STATE_DSA,
@@ -2889,7 +3327,7 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_STENCILREF] = NINE_STATE_STENCIL_REF,
     [D3DRS_STENCILMASK] = NINE_STATE_DSA,
     [D3DRS_STENCILWRITEMASK] = NINE_STATE_DSA,
-    [D3DRS_TEXTUREFACTOR] = NINE_STATE_FF_PSSTAGES,
+    [D3DRS_TEXTUREFACTOR] = NINE_STATE_FF_PS_CONSTS,
     [D3DRS_WRAP0] = NINE_STATE_UNHANDLED, /* cylindrical wrap is crazy */
     [D3DRS_WRAP1] = NINE_STATE_UNHANDLED,
     [D3DRS_WRAP2] = NINE_STATE_UNHANDLED,
@@ -2901,31 +3339,31 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_CLIPPING] = 0, /* software vertex processing only */
     [D3DRS_LIGHTING] = NINE_STATE_FF_LIGHTING,
     [D3DRS_AMBIENT] = NINE_STATE_FF_LIGHTING | NINE_STATE_FF_MATERIAL,
-    [D3DRS_FOGVERTEXMODE] = NINE_STATE_FF_OTHER,
+    [D3DRS_FOGVERTEXMODE] = NINE_STATE_FF_SHADER,
     [D3DRS_COLORVERTEX] = NINE_STATE_FF_LIGHTING,
     [D3DRS_LOCALVIEWER] = NINE_STATE_FF_LIGHTING,
-    [D3DRS_NORMALIZENORMALS] = NINE_STATE_FF_OTHER,
+    [D3DRS_NORMALIZENORMALS] = NINE_STATE_FF_SHADER,
     [D3DRS_DIFFUSEMATERIALSOURCE] = NINE_STATE_FF_LIGHTING,
     [D3DRS_SPECULARMATERIALSOURCE] = NINE_STATE_FF_LIGHTING,
     [D3DRS_AMBIENTMATERIALSOURCE] = NINE_STATE_FF_LIGHTING,
     [D3DRS_EMISSIVEMATERIALSOURCE] = NINE_STATE_FF_LIGHTING,
-    [D3DRS_VERTEXBLEND] = NINE_STATE_FF_OTHER,
+    [D3DRS_VERTEXBLEND] = NINE_STATE_FF_SHADER,
     [D3DRS_CLIPPLANEENABLE] = NINE_STATE_RASTERIZER,
-    [D3DRS_POINTSIZE] = NINE_STATE_RASTERIZER,
-    [D3DRS_POINTSIZE_MIN] = NINE_STATE_RASTERIZER | NINE_STATE_POINTSIZE_SHADER,
+    [D3DRS_POINTSIZE] = NINE_STATE_RASTERIZER | NINE_STATE_FF_VS_OTHER,
+    [D3DRS_POINTSIZE_MIN] = NINE_STATE_RASTERIZER | NINE_STATE_FF_VS_OTHER | NINE_STATE_VS_PARAMS_MISC,
     [D3DRS_POINTSPRITEENABLE] = NINE_STATE_RASTERIZER,
-    [D3DRS_POINTSCALEENABLE] = NINE_STATE_FF_OTHER,
-    [D3DRS_POINTSCALE_A] = NINE_STATE_FF_OTHER,
-    [D3DRS_POINTSCALE_B] = NINE_STATE_FF_OTHER,
-    [D3DRS_POINTSCALE_C] = NINE_STATE_FF_OTHER,
+    [D3DRS_POINTSCALEENABLE] = NINE_STATE_FF_SHADER,
+    [D3DRS_POINTSCALE_A] = NINE_STATE_FF_VS_OTHER,
+    [D3DRS_POINTSCALE_B] = NINE_STATE_FF_VS_OTHER,
+    [D3DRS_POINTSCALE_C] = NINE_STATE_FF_VS_OTHER,
     [D3DRS_MULTISAMPLEANTIALIAS] = NINE_STATE_MULTISAMPLE,
     [D3DRS_MULTISAMPLEMASK] = NINE_STATE_SAMPLE_MASK,
     [D3DRS_PATCHEDGESTYLE] = NINE_STATE_UNHANDLED,
     [D3DRS_DEBUGMONITORTOKEN] = NINE_STATE_UNHANDLED,
-    [D3DRS_POINTSIZE_MAX] = NINE_STATE_RASTERIZER | NINE_STATE_POINTSIZE_SHADER,
-    [D3DRS_INDEXEDVERTEXBLENDENABLE] = NINE_STATE_FF_OTHER,
+    [D3DRS_POINTSIZE_MAX] = NINE_STATE_RASTERIZER | NINE_STATE_FF_VS_OTHER | NINE_STATE_VS_PARAMS_MISC,
+    [D3DRS_INDEXEDVERTEXBLENDENABLE] = NINE_STATE_FF_SHADER,
     [D3DRS_COLORWRITEENABLE] = NINE_STATE_BLEND,
-    [D3DRS_TWEENFACTOR] = NINE_STATE_FF_OTHER,
+    [D3DRS_TWEENFACTOR] = NINE_STATE_FF_VS_OTHER,
     [D3DRS_BLENDOP] = NINE_STATE_BLEND,
     [D3DRS_POSITIONDEGREE] = NINE_STATE_UNHANDLED,
     [D3DRS_NORMALDEGREE] = NINE_STATE_UNHANDLED,
@@ -2966,14 +3404,31 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
 
 /* Misc */
 
+static D3DMATRIX nine_state_identity = { .m[0] = { 1, 0, 0, 0 },
+                                         .m[1] = { 0, 1, 0, 0 },
+                                         .m[2] = { 0, 0, 1, 0 },
+                                         .m[3] = { 0, 0, 0, 1 } };
+
+void
+nine_state_resize_transform(struct nine_ff_state *ff_state, unsigned N)
+{
+    unsigned n = ff_state->num_transforms;
+
+    if (N <= n)
+        return;
+
+    ff_state->transform = REALLOC(ff_state->transform,
+                                  n * sizeof(D3DMATRIX),
+                                  N * sizeof(D3DMATRIX));
+    for (; n < N; ++n)
+        ff_state->transform[n] = nine_state_identity;
+    ff_state->num_transforms = N;
+}
+
 D3DMATRIX *
 nine_state_access_transform(struct nine_ff_state *ff_state, D3DTRANSFORMSTATETYPE t,
                             boolean alloc)
 {
-    static D3DMATRIX Identity = { .m[0] = { 1, 0, 0, 0 },
-                                  .m[1] = { 0, 1, 0, 0 },
-                                  .m[2] = { 0, 0, 1, 0 },
-                                  .m[3] = { 0, 0, 0, 1 } };
     unsigned index;
 
     switch (t) {
@@ -2995,17 +3450,9 @@ nine_state_access_transform(struct nine_ff_state *ff_state, D3DTRANSFORMSTATETYP
     }
 
     if (index >= ff_state->num_transforms) {
-        unsigned N = index + 1;
-        unsigned n = ff_state->num_transforms;
-
         if (!alloc)
-            return &Identity;
-        ff_state->transform = REALLOC(ff_state->transform,
-                                      n * sizeof(D3DMATRIX),
-                                      N * sizeof(D3DMATRIX));
-        for (; n < N; ++n)
-            ff_state->transform[n] = Identity;
-        ff_state->num_transforms = N;
+            return &nine_state_identity;
+        nine_state_resize_transform(ff_state, index + 1);
     }
     return &ff_state->transform[index];
 }
@@ -3039,7 +3486,7 @@ nine_state_set_light(struct nine_ff_state *ff_state, DWORD Index,
 }
 
 HRESULT
-nine_state_light_enable(struct nine_ff_state *ff_state, uint32_t *change_group,
+nine_state_light_enable(struct nine_ff_state *ff_state,
                         DWORD Index, BOOL Enable)
 {
     unsigned i;
@@ -3068,8 +3515,6 @@ nine_state_light_enable(struct nine_ff_state *ff_state, uint32_t *change_group,
             ff_state->active_light[i] = ff_state->active_light[i + 1];
     }
 
-    *change_group |= NINE_STATE_FF_LIGHTING;
-
     return D3D_OK;
 }