gallium: add flags parameter to pipe_screen::context_create
[mesa.git] / src / gallium / state_trackers / nine / device9.c
index fdcf7a2875687b50f2d734e44d59042cd176d130..f14ffea13e13709026c07599274ab132ff70f89d 100644 (file)
@@ -41,6 +41,7 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
+#include "pipe/p_config.h"
 #include "util/u_math.h"
 #include "util/u_inlines.h"
 #include "util/u_hash_table.h"
 
 #define DBG_CHANNEL DBG_DEVICE
 
+#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+
+#include <fpu_control.h>
+
+static void nine_setup_fpu()
+{
+    fpu_control_t c;
+
+    _FPU_GETCW(c);
+    /* clear the control word */
+    c &= _FPU_RESERVED;
+    /* d3d9 doc/wine tests: mask all exceptions, use single-precision
+     * and round to nearest */
+    c |= _FPU_MASK_IM | _FPU_MASK_DM | _FPU_MASK_ZM | _FPU_MASK_OM |
+         _FPU_MASK_UM | _FPU_MASK_PM | _FPU_SINGLE | _FPU_RC_NEAREST;
+    _FPU_SETCW(c);
+}
+
+#else
+
+static void nine_setup_fpu(void)
+{
+    WARN_ONCE("FPU setup not supported on non-x86 platforms\n");
+}
+
+#endif
+
 static void
 NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset )
 {
@@ -91,48 +119,6 @@ NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset )
             This, (IDirect3DSurface9 *)This->swapchains[0]->zsbuf);
 }
 
-void
-NineDevice9_RestoreNonCSOState( struct NineDevice9 *This, unsigned mask )
-{
-    struct pipe_context *pipe = This->pipe;
-
-    DBG("This=%p mask=%u\n", This, mask);
-
-    if (mask & 0x1) {
-        struct pipe_constant_buffer cb;
-        cb.buffer_offset = 0;
-
-        if (This->prefer_user_constbuf) {
-            cb.buffer = NULL;
-            cb.user_buffer = This->state.vs_const_f;
-        } else {
-            cb.buffer = This->constbuf_vs;
-            cb.user_buffer = NULL;
-        }
-        cb.buffer_size = This->vs_const_size;
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
-
-        if (This->prefer_user_constbuf) {
-            cb.user_buffer = This->state.ps_const_f;
-        } else {
-            cb.buffer = This->constbuf_ps;
-        }
-        cb.buffer_size = This->ps_const_size;
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
-    }
-
-    if (mask & 0x2) {
-        struct pipe_poly_stipple stipple;
-        memset(&stipple, ~0, sizeof(stipple));
-        pipe->set_polygon_stipple(pipe, &stipple);
-    }
-
-    This->state.changed.group = NINE_STATE_ALL;
-    This->state.changed.vtxbuf = (1ULL << This->caps.MaxStreams) - 1;
-    This->state.changed.ucp = (1 << PIPE_MAX_CLIP_PLANES) - 1;
-    This->state.changed.texture = NINE_PS_SAMPLERS_MASK | NINE_VS_SAMPLERS_MASK;
-}
-
 #define GET_PCAP(n) pScreen->get_param(pScreen, PIPE_CAP_##n)
 HRESULT
 NineDevice9_ctor( struct NineDevice9 *This,
@@ -158,6 +144,7 @@ NineDevice9_ctor( struct NineDevice9 *This,
     if (FAILED(hr)) { return hr; }
 
     list_inithead(&This->update_textures);
+    list_inithead(&This->managed_textures);
 
     This->screen = pScreen;
     This->caps = *pCaps;
@@ -168,7 +155,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
     IDirect3D9_AddRef(This->d3d9);
     ID3DPresentGroup_AddRef(This->present);
 
-    This->pipe = This->screen->context_create(This->screen, NULL);
+    if (!(This->params.BehaviorFlags & D3DCREATE_FPU_PRESERVE))
+        nine_setup_fpu();
+
+    if (This->params.BehaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING)
+        DBG("Application asked full Software Vertex Processing. Ignoring.\n");
+    if (This->params.BehaviorFlags & D3DCREATE_MIXED_VERTEXPROCESSING)
+        DBG("Application asked mixed Software Vertex Processing. Ignoring.\n");
+
+    This->pipe = This->screen->context_create(This->screen, NULL, 0);
     if (!This->pipe) { return E_OUTOFMEMORY; } /* guess */
 
     This->cso = cso_create_context(This->pipe);
@@ -221,6 +216,42 @@ NineDevice9_ctor( struct NineDevice9 *This,
         NineUnknown_ConvertRefToBind(NineUnknown(This->state.rt[i]));
     }
 
+    /* Initialize a dummy VBO to be used when a a vertex declaration does not
+     * specify all the inputs needed by vertex shader, on win default behavior
+     * is to pass 0,0,0,0 to the shader */
+    {
+        struct pipe_transfer *transfer;
+        struct pipe_resource tmpl;
+        struct pipe_box box;
+        unsigned char *data;
+
+        tmpl.target = PIPE_BUFFER;
+        tmpl.format = PIPE_FORMAT_R8_UNORM;
+        tmpl.width0 = 16; /* 4 floats */
+        tmpl.height0 = 1;
+        tmpl.depth0 = 1;
+        tmpl.array_size = 1;
+        tmpl.last_level = 0;
+        tmpl.nr_samples = 0;
+        tmpl.usage = PIPE_USAGE_DEFAULT;
+        tmpl.bind = PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_TRANSFER_WRITE;
+        tmpl.flags = 0;
+        This->dummy_vbo = pScreen->resource_create(pScreen, &tmpl);
+
+        if (!This->dummy_vbo)
+            return D3DERR_OUTOFVIDEOMEMORY;
+
+        u_box_1d(0, 16, &box);
+        data = This->pipe->transfer_map(This->pipe, This->dummy_vbo, 0,
+                                        PIPE_TRANSFER_WRITE |
+                                        PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                                        &box, &transfer);
+        assert(data);
+        assert(transfer);
+        memset(data, 0, 16);
+        This->pipe->transfer_unmap(This->pipe, transfer);
+    }
+
     This->cursor.software = FALSE;
     This->cursor.hotspot.x = -1;
     This->cursor.hotspot.y = -1;
@@ -256,7 +287,7 @@ NineDevice9_ctor( struct NineDevice9 *This,
                             NINE_MAX_CONST_ALL);
         /* ps 3.0: 224 float constants. All cards supported support at least
          * 256 constants for ps */
-        max_const_ps = 224 + (NINE_MAX_CONST_I + NINE_MAX_CONST_B / 4);
+        max_const_ps = NINE_MAX_CONST_F_PS3 + (NINE_MAX_CONST_I + NINE_MAX_CONST_B / 4);
 
         This->max_vs_const_f = max_const_vs -
                                (NINE_MAX_CONST_I + NINE_MAX_CONST_B / 4);
@@ -269,13 +300,18 @@ NineDevice9_ctor( struct NineDevice9 *This,
         This->state.vs_const_f = CALLOC(This->vs_const_size, 1);
         This->state.ps_const_f = CALLOC(This->ps_const_size, 1);
         This->state.vs_lconstf_temp = CALLOC(This->vs_const_size,1);
+        This->state.ps_lconstf_temp = CALLOC(This->ps_const_size,1);
         if (!This->state.vs_const_f || !This->state.ps_const_f ||
-            !This->state.vs_lconstf_temp)
+            !This->state.vs_lconstf_temp || !This->state.ps_lconstf_temp)
             return E_OUTOFMEMORY;
 
         if (strstr(pScreen->get_name(pScreen), "AMD") ||
-            strstr(pScreen->get_name(pScreen), "ATI"))
-            This->prefer_user_constbuf = TRUE;
+            strstr(pScreen->get_name(pScreen), "ATI")) {
+            This->driver_bugs.buggy_barycentrics = TRUE;
+        }
+
+        /* Disable NV path for now, needs some fixes */
+        This->prefer_user_constbuf = TRUE;
 
         tmpl.target = PIPE_BUFFER;
         tmpl.format = PIPE_FORMAT_R8_UNORM;
@@ -298,17 +334,73 @@ NineDevice9_ctor( struct NineDevice9 *This,
             return E_OUTOFMEMORY;
     }
 
-    /* Allocate upload helper for drivers that suck (from st pov ;). */
+    /* allocate dummy texture/sampler for when there are missing ones bound */
     {
-        unsigned bind = 0;
+        struct pipe_resource tmplt;
+        struct pipe_sampler_view templ;
+        struct pipe_sampler_state samp;
+        memset(&samp, 0, sizeof(samp));
+
+        tmplt.target = PIPE_TEXTURE_2D;
+        tmplt.width0 = 1;
+        tmplt.height0 = 1;
+        tmplt.depth0 = 1;
+        tmplt.last_level = 0;
+        tmplt.array_size = 1;
+        tmplt.usage = PIPE_USAGE_DEFAULT;
+        tmplt.flags = 0;
+        tmplt.format = PIPE_FORMAT_B8G8R8A8_UNORM;
+        tmplt.bind = PIPE_BIND_SAMPLER_VIEW;
+        tmplt.nr_samples = 0;
+
+        This->dummy_texture = This->screen->resource_create(This->screen, &tmplt);
+        if (!This->dummy_texture)
+            return D3DERR_DRIVERINTERNALERROR;
+
+        templ.format = PIPE_FORMAT_B8G8R8A8_UNORM;
+        templ.u.tex.first_layer = 0;
+        templ.u.tex.last_layer = 0;
+        templ.u.tex.first_level = 0;
+        templ.u.tex.last_level = 0;
+        templ.swizzle_r = PIPE_SWIZZLE_ZERO;
+        templ.swizzle_g = PIPE_SWIZZLE_ZERO;
+        templ.swizzle_b = PIPE_SWIZZLE_ZERO;
+        templ.swizzle_a = PIPE_SWIZZLE_ONE;
+        templ.target = This->dummy_texture->target;
+
+        This->dummy_sampler_view = This->pipe->create_sampler_view(This->pipe, This->dummy_texture, &templ);
+        if (!This->dummy_sampler_view)
+            return D3DERR_DRIVERINTERNALERROR;
+
+        samp.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+        samp.max_lod = 15.0f;
+        samp.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+        samp.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+        samp.compare_mode = PIPE_TEX_COMPARE_NONE;
+        samp.compare_func = PIPE_FUNC_LEQUAL;
+        samp.normalized_coords = 1;
+        samp.seamless_cube_map = 1;
+        This->dummy_sampler_state = samp;
+    }
+
+    /* Allocate upload helper for drivers that suck (from st pov ;). */
 
-        This->driver_caps.user_vbufs = GET_PCAP(USER_VERTEX_BUFFERS);
-        This->driver_caps.user_ibufs = GET_PCAP(USER_INDEX_BUFFERS);
+    This->driver_caps.user_vbufs = GET_PCAP(USER_VERTEX_BUFFERS);
+    This->driver_caps.user_ibufs = GET_PCAP(USER_INDEX_BUFFERS);
+    This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
+
+    if (!This->driver_caps.user_vbufs)
+        This->vertex_uploader = u_upload_create(This->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+    if (!This->driver_caps.user_ibufs)
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
+    if (!This->driver_caps.user_cbufs) {
+        unsigned alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
 
-        if (!This->driver_caps.user_vbufs) bind |= PIPE_BIND_VERTEX_BUFFER;
-        if (!This->driver_caps.user_ibufs) bind |= PIPE_BIND_INDEX_BUFFER;
-        if (bind)
-            This->upload = u_upload_create(This->pipe, 1 << 20, 4, bind);
+        This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
+                                                  alignment, PIPE_BIND_CONSTANT_BUFFER);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
@@ -318,10 +410,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
     nine_ff_init(This); /* initialize fixed function code */
 
     NineDevice9_SetDefaultState(This, FALSE);
-    NineDevice9_RestoreNonCSOState(This, ~0);
+
+    {
+        struct pipe_poly_stipple stipple;
+        memset(&stipple, ~0, sizeof(stipple));
+        This->pipe->set_polygon_stipple(This->pipe, &stipple);
+    }
 
     This->update = &This->state;
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     ID3DPresentGroup_Release(This->present);
 
@@ -341,16 +438,24 @@ NineDevice9_dtor( struct NineDevice9 *This )
     nine_ff_fini(This);
     nine_state_clear(&This->state, TRUE);
 
-    if (This->upload)
-        u_upload_destroy(This->upload);
+    if (This->vertex_uploader)
+        u_upload_destroy(This->vertex_uploader);
+    if (This->index_uploader)
+        u_upload_destroy(This->index_uploader);
+    if (This->constbuf_uploader)
+        u_upload_destroy(This->constbuf_uploader);
 
     nine_bind(&This->record, NULL);
 
+    pipe_sampler_view_reference(&This->dummy_sampler_view, NULL);
+    pipe_resource_reference(&This->dummy_texture, NULL);
     pipe_resource_reference(&This->constbuf_vs, NULL);
     pipe_resource_reference(&This->constbuf_ps, NULL);
+    pipe_resource_reference(&This->dummy_vbo, NULL);
     FREE(This->state.vs_const_f);
     FREE(This->state.ps_const_f);
     FREE(This->state.vs_lconstf_temp);
+    FREE(This->state.ps_lconstf_temp);
 
     if (This->swapchains) {
         for (i = 0; i < This->nswapchains; ++i)
@@ -396,7 +501,7 @@ NineDevice9_GetCaps( struct NineDevice9 *This )
     return &This->caps;
 }
 
-static INLINE void
+static inline void
 NineDevice9_PauseRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -405,7 +510,7 @@ NineDevice9_PauseRecording( struct NineDevice9 *This )
     }
 }
 
-static INLINE void
+static inline void
 NineDevice9_ResumeRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -433,10 +538,14 @@ NineDevice9_GetAvailableTextureMem( struct NineDevice9 *This )
 HRESULT WINAPI
 NineDevice9_EvictManagedResources( struct NineDevice9 *This )
 {
-    /* We don't really need to do anything here, but might want to free up
-     * the GPU virtual address space by killing pipe_resources.
-     */
-    STUB(D3D_OK);
+    struct NineBaseTexture9 *tex;
+
+    DBG("This=%p\n", This);
+    LIST_FOR_EACH_ENTRY(tex, &This->managed_textures, list2) {
+        NineBaseTexture9_UnLoad(tex);
+    }
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -485,11 +594,11 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
                                  UINT YHotSpot,
                                  IDirect3DSurface9 *pCursorBitmap )
 {
-    /* TODO: hardware cursor */
     struct NineSurface9 *surf = NineSurface9(pCursorBitmap);
     struct pipe_context *pipe = This->pipe;
     struct pipe_box box;
     struct pipe_transfer *transfer;
+    BOOL hw_cursor;
     void *ptr;
 
     DBG_FLAG(DBG_SWAPCHAIN, "This=%p XHotSpot=%u YHotSpot=%u "
@@ -497,8 +606,15 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
 
     user_assert(pCursorBitmap, D3DERR_INVALIDCALL);
 
-    This->cursor.w = MIN2(surf->desc.Width, This->cursor.image->width0);
-    This->cursor.h = MIN2(surf->desc.Height, This->cursor.image->height0);
+    if (This->swapchains[0]->params.Windowed) {
+        This->cursor.w = MIN2(surf->desc.Width, 32);
+        This->cursor.h = MIN2(surf->desc.Height, 32);
+        hw_cursor = 1; /* always use hw cursor for windowed mode */
+    } else {
+        This->cursor.w = MIN2(surf->desc.Width, This->cursor.image->width0);
+        This->cursor.h = MIN2(surf->desc.Height, This->cursor.image->height0);
+        hw_cursor = This->cursor.w == 32 && This->cursor.h == 32;
+    }
 
     u_box_origin_2d(This->cursor.w, This->cursor.h, &box);
 
@@ -529,16 +645,21 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
                                  lock.pBits, lock.Pitch,
                                  This->cursor.w, This->cursor.h);
 
-        if (!This->cursor.software &&
-            This->cursor.w == 32 && This->cursor.h == 32)
-            ID3DPresent_SetCursor(This->swapchains[0]->present,
-                                  lock.pBits, &This->cursor.hotspot,
-                                  This->cursor.visible);
+        if (hw_cursor)
+            hw_cursor = ID3DPresent_SetCursor(This->swapchains[0]->present,
+                                              lock.pBits,
+                                              &This->cursor.hotspot,
+                                              This->cursor.visible) == D3D_OK;
 
         NineSurface9_UnlockRect(surf);
     }
     pipe->transfer_unmap(pipe, transfer);
 
+    /* hide cursor if we emulate it */
+    if (!hw_cursor)
+        ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, FALSE);
+    This->cursor.software = !hw_cursor;
+
     return D3D_OK;
 }
 
@@ -556,7 +677,7 @@ NineDevice9_SetCursorPosition( struct NineDevice9 *This,
     This->cursor.pos.y = Y;
 
     if (!This->cursor.software)
-        ID3DPresent_SetCursorPos(swap->present, &This->cursor.pos);
+        This->cursor.software = ID3DPresent_SetCursorPos(swap->present, &This->cursor.pos) != D3D_OK;
 }
 
 BOOL WINAPI
@@ -569,7 +690,7 @@ NineDevice9_ShowCursor( struct NineDevice9 *This,
 
     This->cursor.visible = bShow && (This->cursor.hotspot.x != -1);
     if (!This->cursor.software)
-        ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, bShow);
+        This->cursor.software = ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, bShow) != D3D_OK;
 
     return old;
 }
@@ -638,8 +759,8 @@ NineDevice9_Reset( struct NineDevice9 *This,
     for (i = 0; i < This->nswapchains; ++i) {
         D3DPRESENT_PARAMETERS *params = &pPresentationParameters[i];
         hr = NineSwapChain9_Resize(This->swapchains[i], params, NULL);
-        if (FAILED(hr))
-            return (hr == D3DERR_OUTOFVIDEOMEMORY) ? hr : D3DERR_DEVICELOST;
+        if (hr != D3D_OK)
+            return hr;
     }
 
     nine_pipe_context_clear(This);
@@ -971,7 +1092,7 @@ create_zs_or_rt_surface(struct NineDevice9 *This,
     templ.bind = PIPE_BIND_SAMPLER_VIEW; /* StretchRect */
     switch (type) {
     case 0: templ.bind |= PIPE_BIND_RENDER_TARGET; break;
-    case 1: templ.bind |= PIPE_BIND_DEPTH_STENCIL; break;
+    case 1: templ.bind = d3d9_get_pipe_depth_format_bindings(Format); break;
     default:
         assert(type == 2);
         break;
@@ -994,6 +1115,13 @@ create_zs_or_rt_surface(struct NineDevice9 *This,
     default: break;
     }
 
+    if (compressed_format(Format)) {
+        const unsigned w = util_format_get_blockwidth(templ.format);
+        const unsigned h = util_format_get_blockheight(templ.format);
+
+        user_assert(!(Width % w) && !(Height % h), D3DERR_INVALIDCALL);
+    }
+
     if (Pool == D3DPOOL_DEFAULT && Format != D3DFMT_NULL) {
         /* resource_create doesn't return an error code, so check format here */
         user_assert(templ.format != PIPE_FORMAT_NONE, D3DERR_INVALIDCALL);
@@ -1042,6 +1170,8 @@ NineDevice9_CreateDepthStencilSurface( struct NineDevice9 *This,
                                        HANDLE *pSharedHandle )
 {
     *ppSurface = NULL;
+    if (!depth_stencil_format(Format))
+        return D3DERR_NOTAVAILABLE;
     return create_zs_or_rt_surface(This, 1, D3DPOOL_DEFAULT,
                                    Width, Height, Format,
                                    MultiSample, MultisampleQuality,
@@ -1057,6 +1187,8 @@ NineDevice9_UpdateSurface( struct NineDevice9 *This,
 {
     struct NineSurface9 *dst = NineSurface9(pDestinationSurface);
     struct NineSurface9 *src = NineSurface9(pSourceSurface);
+    int copy_width, copy_height;
+    RECT destRect;
 
     DBG("This=%p pSourceSurface=%p pDestinationSurface=%p "
         "pSourceRect=%p pDestPoint=%p\n", This,
@@ -1068,13 +1200,75 @@ NineDevice9_UpdateSurface( struct NineDevice9 *This,
     if (pDestPoint)
         DBG("pDestPoint = (%u,%u)\n", pDestPoint->x, pDestPoint->y);
 
+    user_assert(dst && src, D3DERR_INVALIDCALL);
+
     user_assert(dst->base.pool == D3DPOOL_DEFAULT, D3DERR_INVALIDCALL);
     user_assert(src->base.pool == D3DPOOL_SYSTEMMEM, D3DERR_INVALIDCALL);
 
     user_assert(dst->desc.MultiSampleType == D3DMULTISAMPLE_NONE, D3DERR_INVALIDCALL);
     user_assert(src->desc.MultiSampleType == D3DMULTISAMPLE_NONE, D3DERR_INVALIDCALL);
 
-    return NineSurface9_CopySurface(dst, src, pDestPoint, pSourceRect);
+    user_assert(!src->lock_count, D3DERR_INVALIDCALL);
+    user_assert(!dst->lock_count, D3DERR_INVALIDCALL);
+
+    user_assert(dst->desc.Format == src->desc.Format, D3DERR_INVALIDCALL);
+    user_assert(!depth_stencil_format(dst->desc.Format), D3DERR_INVALIDCALL);
+
+    if (pSourceRect) {
+        copy_width = pSourceRect->right - pSourceRect->left;
+        copy_height = pSourceRect->bottom - pSourceRect->top;
+
+        user_assert(pSourceRect->left >= 0 &&
+                    copy_width > 0 &&
+                    pSourceRect->right <= src->desc.Width &&
+                    pSourceRect->top >= 0 &&
+                    copy_height > 0 &&
+                    pSourceRect->bottom <= src->desc.Height,
+                    D3DERR_INVALIDCALL);
+    } else {
+        copy_width = src->desc.Width;
+        copy_height = src->desc.Height;
+    }
+
+    destRect.right = copy_width;
+    destRect.bottom = copy_height;
+
+    if (pDestPoint) {
+        user_assert(pDestPoint->x >= 0 && pDestPoint->y >= 0,
+                    D3DERR_INVALIDCALL);
+        destRect.right += pDestPoint->x;
+        destRect.bottom += pDestPoint->y;
+    }
+
+    user_assert(destRect.right <= dst->desc.Width &&
+                destRect.bottom <= dst->desc.Height,
+                D3DERR_INVALIDCALL);
+
+    if (compressed_format(dst->desc.Format)) {
+        const unsigned w = util_format_get_blockwidth(dst->base.info.format);
+        const unsigned h = util_format_get_blockheight(dst->base.info.format);
+
+        if (pDestPoint) {
+            user_assert(!(pDestPoint->x % w) && !(pDestPoint->y % h),
+                        D3DERR_INVALIDCALL);
+        }
+
+        if (pSourceRect) {
+            user_assert(!(pSourceRect->left % w) && !(pSourceRect->top % h),
+                        D3DERR_INVALIDCALL);
+        }
+        if (!(copy_width == src->desc.Width &&
+              copy_width == dst->desc.Width &&
+              copy_height == src->desc.Height &&
+              copy_height == dst->desc.Height)) {
+            user_assert(!(copy_width  % w) && !(copy_height % h),
+                        D3DERR_INVALIDCALL);
+        }
+    }
+
+    NineSurface9_CopyMemToDefault(dst, src, pDestPoint, pSourceRect);
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -1086,6 +1280,7 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
     struct NineBaseTexture9 *srcb = NineBaseTexture9(pSourceTexture);
     unsigned l, m;
     unsigned last_level = dstb->base.info.last_level;
+    RECT rect;
 
     DBG("This=%p pSourceTexture=%p pDestinationTexture=%p\n", This,
         pSourceTexture, pDestinationTexture);
@@ -1098,16 +1293,19 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
     if (dstb->base.usage & D3DUSAGE_AUTOGENMIPMAP) {
         /* Only the first level is updated, the others regenerated. */
         last_level = 0;
+        /* if the source has D3DUSAGE_AUTOGENMIPMAP, we have to ignore
+         * the sublevels, thus level 0 has to match */
+        user_assert(!(srcb->base.usage & D3DUSAGE_AUTOGENMIPMAP) ||
+                    (srcb->base.info.width0 == dstb->base.info.width0 &&
+                     srcb->base.info.height0 == dstb->base.info.height0 &&
+                     srcb->base.info.depth0 == dstb->base.info.depth0),
+                    D3DERR_INVALIDCALL);
     } else {
         user_assert(!(srcb->base.usage & D3DUSAGE_AUTOGENMIPMAP), D3DERR_INVALIDCALL);
     }
 
     user_assert(dstb->base.type == srcb->base.type, D3DERR_INVALIDCALL);
 
-    /* TODO: We can restrict the update to the dirty portions of the source.
-     * Yes, this seems silly, but it's what MSDN says ...
-     */
-
     /* Find src level that matches dst level 0: */
     user_assert(srcb->base.info.width0 >= dstb->base.info.width0 &&
                 srcb->base.info.height0 >= dstb->base.info.height0 &&
@@ -1131,9 +1329,25 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
         struct NineTexture9 *dst = NineTexture9(dstb);
         struct NineTexture9 *src = NineTexture9(srcb);
 
-        for (l = 0; l <= last_level; ++l, ++m)
-            NineSurface9_CopySurface(dst->surfaces[l],
-                                     src->surfaces[m], NULL, NULL);
+        if (src->dirty_rect.width == 0)
+            return D3D_OK;
+
+        pipe_box_to_rect(&rect, &src->dirty_rect);
+        for (l = 0; l < m; ++l)
+            rect_minify_inclusive(&rect);
+
+        for (l = 0; l <= last_level; ++l, ++m) {
+            fit_rect_format_inclusive(dst->base.base.info.format,
+                                      &rect,
+                                      dst->surfaces[l]->desc.Width,
+                                      dst->surfaces[l]->desc.Height);
+            NineSurface9_CopyMemToDefault(dst->surfaces[l],
+                                          src->surfaces[m],
+                                          (POINT *)&rect,
+                                          &rect);
+            rect_minify_inclusive(&rect);
+        }
+        u_box_origin_2d(0, 0, &src->dirty_rect);
     } else
     if (dstb->base.type == D3DRTYPE_CUBETEXTURE) {
         struct NineCubeTexture9 *dst = NineCubeTexture9(dstb);
@@ -1142,10 +1356,25 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
 
         /* GPUs usually have them stored as arrays of mip-mapped 2D textures. */
         for (z = 0; z < 6; ++z) {
+            if (src->dirty_rect[z].width == 0)
+                continue;
+
+            pipe_box_to_rect(&rect, &src->dirty_rect[z]);
+            for (l = 0; l < m; ++l)
+                rect_minify_inclusive(&rect);
+
             for (l = 0; l <= last_level; ++l, ++m) {
-                NineSurface9_CopySurface(dst->surfaces[l * 6 + z],
-                                         src->surfaces[m * 6 + z], NULL, NULL);
+                fit_rect_format_inclusive(dst->base.base.info.format,
+                                          &rect,
+                                          dst->surfaces[l * 6 + z]->desc.Width,
+                                          dst->surfaces[l * 6 + z]->desc.Height);
+                NineSurface9_CopyMemToDefault(dst->surfaces[l * 6 + z],
+                                              src->surfaces[m * 6 + z],
+                                              (POINT *)&rect,
+                                              &rect);
+                rect_minify_inclusive(&rect);
             }
+            u_box_origin_2d(0, 0, &src->dirty_rect[z]);
             m -= l;
         }
     } else
@@ -1153,15 +1382,20 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
         struct NineVolumeTexture9 *dst = NineVolumeTexture9(dstb);
         struct NineVolumeTexture9 *src = NineVolumeTexture9(srcb);
 
+        if (src->dirty_box.width == 0)
+            return D3D_OK;
         for (l = 0; l <= last_level; ++l, ++m)
-            NineVolume9_CopyVolume(dst->volumes[l],
-                                   src->volumes[m], 0, 0, 0, NULL);
+            NineVolume9_CopyMemToDefault(dst->volumes[l],
+                                         src->volumes[m], 0, 0, 0, NULL);
+        u_box_3d(0, 0, 0, 0, 0, 0, &src->dirty_box);
     } else{
         assert(!"invalid texture type");
     }
 
-    if (dstb->base.usage & D3DUSAGE_AUTOGENMIPMAP)
+    if (dstb->base.usage & D3DUSAGE_AUTOGENMIPMAP) {
+        dstb->dirty_mip = TRUE;
         NineBaseTexture9_GenerateMipSubLevels(dstb);
+    }
 
     return D3D_OK;
 }
@@ -1183,7 +1417,12 @@ NineDevice9_GetRenderTargetData( struct NineDevice9 *This,
     user_assert(dst->desc.MultiSampleType < 2, D3DERR_INVALIDCALL);
     user_assert(src->desc.MultiSampleType < 2, D3DERR_INVALIDCALL);
 
-    return NineSurface9_CopySurface(dst, src, NULL, NULL);
+    user_assert(src->desc.Width == dst->desc.Width, D3DERR_INVALIDCALL);
+    user_assert(src->desc.Height == dst->desc.Height, D3DERR_INVALIDCALL);
+
+    NineSurface9_CopyDefaultToMem(dst, src);
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -1264,6 +1503,7 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
                 (pSourceRect->left <= pSourceRect->right &&
                  pSourceRect->top <= pSourceRect->bottom), D3DERR_INVALIDCALL);
 
+    memset(&blit, 0, sizeof(blit));
     blit.dst.resource = dst_res;
     blit.dst.level = dst->level;
     blit.dst.box.z = dst->layer;
@@ -1322,6 +1562,7 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
     blit.filter = Filter == D3DTEXF_LINEAR ?
        PIPE_TEX_FILTER_LINEAR : PIPE_TEX_FILTER_NEAREST;
     blit.scissor_enable = FALSE;
+    blit.alpha_blend = FALSE;
 
     /* If both of a src and dst dimension are negative, flip them. */
     if (blit.dst.box.width < 0 && blit.src.box.width < 0) {
@@ -1338,8 +1579,12 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
 
     user_assert(!scaled || dst != src, D3DERR_INVALIDCALL);
     user_assert(!scaled ||
-                !NineSurface9_IsOffscreenPlain(dst) ||
+                !NineSurface9_IsOffscreenPlain(dst), D3DERR_INVALIDCALL);
+    user_assert(!NineSurface9_IsOffscreenPlain(dst) ||
                 NineSurface9_IsOffscreenPlain(src), D3DERR_INVALIDCALL);
+    user_assert(NineSurface9_IsOffscreenPlain(dst) ||
+                dst->desc.Usage & (D3DUSAGE_RENDERTARGET | D3DUSAGE_DEPTHSTENCIL),
+                D3DERR_INVALIDCALL);
     user_assert(!scaled ||
                 (!util_format_is_compressed(dst->base.info.format) &&
                  !util_format_is_compressed(src->base.info.format)),
@@ -1392,6 +1637,9 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
             &blit.src.box);
     }
 
+    /* Communicate the container it needs to update sublevels - if apply */
+    NineSurface9_MarkContainerDirty(dst);
+
     return D3D_OK;
 }
 
@@ -1432,11 +1680,8 @@ NineDevice9_ColorFill( struct NineDevice9 *This,
     }
     d3dcolor_to_pipe_color_union(&rgba, color);
 
-    fallback =
-        !This->screen->is_format_supported(This->screen, surf->base.info.format,
-                                           surf->base.info.target,
-                                           surf->base.info.nr_samples,
-                                           PIPE_BIND_RENDER_TARGET);
+    fallback = !(surf->base.info.bind & PIPE_BIND_RENDER_TARGET);
+
     if (!fallback) {
         psurf = NineSurface9_GetSurface(surf, 0);
         if (!psurf)
@@ -1608,11 +1853,15 @@ NineDevice9_Clear( struct NineDevice9 *This,
                    float Z,
                    DWORD Stencil )
 {
+    const int sRGB = This->state.rs[D3DRS_SRGBWRITEENABLE] ? 1 : 0;
+    struct pipe_surface *cbuf, *zsbuf;
     struct pipe_context *pipe = This->pipe;
-    struct NineSurface9 *zsbuf = This->state.ds;
+    struct NineSurface9 *zsbuf_surf = This->state.ds;
+    struct NineSurface9 *rt;
     unsigned bufs = 0;
     unsigned r, i;
     union pipe_color_union rgba;
+    unsigned rt_mask = 0;
     D3DRECT rect;
 
     DBG("This=%p Count=%u pRects=%p Flags=%x Color=%08x Z=%f Stencil=%x\n",
@@ -1621,8 +1870,8 @@ NineDevice9_Clear( struct NineDevice9 *This,
     user_assert(This->state.ds || !(Flags & NINED3DCLEAR_DEPTHSTENCIL),
                 D3DERR_INVALIDCALL);
     user_assert(!(Flags & D3DCLEAR_STENCIL) ||
-                (zsbuf &&
-                 util_format_is_depth_and_stencil(zsbuf->base.info.format)),
+                (zsbuf_surf &&
+                 util_format_is_depth_and_stencil(zsbuf_surf->base.info.format)),
                 D3DERR_INVALIDCALL);
 #ifdef NINE_STRICT
     user_assert((Count && pRects) || (!Count && !pRects), D3DERR_INVALIDCALL);
@@ -1641,7 +1890,7 @@ NineDevice9_Clear( struct NineDevice9 *This,
         return D3D_OK;
     d3dcolor_to_pipe_color_union(&rgba, Color);
 
-    nine_update_state(This, NINE_STATE_FB);
+    nine_update_state_framebuffer(This);
 
     rect.x1 = This->state.viewport.X;
     rect.y1 = This->state.viewport.Y;
@@ -1668,25 +1917,42 @@ NineDevice9_Clear( struct NineDevice9 *This,
 
     if (rect.x1 >= This->state.fb.width || rect.y1 >= This->state.fb.height)
         return D3D_OK;
+
+    for (i = 0; i < This->caps.NumSimultaneousRTs; ++i) {
+        if (This->state.rt[i] && This->state.rt[i]->desc.Format != D3DFMT_NULL)
+            rt_mask |= 1 << i;
+    }
+
+    /* fast path, clears everything at once */
     if (!Count &&
-        rect.x1 == 0 && rect.x2 >= This->state.fb.width &&
-        rect.y1 == 0 && rect.y2 >= This->state.fb.height) {
-        /* fast path, clears everything at once */
-        DBG("fast path\n");
+        (!(bufs & PIPE_CLEAR_COLOR) || (rt_mask == This->state.rt_mask)) &&
+        rect.x1 == 0 && rect.y1 == 0 &&
+        /* Case we clear only render target. Check clear region vs rt. */
+        ((!(bufs & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
+         rect.x2 >= This->state.fb.width &&
+         rect.y2 >= This->state.fb.height) ||
+        /* Case we clear depth buffer (and eventually rt too).
+         * depth buffer size is always >= rt size. Compare to clear region */
+        ((bufs & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) &&
+         This->state.fb.zsbuf != NULL &&
+         rect.x2 >= zsbuf_surf->desc.Width &&
+         rect.y2 >= zsbuf_surf->desc.Height))) {
+        DBG("Clear fast path\n");
         pipe->clear(pipe, bufs, &rgba, Z, Stencil);
         return D3D_OK;
     }
-    rect.x2 = MIN2(rect.x2, This->state.fb.width);
-    rect.y2 = MIN2(rect.y2, This->state.fb.height);
 
     if (!Count) {
         Count = 1;
         pRects = &rect;
     }
 
-    for (i = 0; (i < This->state.fb.nr_cbufs); ++i) {
-        if (!This->state.fb.cbufs[i] || !(Flags & D3DCLEAR_TARGET))
+    for (i = 0; i < This->caps.NumSimultaneousRTs; ++i) {
+        rt = This->state.rt[i];
+        if (!rt || rt->desc.Format == D3DFMT_NULL ||
+            !(Flags & D3DCLEAR_TARGET))
             continue; /* save space, compiler should hoist this */
+        cbuf = NineSurface9_GetSurface(rt, sRGB);
         for (r = 0; r < Count; ++r) {
             /* Don't trust users to pass these in the right order. */
             unsigned x1 = MIN2(pRects[r].x1, pRects[r].x2);
@@ -1701,11 +1967,11 @@ NineDevice9_Clear( struct NineDevice9 *This,
 
             x1 = MAX2(x1, rect.x1);
             y1 = MAX2(y1, rect.y1);
-            x2 = MIN2(x2, rect.x2);
-            y2 = MIN2(y2, rect.y2);
+            x2 = MIN3(x2, rect.x2, rt->desc.Width);
+            y2 = MIN3(y2, rect.y2, rt->desc.Height);
 
             DBG("Clearing (%u..%u)x(%u..%u)\n", x1, x2, y1, y2);
-            pipe->clear_render_target(pipe, This->state.fb.cbufs[i], &rgba,
+            pipe->clear_render_target(pipe, cbuf, &rgba,
                                       x1, y1, x2 - x1, y2 - y1);
         }
     }
@@ -1727,10 +1993,12 @@ NineDevice9_Clear( struct NineDevice9 *This,
 
         x1 = MIN2(x1, rect.x1);
         y1 = MIN2(y1, rect.y1);
-        x2 = MIN2(x2, rect.x2);
-        y2 = MIN2(y2, rect.y2);
+        x2 = MIN3(x2, rect.x2, zsbuf_surf->desc.Width);
+        y2 = MIN3(y2, rect.y2, zsbuf_surf->desc.Height);
 
-        pipe->clear_depth_stencil(pipe, This->state.fb.zsbuf, bufs, Z, Stencil,
+        zsbuf = NineSurface9_GetSurface(zsbuf_surf, 0);
+        assert(zsbuf);
+        pipe->clear_depth_stencil(pipe, zsbuf, bufs, Z, Stencil,
                                   x1, y1, x2 - x1, y2 - y1);
     }
     return D3D_OK;
@@ -1860,8 +2128,10 @@ NineDevice9_SetLight( struct NineDevice9 *This,
             return E_OUTOFMEMORY;
         state->ff.num_lights = N;
 
-        for (; n < Index; ++n)
+        for (; n < Index; ++n) {
+            memset(&state->ff.light[n], 0, sizeof(D3DLIGHT9));
             state->ff.light[n].Type = (D3DLIGHTTYPE)NINED3DLIGHT_INVALID;
+        }
     }
     state->ff.light[Index] = *pLight;
 
@@ -1963,7 +2233,9 @@ NineDevice9_GetLightEnable( struct NineDevice9 *This,
     for (i = 0; i < state->ff.num_lights_active; ++i)
         if (state->ff.active_light[i] == Index)
             break;
-    *pEnable = i != state->ff.num_lights_active;
+
+    *pEnable = i != state->ff.num_lights_active ? 128 : 0; // Taken from wine
+
     return D3D_OK;
 }
 
@@ -2027,6 +2299,7 @@ NineDevice9_ResolveZ( struct NineDevice9 *This )
     desc = util_format_description(dst->format);
     user_assert(desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS, D3DERR_INVALIDCALL);
 
+    memset(&blit, 0, sizeof(blit));
     blit.src.resource = src;
     blit.src.level = 0;
     blit.src.format = src->format;
@@ -2299,14 +2572,8 @@ NineDevice9_SetTexture( struct NineDevice9 *This,
                 Stage == D3DDMAPSAMPLER ||
                 (Stage >= D3DVERTEXTEXTURESAMPLER0 &&
                  Stage <= D3DVERTEXTEXTURESAMPLER3), D3DERR_INVALIDCALL);
-    user_assert(!tex || tex->base.pool != D3DPOOL_SCRATCH, D3DERR_INVALIDCALL);
-
-    if (unlikely(tex && tex->base.pool == D3DPOOL_SYSTEMMEM)) {
-        /* TODO: Currently not implemented. Better return error
-         * with message telling what's wrong */
-        ERR("This=%p D3DPOOL_SYSTEMMEM not implemented for SetTexture\n", This);
-        user_assert(tex->base.pool != D3DPOOL_SYSTEMMEM, D3DERR_INVALIDCALL);
-    }
+    user_assert(!tex || (tex->base.pool != D3DPOOL_SCRATCH &&
+                tex->base.pool != D3DPOOL_SYSTEMMEM), D3DERR_INVALIDCALL);
 
     if (Stage >= D3DDMAPSAMPLER)
         Stage = Stage - D3DDMAPSAMPLER + NINE_MAX_SAMPLERS_PS;
@@ -2320,7 +2587,7 @@ NineDevice9_SetTexture( struct NineDevice9 *This,
         if (tex) {
             state->samplers_shadow |= tex->shadow << Stage;
 
-            if ((tex->dirty | tex->dirty_mip) && LIST_IS_EMPTY(&tex->list))
+            if ((tex->managed.dirty | tex->dirty_mip) && LIST_IS_EMPTY(&tex->list))
                 list_add(&tex->list, &This->update_textures);
 
             tex->bind_count++;
@@ -2359,6 +2626,7 @@ NineDevice9_SetTextureStageState( struct NineDevice9 *This,
                                   DWORD Value )
 {
     struct nine_state *state = This->update;
+    int bumpmap_index = -1;
 
     DBG("Stage=%u Type=%u Value=%08x\n", Stage, Type, Value);
     nine_dump_D3DTSS_value(DBG_FF, Type, Value);
@@ -2367,6 +2635,36 @@ NineDevice9_SetTextureStageState( struct NineDevice9 *This,
     user_assert(Type < Elements(state->ff.tex_stage[0]), D3DERR_INVALIDCALL);
 
     state->ff.tex_stage[Stage][Type] = Value;
+    switch (Type) {
+    case D3DTSS_BUMPENVMAT00:
+        bumpmap_index = 4 * Stage;
+        break;
+    case D3DTSS_BUMPENVMAT10:
+        bumpmap_index = 4 * Stage + 1;
+        break;
+    case D3DTSS_BUMPENVMAT01:
+        bumpmap_index = 4 * Stage + 2;
+        break;
+    case D3DTSS_BUMPENVMAT11:
+        bumpmap_index = 4 * Stage + 3;
+        break;
+    case D3DTSS_BUMPENVLSCALE:
+        bumpmap_index = 4 * 8 + 2 * Stage;
+        break;
+    case D3DTSS_BUMPENVLOFFSET:
+        bumpmap_index = 4 * 8 + 2 * Stage + 1;
+        break;
+    case D3DTSS_TEXTURETRANSFORMFLAGS:
+        state->changed.group |= NINE_STATE_PS1X_SHADER;
+        break;
+    default:
+        break;
+    }
+
+    if (bumpmap_index >= 0) {
+        state->bumpmap_vars[bumpmap_index] = Value;
+        state->changed.group |= NINE_STATE_PS_CONST;
+    }
 
     state->changed.group |= NINE_STATE_FF_PSSTAGES;
     state->ff.changed.tex_stage[Stage][Type / 32] |= 1 << (Type % 32);
@@ -2411,12 +2709,11 @@ NineDevice9_SetSamplerState( struct NineDevice9 *This,
     if (Sampler >= D3DDMAPSAMPLER)
         Sampler = Sampler - D3DDMAPSAMPLER + NINE_MAX_SAMPLERS_PS;
 
-    state->samp[Sampler][Type] = Value;
-    state->changed.group |= NINE_STATE_SAMPLER;
-    state->changed.sampler[Sampler] |= 1 << Type;
-
-    if (Type == D3DSAMP_SRGBTEXTURE)
-        state->changed.srgb = TRUE;
+    if (state->samp[Sampler][Type] != Value || unlikely(This->is_recording)) {
+        state->samp[Sampler][Type] = Value;
+        state->changed.group |= NINE_STATE_SAMPLER;
+        state->changed.sampler[Sampler] |= 1 << Type;
+    }
 
     return D3D_OK;
 }
@@ -2548,7 +2845,7 @@ NineDevice9_GetNPatchMode( struct NineDevice9 *This )
     STUB(0);
 }
 
-static INLINE void
+static inline void
 init_draw_info(struct pipe_draw_info *info,
                struct NineDevice9 *dev, D3DPRIMITIVETYPE type, UINT count)
 {
@@ -2575,7 +2872,7 @@ NineDevice9_DrawPrimitive( struct NineDevice9 *This,
     DBG("iface %p, PrimitiveType %u, StartVertex %u, PrimitiveCount %u\n",
         This, PrimitiveType, StartVertex, PrimitiveCount);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = FALSE;
@@ -2608,7 +2905,7 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This,
     user_assert(This->state.idxbuf, D3DERR_INVALIDCALL);
     user_assert(This->state.vdecl, D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = TRUE;
@@ -2640,7 +2937,7 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
     user_assert(pVertexStreamZeroData && VertexStreamZeroStride,
                 D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = FALSE;
@@ -2654,13 +2951,16 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
     vtxbuf.buffer = NULL;
     vtxbuf.user_buffer = pVertexStreamZeroData;
 
-    if (!This->driver_caps.user_vbufs)
-        u_upload_data(This->upload,
+    if (!This->driver_caps.user_vbufs) {
+        u_upload_data(This->vertex_uploader,
                       0,
                       (info.max_index + 1) * VertexStreamZeroStride, /* XXX */
                       vtxbuf.user_buffer,
                       &vtxbuf.buffer_offset,
                       &vtxbuf.buffer);
+        u_upload_unmap(This->vertex_uploader);
+        vtxbuf.user_buffer = NULL;
+    }
 
     This->pipe->set_vertex_buffers(This->pipe, 0, 1, &vtxbuf);
 
@@ -2702,7 +3002,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
     user_assert(IndexDataFormat == D3DFMT_INDEX16 ||
                 IndexDataFormat == D3DFMT_INDEX32, D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = TRUE;
@@ -2723,23 +3023,28 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
 
     if (!This->driver_caps.user_vbufs) {
         const unsigned base = info.min_index * VertexStreamZeroStride;
-        u_upload_data(This->upload,
+        u_upload_data(This->vertex_uploader,
                       base,
                       (info.max_index -
                        info.min_index + 1) * VertexStreamZeroStride, /* XXX */
                       (const uint8_t *)vbuf.user_buffer + base,
                       &vbuf.buffer_offset,
                       &vbuf.buffer);
+        u_upload_unmap(This->vertex_uploader);
         /* Won't be used: */
         vbuf.buffer_offset -= base;
+        vbuf.user_buffer = NULL;
     }
-    if (!This->driver_caps.user_ibufs)
-        u_upload_data(This->upload,
+    if (!This->driver_caps.user_ibufs) {
+        u_upload_data(This->index_uploader,
                       0,
                       info.count * ibuf.index_size,
                       ibuf.user_buffer,
                       &ibuf.offset,
                       &ibuf.buffer);
+        u_upload_unmap(This->index_uploader);
+        ibuf.user_buffer = NULL;
+    }
 
     This->pipe->set_vertex_buffers(This->pipe, 0, 1, &vbuf);
     This->pipe->set_index_buffer(This->pipe, &ibuf);
@@ -2786,7 +3091,7 @@ NineDevice9_ProcessVertices( struct NineDevice9 *This,
     if (!screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS))
         STUB(D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     /* TODO: Create shader with stream output. */
     STUB(D3DERR_INVALIDCALL);
@@ -2956,6 +3261,13 @@ NineDevice9_SetVertexShader( struct NineDevice9 *This,
 
     DBG("This=%p pShader=%p\n", This, pShader);
 
+    if (!This->is_recording && state->vs == (struct NineVertexShader9*)pShader)
+      return D3D_OK;
+
+    /* ff -> non-ff: commit back non-ff constants */
+    if (!state->vs && pShader)
+        state->commit |= NINE_STATE_COMMIT_CONST_VS;
+
     nine_bind(&state->vs, pShader);
 
     state->changed.group |= NINE_STATE_VS;
@@ -2990,6 +3302,12 @@ NineDevice9_SetVertexShaderConstantF( struct NineDevice9 *This,
        return D3D_OK;
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        if (!memcmp(&state->vs_const_f[StartRegister * 4], pConstantData,
+                    Vector4fCount * 4 * sizeof(state->vs_const_f[0])))
+            return D3D_OK;
+    }
+
     memcpy(&state->vs_const_f[StartRegister * 4],
            pConstantData,
            Vector4fCount * 4 * sizeof(state->vs_const_f[0]));
@@ -3039,6 +3357,11 @@ NineDevice9_SetVertexShaderConstantI( struct NineDevice9 *This,
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.vs_integer) {
+        if (!This->is_recording) {
+            if (!memcmp(&state->vs_const_i[StartRegister][0], pConstantData,
+                        Vector4iCount * sizeof(state->vs_const_i[0])))
+                return D3D_OK;
+        }
         memcpy(&state->vs_const_i[StartRegister][0],
                pConstantData,
                Vector4iCount * sizeof(state->vs_const_i[0]));
@@ -3103,6 +3426,16 @@ NineDevice9_SetVertexShaderConstantB( struct NineDevice9 *This,
     user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        bool noChange = true;
+        for (i = 0; i < BoolCount; i++) {
+            if (!!state->vs_const_b[StartRegister + i] != !!pConstantData[i])
+              noChange = false;
+        }
+        if (noChange)
+            return D3D_OK;
+    }
+
     for (i = 0; i < BoolCount; i++)
         state->vs_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
@@ -3279,13 +3612,28 @@ NineDevice9_SetPixelShader( struct NineDevice9 *This,
                             IDirect3DPixelShader9 *pShader )
 {
     struct nine_state *state = This->update;
+    unsigned old_mask = state->ps ? state->ps->rt_mask : 1;
+    unsigned mask;
 
     DBG("This=%p pShader=%p\n", This, pShader);
 
+    if (!This->is_recording && state->ps == (struct NinePixelShader9*)pShader)
+      return D3D_OK;
+
+    /* ff -> non-ff: commit back non-ff constants */
+    if (!state->ps && pShader)
+        state->commit |= NINE_STATE_COMMIT_CONST_PS;
+
     nine_bind(&state->ps, pShader);
 
     state->changed.group |= NINE_STATE_PS;
 
+    mask = state->ps ? state->ps->rt_mask : 1;
+    /* We need to update cbufs if the pixel shader would
+     * write to different render targets */
+    if (mask != old_mask)
+        state->changed.group |= NINE_STATE_FB;
+
     return D3D_OK;
 }
 
@@ -3309,13 +3657,19 @@ NineDevice9_SetPixelShaderConstantF( struct NineDevice9 *This,
     DBG("This=%p StartRegister=%u pConstantData=%p Vector4fCount=%u\n",
         This, StartRegister, pConstantData, Vector4fCount);
 
-    user_assert(StartRegister                  < NINE_MAX_CONST_F, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + Vector4fCount <= NINE_MAX_CONST_F, D3DERR_INVALIDCALL);
+    user_assert(StartRegister                  < NINE_MAX_CONST_F_PS3, D3DERR_INVALIDCALL);
+    user_assert(StartRegister + Vector4fCount <= NINE_MAX_CONST_F_PS3, D3DERR_INVALIDCALL);
 
     if (!Vector4fCount)
        return D3D_OK;
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        if (!memcmp(&state->ps_const_f[StartRegister * 4], pConstantData,
+                    Vector4fCount * 4 * sizeof(state->ps_const_f[0])))
+            return D3D_OK;
+    }
+
     memcpy(&state->ps_const_f[StartRegister * 4],
            pConstantData,
            Vector4fCount * 4 * sizeof(state->ps_const_f[0]));
@@ -3337,8 +3691,8 @@ NineDevice9_GetPixelShaderConstantF( struct NineDevice9 *This,
 {
     const struct nine_state *state = &This->state;
 
-    user_assert(StartRegister                  < NINE_MAX_CONST_F, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + Vector4fCount <= NINE_MAX_CONST_F, D3DERR_INVALIDCALL);
+    user_assert(StartRegister                  < NINE_MAX_CONST_F_PS3, D3DERR_INVALIDCALL);
+    user_assert(StartRegister + Vector4fCount <= NINE_MAX_CONST_F_PS3, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     memcpy(pConstantData,
@@ -3365,6 +3719,11 @@ NineDevice9_SetPixelShaderConstantI( struct NineDevice9 *This,
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.ps_integer) {
+        if (!This->is_recording) {
+            if (!memcmp(&state->ps_const_i[StartRegister][0], pConstantData,
+                        Vector4iCount * sizeof(state->ps_const_i[0])))
+                return D3D_OK;
+        }
         memcpy(&state->ps_const_i[StartRegister][0],
                pConstantData,
                Vector4iCount * sizeof(state->ps_const_i[0]));
@@ -3428,6 +3787,16 @@ NineDevice9_SetPixelShaderConstantB( struct NineDevice9 *This,
     user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        bool noChange = true;
+        for (i = 0; i < BoolCount; i++) {
+            if (!!state->ps_const_b[StartRegister + i] != !!pConstantData[i])
+              noChange = false;
+        }
+        if (noChange)
+            return D3D_OK;
+    }
+
     for (i = 0; i < BoolCount; i++)
         state->ps_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;