st/nine: Partial software vertex processing support
authorAxel Davy <axel.davy@ens.fr>
Sat, 17 Sep 2016 12:16:41 +0000 (14:16 +0200)
committerAxel Davy <axel.davy@ens.fr>
Mon, 10 Oct 2016 21:43:49 +0000 (23:43 +0200)
Software Vertex Processing allows:
. Less limitations for shaders (more loops, etc)
. Less limitations for ff (more enabled lights, 255
matrices for VertexBlend)

In particular shaders can get more constants.
This patch implements support for this (not using software
rendering, but hardware rendering, as llvmpipe and dx10+ hw
have the same limits...)

This is considered a second class path. Even apps asking for
"Mixed Vertex processing" (ie the ability to switch to swvp
on demand) do not use the feature much. Some just initialize
more constants than the normal limit at the start of the
application, but never use more than the normal limit.
When the apps do not need the software vertex processing
features, they do not seem to turn it on. This means it is
ok if that path is slow.
Thus no care has been made to make the path optimized.

Signed-off-by: Axel Davy <axel.davy@ens.fr>
src/gallium/state_trackers/nine/device9.c
src/gallium/state_trackers/nine/device9.h
src/gallium/state_trackers/nine/nine_shader.c
src/gallium/state_trackers/nine/nine_state.c
src/gallium/state_trackers/nine/nine_state.h
src/gallium/state_trackers/nine/stateblock9.c
src/gallium/state_trackers/nine/vertexshader9.c
src/gallium/state_trackers/nine/vertexshader9.h

index dca75c53e6e83518a47a91151d61a71cf7c5eeca..f1354904344204315c1535716b031a21490e5a80 100644 (file)
@@ -168,12 +168,31 @@ NineDevice9_ctor( struct NineDevice9 *This,
     if (This->params.BehaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING) {
         DBG("Application asked full Software Vertex Processing.\n");
         This->swvp = true;
+        This->may_swvp = true;
     } else
         This->swvp = false;
-    if (This->params.BehaviorFlags & D3DCREATE_MIXED_VERTEXPROCESSING)
+    if (This->params.BehaviorFlags & D3DCREATE_MIXED_VERTEXPROCESSING) {
         DBG("Application asked mixed Software Vertex Processing.\n");
+        This->may_swvp = true;
+    }
     /* TODO: check if swvp is resetted by device Resets */
 
+    if (This->may_swvp &&
+        (This->screen->get_shader_param(This->screen, PIPE_SHADER_VERTEX,
+                                        PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE)
+                                     < (NINE_MAX_CONST_F_SWVP/2) * sizeof(float[4]) ||
+         This->screen->get_shader_param(This->screen, PIPE_SHADER_VERTEX,
+                                        PIPE_SHADER_CAP_MAX_CONST_BUFFERS) < 5)) {
+        /* Note: We just go on, some apps never use the abilities of
+         * swvp, and just set more constants than allowed at init.
+         * Only cards we support that are affected are the r500 */
+        WARN("Card unable to handle Software Vertex Processing. Game may fail\n");
+    }
+
+    /* When may_swvp, SetConstant* limits are different */
+    if (This->may_swvp)
+        This->caps.MaxVertexShaderConst = NINE_MAX_CONST_F_SWVP;
+
     This->pipe = This->screen->context_create(This->screen, NULL, 0);
     if (!This->pipe) { return E_OUTOFMEMORY; } /* guess */
 
@@ -322,12 +341,22 @@ NineDevice9_ctor( struct NineDevice9 *This,
         This->vs_const_size = max_const_vs * sizeof(float[4]);
         This->ps_const_size = max_const_ps * sizeof(float[4]);
         /* Include space for I,B constants for user constbuf. */
+        if (This->may_swvp) {
+            This->state.vs_const_f_swvp = CALLOC(NINE_MAX_CONST_F_SWVP * sizeof(float[4]),1);
+            if (!This->state.vs_const_f_swvp)
+                return E_OUTOFMEMORY;
+            This->state.vs_lconstf_temp = CALLOC(NINE_MAX_CONST_F_SWVP * sizeof(float[4]),1);
+            This->state.vs_const_i = CALLOC(NINE_MAX_CONST_I_SWVP * sizeof(int[4]), 1);
+            This->state.vs_const_b = CALLOC(NINE_MAX_CONST_B_SWVP * sizeof(BOOL), 1);
+        } else {
+            This->state.vs_const_f_swvp = NULL;
+            This->state.vs_lconstf_temp = CALLOC(This->vs_const_size,1);
+            This->state.vs_const_i = CALLOC(NINE_MAX_CONST_I * sizeof(int[4]), 1);
+            This->state.vs_const_b = CALLOC(NINE_MAX_CONST_B * sizeof(BOOL), 1);
+        }
         This->state.vs_const_f = CALLOC(This->vs_const_size, 1);
         This->state.ps_const_f = CALLOC(This->ps_const_size, 1);
-        This->state.vs_lconstf_temp = CALLOC(This->vs_const_size,1);
         This->state.ps_lconstf_temp = CALLOC(This->ps_const_size,1);
-        This->state.vs_const_i = CALLOC(NINE_MAX_CONST_I * sizeof(int[4]), 1);
-        This->state.vs_const_b = CALLOC(NINE_MAX_CONST_B * sizeof(BOOL), 1);
         if (!This->state.vs_const_f || !This->state.ps_const_f ||
             !This->state.vs_lconstf_temp || !This->state.ps_lconstf_temp ||
             !This->state.vs_const_i || !This->state.vs_const_b)
@@ -464,6 +493,7 @@ NineDevice9_dtor( struct NineDevice9 *This )
     FREE(This->state.ps_lconstf_temp);
     FREE(This->state.vs_const_i);
     FREE(This->state.vs_const_b);
+    FREE(This->state.vs_const_f_swvp);
 
     if (This->swapchains) {
         for (i = 0; i < This->nswapchains; ++i)
@@ -2490,11 +2520,11 @@ NineDevice9_CreateStateBlock( struct NineDevice9 *This,
        /* TODO: texture/sampler state */
        memcpy(dst->changed.rs,
               nine_render_states_vertex, sizeof(dst->changed.rs));
-       nine_ranges_insert(&dst->changed.vs_const_f, 0, This->max_vs_const_f,
+       nine_ranges_insert(&dst->changed.vs_const_f, 0, This->may_swvp ? NINE_MAX_CONST_F_SWVP : This->max_vs_const_f,
                           &This->range_pool);
-       nine_ranges_insert(&dst->changed.vs_const_i, 0, NINE_MAX_CONST_I,
+       nine_ranges_insert(&dst->changed.vs_const_i, 0, This->may_swvp ? NINE_MAX_CONST_I_SWVP : NINE_MAX_CONST_I,
                           &This->range_pool);
-       nine_ranges_insert(&dst->changed.vs_const_b, 0, NINE_MAX_CONST_B,
+       nine_ranges_insert(&dst->changed.vs_const_b, 0, This->may_swvp ? NINE_MAX_CONST_B_SWVP : NINE_MAX_CONST_B,
                           &This->range_pool);
        for (s = 0; s < NINE_MAX_SAMPLERS; ++s)
            dst->changed.sampler[s] |= 1 << D3DSAMP_DMAPOFFSET;
@@ -2890,6 +2920,7 @@ NineDevice9_SetSoftwareVertexProcessing( struct NineDevice9 *This,
 {
     if (This->params.BehaviorFlags & D3DCREATE_MIXED_VERTEXPROCESSING) {
         This->swvp = bSoftware;
+        This->state.changed.group |= NINE_STATE_SWVP;
         return D3D_OK;
     } else
         return D3DERR_INVALIDCALL; /* msdn. TODO: check in practice */
@@ -3376,6 +3407,7 @@ NineDevice9_SetVertexShaderConstantF( struct NineDevice9 *This,
                                       UINT Vector4fCount )
 {
     struct nine_state *state = This->update;
+    float *vs_const_f = This->may_swvp ? state->vs_const_f_swvp : state->vs_const_f;
 
     DBG("This=%p StartRegister=%u pConstantData=%p Vector4fCount=%u\n",
         This, StartRegister, pConstantData, Vector4fCount);
@@ -3388,12 +3420,12 @@ NineDevice9_SetVertexShaderConstantF( struct NineDevice9 *This,
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (!This->is_recording) {
-        if (!memcmp(&state->vs_const_f[StartRegister * 4], pConstantData,
+        if (!memcmp(&vs_const_f[StartRegister * 4], pConstantData,
                     Vector4fCount * 4 * sizeof(state->vs_const_f[0])))
             return D3D_OK;
     }
 
-    memcpy(&state->vs_const_f[StartRegister * 4],
+    memcpy(&vs_const_f[StartRegister * 4],
            pConstantData,
            Vector4fCount * 4 * sizeof(state->vs_const_f[0]));
 
@@ -3401,6 +3433,14 @@ NineDevice9_SetVertexShaderConstantF( struct NineDevice9 *This,
                        StartRegister, StartRegister + Vector4fCount,
                        &This->range_pool);
 
+    if (This->may_swvp) {
+        Vector4fCount = MIN2(StartRegister + Vector4fCount, NINE_MAX_CONST_F) - StartRegister;
+        if (StartRegister < NINE_MAX_CONST_F)
+            memcpy(&state->vs_const_f[StartRegister * 4],
+                   pConstantData,
+                   Vector4fCount * 4 * sizeof(state->vs_const_f[0]));
+    }
+
     state->changed.group |= NINE_STATE_VS_CONST;
 
     return D3D_OK;
@@ -3413,13 +3453,14 @@ NineDevice9_GetVertexShaderConstantF( struct NineDevice9 *This,
                                       UINT Vector4fCount )
 {
     const struct nine_state *state = &This->state;
+    float *vs_const_f = This->may_swvp ? state->vs_const_f_swvp : state->vs_const_f;
 
     user_assert(StartRegister                  < This->caps.MaxVertexShaderConst, D3DERR_INVALIDCALL);
     user_assert(StartRegister + Vector4fCount <= This->caps.MaxVertexShaderConst, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     memcpy(pConstantData,
-           &state->vs_const_f[StartRegister * 4],
+           &vs_const_f[StartRegister * 4],
            Vector4fCount * 4 * sizeof(state->vs_const_f[0]));
 
     return D3D_OK;
@@ -3437,8 +3478,10 @@ NineDevice9_SetVertexShaderConstantI( struct NineDevice9 *This,
     DBG("This=%p StartRegister=%u pConstantData=%p Vector4iCount=%u\n",
         This, StartRegister, pConstantData, Vector4iCount);
 
-    user_assert(StartRegister                  < NINE_MAX_CONST_I, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + Vector4iCount <= NINE_MAX_CONST_I, D3DERR_INVALIDCALL);
+    user_assert(StartRegister < (This->may_swvp ? NINE_MAX_CONST_I_SWVP : NINE_MAX_CONST_I),
+                D3DERR_INVALIDCALL);
+    user_assert(StartRegister + Vector4iCount <= (This->may_swvp ? NINE_MAX_CONST_I_SWVP : NINE_MAX_CONST_I),
+                D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.vs_integer) {
@@ -3476,8 +3519,10 @@ NineDevice9_GetVertexShaderConstantI( struct NineDevice9 *This,
     const struct nine_state *state = &This->state;
     int i;
 
-    user_assert(StartRegister                  < NINE_MAX_CONST_I, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + Vector4iCount <= NINE_MAX_CONST_I, D3DERR_INVALIDCALL);
+    user_assert(StartRegister < (This->may_swvp ? NINE_MAX_CONST_I_SWVP : NINE_MAX_CONST_I),
+                D3DERR_INVALIDCALL);
+    user_assert(StartRegister + Vector4iCount <= (This->may_swvp ? NINE_MAX_CONST_I_SWVP : NINE_MAX_CONST_I),
+                D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.vs_integer) {
@@ -3509,8 +3554,10 @@ NineDevice9_SetVertexShaderConstantB( struct NineDevice9 *This,
     DBG("This=%p StartRegister=%u pConstantData=%p BoolCount=%u\n",
         This, StartRegister, pConstantData, BoolCount);
 
-    user_assert(StartRegister              < NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
+    user_assert(StartRegister < (This->may_swvp ? NINE_MAX_CONST_B_SWVP : NINE_MAX_CONST_B),
+                D3DERR_INVALIDCALL);
+    user_assert(StartRegister + BoolCount <= (This->may_swvp ? NINE_MAX_CONST_B_SWVP : NINE_MAX_CONST_B),
+                D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (!This->is_recording) {
@@ -3543,8 +3590,10 @@ NineDevice9_GetVertexShaderConstantB( struct NineDevice9 *This,
     const struct nine_state *state = &This->state;
     int i;
 
-    user_assert(StartRegister              < NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
-    user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
+    user_assert(StartRegister < (This->may_swvp ? NINE_MAX_CONST_B_SWVP : NINE_MAX_CONST_B),
+                D3DERR_INVALIDCALL);
+    user_assert(StartRegister + BoolCount <= (This->may_swvp ? NINE_MAX_CONST_B_SWVP : NINE_MAX_CONST_B),
+                D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     for (i = 0; i < BoolCount; i++)
index f2fd164cc12f35e81e5c3682f878b60dae89caa6..b6aa5e065316dd9fb241dbadf3a90796a9092282 100644 (file)
@@ -48,6 +48,7 @@ struct NineDevice9
 {
     struct NineUnknown base;
     boolean ex;
+    boolean may_swvp;
 
     /* G3D context */
     struct pipe_screen *screen;
index 2d4e323a4ae9ae2b1610bfbaa6b2057dd6804d76..2b573e6879e78eef92a55bb0fae6e135dacc9055 100644 (file)
@@ -3501,7 +3501,10 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
     tx->parse++; /* for byte_size */
 
     if (tx->failure) {
-        ERR("Encountered buggy shader\n");
+        /* For VS shaders, we print the warning later,
+         * we first try with swvp. */
+        if (IS_PS)
+            ERR("Encountered buggy shader\n");
         ureg_destroy(tx->ureg);
         hr = D3DERR_INVALIDCALL;
         goto out;
index 2faca121fe6f950b9c6f1974890787bf50174142..024e639f92f9777b60de629573a9f977ab5874d2 100644 (file)
@@ -78,6 +78,143 @@ prepare_rasterizer(struct NineDevice9 *device)
     device->state.commit |= NINE_STATE_COMMIT_RASTERIZER;
 }
 
+static void
+prepare_vs_constants_userbuf_swvp(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+
+    if (state->changed.vs_const_f || state->changed.group & NINE_STATE_SWVP) {
+        struct pipe_constant_buffer cb;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 4096 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_f_swvp;
+
+        if (state->vs->lconstf.ranges) {
+            const struct nine_lconstf *lconstf =  &device->state.vs->lconstf;
+            const struct nine_range *r = lconstf->ranges;
+            unsigned n = 0;
+            float *dst = device->state.vs_lconstf_temp;
+            float *src = (float *)cb.user_buffer;
+            memcpy(dst, src, cb.buffer_size);
+            while (r) {
+                unsigned p = r->bgn;
+                unsigned c = r->end - r->bgn;
+                memcpy(&dst[p * 4], &lconstf->data[n * 4], c * 4 * sizeof(float));
+                n += c;
+                r = r->next;
+            }
+            cb.user_buffer = dst;
+        }
+
+        state->pipe.cb0_swvp = cb;
+
+        cb.user_buffer = (char *)cb.user_buffer + 4096 * sizeof(float[4]);
+        state->pipe.cb1_swvp = cb;
+    }
+
+    if (state->changed.vs_const_i || state->changed.group & NINE_STATE_SWVP) {
+        struct pipe_constant_buffer cb;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 2048 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_i;
+
+        state->pipe.cb2_swvp = cb;
+        state->changed.vs_const_i = 0;
+    }
+
+    if (state->changed.vs_const_b || state->changed.group & NINE_STATE_SWVP) {
+        struct pipe_constant_buffer cb;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 512 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_b;
+
+        state->pipe.cb3_swvp = cb;
+        state->changed.vs_const_b = 0;
+    }
+
+    if (!device->driver_caps.user_cbufs) {
+        struct pipe_constant_buffer *cb = &(state->pipe.cb0_swvp);
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb->buffer_size,
+                      device->constbuf_alignment,
+                      cb->user_buffer,
+                      &(cb->buffer_offset),
+                      &(cb->buffer));
+        u_upload_unmap(device->constbuf_uploader);
+        cb->user_buffer = NULL;
+
+        cb = &(state->pipe.cb1_swvp);
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb->buffer_size,
+                      device->constbuf_alignment,
+                      cb->user_buffer,
+                      &(cb->buffer_offset),
+                      &(cb->buffer));
+        u_upload_unmap(device->constbuf_uploader);
+        cb->user_buffer = NULL;
+
+        cb = &(state->pipe.cb2_swvp);
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb->buffer_size,
+                      device->constbuf_alignment,
+                      cb->user_buffer,
+                      &(cb->buffer_offset),
+                      &(cb->buffer));
+        u_upload_unmap(device->constbuf_uploader);
+        cb->user_buffer = NULL;
+
+        cb = &(state->pipe.cb3_swvp);
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb->buffer_size,
+                      device->constbuf_alignment,
+                      cb->user_buffer,
+                      &(cb->buffer_offset),
+                      &(cb->buffer));
+        u_upload_unmap(device->constbuf_uploader);
+        cb->user_buffer = NULL;
+    }
+
+    if (device->state.changed.vs_const_f) {
+        struct nine_range *r = device->state.changed.vs_const_f;
+        struct nine_range *p = r;
+        while (p->next)
+            p = p->next;
+        nine_range_pool_put_chain(&device->range_pool, r, p);
+        device->state.changed.vs_const_f = NULL;
+    }
+
+    if (device->state.changed.vs_const_i) {
+        struct nine_range *r = device->state.changed.vs_const_i;
+        struct nine_range *p = r;
+        while (p->next)
+            p = p->next;
+        nine_range_pool_put_chain(&device->range_pool, r, p);
+        device->state.changed.vs_const_i = NULL;
+    }
+
+    if (device->state.changed.vs_const_b) {
+        struct nine_range *r = device->state.changed.vs_const_b;
+        struct nine_range *p = r;
+        while (p->next)
+            p = p->next;
+        nine_range_pool_put_chain(&device->range_pool, r, p);
+        device->state.changed.vs_const_b = NULL;
+    }
+
+    state->changed.group &= ~NINE_STATE_VS_CONST;
+    state->commit |= NINE_STATE_COMMIT_CONST_VS;
+}
+
 static void
 prepare_vs_constants_userbuf(struct NineDevice9 *device)
 {
@@ -88,21 +225,27 @@ prepare_vs_constants_userbuf(struct NineDevice9 *device)
     cb.buffer_size = device->state.vs->const_used_size;
     cb.user_buffer = device->state.vs_const_f;
 
-    if (!cb.buffer_size)
+    if (device->swvp) {
+        prepare_vs_constants_userbuf_swvp(device);
         return;
+    }
 
-    if (state->changed.vs_const_i) {
+    if (state->changed.vs_const_i || state->changed.group & NINE_STATE_SWVP) {
         int *idst = (int *)&state->vs_const_f[4 * device->max_vs_const_f];
         memcpy(idst, state->vs_const_i, NINE_MAX_CONST_I * sizeof(int[4]));
         state->changed.vs_const_i = 0;
     }
-    if (state->changed.vs_const_b) {
+
+    if (state->changed.vs_const_b || state->changed.group & NINE_STATE_SWVP) {
         int *idst = (int *)&state->vs_const_f[4 * device->max_vs_const_f];
         uint32_t *bdst = (uint32_t *)&idst[4 * NINE_MAX_CONST_I];
         memcpy(bdst, state->vs_const_b, NINE_MAX_CONST_B * sizeof(BOOL));
         state->changed.vs_const_b = 0;
     }
 
+    if (!cb.buffer_size)
+        return;
+
     if (device->state.vs->lconstf.ranges) {
         /* TODO: Can we make it so that we don't have to copy everything ? */
         const struct nine_lconstf *lconstf =  &device->state.vs->lconstf;
@@ -251,7 +394,7 @@ prepare_vs(struct NineDevice9 *device, uint8_t shader_changed)
     int has_key_changed = 0;
 
     if (likely(state->programmable_vs))
-        has_key_changed = NineVertexShader9_UpdateKey(vs, state);
+        has_key_changed = NineVertexShader9_UpdateKey(vs, device);
 
     if (!shader_changed && !has_key_changed)
         return 0;
@@ -740,8 +883,16 @@ commit_vs_constants(struct NineDevice9 *device)
 
     if (unlikely(!device->state.programmable_vs))
         pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs_ff);
-    else
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs);
+    else {
+        if (device->swvp) {
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb0_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 1, &device->state.pipe.cb1_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 2, &device->state.pipe.cb2_swvp);
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 3, &device->state.pipe.cb3_swvp);
+        } else {
+            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs);
+        }
+    }
 }
 
 static inline void
@@ -777,7 +928,8 @@ commit_ps(struct NineDevice9 *device)
    (NINE_STATE_VS |         \
     NINE_STATE_TEXTURE |    \
     NINE_STATE_FOG_SHADER | \
-    NINE_STATE_POINTSIZE_SHADER)
+    NINE_STATE_POINTSIZE_SHADER | \
+    NINE_STATE_SWVP)
 
 #define NINE_STATE_SHADER_CHANGE_PS \
    (NINE_STATE_PS |         \
@@ -886,14 +1038,14 @@ nine_update_state(struct NineDevice9 *device)
             commit_index_buffer(device);
     }
 
-    if (likely(group & (NINE_STATE_FREQUENT | NINE_STATE_VS | NINE_STATE_PS))) {
+    if (likely(group & (NINE_STATE_FREQUENT | NINE_STATE_VS | NINE_STATE_PS | NINE_STATE_SWVP))) {
         if (group & NINE_STATE_MULTISAMPLE)
             group |= check_multisample(device);
         if (group & NINE_STATE_RASTERIZER)
             prepare_rasterizer(device);
         if (group & (NINE_STATE_TEXTURE | NINE_STATE_SAMPLER))
             update_textures_and_samplers(device);
-        if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS)) && state->programmable_vs)
+        if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS | NINE_STATE_SWVP)) && state->programmable_vs)
             prepare_vs_constants_userbuf(device);
         if ((group & (NINE_STATE_PS_CONST | NINE_STATE_PS)) && state->ps)
             prepare_ps_constants_userbuf(device);
index 8c9483231e05a0447e5b5512ee75168d74c41483..2aa424d46a7453261cb3b9dd355c16237bcca489 100644 (file)
@@ -84,8 +84,9 @@
 #define NINE_STATE_PS1X_SHADER (1 << 26)
 #define NINE_STATE_POINTSIZE_SHADER (1 << 27)
 #define NINE_STATE_MULTISAMPLE (1 << 28)
-#define NINE_STATE_ALL          0x1fffffff
-#define NINE_STATE_UNHANDLED   (1 << 29)
+#define NINE_STATE_SWVP        (1 << 29)
+#define NINE_STATE_ALL          0x3fffffff
+#define NINE_STATE_UNHANDLED   (1 << 30)
 
 #define NINE_STATE_COMMIT_DSA  (1 << 0)
 #define NINE_STATE_COMMIT_RASTERIZER (1 << 1)
 #define NINE_MAX_CONST_F   256
 #define NINE_MAX_CONST_I   16
 #define NINE_MAX_CONST_B   16
+#define NINE_MAX_CONST_F_SWVP   8192
+#define NINE_MAX_CONST_I_SWVP   2048
+#define NINE_MAX_CONST_B_SWVP   2048
 #define NINE_MAX_CONST_ALL 276 /* B consts count only 1/4 th */
 
 #define NINE_CONST_I_BASE(nconstf) \
@@ -157,6 +161,7 @@ struct nine_state
      */
     struct NineVertexShader9 *vs;
     float *vs_const_f;
+    float *vs_const_f_swvp;
     int   *vs_const_i;
     BOOL  *vs_const_b;
     float *vs_lconstf_temp;
@@ -229,6 +234,10 @@ struct nine_state
         struct pipe_rasterizer_state rast;
         struct pipe_blend_state blend;
         struct pipe_constant_buffer cb_vs;
+        struct pipe_constant_buffer cb0_swvp;
+        struct pipe_constant_buffer cb1_swvp;
+        struct pipe_constant_buffer cb2_swvp;
+        struct pipe_constant_buffer cb3_swvp;
         struct pipe_constant_buffer cb_ps;
         struct pipe_constant_buffer cb_vs_ff;
         struct pipe_constant_buffer cb_ps_ff;
index 19c3766b3c680221838e4cacebd5280c4c97dde1..102213e417e86d982bb83c22956aebaeda13138a 100644 (file)
@@ -30,8 +30,9 @@
 
 /* XXX TODO: handling of lights is broken */
 
-#define VS_CONST_I_SIZE (NINE_MAX_CONST_I * sizeof(int[4]))
-#define VS_CONST_B_SIZE (NINE_MAX_CONST_B * sizeof(BOOL))
+#define VS_CONST_I_SIZE(device) (device->may_swvp ? (NINE_MAX_CONST_I_SWVP * sizeof(int[4])) : (NINE_MAX_CONST_I * sizeof(int[4])))
+#define VS_CONST_B_SIZE(device) (device->may_swvp ? (NINE_MAX_CONST_B_SWVP * sizeof(BOOL)) : (NINE_MAX_CONST_B * sizeof(BOOL)))
+#define VS_CONST_F_SWVP_SIZE    (NINE_MAX_CONST_F_SWVP * sizeof(float[4]))
 
 HRESULT
 NineStateBlock9_ctor( struct NineStateBlock9 *This,
@@ -49,12 +50,19 @@ NineStateBlock9_ctor( struct NineStateBlock9 *This,
 
     This->state.vs_const_f = MALLOC(This->base.device->vs_const_size);
     This->state.ps_const_f = MALLOC(This->base.device->ps_const_size);
-    This->state.vs_const_i = MALLOC(VS_CONST_I_SIZE);
-    This->state.vs_const_b = MALLOC(VS_CONST_B_SIZE);
+    This->state.vs_const_i = MALLOC(VS_CONST_I_SIZE(This->base.device));
+    This->state.vs_const_b = MALLOC(VS_CONST_B_SIZE(This->base.device));
     if (!This->state.vs_const_f || !This->state.ps_const_f ||
         !This->state.vs_const_i || !This->state.vs_const_b)
         return E_OUTOFMEMORY;
 
+    if (This->base.device->may_swvp) {
+        This->state.vs_const_f_swvp = MALLOC(VS_CONST_F_SWVP_SIZE);
+        if (!This->state.vs_const_f_swvp)
+            return E_OUTOFMEMORY;
+    } else
+        This->state.vs_const_f_swvp = NULL;
+
     return D3D_OK;
 }
 
@@ -71,6 +79,7 @@ NineStateBlock9_dtor( struct NineStateBlock9 *This )
     FREE(state->ps_const_f);
     FREE(state->vs_const_i);
     FREE(state->vs_const_b);
+    FREE(state->vs_const_f_swvp);
 
     FREE(state->ff.light);
 
@@ -101,7 +110,8 @@ NineStateBlock9_dtor( struct NineStateBlock9 *This )
  * TODO: compare ?
  */
 static void
-nine_state_copy_common(struct nine_state *dst,
+nine_state_copy_common(struct NineDevice9 *device,
+                       struct nine_state *dst,
                        struct nine_state *src,
                        struct nine_state *mask, /* aliases either src or dst */
                        const boolean apply,
@@ -130,13 +140,32 @@ nine_state_copy_common(struct nine_state *dst,
      */
     if (mask->changed.group & NINE_STATE_VS_CONST) {
         struct nine_range *r;
-        for (r = mask->changed.vs_const_f; r; r = r->next) {
-            memcpy(&dst->vs_const_f[r->bgn * 4],
-                   &src->vs_const_f[r->bgn * 4],
-                   (r->end - r->bgn) * 4 * sizeof(float));
-            if (apply)
-                nine_ranges_insert(&dst->changed.vs_const_f, r->bgn, r->end,
-                                   pool);
+        if (device->may_swvp) {
+            for (r = mask->changed.vs_const_f; r; r = r->next) {
+                int bgn = r->bgn;
+                int end = r->end;
+                memcpy(&dst->vs_const_f_swvp[bgn * 4],
+                       &src->vs_const_f_swvp[bgn * 4],
+                       (end - bgn) * 4 * sizeof(float));
+                if (apply)
+                    nine_ranges_insert(&dst->changed.vs_const_f, bgn, end,
+                                       pool);
+                if (bgn < device->max_vs_const_f) {
+                    end = MIN2(end, device->max_vs_const_f);
+                    memcpy(&dst->vs_const_f[bgn * 4],
+                           &src->vs_const_f[bgn * 4],
+                           (end - bgn) * 4 * sizeof(float));
+                }
+            }
+        } else {
+            for (r = mask->changed.vs_const_f; r; r = r->next) {
+                memcpy(&dst->vs_const_f[r->bgn * 4],
+                       &src->vs_const_f[r->bgn * 4],
+                       (r->end - r->bgn) * 4 * sizeof(float));
+                if (apply)
+                    nine_ranges_insert(&dst->changed.vs_const_f, r->bgn, r->end,
+                                       pool);
+            }
         }
         for (r = mask->changed.vs_const_i; r; r = r->next) {
             memcpy(&dst->vs_const_i[r->bgn * 4],
@@ -342,7 +371,8 @@ nine_state_copy_common(struct nine_state *dst,
 }
 
 static void
-nine_state_copy_common_all(struct nine_state *dst,
+nine_state_copy_common_all(struct NineDevice9 *device,
+                           struct nine_state *dst,
                            const struct nine_state *src,
                            struct nine_state *help,
                            const boolean apply,
@@ -369,12 +399,15 @@ nine_state_copy_common_all(struct nine_state *dst,
     if (1) {
         struct nine_range *r = help->changed.vs_const_f;
         memcpy(&dst->vs_const_f[0],
-               &src->vs_const_f[0], (r->end - r->bgn) * 4 * sizeof(float));
+               &src->vs_const_f[0], device->max_vs_const_f * 4 * sizeof(float));
+        if (device->may_swvp)
+            memcpy(dst->vs_const_f_swvp,
+                   src->vs_const_f_swvp, VS_CONST_F_SWVP_SIZE);
         if (apply)
             nine_ranges_insert(&dst->changed.vs_const_f, r->bgn, r->end, pool);
 
-        memcpy(dst->vs_const_i, src->vs_const_i, VS_CONST_I_SIZE);
-        memcpy(dst->vs_const_b, src->vs_const_b, VS_CONST_B_SIZE);
+        memcpy(dst->vs_const_i, src->vs_const_i, VS_CONST_I_SIZE(device));
+        memcpy(dst->vs_const_b, src->vs_const_b, VS_CONST_B_SIZE(device));
         if (apply) {
             r = help->changed.vs_const_i;
             nine_ranges_insert(&dst->changed.vs_const_i, r->bgn, r->end, pool);
@@ -491,17 +524,18 @@ nine_state_copy_common_all(struct nine_state *dst,
 HRESULT NINE_WINAPI
 NineStateBlock9_Capture( struct NineStateBlock9 *This )
 {
+    struct NineDevice9 *device = This->base.device;
     struct nine_state *dst = &This->state;
-    struct nine_state *src = &This->base.device->state;
-    const int MaxStreams = This->base.device->caps.MaxStreams;
+    struct nine_state *src = &device->state;
+    const int MaxStreams = device->caps.MaxStreams;
     unsigned s;
 
     DBG("This=%p\n", This);
 
     if (This->type == NINESBT_ALL)
-        nine_state_copy_common_all(dst, src, dst, FALSE, NULL, MaxStreams);
+        nine_state_copy_common_all(device, dst, src, dst, FALSE, NULL, MaxStreams);
     else
-        nine_state_copy_common(dst, src, dst, FALSE, NULL);
+        nine_state_copy_common(device, dst, src, dst, FALSE, NULL);
 
     if (dst->changed.group & NINE_STATE_VDECL)
         nine_bind(&dst->vdecl, src->vdecl);
@@ -521,18 +555,19 @@ NineStateBlock9_Capture( struct NineStateBlock9 *This )
 HRESULT NINE_WINAPI
 NineStateBlock9_Apply( struct NineStateBlock9 *This )
 {
-    struct nine_state *dst = &This->base.device->state;
+    struct NineDevice9 *device = This->base.device;
+    struct nine_state *dst = &device->state;
     struct nine_state *src = &This->state;
-    struct nine_range_pool *pool = &This->base.device->range_pool;
-    const int MaxStreams = This->base.device->caps.MaxStreams;
+    struct nine_range_pool *pool = &device->range_pool;
+    const int MaxStreams = device->caps.MaxStreams;
     unsigned s;
 
     DBG("This=%p\n", This);
 
     if (This->type == NINESBT_ALL)
-        nine_state_copy_common_all(dst, src, src, TRUE, pool, MaxStreams);
+        nine_state_copy_common_all(device, dst, src, src, TRUE, pool, MaxStreams);
     else
-        nine_state_copy_common(dst, src, src, TRUE, pool);
+        nine_state_copy_common(device, dst, src, src, TRUE, pool);
 
     if ((src->changed.group & NINE_STATE_VDECL) && src->vdecl)
         NineDevice9_SetVertexDeclaration(This->base.device, (IDirect3DVertexDeclaration9 *)src->vdecl);
index bc09a413fab85bb6f2a5dc62e70c4f106465841c..92f8f6bb5813ea1f3353683d0207c061f510435f 100644 (file)
@@ -63,12 +63,21 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     info.fog_enable = 0;
     info.point_size_min = 0;
     info.point_size_max = 0;
-    info.swvp_on = false;
+    info.swvp_on = !!(device->params.BehaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING);
 
     hr = nine_translate_shader(device, &info);
+    if (hr == D3DERR_INVALIDCALL &&
+        (device->params.BehaviorFlags & D3DCREATE_MIXED_VERTEXPROCESSING)) {
+        /* Retry with a swvp shader. It will require swvp to be on. */
+        info.swvp_on = true;
+        hr = nine_translate_shader(device, &info);
+    }
+    if (hr == D3DERR_INVALIDCALL)
+        ERR("Encountered buggy shader\n");
     if (FAILED(hr))
         return hr;
     This->byte_code.version = info.version;
+    This->swvp_only = info.swvp_on;
 
     This->byte_code.tokens = mem_dup(pFunction, info.byte_size);
     if (!This->byte_code.tokens)
@@ -77,7 +86,7 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
 
     This->variant.cso = info.cso;
     This->last_cso = info.cso;
-    This->last_key = 0;
+    This->last_key = (uint32_t) (info.swvp_on << 9);
 
     This->const_used_size = info.const_used_size;
     This->lconstf = info.lconstf;
@@ -168,7 +177,7 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This )
         info.fog_enable = device->state.rs[D3DRS_FOGENABLE];
         info.point_size_min = asfloat(device->state.rs[D3DRS_POINTSIZE_MIN]);
         info.point_size_max = asfloat(device->state.rs[D3DRS_POINTSIZE_MAX]);
-        info.swvp_on = false;
+        info.swvp_on = device->swvp;
 
         hr = nine_translate_shader(This->base.device, &info);
         if (FAILED(hr))
index 3c9db7990a003b5a592170df6eaedecdfea66a56..823c71aa85e781f91a9323bc05855d8d693c1877 100644 (file)
@@ -26,6 +26,7 @@
 #include "util/u_half.h"
 
 #include "iunknown.h"
+#include "device9.h"
 #include "nine_helpers.h"
 #include "nine_shader.h"
 #include "nine_state.h"
@@ -50,6 +51,7 @@ struct NineVertexShader9
 
     boolean position_t; /* if true, disable vport transform */
     boolean point_size; /* if true, set rasterizer.point_size_per_vertex to 1 */
+    boolean swvp_only;
 
     unsigned const_used_size; /* in bytes */
 
@@ -73,8 +75,9 @@ NineVertexShader9( void *data )
 
 static inline BOOL
 NineVertexShader9_UpdateKey( struct NineVertexShader9 *vs,
-                             struct nine_state *state )
+                             struct NineDevice9 *device )
 {
+    struct nine_state *state = &(device->state);
     uint8_t samplers_shadow;
     uint64_t key;
     BOOL res;
@@ -84,7 +87,8 @@ NineVertexShader9_UpdateKey( struct NineVertexShader9 *vs,
     key = samplers_shadow;
 
     if (vs->byte_code.version < 0x30)
-        key |= (uint32_t) (state->rs[D3DRS_FOGENABLE] << 8);
+        key |= (uint32_t) ((!!state->rs[D3DRS_FOGENABLE]) << 8);
+    key |= (uint32_t) (device->swvp << 9);
 
     /* We want to use a 64 bits key for performance.
      * Use compressed float16 values for the pointsize min/max in the key.