st/nine: Recompile optimized shaders based on b/i consts
authorAxel Davy <davyaxel0@gmail.com>
Sun, 8 Apr 2018 15:06:00 +0000 (17:06 +0200)
committerAxel Davy <davyaxel0@gmail.com>
Tue, 30 Apr 2019 17:18:51 +0000 (19:18 +0200)
Boolean and Integer constants are used in d3d9 for flow control.

Boolean are used for if/then/else and Integer constants
for loops.
The compilers can generate better code if these values are known
at compilation.
I haven't met so far a game that would change the values of these
constants frequently (and when they do, they set to the values used
for the previous draw call, and thus the changes get filtered out).

Thus it makes sense to inline these constants and recompile the shaders.
The commit sets a bound to the number of variants for a given shader
to avoid too many shaders to be generated.

One drawback is it means more shader compilations. It would probably
make sense to compile these shaders asynchronously or let the user
control the behaviour with an env var, but this is not done here.

The games I tested hit very few shader variants, and the performance
impact was negligible, but it could help for games with uber shaders.

Signed-off-by: Axel Davy <davyaxel0@gmail.com>
src/gallium/state_trackers/nine/device9.c
src/gallium/state_trackers/nine/nine_shader.c
src/gallium/state_trackers/nine/nine_shader.h
src/gallium/state_trackers/nine/nine_state.c
src/gallium/state_trackers/nine/nine_state.h
src/gallium/state_trackers/nine/pixelshader9.c
src/gallium/state_trackers/nine/pixelshader9.h
src/gallium/state_trackers/nine/vertexshader9.c
src/gallium/state_trackers/nine/vertexshader9.h

index 2cc3a9465fa90f11210e78d682d1d4f1894807b9..78ca58d3c270cbd2173b33cdcd39c48966cac111 100644 (file)
@@ -491,6 +491,10 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->driver_caps.ps_integer = pScreen->get_shader_param(pScreen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_INTEGERS);
     This->driver_caps.offset_units_unscaled = GET_PCAP(POLYGON_OFFSET_UNITS_UNSCALED);
 
+    /* Code would be needed when integers are not available to correctly
+     * handle the conversion of integer constants */
+    This->context.inline_constants = This->driver_caps.vs_integer && This->driver_caps.ps_integer;
+
     nine_ff_init(This); /* initialize fixed function code */
 
     NineDevice9_SetDefaultState(This, FALSE);
index 57bf5efd5dcc0fe95e14a9db60e06ea23d30d428..3035b76d816d6f986d67d69fdf15d49bd737f017 100644 (file)
@@ -3486,6 +3486,9 @@ tx_ctor(struct shader_translator *tx, struct nine_shader_info *info)
     info->position_t = FALSE;
     info->point_size = FALSE;
 
+    memset(info->int_slots_used, 0, sizeof(info->int_slots_used));
+    memset(info->bool_slots_used, 0, sizeof(info->bool_slots_used));
+
     tx->info->const_float_slots = 0;
     tx->info->const_int_slots = 0;
     tx->info->const_bool_slots = 0;
@@ -3696,6 +3699,32 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info,
     if (tx->mul_zero_wins)
        ureg_property(tx->ureg, TGSI_PROPERTY_MUL_ZERO_WINS, 1);
 
+    /* Add additional definition of constants */
+    if (info->add_constants_defs.c_combination) {
+        unsigned i;
+
+        assert(info->add_constants_defs.int_const_added);
+        assert(info->add_constants_defs.bool_const_added);
+        /* We only add constants that are used by the shader
+         * and that are not defined in the shader */
+        for (i = 0; i < NINE_MAX_CONST_I; ++i) {
+            if ((*info->add_constants_defs.int_const_added)[i]) {
+                DBG("Defining const i%i : { %i %i %i %i }\n", i,
+                    info->add_constants_defs.c_combination->const_i[i][0],
+                    info->add_constants_defs.c_combination->const_i[i][1],
+                    info->add_constants_defs.c_combination->const_i[i][2],
+                    info->add_constants_defs.c_combination->const_i[i][3]);
+                tx_set_lconsti(tx, i, info->add_constants_defs.c_combination->const_i[i]);
+            }
+        }
+        for (i = 0; i < NINE_MAX_CONST_B; ++i) {
+            if ((*info->add_constants_defs.bool_const_added)[i]) {
+                DBG("Defining const b%i : %i\n", i, (int)(info->add_constants_defs.c_combination->const_b[i] != 0));
+                tx_set_lconstb(tx, i, info->add_constants_defs.c_combination->const_b[i]);
+            }
+        }
+    }
+
     while (!sm1_parse_eof(tx) && !tx->failure)
         sm1_parse_instruction(tx);
     tx->parse++; /* for byte_size */
index 8d98e9e597ff76d0e4c2c33746fb6e3b0712128a..8b5be52de09925fd5b9779610dcf7202f969fae6 100644 (file)
@@ -27,6 +27,7 @@
 #include "d3d9caps.h"
 #include "nine_defines.h"
 #include "nine_helpers.h"
+#include "nine_state.h"
 #include "pipe/p_state.h" /* PIPE_MAX_ATTRIBS */
 #include "util/u_memory.h"
 
@@ -39,6 +40,8 @@ struct nine_lconstf /* NOTE: both pointers should be FREE'd by the user */
     float *data;
 };
 
+struct nine_shader_constant_combination;
+
 struct nine_shader_info
 {
     unsigned type; /* in, PIPE_SHADER_x */
@@ -72,6 +75,9 @@ struct nine_shader_info
     unsigned const_b_base; /* in vec4 (16 byte) units */
     unsigned const_used_size;
 
+    boolean int_slots_used[NINE_MAX_CONST_I];
+    boolean bool_slots_used[NINE_MAX_CONST_B];
+
     unsigned const_float_slots;
     unsigned const_int_slots;
     unsigned const_bool_slots;
@@ -79,6 +85,12 @@ struct nine_shader_info
     struct nine_lconstf lconstf; /* out, NOTE: members to be free'd by user */
     uint8_t bumpenvmat_needed;
 
+    struct {
+        struct nine_shader_constant_combination* c_combination;
+        boolean (*int_const_added)[NINE_MAX_CONST_I];
+        boolean (*bool_const_added)[NINE_MAX_CONST_B];
+    } add_constants_defs;
+
     boolean swvp_on;
 
     boolean process_vertices;
@@ -103,12 +115,16 @@ nine_info_mark_const_f_used(struct nine_shader_info *info, int idx)
 static inline void
 nine_info_mark_const_i_used(struct nine_shader_info *info, int idx)
 {
+    if (!info->swvp_on)
+        info->int_slots_used[idx] = TRUE;
     if (info->const_int_slots < (idx + 1))
         info->const_int_slots = idx + 1;
 }
 static inline void
 nine_info_mark_const_b_used(struct nine_shader_info *info, int idx)
 {
+    if (!info->swvp_on)
+        info->bool_slots_used[idx] = TRUE;
     if (info->const_bool_slots < (idx + 1))
         info->const_bool_slots = idx + 1;
 }
@@ -224,4 +240,91 @@ nine_shader_variants_so_free(struct nine_shader_variant_so *list)
         nine_bind(&list->vdecl, NULL);
 }
 
+struct nine_shader_constant_combination
+{
+    struct nine_shader_constant_combination *next;
+    int const_i[NINE_MAX_CONST_I][4];
+    BOOL const_b[NINE_MAX_CONST_B];
+};
+
+#define NINE_MAX_CONSTANT_COMBINATION_VARIANTS 32
+
+static inline uint8_t
+nine_shader_constant_combination_key(struct nine_shader_constant_combination **list,
+                                     boolean *int_slots_used,
+                                     boolean *bool_slots_used,
+                                     int *const_i,
+                                     BOOL *const_b)
+{
+    int i;
+    uint8_t index = 0;
+    boolean match;
+    struct nine_shader_constant_combination **next_allocate = list, *current = *list;
+
+    assert(int_slots_used);
+    assert(bool_slots_used);
+    assert(const_i);
+    assert(const_b);
+
+    while (current) {
+        index++; /* start at 1. 0 is for the variant without constant replacement */
+        match = TRUE;
+        for (i = 0; i < NINE_MAX_CONST_I; ++i) {
+            if (int_slots_used[i])
+                match &= !memcmp(const_i + 4*i, current->const_i[i], sizeof(current->const_i[0]));
+        }
+        for (i = 0; i < NINE_MAX_CONST_B; ++i) {
+            if (bool_slots_used[i])
+                match &= const_b[i] == current->const_b[i];
+        }
+        if (match)
+            return index;
+        next_allocate = &current->next;
+        current = current->next;
+    }
+
+    if (index < NINE_MAX_CONSTANT_COMBINATION_VARIANTS) {
+        *next_allocate = MALLOC_STRUCT(nine_shader_constant_combination);
+        current = *next_allocate;
+        index++;
+        current->next = NULL;
+        memcpy(current->const_i, const_i, sizeof(current->const_i));
+        memcpy(current->const_b, const_b, sizeof(current->const_b));
+        return index;
+    }
+
+    return 0; /* Too many variants, revert to no replacement */
+}
+
+static inline struct nine_shader_constant_combination *
+nine_shader_constant_combination_get(struct nine_shader_constant_combination *list, uint8_t index)
+{
+    if (index == 0)
+        return NULL;
+    while (index) {
+        assert(list != NULL);
+        index--;
+        if (index == 0)
+            return list;
+        list = list->next;
+    }
+    assert(FALSE);
+    return NULL;
+}
+
+static inline void
+nine_shader_constant_combination_free(struct nine_shader_constant_combination *list)
+{
+    if (!list)
+        return;
+
+    while (list->next) {
+        struct nine_shader_constant_combination *ptr = list->next;
+        list->next = ptr->next;
+        FREE(ptr);
+    }
+
+    FREE(list);
+}
+
 #endif /* _NINE_SHADER_H_ */
index 21d51c9715d0180e5f804d5ab9f42bc77233f9e3..475925ffda9aac27a24bcdc4b685c7eb9ba4f951 100644 (file)
@@ -1595,7 +1595,7 @@ CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_i,
     }
 
     context->changed.vs_const_i = TRUE;
-    context->changed.group |= NINE_STATE_VS_CONST;
+    context->changed.group |= NINE_STATE_VS_CONST | NINE_STATE_VS_PARAMS_MISC;
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_b,
@@ -1614,7 +1614,7 @@ CSMT_ITEM_NO_WAIT(nine_context_set_vertex_shader_constant_b,
         context->vs_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
     context->changed.vs_const_b = TRUE;
-    context->changed.group |= NINE_STATE_VS_CONST;
+    context->changed.group |= NINE_STATE_VS_CONST | NINE_STATE_VS_PARAMS_MISC;
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader,
@@ -1669,7 +1669,7 @@ CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_i_transformed,
            Vector4iCount * sizeof(context->ps_const_i[0]));
 
     context->changed.ps_const_i = TRUE;
-    context->changed.group |= NINE_STATE_PS_CONST;
+    context->changed.group |= NINE_STATE_PS_CONST | NINE_STATE_PS_PARAMS_MISC;
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_i,
@@ -1694,7 +1694,7 @@ CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_i,
         }
     }
     context->changed.ps_const_i = TRUE;
-    context->changed.group |= NINE_STATE_PS_CONST;
+    context->changed.group |= NINE_STATE_PS_CONST | NINE_STATE_PS_PARAMS_MISC;
 }
 
 CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_b,
@@ -1713,7 +1713,7 @@ CSMT_ITEM_NO_WAIT(nine_context_set_pixel_shader_constant_b,
         context->ps_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
     context->changed.ps_const_b = TRUE;
-    context->changed.group |= NINE_STATE_PS_CONST;
+    context->changed.group |= NINE_STATE_PS_CONST | NINE_STATE_PS_PARAMS_MISC;
 }
 
 /* XXX: use resource, as resource might change */
index d8f7230e5b34e5fdf158161dcb832d201ed1f8dc..376dc5616977a5ffe5eaebca714c31ef063e551f 100644 (file)
@@ -300,6 +300,8 @@ struct nine_context {
     int dummy_vbo_bound_at; /* -1 = not bound , >= 0 = bound index */
     boolean vbo_bound_done;
 
+    boolean inline_constants;
+
     struct nine_ff_state ff;
 
     /* software vertex processing */
index 5d79019a1bcd7a991ba62e9564849fdc85e29d2f..8db4b6e5c375e2a27ab77656d29d321cabfaacc8 100644 (file)
@@ -60,6 +60,9 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
     info.sampler_ps1xtypes = 0x0;
     info.fog_enable = 0;
     info.projected = 0;
+    info.add_constants_defs.c_combination = NULL;
+    info.add_constants_defs.int_const_added = NULL;
+    info.add_constants_defs.bool_const_added = NULL;
     info.process_vertices = false;
 
     pipe = nine_context_get_pipe_acquire(device);
@@ -82,6 +85,15 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
     This->rt_mask = info.rt_mask;
     This->const_used_size = info.const_used_size;
     This->bumpenvmat_needed = info.bumpenvmat_needed;
+
+    memcpy(This->int_slots_used, info.int_slots_used, sizeof(This->int_slots_used));
+    memcpy(This->bool_slots_used, info.bool_slots_used, sizeof(This->bool_slots_used));
+
+    This->const_int_slots = info.const_int_slots;
+    This->const_bool_slots = info.const_bool_slots;
+
+    This->c_combinations = NULL;
+
     /* no constant relative addressing for ps */
     assert(info.lconstf.data == NULL);
     assert(info.lconstf.ranges == NULL);
@@ -115,6 +127,8 @@ NinePixelShader9_dtor( struct NinePixelShader9 *This )
     }
     nine_shader_variants_free(&This->variant);
 
+    nine_shader_constant_combination_free(This->c_combinations);
+
     FREE((void *)This->byte_code.tokens); /* const_cast */
 
     NineUnknown_dtor(&This->base);
@@ -169,6 +183,10 @@ NinePixelShader9_GetVariant( struct NinePixelShader9 *This )
         info.fog_mode = device->context.rs[D3DRS_FOGTABLEMODE];
         info.force_color_in_centroid = key >> 34 & 1;
         info.projected = (key >> 48) & 0xffff;
+        info.add_constants_defs.c_combination =
+            nine_shader_constant_combination_get(This->c_combinations, (key >> 40) & 0xff);
+        info.add_constants_defs.int_const_added = &This->int_slots_used;
+        info.add_constants_defs.bool_const_added = &This->bool_slots_used;
         info.process_vertices = false;
 
         hr = nine_translate_shader(This->base.device, &info, pipe);
index bcbadd710573772f71de287b24c377323644926c..b616d9d7dee14ebd6fb5b6ea562f8fcbc490746e 100644 (file)
@@ -49,6 +49,14 @@ struct NinePixelShader9
     uint16_t sampler_mask;
     uint8_t rt_mask;
 
+    boolean int_slots_used[NINE_MAX_CONST_I];
+    boolean bool_slots_used[NINE_MAX_CONST_B];
+
+    unsigned const_int_slots;
+    unsigned const_bool_slots;
+
+    struct nine_shader_constant_combination *c_combinations;
+
     uint64_t ff_key[6];
     void *ff_cso;
 
@@ -99,6 +107,13 @@ NinePixelShader9_UpdateKey( struct NinePixelShader9 *ps,
     if (context->rt[0]->base.info.nr_samples)
         key |= ((uint64_t)1) << 34;
 
+    if ((ps->const_int_slots > 0 || ps->const_bool_slots > 0) && context->inline_constants)
+        key |= ((uint64_t)nine_shader_constant_combination_key(&ps->c_combinations,
+                                                               ps->int_slots_used,
+                                                               ps->bool_slots_used,
+                                                               (void *)context->ps_const_i,
+                                                               context->ps_const_b)) << 40;
+
     if (unlikely(ps->byte_code.version < 0x14)) {
         projected = nine_ff_get_projected_key(context);
         key |= ((uint64_t) projected) << 48;
index f104a9ad13472c0f067e6731a3d5f0c6e093338c..cffe850167796f679fbf1e52e160d463fbe701fd 100644 (file)
@@ -66,6 +66,9 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     info.fog_enable = 0;
     info.point_size_min = 0;
     info.point_size_max = 0;
+    info.add_constants_defs.c_combination = NULL;
+    info.add_constants_defs.int_const_added = NULL;
+    info.add_constants_defs.bool_const_added = NULL;
     info.swvp_on = !!(device->params.BehaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING);
     info.process_vertices = false;
 
@@ -100,6 +103,14 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     This->position_t = info.position_t;
     This->point_size = info.point_size;
 
+    memcpy(This->int_slots_used, info.int_slots_used, sizeof(This->int_slots_used));
+    memcpy(This->bool_slots_used, info.bool_slots_used, sizeof(This->bool_slots_used));
+
+    This->const_int_slots = info.const_int_slots;
+    This->const_bool_slots = info.const_bool_slots;
+
+    This->c_combinations = NULL;
+
     for (i = 0; i < info.num_inputs && i < ARRAY_SIZE(This->input_map); ++i)
         This->input_map[i].ndecl = info.input_map[i];
     This->num_inputs = i;
@@ -142,6 +153,8 @@ NineVertexShader9_dtor( struct NineVertexShader9 *This )
     nine_shader_variants_free(&This->variant);
     nine_shader_variants_so_free(&This->variant_so);
 
+    nine_shader_constant_combination_free(This->c_combinations);
+
     FREE((void *)This->byte_code.tokens); /* const_cast */
 
     FREE(This->lconstf.data);
@@ -195,6 +208,10 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This )
         info.fog_enable = device->context.rs[D3DRS_FOGENABLE];
         info.point_size_min = asfloat(device->context.rs[D3DRS_POINTSIZE_MIN]);
         info.point_size_max = asfloat(device->context.rs[D3DRS_POINTSIZE_MAX]);
+        info.add_constants_defs.c_combination =
+            nine_shader_constant_combination_get(This->c_combinations, (key >> 16) & 0xff);
+        info.add_constants_defs.int_const_added = &This->int_slots_used;
+        info.add_constants_defs.bool_const_added = &This->bool_slots_used;
         info.swvp_on = device->context.swvp;
         info.process_vertices = false;
 
@@ -232,6 +249,9 @@ NineVertexShader9_GetVariantProcessVertices( struct NineVertexShader9 *This,
     info.fog_enable = false;
     info.point_size_min = 0;
     info.point_size_max = 0;
+    info.add_constants_defs.c_combination = NULL;
+    info.add_constants_defs.int_const_added = NULL;
+    info.add_constants_defs.bool_const_added = NULL;
     info.swvp_on = true;
     info.vdecl_out = vdecl_out;
     info.process_vertices = true;
index 888f1de35be361b04d14556d773d9ce15739329b..766b2fd13eadcc1b212bbae54737dc3353829360 100644 (file)
@@ -59,6 +59,14 @@ struct NineVertexShader9
 
     struct nine_lconstf lconstf;
 
+    boolean int_slots_used[NINE_MAX_CONST_I];
+    boolean bool_slots_used[NINE_MAX_CONST_B];
+
+    unsigned const_int_slots;
+    unsigned const_bool_slots;
+
+    struct nine_shader_constant_combination *c_combinations;
+
     uint64_t ff_key[3];
     void *ff_cso;
 
@@ -93,6 +101,13 @@ NineVertexShader9_UpdateKey( struct NineVertexShader9 *vs,
         key |= (uint32_t) ((!!context->rs[D3DRS_FOGENABLE]) << 8);
     key |= (uint32_t) (context->swvp << 9);
 
+    if ((vs->const_int_slots > 0 || vs->const_bool_slots > 0) && context->inline_constants && !context->swvp)
+        key |= ((uint64_t)nine_shader_constant_combination_key(&vs->c_combinations,
+                                                               vs->int_slots_used,
+                                                               vs->bool_slots_used,
+                                                               context->vs_const_i,
+                                                               context->vs_const_b)) << 16;
+
     /* We want to use a 64 bits key for performance.
      * Use compressed float16 values for the pointsize min/max in the key.
      * Shaders do not usually output psize.*/