vc4: Avoid VS shader recompiles by keeping a set of FS inputs seen so far.
authorEric Anholt <eric@anholt.net>
Wed, 3 Aug 2016 18:55:55 +0000 (11:55 -0700)
committerEric Anholt <eric@anholt.net>
Thu, 4 Aug 2016 15:48:27 +0000 (08:48 -0700)
We don't want to bake the whole array into the FS key, because of the
hashing overhead.  But we can keep a set of the arrays seen, and use a
pointer to the copy in as the array's proxy.

Between this and the previous patch, gl-1.0-blend-func now passes on
hardware, where previously it was filling the 256MB CMA area with shaders
and OOMing.

Drops 712 shaders from shader-db.

src/gallium/drivers/vc4/vc4_context.h
src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.h

index b656539611cdfd1dc8dbfab0fcaf0d7bbdc936ea..c3474a04963f6184e3476468103e0d2079204047 100644 (file)
@@ -69,6 +69,7 @@
 #define VC4_DIRTY_COMPILED_CS   (1 << 23)
 #define VC4_DIRTY_COMPILED_VS   (1 << 24)
 #define VC4_DIRTY_COMPILED_FS   (1 << 25)
+#define VC4_DIRTY_FS_INPUTS     (1 << 26)
 
 struct vc4_sampler_view {
         struct pipe_sampler_view base;
@@ -123,6 +124,17 @@ struct vc4_ubo_range {
         uint32_t size;
 };
 
+struct vc4_fs_inputs {
+        /**
+         * Array of the meanings of the VPM inputs this shader needs.
+         *
+         * It doesn't include those that aren't part of the VPM, like
+         * point/line coordinates.
+         */
+        struct vc4_varying_slot *input_slots;
+        uint32_t num_inputs;
+};
+
 struct vc4_compiled_shader {
         uint64_t program_id;
         struct vc4_bo *bo;
@@ -152,13 +164,7 @@ struct vc4_compiled_shader {
         uint8_t vattr_offsets[9];
         uint8_t vattrs_live;
 
-        /**
-         * Array of the meanings of the VPM inputs this shader needs.
-         *
-         * It doesn't include those that aren't part of the VPM, like
-         * point/line coordinates.
-         */
-        struct vc4_varying_slot *input_slots;
+        const struct vc4_fs_inputs *fs_inputs;
 };
 
 struct vc4_program_stateobj {
@@ -270,6 +276,7 @@ struct vc4_context {
         struct primconvert_context *primconvert;
 
         struct hash_table *fs_cache, *vs_cache;
+        struct set *fs_inputs_set;
         uint32_t next_uncompiled_program_id;
         uint64_t next_compiled_program_id;
 
index 487491ad6674219bbda8bd01cf9035dde9cda3b9..d87caa7c5179c1fe59bcdfd36835cf8c257fb314 100644 (file)
@@ -2100,8 +2100,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                 break;
         case QSTAGE_VERT:
                 emit_vert_end(c,
-                              vc4->prog.fs->input_slots,
-                              vc4->prog.fs->num_inputs);
+                              c->vs_key->fs_inputs->input_slots,
+                              c->vs_key->fs_inputs->num_inputs);
                 break;
         case QSTAGE_COORD:
                 emit_coord_end(c);
@@ -2207,6 +2207,13 @@ static void
 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
                              struct vc4_compiled_shader *shader)
 {
+        struct vc4_fs_inputs inputs;
+
+        memset(&inputs, 0, sizeof(inputs));
+        inputs.input_slots = ralloc_array(shader,
+                                          struct vc4_varying_slot,
+                                          c->num_input_slots);
+
         bool input_live[c->num_input_slots];
 
         memset(input_live, 0, sizeof(input_live));
@@ -2217,10 +2224,6 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
                 }
         }
 
-        shader->input_slots = ralloc_array(shader,
-                                           struct vc4_varying_slot,
-                                           c->num_input_slots);
-
         for (int i = 0; i < c->num_input_slots; i++) {
                 struct vc4_varying_slot *slot = &c->input_slots[i];
 
@@ -2235,11 +2238,33 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
                     slot->slot == VARYING_SLOT_COL1 ||
                     slot->slot == VARYING_SLOT_BFC0 ||
                     slot->slot == VARYING_SLOT_BFC1) {
-                        shader->color_inputs |= (1 << shader->num_inputs);
+                        shader->color_inputs |= (1 << inputs.num_inputs);
                 }
 
-                shader->input_slots[shader->num_inputs] = *slot;
-                shader->num_inputs++;
+                inputs.input_slots[inputs.num_inputs] = *slot;
+                inputs.num_inputs++;
+        }
+        shader->num_inputs = inputs.num_inputs;
+
+        /* Add our set of inputs to the set of all inputs seen.  This way, we
+         * can have a single pointer that identifies an FS inputs set,
+         * allowing VS to avoid recompiling when the FS is recompiled (or a
+         * new one is bound using separate shader objects) but the inputs
+         * don't change.
+         */
+        struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
+        if (entry) {
+                shader->fs_inputs = entry->key;
+                ralloc_free(inputs.input_slots);
+        } else {
+                struct vc4_fs_inputs *alloc_inputs;
+
+                alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
+                memcpy(alloc_inputs, &inputs, sizeof(inputs));
+                ralloc_steal(alloc_inputs, inputs.input_slots);
+                _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
+
+                shader->fs_inputs = alloc_inputs;
         }
 }
 
@@ -2434,10 +2459,14 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
                 return;
 
         vc4->dirty |= VC4_DIRTY_COMPILED_FS;
+
         if (vc4->rasterizer->base.flatshade &&
             old_fs && vc4->prog.fs->color_inputs != old_fs->color_inputs) {
                 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
         }
+
+        if (old_fs && vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
+                vc4->dirty |= VC4_DIRTY_FS_INPUTS;
 }
 
 static void
@@ -2451,14 +2480,14 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
                             VC4_DIRTY_VERTTEX |
                             VC4_DIRTY_VTXSTATE |
                             VC4_DIRTY_UNCOMPILED_VS |
-                            VC4_DIRTY_COMPILED_FS))) {
+                            VC4_DIRTY_FS_INPUTS))) {
                 return;
         }
 
         memset(key, 0, sizeof(*key));
         vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
         key->base.shader_state = vc4->prog.bind_vs;
-        key->compiled_fs_id = vc4->prog.fs->program_id;
+        key->fs_inputs = vc4->prog.fs->fs_inputs;
         key->clamp_color = vc4->rasterizer->base.clamp_vertex_color;
 
         for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
@@ -2477,7 +2506,7 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
 
         key->is_coord = true;
         /* Coord shaders don't care what the FS inputs are. */
-        key->compiled_fs_id = 0;
+        key->fs_inputs = NULL;
         struct vc4_compiled_shader *cs =
                 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
         if (cs != vc4->prog.cs) {
@@ -2517,6 +2546,29 @@ vs_cache_compare(const void *key1, const void *key2)
         return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
 }
 
+static uint32_t
+fs_inputs_hash(const void *key)
+{
+        const struct vc4_fs_inputs *inputs = key;
+
+        return _mesa_hash_data(inputs->input_slots,
+                               sizeof(*inputs->input_slots) *
+                               inputs->num_inputs);
+}
+
+static bool
+fs_inputs_compare(const void *key1, const void *key2)
+{
+        const struct vc4_fs_inputs *inputs1 = key1;
+        const struct vc4_fs_inputs *inputs2 = key2;
+
+        return (inputs1->num_inputs == inputs2->num_inputs &&
+                memcmp(inputs1->input_slots,
+                       inputs2->input_slots,
+                       sizeof(*inputs1->input_slots) *
+                       inputs1->num_inputs) == 0);
+}
+
 static void
 delete_from_cache_if_matches(struct hash_table *ht,
                              struct hash_entry *entry,
@@ -2582,6 +2634,8 @@ vc4_program_init(struct pipe_context *pctx)
                                                 fs_cache_compare);
         vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
                                                 vs_cache_compare);
+        vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
+                                              fs_inputs_compare);
 }
 
 void
index b8ded30711ca7d0a94f20ea24da99e920b0c99c2..e6297c5c82ce3e965073dce062b1ec522165018b 100644 (file)
@@ -352,12 +352,7 @@ struct vc4_fs_key {
 struct vc4_vs_key {
         struct vc4_key base;
 
-        /**
-         * This is a proxy for the array of FS input semantics, which is
-         * larger than we would want to put in the key.
-         */
-        uint64_t compiled_fs_id;
-
+        const struct vc4_fs_inputs *fs_inputs;
         enum pipe_format attr_formats[8];
         bool is_coord;
         bool per_vertex_point_size;