vc4: Upload CS/VS UBO uniforms together.
authorEric Anholt <eric@anholt.net>
Thu, 1 Dec 2016 20:15:10 +0000 (12:15 -0800)
committerEric Anholt <eric@anholt.net>
Wed, 10 Apr 2019 18:45:30 +0000 (11:45 -0700)
Same as I did for V3D, drop all this code trying to GC the
non-indirectly-loaded uniforms from the UBO that's used for indirect
access of gallium cb[0].  While it does successfully drop some of those,
it came at the cost of uploading the VS's indirect unifroms twice, for the
bin and render versions of the shader.

With the UBO loads simplified, I was also able to easily backport V3D's
change to pack a UBO offset into the uniform_data[] field so that we don't
need to do the add of the uniform base in the shader.

As a bonus, now vc4 doesn't depend on mesa/st type_size functions.

total uniforms in shared programs: 25514 -> 25490 (-0.09%)
total instructions in shared programs: 77019 -> 76836 (-0.24%)

src/gallium/drivers/vc4/vc4_context.h
src/gallium/drivers/vc4/vc4_draw.c
src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_uniforms.c

index e7cb831774ce2a4b1a5a8eb8e60addaa6bfa399e..f02992f07ee5c19aa15ec94b64e1cdf870c5656f 100644 (file)
@@ -121,25 +121,6 @@ struct vc4_uncompiled_shader {
         struct pipe_shader_state base;
 };
 
-struct vc4_ubo_range {
-        /**
-         * offset in bytes from the start of the ubo where this range is
-         * uploaded.
-         *
-         * Only set once used is set.
-         */
-        uint32_t dst_offset;
-
-        /**
-         * offset in bytes from the start of the gallium uniforms where the
-         * data comes from.
-         */
-        uint32_t src_offset;
-
-        /** size in bytes of this ubo range */
-        uint32_t size;
-};
-
 struct vc4_fs_inputs {
         /**
          * Array of the meanings of the VPM inputs this shader needs.
@@ -157,9 +138,6 @@ struct vc4_compiled_shader {
 
         struct vc4_shader_uniform_info uniforms;
 
-        struct vc4_ubo_range *ubo_ranges;
-        uint32_t num_ubo_ranges;
-        uint32_t ubo_size;
         /**
          * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the
          * uniforms have to be rewritten (and therefore the shader state
index 06785516cae2367f5a99e4f46020fd2c08fbd5f8..df95c313f34c9851da51a9594506c69dc2d99566 100644 (file)
@@ -343,6 +343,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         vc4_emit_state(pctx);
 
         bool needs_drawarrays_shader_state = false;
+
         if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
                            VC4_DIRTY_VTXSTATE |
                            VC4_DIRTY_PRIM_MODE |
index 4c284b6cd7c94770231ffe1e682202ea706e70c5..7d13544ab2850737823a6a3236c5bdc0e9af64b9 100644 (file)
@@ -38,7 +38,6 @@
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
-#include "mesa/state_tracker/st_glsl_types.h"
 
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -51,12 +50,6 @@ type_size(const struct glsl_type *type)
    return glsl_count_attribute_slots(type, false);
 }
 
-static int
-uniforms_type_size(const struct glsl_type *type)
-{
-        return st_glsl_storage_type_size(type, false);
-}
-
 static void
 resize_qreg_array(struct vc4_compile *c,
                   struct qreg **regs,
@@ -99,43 +92,17 @@ static struct qreg
 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
         struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
-        uint32_t offset = nir_intrinsic_base(intr);
-        struct vc4_compiler_ubo_range *range = NULL;
-        unsigned i;
-        for (i = 0; i < c->num_uniform_ranges; i++) {
-                range = &c->ubo_ranges[i];
-                if (offset >= range->src_offset &&
-                    offset < range->src_offset + range->size) {
-                        break;
-                }
-        }
-        /* The driver-location-based offset always has to be within a declared
-         * uniform range.
-         */
-        assert(range);
-        if (!range->used) {
-                range->used = true;
-                range->dst_offset = c->next_ubo_dst_offset;
-                c->next_ubo_dst_offset += range->size;
-                c->num_ubo_ranges++;
-        }
-
-        offset -= range->src_offset;
-
-        /* Adjust for where we stored the TGSI register base. */
-        indirect_offset = qir_ADD(c, indirect_offset,
-                                  qir_uniform_ui(c, (range->dst_offset +
-                                                     offset)));
 
         /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
+        uint32_t range = nir_intrinsic_range(intr);
         indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
         indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
-                                        qir_uniform_ui(c, (range->dst_offset +
-                                                           range->size - 4)));
+                                        qir_uniform_ui(c, range - 4));
 
         qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
                      indirect_offset,
-                     qir_uniform(c, QUNIFORM_UBO0_ADDR, 0));
+                     qir_uniform(c, QUNIFORM_UBO0_ADDR,
+                                 nir_intrinsic_base(intr)));
 
         c->num_texture_samples++;
 
@@ -859,24 +826,6 @@ add_output(struct vc4_compile *c,
         c->output_slots[decl_offset].swizzle = swizzle;
 }
 
-static void
-declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
-{
-        unsigned array_id = c->num_uniform_ranges++;
-        if (array_id >= c->ubo_ranges_array_size) {
-                c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
-                                                array_id + 1);
-                c->ubo_ranges = reralloc(c, c->ubo_ranges,
-                                         struct vc4_compiler_ubo_range,
-                                         c->ubo_ranges_array_size);
-        }
-
-        c->ubo_ranges[array_id].dst_offset = 0;
-        c->ubo_ranges[array_id].src_offset = start;
-        c->ubo_ranges[array_id].size = size;
-        c->ubo_ranges[array_id].used = false;
-}
-
 static bool
 ntq_src_is_only_ssa_def_user(nir_src *src)
 {
@@ -1698,19 +1647,6 @@ ntq_setup_outputs(struct vc4_compile *c)
         }
 }
 
-static void
-ntq_setup_uniforms(struct vc4_compile *c)
-{
-        nir_foreach_variable(var, &c->s->uniforms) {
-                uint32_t vec4_count = uniforms_type_size(var->type);
-                unsigned vec4_size = 4 * sizeof(float);
-
-                declare_uniform_range(c, var->data.driver_location * vec4_size,
-                                      vec4_count * vec4_size);
-
-        }
-}
-
 /**
  * Sets up the mapping from nir_register to struct qreg *.
  *
@@ -2216,7 +2152,6 @@ nir_to_qir(struct vc4_compile *c)
 
         ntq_setup_inputs(c);
         ntq_setup_outputs(c);
-        ntq_setup_uniforms(c);
 
         /* Find the main function and emit the body. */
         nir_foreach_function(function, c->s) {
@@ -2677,39 +2612,6 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
 
         shader->fs_threaded = c->fs_threaded;
 
-        /* Copy the compiler UBO range state to the compiled shader, dropping
-         * out arrays that were never referenced by an indirect load.
-         *
-         * (Note that QIR dead code elimination of an array access still
-         * leaves that array alive, though)
-         */
-        if (c->num_ubo_ranges) {
-                shader->num_ubo_ranges = c->num_ubo_ranges;
-                shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range,
-                                                  c->num_ubo_ranges);
-                uint32_t j = 0;
-                for (int i = 0; i < c->num_uniform_ranges; i++) {
-                        struct vc4_compiler_ubo_range *range =
-                                &c->ubo_ranges[i];
-                        if (!range->used)
-                                continue;
-
-                        shader->ubo_ranges[j].dst_offset = range->dst_offset;
-                        shader->ubo_ranges[j].src_offset = range->src_offset;
-                        shader->ubo_ranges[j].size = range->size;
-                        shader->ubo_size += c->ubo_ranges[i].size;
-                        j++;
-                }
-        }
-        if (shader->ubo_size) {
-                if (vc4_debug & VC4_DEBUG_SHADERDB) {
-                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                                qir_get_stage_name(c->stage),
-                                c->program_id, c->variant_id,
-                                shader->ubo_size / 4);
-                }
-        }
-
         if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",
                         qir_get_stage_name(c->stage),
index 17a0d0febc0002f6a73cfff64536ba858cc4440c..4d8bf60cf4401193924a18155c6a71503fb55ec3 100644 (file)
@@ -300,31 +300,6 @@ struct vc4_varying_slot {
         uint8_t swizzle;
 };
 
-struct vc4_compiler_ubo_range {
-        /**
-         * offset in bytes from the start of the ubo where this range is
-         * uploaded.
-         *
-         * Only set once used is set.
-         */
-        uint32_t dst_offset;
-
-        /**
-         * offset in bytes from the start of the gallium uniforms where the
-         * data comes from.
-         */
-        uint32_t src_offset;
-
-        /** size in bytes of this ubo range */
-        uint32_t size;
-
-        /**
-         * Set if this range is used by the shader for indirect uniforms
-         * access.
-         */
-        bool used;
-};
-
 struct vc4_key {
         struct vc4_uncompiled_shader *shader_state;
         struct {
@@ -441,14 +416,6 @@ struct vc4_compile {
         uint32_t outputs_array_size;
         uint32_t uniforms_array_size;
 
-        struct vc4_compiler_ubo_range *ubo_ranges;
-        uint32_t ubo_ranges_array_size;
-        /** Number of uniform areas declared in ubo_ranges. */
-        uint32_t num_uniform_ranges;
-        /** Number of uniform areas used for indirect addressed loads. */
-        uint32_t num_ubo_ranges;
-        uint32_t next_ubo_dst_offset;
-
         /* State for whether we're executing on each channel currently.  0 if
          * yes, otherwise a block number + 1 that the channel jumped to.
          */
index d12f5667045fba88cb9c56dabc2204e49289f2bd..dd07487ab16d7a6a5d4ab1d8900b3fef23442f32 100644 (file)
@@ -22,6 +22,7 @@
  */
 
 #include "util/u_pack_color.h"
+#include "util/u_upload_mgr.h"
 #include "util/format_srgb.h"
 
 #include "vc4_context.h"
@@ -186,26 +187,6 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate,
         return fui(1.0f / dim);
 }
 
-static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4,
-               struct vc4_compiled_shader *shader,
-               const uint32_t *gallium_uniforms)
-{
-        if (!shader->ubo_size)
-                return NULL;
-
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
-        void *data = vc4_bo_map(ubo);
-        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
-                memcpy(data + shader->ubo_ranges[i].dst_offset,
-                       ((const void *)gallium_uniforms +
-                        shader->ubo_ranges[i].src_offset),
-                       shader->ubo_ranges[i].size);
-        }
-
-        return ubo;
-}
-
 void
 vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                    struct vc4_constbuf_stateobj *cb,
@@ -214,7 +195,6 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
         struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
         struct vc4_job *job = vc4->job;
         const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
 
         cl_ensure_space(&job->uniforms, (uinfo->count +
                                          uinfo->num_texture_samples) * 4);
@@ -272,8 +252,23 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         break;
 
                 case QUNIFORM_UBO0_ADDR:
+                        /* Constant buffer 0 may be a system memory pointer,
+                         * in which case we want to upload a shadow copy to
+                         * the GPU.
+                        */
+                        if (!cb->cb[0].buffer) {
+                                u_upload_data(vc4->uploader, 0,
+                                              cb->cb[0].buffer_size, 16,
+                                              cb->cb[0].user_buffer,
+                                              &cb->cb[0].buffer_offset,
+                                              &cb->cb[0].buffer);
+                        }
+
                         cl_aligned_reloc(job, &job->uniforms,
-                                         &uniforms, ubo, data);
+                                         &uniforms,
+                                         vc4_resource(cb->cb[0].buffer)->bo,
+                                         cb->cb[0].buffer_offset +
+                                         data);
                         break;
 
                 case QUNIFORM_UBO1_ADDR: {
@@ -374,8 +369,6 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
         }
 
         cl_end(&job->uniforms, uniforms);
-
-        vc4_bo_unreference(&ubo);
 }
 
 void