panfrost: Implement transform feedback
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Fri, 9 Aug 2019 19:32:49 +0000 (12:32 -0700)
committerAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Tue, 13 Aug 2019 16:43:41 +0000 (09:43 -0700)
Midgard has no hardware support for transform feedback, so we simulate
it in software. Lucky us.

What Midgard does do is write out vertex shader outputs to main memory
unconditonally. Fragment shaders read varyings back from main memory;
there's no on-chip storage for varyings. Whether this was a reasonable
design is a question I will not be engaging in this commit message.

What that does mean is that, in some sense, Midgard *always* does
transform feedback uncondtionally, and there's no way to turn off
transform feedback. Normally, we would allocate some scratch memory
every frame to store the varyings in an arbitrary format (interleaved
for simplicity), and then feed that scratch to the fragment shader and
discard when the rendering completes.

The only difference now is that sometimes, for some buffers, we use a BO
provided to us by Gallium and a format provided by Gallium, instead of
allocating the memory and choosing the format ourselves. This has some
limitations -- in particular, it only works at vec4 granularity, so a
corresponding GLSL linkage patch is needed to correctly implement
transform feedback for non-vec4 types. Nevertheless, given the hardware
already works in this admittedly-bizarre fashion, transform feedback is
"free". Or, at least, it's no more expensive than any other rendering.

Specifically not implemented is dynamically-sized transform feedback
(i.e. with geometry/tesselation shaders).

Spoiler alert: Midgard has no support for geometry *or* tessellation
shaders, despite advertising support. They get compiled to *massive*
compute shaders. How's that for checkbox compliance?

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
src/gallium/drivers/panfrost/pan_varyings.c

index 40d7d98bf650a44490b2da089273a03751a68fa8..69e9e6d036d9980d8fd75106cc84428bf24a59a8 100644 (file)
@@ -24,6 +24,7 @@
  */
 
 #include "pan_context.h"
+#include "util/u_prim.h"
 
 static mali_ptr
 panfrost_emit_varyings(
@@ -45,6 +46,33 @@ panfrost_emit_varyings(
         return transfer.gpu;
 }
 
+static void
+panfrost_emit_streamout(
+        struct panfrost_context *ctx,
+        union mali_attr *slot,
+        unsigned stride,
+        unsigned offset,
+        unsigned count,
+        struct pipe_stream_output_target *target)
+{
+        /* Fill out the descriptor */
+        slot->stride = stride * 4;
+        slot->shift = slot->extra_flags = 0;
+
+        unsigned max_size = target->buffer_size;
+        unsigned expected_size = slot->stride * count;
+
+        slot->size = MIN2(max_size, expected_size);
+
+        /* Grab the BO and bind it to the batch */
+        struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx);
+        struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
+        panfrost_job_add_bo(batch, bo);
+
+        mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
+        slot->elements = addr;
+}
+
 static void
 panfrost_emit_point_coord(union mali_attr *slot)
 {
@@ -110,6 +138,44 @@ panfrost_emit_varying_meta(
         }
 }
 
+static bool
+has_point_coord(unsigned mask, gl_varying_slot loc)
+{
+        if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
+                return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
+        else if (loc == VARYING_SLOT_PNTC)
+                return (mask & (1 << 8));
+        else
+                return false;
+}
+
+/* Helpers for manipulating stream out information so we can pack varyings
+ * accordingly. Compute the src_offset for a given captured varying */
+
+static struct pipe_stream_output
+pan_get_so(struct pipe_stream_output_info info, gl_varying_slot loc)
+{
+        for (unsigned i = 0; i < info.num_outputs; ++i) {
+                if (info.output[i].register_index == loc)
+                        return  info.output[i];
+        }
+
+        unreachable("Varying not captured");
+}
+
+/* TODO: Integers */
+static enum mali_format
+pan_xfb_format(unsigned nr_components)
+{
+        switch (nr_components) {
+                case 1: return MALI_R32F;
+                case 2: return MALI_RG32F;
+                case 3: return MALI_RGB32F;
+                case 4: return MALI_RGBA32F;
+                default: unreachable("Invalid format");
+        }
+}
+
 void
 panfrost_emit_varying_descriptor(
         struct panfrost_context *ctx,
@@ -129,53 +195,55 @@ panfrost_emit_varying_descriptor(
         struct panfrost_transfer trans = panfrost_allocate_transient(ctx,
                                          vs_size + fs_size);
 
-        for (unsigned i = 0; i < vs->tripipe->varying_count; i++) {
-                if (!is_special_varying(vs->varyings_loc[i]))
-                        vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
-        }
-
-        for (unsigned i = 0; i < fs->tripipe->varying_count; i++) {
-                unsigned j;
+        struct pipe_stream_output_info so = vs->stream_output;
 
-                /* If we have a point sprite replacement, handle that here. We
-                 * have to translate location first.  TODO: Flip y in shader.
-                 * We're already keying ... just time crunch .. */
+        /* Check if this varying is linked by us. This is the case for
+         * general-purpose, non-captured varyings. If it is, link it. If it's
+         * not, use the provided stream out information to determine the
+         * offset, since it was already linked for us. */
 
-                unsigned loc = fs->varyings_loc[i];
-                unsigned pnt_loc =
-                        (loc >= VARYING_SLOT_TEX0) ? (loc - VARYING_SLOT_TEX0) :
-                        (loc == VARYING_SLOT_PNTC) ? 8 :
-                        ~0;
+        for (unsigned i = 0; i < vs->tripipe->varying_count; i++) {
+                gl_varying_slot loc = vs->varyings_loc[i];
 
-                if (~pnt_loc && fs->point_sprite_mask & (1 << pnt_loc)) {
-                        /* gl_PointCoord index by convention */
-                        fs->varyings[i].index = 3;
-                        fs->reads_point_coord = true;
+                bool special = is_special_varying(loc);
+                bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
 
-                        /* Swizzle out the z/w to 0/1 */
-                        fs->varyings[i].format = MALI_RG16F;
-                        fs->varyings[i].swizzle =
-                                panfrost_get_default_swizzle(2);
+                if (captured) {
+                        struct pipe_stream_output o = pan_get_so(so, loc);
 
-                        continue;
+                        unsigned dst_offset = o.dst_offset * 4; /* dwords */
+                        vs->varyings[i].src_offset = dst_offset;
+                } else if (!special) {
+                        vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
                 }
+        }
 
-                if (fs->varyings[i].index)
-                        continue;
+        /* Conversely, we need to set src_offset for the captured varyings.
+         * Here, the layout is defined by the stream out info, not us */
+
+        /* Link up with fragment varyings */
+        bool reads_point_coord = fs->reads_point_coord;
+
+        for (unsigned i = 0; i < fs->tripipe->varying_count; i++) {
+                gl_varying_slot loc = fs->varyings_loc[i];
+                signed vs_idx = -1;
 
-                /*
-                 * Re-use the VS general purpose varying pos if it exists,
-                 * create a new one otherwise.
-                 */
-                for (j = 0; j < vs->tripipe->varying_count; j++) {
-                        if (fs->varyings_loc[i] == vs->varyings_loc[j])
+                /* Link up */
+                for (unsigned j = 0; j < vs->tripipe->varying_count; ++j) {
+                        if (vs->varyings_loc[j] == loc) {
+                                vs_idx = j;
                                 break;
+                        }
                 }
 
-                if (j < vs->tripipe->varying_count)
-                        fs->varyings[i].src_offset = vs->varyings[j].src_offset;
+                /* Either assign or reuse */
+                if (vs_idx >= 0)
+                        fs->varyings[i].src_offset = vs->varyings[vs_idx].src_offset;
                 else
                         fs->varyings[i].src_offset = 16 * (num_gen_varyings++);
+
+                if (has_point_coord(fs->point_sprite_mask, loc))
+                        reads_point_coord |= true;
         }
 
         memcpy(trans.cpu, vs->varyings, vs_size);
@@ -183,13 +251,45 @@ panfrost_emit_varying_descriptor(
 
         union mali_attr varyings[PIPE_MAX_ATTRIBS];
 
-        unsigned idx = 0;
+        /* Figure out how many streamout buffers could be bound */
+        unsigned so_count = ctx->streamout.num_targets;
+        for (unsigned i = 0; i < vs->tripipe->varying_count; i++) {
+                gl_varying_slot loc = vs->varyings_loc[i];
+
+                bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
+                if (!captured) continue;
+
+                struct pipe_stream_output o = pan_get_so(so, loc);
+                so_count = MAX2(so_count, o.output_buffer + 1);
+        }
+
+        signed idx = so_count;
         signed general = idx++;
         signed gl_Position = idx++;
         signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
-        signed gl_PointCoord = fs->reads_point_coord ? (idx++) : -1;
+        signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
         signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
 
+        /* Emit the stream out buffers */
+
+        unsigned output_count = u_stream_outputs_for_vertices(
+                        ctx->active_prim, ctx->vertex_count);
+
+        for (unsigned i = 0; i < so_count; ++i) {
+                struct pipe_stream_output_target *target =
+                        (i < ctx->streamout.num_targets) ? ctx->streamout.targets[i] : NULL;
+
+                if (target) {
+                        panfrost_emit_streamout(ctx, &varyings[i], so.stride[i], ctx->streamout.offsets[i], output_count, target);
+                } else {
+                        /* Emit a dummy buffer */
+                        panfrost_emit_varyings(ctx, &varyings[i], so.stride[i] * 4, output_count);
+
+                        /* Clear the attribute type */
+                        varyings[i].elements &= ~0xF;
+                }
+        }
+
         panfrost_emit_varyings(ctx, &varyings[general], num_gen_varyings * 16,
                                vertex_count);
 
@@ -204,7 +304,7 @@ panfrost_emit_varying_descriptor(
                         panfrost_emit_varyings(ctx, &varyings[gl_PointSize],
                                                2, vertex_count);
 
-        if (fs->reads_point_coord)
+        if (reads_point_coord)
                 panfrost_emit_point_coord(&varyings[gl_PointCoord]);
 
         if (fs->reads_face)
@@ -221,6 +321,86 @@ panfrost_emit_varying_descriptor(
                 general, gl_Position, gl_PointSize,
                 gl_PointCoord, gl_FrontFacing);
 
+        /* Replace streamout */
+
+        struct mali_attr_meta *ovs = (struct mali_attr_meta *) (trans.cpu);
+        struct mali_attr_meta *ofs = (struct mali_attr_meta *) (trans.cpu + vs_size);
+
+        for (unsigned i = 0; i < vs->tripipe->varying_count; i++) {
+                gl_varying_slot loc = vs->varyings_loc[i];
+
+                bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
+                if (!captured) continue;
+
+                struct pipe_stream_output o = pan_get_so(so, loc);
+                ovs[i].index = o.output_buffer;
+
+                /* Set the type appropriately. TODO: Integer varyings XXX */
+                assert(o.stream == 0);
+                ovs[i].format = pan_xfb_format(o.num_components);
+                ovs[i].swizzle = panfrost_get_default_swizzle(o.num_components);
+
+                /* Link to the fragment */
+                signed fs_idx = -1;
+
+                /* Link up */
+                for (unsigned j = 0; j < fs->tripipe->varying_count; ++j) {
+                        if (fs->varyings_loc[j] == loc) {
+                                fs_idx = j;
+                                break;
+                        }
+                }
+
+                if (fs_idx >= 0) {
+                        ofs[fs_idx].index = ovs[i].index;
+                        ofs[fs_idx].format = ovs[i].format;
+                        ofs[fs_idx].swizzle = ovs[i].swizzle;
+                }
+        }
+
+        /* Replace point sprite */
+        for (unsigned i = 0; i < fs->tripipe->varying_count; i++) {
+                /* If we have a point sprite replacement, handle that here. We
+                 * have to translate location first.  TODO: Flip y in shader.
+                 * We're already keying ... just time crunch .. */
+
+                if (has_point_coord(fs->point_sprite_mask, fs->varyings_loc[i])) {
+                        ofs[i].index = gl_PointCoord;
+
+                        /* Swizzle out the z/w to 0/1 */
+                        ofs[i].format = MALI_RG16F;
+                        ofs[i].swizzle =
+                                panfrost_get_default_swizzle(2);
+                }
+        }
+
+        /* Fix up unaligned addresses */
+        for (unsigned i = 0; i < so_count; ++i) {
+                unsigned align = (varyings[i].elements & 63);
+
+                /* While we're at it, the SO buffers are linear */
+
+                if (!align) {
+                        varyings[i].elements |= MALI_ATTR_LINEAR;
+                        continue;
+                }
+
+                /* We need to adjust alignment */
+                varyings[i].elements &= ~63;
+                varyings[i].elements |= MALI_ATTR_LINEAR;
+                varyings[i].size += align;
+
+                for (unsigned v = 0; v < vs->tripipe->varying_count; ++v) {
+                        if (ovs[v].index == i)
+                                ovs[v].src_offset = vs->varyings[v].src_offset + align;
+                }
+
+                for (unsigned f = 0; f < fs->tripipe->varying_count; ++f) {
+                        if (ofs[f].index == i)
+                                ofs[f].src_offset = fs->varyings[f].src_offset + align;
+                }
+        }
+
         mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr));
         ctx->payloads[PIPE_SHADER_VERTEX].postfix.varyings = varyings_p;
         ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.varyings = varyings_p;