From 29cfd154e387c5acb6c4827afc826503966bfd4b Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 9 Aug 2019 12:32:49 -0700 Subject: [PATCH] panfrost: Implement transform feedback Midgard has no hardware support for transform feedback, so we simulate it in software. Lucky us. What Midgard does do is write out vertex shader outputs to main memory unconditonally. Fragment shaders read varyings back from main memory; there's no on-chip storage for varyings. Whether this was a reasonable design is a question I will not be engaging in this commit message. What that does mean is that, in some sense, Midgard *always* does transform feedback uncondtionally, and there's no way to turn off transform feedback. Normally, we would allocate some scratch memory every frame to store the varyings in an arbitrary format (interleaved for simplicity), and then feed that scratch to the fragment shader and discard when the rendering completes. The only difference now is that sometimes, for some buffers, we use a BO provided to us by Gallium and a format provided by Gallium, instead of allocating the memory and choosing the format ourselves. This has some limitations -- in particular, it only works at vec4 granularity, so a corresponding GLSL linkage patch is needed to correctly implement transform feedback for non-vec4 types. Nevertheless, given the hardware already works in this admittedly-bizarre fashion, transform feedback is "free". Or, at least, it's no more expensive than any other rendering. Specifically not implemented is dynamically-sized transform feedback (i.e. with geometry/tesselation shaders). Spoiler alert: Midgard has no support for geometry *or* tessellation shaders, despite advertising support. They get compiled to *massive* compute shaders. How's that for checkbox compliance? Signed-off-by: Alyssa Rosenzweig Reviewed-by: Boris Brezillon --- src/gallium/drivers/panfrost/pan_varyings.c | 254 +++++++++++++++++--- 1 file changed, 217 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_varyings.c b/src/gallium/drivers/panfrost/pan_varyings.c index 40d7d98bf65..69e9e6d036d 100644 --- a/src/gallium/drivers/panfrost/pan_varyings.c +++ b/src/gallium/drivers/panfrost/pan_varyings.c @@ -24,6 +24,7 @@ */ #include "pan_context.h" +#include "util/u_prim.h" static mali_ptr panfrost_emit_varyings( @@ -45,6 +46,33 @@ panfrost_emit_varyings( return transfer.gpu; } +static void +panfrost_emit_streamout( + struct panfrost_context *ctx, + union mali_attr *slot, + unsigned stride, + unsigned offset, + unsigned count, + struct pipe_stream_output_target *target) +{ + /* Fill out the descriptor */ + slot->stride = stride * 4; + slot->shift = slot->extra_flags = 0; + + unsigned max_size = target->buffer_size; + unsigned expected_size = slot->stride * count; + + slot->size = MIN2(max_size, expected_size); + + /* Grab the BO and bind it to the batch */ + struct panfrost_job *batch = panfrost_get_job_for_fbo(ctx); + struct panfrost_bo *bo = pan_resource(target->buffer)->bo; + panfrost_job_add_bo(batch, bo); + + mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride); + slot->elements = addr; +} + static void panfrost_emit_point_coord(union mali_attr *slot) { @@ -110,6 +138,44 @@ panfrost_emit_varying_meta( } } +static bool +has_point_coord(unsigned mask, gl_varying_slot loc) +{ + if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7)) + return (mask & (1 << (loc - VARYING_SLOT_TEX0))); + else if (loc == VARYING_SLOT_PNTC) + return (mask & (1 << 8)); + else + return false; +} + +/* Helpers for manipulating stream out information so we can pack varyings + * accordingly. Compute the src_offset for a given captured varying */ + +static struct pipe_stream_output +pan_get_so(struct pipe_stream_output_info info, gl_varying_slot loc) +{ + for (unsigned i = 0; i < info.num_outputs; ++i) { + if (info.output[i].register_index == loc) + return info.output[i]; + } + + unreachable("Varying not captured"); +} + +/* TODO: Integers */ +static enum mali_format +pan_xfb_format(unsigned nr_components) +{ + switch (nr_components) { + case 1: return MALI_R32F; + case 2: return MALI_RG32F; + case 3: return MALI_RGB32F; + case 4: return MALI_RGBA32F; + default: unreachable("Invalid format"); + } +} + void panfrost_emit_varying_descriptor( struct panfrost_context *ctx, @@ -129,53 +195,55 @@ panfrost_emit_varying_descriptor( struct panfrost_transfer trans = panfrost_allocate_transient(ctx, vs_size + fs_size); - for (unsigned i = 0; i < vs->tripipe->varying_count; i++) { - if (!is_special_varying(vs->varyings_loc[i])) - vs->varyings[i].src_offset = 16 * (num_gen_varyings++); - } - - for (unsigned i = 0; i < fs->tripipe->varying_count; i++) { - unsigned j; + struct pipe_stream_output_info so = vs->stream_output; - /* If we have a point sprite replacement, handle that here. We - * have to translate location first. TODO: Flip y in shader. - * We're already keying ... just time crunch .. */ + /* Check if this varying is linked by us. This is the case for + * general-purpose, non-captured varyings. If it is, link it. If it's + * not, use the provided stream out information to determine the + * offset, since it was already linked for us. */ - unsigned loc = fs->varyings_loc[i]; - unsigned pnt_loc = - (loc >= VARYING_SLOT_TEX0) ? (loc - VARYING_SLOT_TEX0) : - (loc == VARYING_SLOT_PNTC) ? 8 : - ~0; + for (unsigned i = 0; i < vs->tripipe->varying_count; i++) { + gl_varying_slot loc = vs->varyings_loc[i]; - if (~pnt_loc && fs->point_sprite_mask & (1 << pnt_loc)) { - /* gl_PointCoord index by convention */ - fs->varyings[i].index = 3; - fs->reads_point_coord = true; + bool special = is_special_varying(loc); + bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); - /* Swizzle out the z/w to 0/1 */ - fs->varyings[i].format = MALI_RG16F; - fs->varyings[i].swizzle = - panfrost_get_default_swizzle(2); + if (captured) { + struct pipe_stream_output o = pan_get_so(so, loc); - continue; + unsigned dst_offset = o.dst_offset * 4; /* dwords */ + vs->varyings[i].src_offset = dst_offset; + } else if (!special) { + vs->varyings[i].src_offset = 16 * (num_gen_varyings++); } + } - if (fs->varyings[i].index) - continue; + /* Conversely, we need to set src_offset for the captured varyings. + * Here, the layout is defined by the stream out info, not us */ + + /* Link up with fragment varyings */ + bool reads_point_coord = fs->reads_point_coord; + + for (unsigned i = 0; i < fs->tripipe->varying_count; i++) { + gl_varying_slot loc = fs->varyings_loc[i]; + signed vs_idx = -1; - /* - * Re-use the VS general purpose varying pos if it exists, - * create a new one otherwise. - */ - for (j = 0; j < vs->tripipe->varying_count; j++) { - if (fs->varyings_loc[i] == vs->varyings_loc[j]) + /* Link up */ + for (unsigned j = 0; j < vs->tripipe->varying_count; ++j) { + if (vs->varyings_loc[j] == loc) { + vs_idx = j; break; + } } - if (j < vs->tripipe->varying_count) - fs->varyings[i].src_offset = vs->varyings[j].src_offset; + /* Either assign or reuse */ + if (vs_idx >= 0) + fs->varyings[i].src_offset = vs->varyings[vs_idx].src_offset; else fs->varyings[i].src_offset = 16 * (num_gen_varyings++); + + if (has_point_coord(fs->point_sprite_mask, loc)) + reads_point_coord |= true; } memcpy(trans.cpu, vs->varyings, vs_size); @@ -183,13 +251,45 @@ panfrost_emit_varying_descriptor( union mali_attr varyings[PIPE_MAX_ATTRIBS]; - unsigned idx = 0; + /* Figure out how many streamout buffers could be bound */ + unsigned so_count = ctx->streamout.num_targets; + for (unsigned i = 0; i < vs->tripipe->varying_count; i++) { + gl_varying_slot loc = vs->varyings_loc[i]; + + bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); + if (!captured) continue; + + struct pipe_stream_output o = pan_get_so(so, loc); + so_count = MAX2(so_count, o.output_buffer + 1); + } + + signed idx = so_count; signed general = idx++; signed gl_Position = idx++; signed gl_PointSize = vs->writes_point_size ? (idx++) : -1; - signed gl_PointCoord = fs->reads_point_coord ? (idx++) : -1; + signed gl_PointCoord = reads_point_coord ? (idx++) : -1; signed gl_FrontFacing = fs->reads_face ? (idx++) : -1; + /* Emit the stream out buffers */ + + unsigned output_count = u_stream_outputs_for_vertices( + ctx->active_prim, ctx->vertex_count); + + for (unsigned i = 0; i < so_count; ++i) { + struct pipe_stream_output_target *target = + (i < ctx->streamout.num_targets) ? ctx->streamout.targets[i] : NULL; + + if (target) { + panfrost_emit_streamout(ctx, &varyings[i], so.stride[i], ctx->streamout.offsets[i], output_count, target); + } else { + /* Emit a dummy buffer */ + panfrost_emit_varyings(ctx, &varyings[i], so.stride[i] * 4, output_count); + + /* Clear the attribute type */ + varyings[i].elements &= ~0xF; + } + } + panfrost_emit_varyings(ctx, &varyings[general], num_gen_varyings * 16, vertex_count); @@ -204,7 +304,7 @@ panfrost_emit_varying_descriptor( panfrost_emit_varyings(ctx, &varyings[gl_PointSize], 2, vertex_count); - if (fs->reads_point_coord) + if (reads_point_coord) panfrost_emit_point_coord(&varyings[gl_PointCoord]); if (fs->reads_face) @@ -221,6 +321,86 @@ panfrost_emit_varying_descriptor( general, gl_Position, gl_PointSize, gl_PointCoord, gl_FrontFacing); + /* Replace streamout */ + + struct mali_attr_meta *ovs = (struct mali_attr_meta *) (trans.cpu); + struct mali_attr_meta *ofs = (struct mali_attr_meta *) (trans.cpu + vs_size); + + for (unsigned i = 0; i < vs->tripipe->varying_count; i++) { + gl_varying_slot loc = vs->varyings_loc[i]; + + bool captured = ((vs->so_mask & (1ll << loc)) ? true : false); + if (!captured) continue; + + struct pipe_stream_output o = pan_get_so(so, loc); + ovs[i].index = o.output_buffer; + + /* Set the type appropriately. TODO: Integer varyings XXX */ + assert(o.stream == 0); + ovs[i].format = pan_xfb_format(o.num_components); + ovs[i].swizzle = panfrost_get_default_swizzle(o.num_components); + + /* Link to the fragment */ + signed fs_idx = -1; + + /* Link up */ + for (unsigned j = 0; j < fs->tripipe->varying_count; ++j) { + if (fs->varyings_loc[j] == loc) { + fs_idx = j; + break; + } + } + + if (fs_idx >= 0) { + ofs[fs_idx].index = ovs[i].index; + ofs[fs_idx].format = ovs[i].format; + ofs[fs_idx].swizzle = ovs[i].swizzle; + } + } + + /* Replace point sprite */ + for (unsigned i = 0; i < fs->tripipe->varying_count; i++) { + /* If we have a point sprite replacement, handle that here. We + * have to translate location first. TODO: Flip y in shader. + * We're already keying ... just time crunch .. */ + + if (has_point_coord(fs->point_sprite_mask, fs->varyings_loc[i])) { + ofs[i].index = gl_PointCoord; + + /* Swizzle out the z/w to 0/1 */ + ofs[i].format = MALI_RG16F; + ofs[i].swizzle = + panfrost_get_default_swizzle(2); + } + } + + /* Fix up unaligned addresses */ + for (unsigned i = 0; i < so_count; ++i) { + unsigned align = (varyings[i].elements & 63); + + /* While we're at it, the SO buffers are linear */ + + if (!align) { + varyings[i].elements |= MALI_ATTR_LINEAR; + continue; + } + + /* We need to adjust alignment */ + varyings[i].elements &= ~63; + varyings[i].elements |= MALI_ATTR_LINEAR; + varyings[i].size += align; + + for (unsigned v = 0; v < vs->tripipe->varying_count; ++v) { + if (ovs[v].index == i) + ovs[v].src_offset = vs->varyings[v].src_offset + align; + } + + for (unsigned f = 0; f < fs->tripipe->varying_count; ++f) { + if (ofs[f].index == i) + ofs[f].src_offset = fs->varyings[f].src_offset + align; + } + } + mali_ptr varyings_p = panfrost_upload_transient(ctx, &varyings, idx * sizeof(union mali_attr)); ctx->payloads[PIPE_SHADER_VERTEX].postfix.varyings = varyings_p; ctx->payloads[PIPE_SHADER_FRAGMENT].postfix.varyings = varyings_p; -- 2.30.2