+void
+panfrost_emit_vertex_data(struct panfrost_batch *batch,
+ struct mali_vertex_tiler_postfix *vertex_postfix)
+{
+ struct panfrost_context *ctx = batch->ctx;
+ struct panfrost_vertex_state *so = ctx->vertex;
+
+ /* Staged mali_attr, and index into them. i =/= k, depending on the
+ * vertex buffer mask and instancing. Twice as much room is allocated,
+ * for a worst case of NPOT_DIVIDEs which take up extra slot */
+ union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
+ unsigned k = 0;
+
+ for (unsigned i = 0; i < so->num_elements; ++i) {
+ /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
+ * means duplicating some vertex buffers (who cares? aside from
+ * maybe some caching implications but I somehow doubt that
+ * matters) */
+
+ struct pipe_vertex_element *elem = &so->pipe[i];
+ unsigned vbi = elem->vertex_buffer_index;
+
+ /* The exception to 1:1 mapping is that we can have multiple
+ * entries (NPOT divisors), so we fixup anyways */
+
+ so->hw[i].index = k;
+
+ if (!(ctx->vb_mask & (1 << vbi)))
+ continue;
+
+ struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
+ struct panfrost_resource *rsrc;
+
+ rsrc = pan_resource(buf->buffer.resource);
+ if (!rsrc)
+ continue;
+
+ /* Align to 64 bytes by masking off the lower bits. This
+ * will be adjusted back when we fixup the src_offset in
+ * mali_attr_meta */
+
+ mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
+ mali_ptr addr = raw_addr & ~63;
+ unsigned chopped_addr = raw_addr - addr;
+
+ /* Add a dependency of the batch on the vertex buffer */
+ panfrost_batch_add_bo(batch, rsrc->bo,
+ PAN_BO_ACCESS_SHARED |
+ PAN_BO_ACCESS_READ |
+ PAN_BO_ACCESS_VERTEX_TILER);
+
+ /* Set common fields */
+ attrs[k].elements = addr;
+ attrs[k].stride = buf->stride;
+
+ /* Since we advanced the base pointer, we shrink the buffer
+ * size */
+ attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
+
+ /* We need to add the extra size we masked off (for
+ * correctness) so the data doesn't get clamped away */
+ attrs[k].size += chopped_addr;
+
+ /* For non-instancing make sure we initialize */
+ attrs[k].shift = attrs[k].extra_flags = 0;
+
+ /* Instancing uses a dramatically different code path than
+ * linear, so dispatch for the actual emission now that the
+ * common code is finished */
+
+ unsigned divisor = elem->instance_divisor;
+
+ if (divisor && ctx->instance_count == 1) {
+ /* Silly corner case where there's a divisor(=1) but
+ * there's no legitimate instancing. So we want *every*
+ * attribute to be the same. So set stride to zero so
+ * we don't go anywhere. */
+
+ attrs[k].size = attrs[k].stride + chopped_addr;
+ attrs[k].stride = 0;
+ attrs[k++].elements |= MALI_ATTR_LINEAR;
+ } else if (ctx->instance_count <= 1) {
+ /* Normal, non-instanced attributes */
+ attrs[k++].elements |= MALI_ATTR_LINEAR;
+ } else {
+ unsigned instance_shift = vertex_postfix->instance_shift;
+ unsigned instance_odd = vertex_postfix->instance_odd;
+
+ k += panfrost_vertex_instanced(ctx->padded_count,
+ instance_shift,
+ instance_odd,
+ divisor, &attrs[k]);
+ }
+ }
+
+ /* Add special gl_VertexID/gl_InstanceID buffers */
+
+ panfrost_vertex_id(ctx->padded_count, &attrs[k]);
+ so->hw[PAN_VERTEX_ID].index = k++;
+ panfrost_instance_id(ctx->padded_count, &attrs[k]);
+ so->hw[PAN_INSTANCE_ID].index = k++;
+
+ /* Upload whatever we emitted and go */
+
+ vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
+ k * sizeof(*attrs));
+}
+
+static mali_ptr
+panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
+ unsigned stride, unsigned count)
+{
+ /* Fill out the descriptor */
+ slot->stride = stride;
+ slot->size = stride * count;
+ slot->shift = slot->extra_flags = 0;
+
+ struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
+ slot->size);
+
+ slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
+
+ return transfer.gpu;
+}
+
+static unsigned
+panfrost_streamout_offset(unsigned stride, unsigned offset,
+ struct pipe_stream_output_target *target)
+{
+ return (target->buffer_offset + (offset * stride * 4)) & 63;
+}
+
+static void
+panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
+ unsigned stride, unsigned offset, unsigned count,
+ struct pipe_stream_output_target *target)
+{
+ /* Fill out the descriptor */
+ slot->stride = stride * 4;
+ slot->shift = slot->extra_flags = 0;
+
+ unsigned max_size = target->buffer_size;
+ unsigned expected_size = slot->stride * count;
+
+ /* Grab the BO and bind it to the batch */
+ struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
+
+ /* Varyings are WRITE from the perspective of the VERTEX but READ from
+ * the perspective of the TILER and FRAGMENT.
+ */
+ panfrost_batch_add_bo(batch, bo,
+ PAN_BO_ACCESS_SHARED |
+ PAN_BO_ACCESS_RW |
+ PAN_BO_ACCESS_VERTEX_TILER |
+ PAN_BO_ACCESS_FRAGMENT);
+
+ /* We will have an offset applied to get alignment */
+ mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
+ slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
+ slot->size = MIN2(max_size, expected_size) + (addr & 63);
+}
+
+static bool
+has_point_coord(unsigned mask, gl_varying_slot loc)
+{
+ if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
+ return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
+ else if (loc == VARYING_SLOT_PNTC)
+ return (mask & (1 << 8));
+ else
+ return false;
+}
+
+/* Helpers for manipulating stream out information so we can pack varyings
+ * accordingly. Compute the src_offset for a given captured varying */
+
+static struct pipe_stream_output *
+pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
+{
+ for (unsigned i = 0; i < info->num_outputs; ++i) {
+ if (info->output[i].register_index == loc)
+ return &info->output[i];
+ }
+
+ unreachable("Varying not captured");
+}
+
+static unsigned
+pan_varying_size(enum mali_format fmt)
+{
+ unsigned type = MALI_EXTRACT_TYPE(fmt);
+ unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
+ unsigned bits = MALI_EXTRACT_BITS(fmt);
+ unsigned bpc = 0;
+
+ if (bits == MALI_CHANNEL_FLOAT) {
+ /* No doubles */
+ bool fp16 = (type == MALI_FORMAT_SINT);
+ assert(fp16 || (type == MALI_FORMAT_UNORM));
+
+ bpc = fp16 ? 2 : 4;
+ } else {
+ assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
+
+ /* See the enums */
+ bits = 1 << bits;
+ assert(bits >= 8);
+ bpc = bits / 8;
+ }
+
+ return bpc * chan;
+}
+
+/* Indices for named (non-XFB) varyings that are present. These are packed
+ * tightly so they correspond to a bitfield present (P) indexed by (1 <<
+ * PAN_VARY_*). This has the nice property that you can lookup the buffer index
+ * of a given special field given a shift S by:
+ *
+ * idx = popcount(P & ((1 << S) - 1))
+ *
+ * That is... look at all of the varyings that come earlier and count them, the
+ * count is the new index since plus one. Likewise, the total number of special
+ * buffers required is simply popcount(P)
+ */
+
+enum pan_special_varying {
+ PAN_VARY_GENERAL = 0,
+ PAN_VARY_POSITION = 1,
+ PAN_VARY_PSIZ = 2,
+ PAN_VARY_PNTCOORD = 3,
+ PAN_VARY_FACE = 4,
+ PAN_VARY_FRAGCOORD = 5,
+
+ /* Keep last */
+ PAN_VARY_MAX,
+};
+
+/* Given a varying, figure out which index it correpsonds to */
+
+static inline unsigned
+pan_varying_index(unsigned present, enum pan_special_varying v)
+{
+ unsigned mask = (1 << v) - 1;
+ return util_bitcount(present & mask);
+}
+
+/* Get the base offset for XFB buffers, which by convention come after
+ * everything else. Wrapper function for semantic reasons; by construction this
+ * is just popcount. */
+
+static inline unsigned
+pan_xfb_base(unsigned present)
+{
+ return util_bitcount(present);
+}
+
+/* Computes the present mask for varyings so we can start emitting varying records */
+
+static inline unsigned
+pan_varying_present(
+ struct panfrost_shader_state *vs,
+ struct panfrost_shader_state *fs,
+ unsigned quirks)
+{
+ /* At the moment we always emit general and position buffers. Not
+ * strictly necessary but usually harmless */
+
+ unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
+
+ /* Enable special buffers by the shader info */
+
+ if (vs->writes_point_size)
+ present |= (1 << PAN_VARY_PSIZ);
+
+ if (fs->reads_point_coord)
+ present |= (1 << PAN_VARY_PNTCOORD);
+
+ if (fs->reads_face)
+ present |= (1 << PAN_VARY_FACE);
+
+ if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
+ present |= (1 << PAN_VARY_FRAGCOORD);
+
+ /* Also, if we have a point sprite, we need a point coord buffer */
+
+ for (unsigned i = 0; i < fs->varying_count; i++) {
+ gl_varying_slot loc = fs->varyings_loc[i];
+
+ if (has_point_coord(fs->point_sprite_mask, loc))
+ present |= (1 << PAN_VARY_PNTCOORD);
+ }
+
+ return present;
+}
+
+/* Emitters for varying records */
+
+static struct mali_attr_meta
+pan_emit_vary(unsigned present, enum pan_special_varying buf,
+ unsigned quirks, enum mali_format format,
+ unsigned offset)
+{
+ unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
+
+ struct mali_attr_meta meta = {
+ .index = pan_varying_index(present, buf),
+ .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
+ .swizzle = quirks & HAS_SWIZZLES ?
+ panfrost_get_default_swizzle(nr_channels) :
+ panfrost_bifrost_swizzle(nr_channels),
+ .format = format,
+ .src_offset = offset
+ };
+
+ return meta;
+}
+
+/* General varying that is unused */
+
+static struct mali_attr_meta
+pan_emit_vary_only(unsigned present, unsigned quirks)
+{
+ return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
+}
+
+/* Special records */
+
+static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
+ [PAN_VARY_POSITION] = MALI_VARYING_POS,
+ [PAN_VARY_PSIZ] = MALI_R16F,
+ [PAN_VARY_PNTCOORD] = MALI_R16F,
+ [PAN_VARY_FACE] = MALI_R32I,
+ [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
+};
+
+static struct mali_attr_meta
+pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
+ unsigned quirks)
+{
+ assert(buf < PAN_VARY_MAX);
+ return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
+}
+
+static enum mali_format
+pan_xfb_format(enum mali_format format, unsigned nr)
+{
+ if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
+ return MALI_R32F | MALI_NR_CHANNELS(nr);
+ else
+ return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
+}
+
+/* Transform feedback records. Note struct pipe_stream_output is (if packed as
+ * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
+ * value. */
+
+static struct mali_attr_meta
+pan_emit_vary_xfb(unsigned present,
+ unsigned max_xfb,
+ unsigned *streamout_offsets,
+ unsigned quirks,
+ enum mali_format format,
+ struct pipe_stream_output o)
+{
+ /* Otherwise construct a record for it */
+ struct mali_attr_meta meta = {
+ /* XFB buffers come after everything else */
+ .index = pan_xfb_base(present) + o.output_buffer,
+
+ /* As usual unknown bit */
+ .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
+
+ /* Override swizzle with number of channels */
+ .swizzle = quirks & HAS_SWIZZLES ?
+ panfrost_get_default_swizzle(o.num_components) :
+ panfrost_bifrost_swizzle(o.num_components),
+
+ /* Override number of channels and precision to highp */
+ .format = pan_xfb_format(format, o.num_components),
+
+ /* Apply given offsets together */
+ .src_offset = (o.dst_offset * 4) /* dwords */
+ + streamout_offsets[o.output_buffer]
+ };
+
+ return meta;
+}
+
+/* Determine if we should capture a varying for XFB. This requires actually
+ * having a buffer for it. If we don't capture it, we'll fallback to a general
+ * varying path (linked or unlinked, possibly discarding the write) */
+
+static bool
+panfrost_xfb_captured(struct panfrost_shader_state *xfb,
+ unsigned loc, unsigned max_xfb)
+{
+ if (!(xfb->so_mask & (1ll << loc)))
+ return false;
+
+ struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
+ return o->output_buffer < max_xfb;
+}
+
+/* Higher-level wrapper around all of the above, classifying a varying into one
+ * of the above types */
+
+static struct mali_attr_meta
+panfrost_emit_varying(
+ struct panfrost_shader_state *stage,
+ struct panfrost_shader_state *other,
+ struct panfrost_shader_state *xfb,
+ unsigned present,
+ unsigned max_xfb,
+ unsigned *streamout_offsets,
+ unsigned quirks,
+ unsigned *gen_offsets,
+ enum mali_format *gen_formats,
+ unsigned *gen_stride,
+ unsigned idx,
+ bool should_alloc,
+ bool is_fragment)
+{
+ gl_varying_slot loc = stage->varyings_loc[idx];
+ enum mali_format format = stage->varyings[idx];
+
+ /* Override format to match linkage */
+ if (!should_alloc && gen_formats[idx])
+ format = gen_formats[idx];
+
+ if (has_point_coord(stage->point_sprite_mask, loc)) {
+ return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
+ } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
+ struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
+ return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
+ } else if (loc == VARYING_SLOT_POS) {
+ if (is_fragment)
+ return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
+ else
+ return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
+ } else if (loc == VARYING_SLOT_PSIZ) {
+ return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
+ } else if (loc == VARYING_SLOT_PNTC) {
+ return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
+ } else if (loc == VARYING_SLOT_FACE) {
+ return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
+ }
+
+ /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
+ signed other_idx = -1;
+
+ for (unsigned j = 0; j < other->varying_count; ++j) {
+ if (other->varyings_loc[j] == loc) {
+ other_idx = j;
+ break;
+ }
+ }
+
+ if (other_idx < 0)
+ return pan_emit_vary_only(present, quirks);
+
+ unsigned offset = gen_offsets[other_idx];
+
+ if (should_alloc) {
+ /* We're linked, so allocate a space via a watermark allocation */
+ enum mali_format alt = other->varyings[other_idx];
+
+ /* Do interpolation at minimum precision */
+ unsigned size_main = pan_varying_size(format);
+ unsigned size_alt = pan_varying_size(alt);
+ unsigned size = MIN2(size_main, size_alt);
+
+ /* If a varying is marked for XFB but not actually captured, we
+ * should match the format to the format that would otherwise
+ * be used for XFB, since dEQP checks for invariance here. It's
+ * unclear if this is required by the spec. */
+
+ if (xfb->so_mask & (1ull << loc)) {
+ struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
+ format = pan_xfb_format(format, o->num_components);
+ size = pan_varying_size(format);
+ } else if (size == size_alt) {
+ format = alt;
+ }
+
+ gen_offsets[idx] = *gen_stride;
+ gen_formats[other_idx] = format;
+ offset = *gen_stride;
+ *gen_stride += size;
+ }
+
+ return pan_emit_vary(present, PAN_VARY_GENERAL,
+ quirks, format, offset);
+}
+
+static void
+pan_emit_special_input(union mali_attr *varyings,
+ unsigned present,
+ enum pan_special_varying v,
+ mali_ptr addr)
+{
+ if (present & (1 << v)) {
+ /* Ensure we write exactly once for performance and with fields
+ * zeroed appropriately to avoid flakes */
+
+ union mali_attr s = {
+ .elements = addr
+ };
+
+ varyings[pan_varying_index(present, v)] = s;
+ }
+}
+
+void
+panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
+ unsigned vertex_count,
+ struct mali_vertex_tiler_postfix *vertex_postfix,
+ struct mali_vertex_tiler_postfix *tiler_postfix,
+ union midgard_primitive_size *primitive_size)
+{
+ /* Load the shaders */
+ struct panfrost_context *ctx = batch->ctx;
+ struct panfrost_device *dev = pan_device(ctx->base.screen);
+ struct panfrost_shader_state *vs, *fs;
+ size_t vs_size, fs_size;
+
+ /* Allocate the varying descriptor */
+
+ vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
+ fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
+ vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
+ fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
+
+ struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
+ vs_size +
+ fs_size);
+
+ struct pipe_stream_output_info *so = &vs->stream_output;
+ unsigned present = pan_varying_present(vs, fs, dev->quirks);
+
+ /* Check if this varying is linked by us. This is the case for
+ * general-purpose, non-captured varyings. If it is, link it. If it's
+ * not, use the provided stream out information to determine the
+ * offset, since it was already linked for us. */
+
+ unsigned gen_offsets[32];
+ enum mali_format gen_formats[32];
+ memset(gen_offsets, 0, sizeof(gen_offsets));
+ memset(gen_formats, 0, sizeof(gen_formats));
+
+ unsigned gen_stride = 0;
+ assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
+ assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
+
+ unsigned streamout_offsets[32];
+
+ for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+ streamout_offsets[i] = panfrost_streamout_offset(
+ so->stride[i],
+ ctx->streamout.offsets[i],
+ ctx->streamout.targets[i]);
+ }
+
+ struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
+ struct mali_attr_meta *ofs = ovs + vs->varying_count;
+
+ for (unsigned i = 0; i < vs->varying_count; i++) {
+ ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
+ ctx->streamout.num_targets, streamout_offsets,
+ dev->quirks,
+ gen_offsets, gen_formats, &gen_stride, i, true, false);
+ }
+
+ for (unsigned i = 0; i < fs->varying_count; i++) {
+ ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
+ ctx->streamout.num_targets, streamout_offsets,
+ dev->quirks,
+ gen_offsets, gen_formats, &gen_stride, i, false, true);
+ }
+
+ unsigned xfb_base = pan_xfb_base(present);
+ struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
+ sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
+ union mali_attr *varyings = (union mali_attr *) T.cpu;
+
+ /* Emit the stream out buffers */
+
+ unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
+ ctx->vertex_count);
+
+ for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
+ panfrost_emit_streamout(batch, &varyings[xfb_base + i],
+ so->stride[i],
+ ctx->streamout.offsets[i],
+ out_count,
+ ctx->streamout.targets[i]);
+ }
+
+ panfrost_emit_varyings(batch,
+ &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
+ gen_stride, vertex_count);
+
+ /* fp32 vec4 gl_Position */
+ tiler_postfix->position_varying = panfrost_emit_varyings(batch,
+ &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
+ sizeof(float) * 4, vertex_count);
+
+ if (present & (1 << PAN_VARY_PSIZ)) {
+ primitive_size->pointer = panfrost_emit_varyings(batch,
+ &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
+ 2, vertex_count);
+ }
+
+ pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
+ pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
+ pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
+
+ vertex_postfix->varyings = T.gpu;
+ tiler_postfix->varyings = T.gpu;
+
+ vertex_postfix->varying_meta = trans.gpu;
+ tiler_postfix->varying_meta = trans.gpu + vs_size;
+}
+