+/* Add any missing varyings needed for stream-out. Otherwise varyings not
+ * used by fragment shader will be stripped out.
+ */
+static void
+tu6_link_streamout(struct ir3_shader_linkage *l,
+ const struct ir3_shader_variant *v)
+{
+ const struct ir3_stream_output_info *info = &v->shader->stream_output;
+
+ /*
+ * First, any stream-out varyings not already in linkage map (ie. also
+ * consumed by frag shader) need to be added:
+ */
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ const struct ir3_stream_output *out = &info->output[i];
+ unsigned compmask =
+ (1 << (out->num_components + out->start_component)) - 1;
+ unsigned k = out->register_index;
+ unsigned idx, nextloc = 0;
+
+ /* psize/pos need to be the last entries in linkage map, and will
+ * get added link_stream_out, so skip over them:
+ */
+ if (v->outputs[k].slot == VARYING_SLOT_PSIZ ||
+ v->outputs[k].slot == VARYING_SLOT_POS)
+ continue;
+
+ for (idx = 0; idx < l->cnt; idx++) {
+ if (l->var[idx].regid == v->outputs[k].regid)
+ break;
+ nextloc = MAX2(nextloc, l->var[idx].loc + 4);
+ }
+
+ /* add if not already in linkage map: */
+ if (idx == l->cnt)
+ ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
+
+ /* expand component-mask if needed, ie streaming out all components
+ * but frag shader doesn't consume all components:
+ */
+ if (compmask & ~l->var[idx].compmask) {
+ l->var[idx].compmask |= compmask;
+ l->max_loc = MAX2(l->max_loc, l->var[idx].loc +
+ util_last_bit(l->var[idx].compmask));
+ }
+ }
+}
+
+static void
+tu6_setup_streamout(const struct ir3_shader_variant *v,
+ struct ir3_shader_linkage *l, struct tu_streamout_state *tf)
+{
+ const struct ir3_stream_output_info *info = &v->shader->stream_output;
+
+ memset(tf, 0, sizeof(*tf));
+
+ tf->prog_count = align(l->max_loc, 2) / 2;
+
+ debug_assert(tf->prog_count < ARRAY_SIZE(tf->prog));
+
+ /* set stride info to the streamout state */
+ for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++)
+ tf->stride[i] = info->stride[i];
+
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ const struct ir3_stream_output *out = &info->output[i];
+ unsigned k = out->register_index;
+ unsigned idx;
+
+ tf->ncomp[out->output_buffer] += out->num_components;
+
+ /* linkage map sorted by order frag shader wants things, so
+ * a bit less ideal here..
+ */
+ for (idx = 0; idx < l->cnt; idx++)
+ if (l->var[idx].regid == v->outputs[k].regid)
+ break;
+
+ debug_assert(idx < l->cnt);
+
+ for (unsigned j = 0; j < out->num_components; j++) {
+ unsigned c = j + out->start_component;
+ unsigned loc = l->var[idx].loc + c;
+ unsigned off = j + out->dst_offset; /* in dwords */
+
+ if (loc & 1) {
+ tf->prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN |
+ A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
+ A6XX_VPC_SO_PROG_B_OFF(off * 4);
+ } else {
+ tf->prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN |
+ A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
+ A6XX_VPC_SO_PROG_A_OFF(off * 4);
+ }
+ }
+ }
+
+ tf->vpc_so_buf_cntl = A6XX_VPC_SO_BUF_CNTL_ENABLE |
+ COND(tf->ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) |
+ COND(tf->ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) |
+ COND(tf->ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) |
+ COND(tf->ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3);
+}
+
+static void
+tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
+ enum a6xx_state_block block, uint32_t offset,
+ uint32_t size, uint32_t *dwords) {
+ assert(size % 4 == 0);
+
+ tu_cs_emit_pkt7(cs, opcode, 3 + size);
+ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(block) |
+ CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
+
+ tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+ tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+ dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
+
+ tu_cs_emit_array(cs, dwords, size);
+}
+
+static void
+tu6_emit_link_map(struct tu_cs *cs,
+ const struct ir3_shader_variant *producer,
+ const struct ir3_shader_variant *consumer) {
+ const struct ir3_const_state *const_state = &consumer->shader->const_state;
+ uint32_t base = const_state->offsets.primitive_map;
+ uint32_t patch_locs[MAX_VARYING] = { }, num_loc;
+ num_loc = ir3_link_geometry_stages(producer, consumer, patch_locs);
+ int size = DIV_ROUND_UP(num_loc, 4);
+
+ size = (MIN2(size + base, consumer->constlen) - base) * 4;
+
+ tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, SB6_GS_SHADER, 0, size,
+ patch_locs);
+}
+
+static uint16_t
+gl_primitive_to_tess(uint16_t primitive) {
+ switch (primitive) {
+ case GL_POINTS:
+ return TESS_POINTS;
+ case GL_LINE_STRIP:
+ return TESS_LINES;
+ case GL_TRIANGLE_STRIP:
+ return TESS_CW_TRIS;
+ default:
+ unreachable("");
+ }
+}
+