+ }
+
+ assert(color_reads_for_sample[component].file != QFILE_NULL);
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static void
+ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ if (nir_src_is_const(instr->src[0])) {
+ int offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[0]));
+ assert(offset % 4 == 0);
+ /* We need dwords */
+ offset = offset / 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_uniform(c, QUNIFORM_UNIFORM,
+ offset + i));
+ }
+ } else {
+ ntq_emit_tmu_general(c, instr, false);
+ }
+}
+
+static void
+ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
+ * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+ */
+ unsigned offset =
+ nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
+
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+ /* Emit the LDVPM directly now, rather than at the top
+ * of the shader like we did for V3D 3.x (which needs
+ * vpmsetup when not just taking the next offset).
+ *
+ * Note that delaying like this may introduce stalls,
+ * as LDVPMV takes a minimum of 1 instruction but may
+ * be slower if the VPM unit is busy with another QPU.
+ */
+ int index = 0;
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+ index++;
+ }
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+ index++;
+ }
+ for (int i = 0; i < offset; i++)
+ index += c->vattr_sizes[i];
+ index += nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg vpm_offset = vir_uniform_ui(c, index++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMV_IN(c, vpm_offset));
+ }
+ } else {
+ for (int i = 0; i < instr->num_components; i++) {
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 + comp]));
+ }
+ }
+}
+
+static void
+ntq_emit_per_sample_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
+
+ unsigned rt = nir_src_as_uint(instr->src[1]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ unsigned sample_idx = nir_intrinsic_base(instr);
+ assert(sample_idx < V3D_MAX_SAMPLES);
+
+ unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ c->sample_colors[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+ntq_emit_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ unsigned offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[1])) * 4 +
+ nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ c->outputs[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ assert(instr->num_components == 1);
+
+ struct qreg offset = ntq_get_src(c, instr->src[1], 0);
+
+ uint32_t base_offset = nir_intrinsic_base(instr);
+
+ if (base_offset)
+ offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
+
+ /* Usually, for VS or FS, we only emit outputs once at program end so
+ * our VPM writes are never in non-uniform control flow, but this
+ * is not true for GS, where we are emitting multiple vertices.
+ */
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
+
+ struct qreg val = ntq_get_src(c, instr->src[0], 0);
+
+ /* The offset isn’t necessarily dynamically uniform for a geometry
+ * shader. This can happen if the shader sometimes doesn’t emit one of
+ * the vertices. In that case subsequent vertices will be written to
+ * different offsets in the VPM and we need to use the scatter write
+ * instruction to have a different offset for each lane.
+ */
+ if (nir_src_is_dynamically_uniform(instr->src[1]))
+ vir_VPM_WRITE_indirect(c, val, offset);
+ else
+ vir_STVPMD(c, offset, val);
+
+ if (vir_in_nonuniform_control_flow(c)) {
+ struct qinst *last_inst =
+ (struct qinst *)c->cur_block->instructions.prev;
+ vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+ }
+}
+
+static void
+ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ /* XXX perf: Use stvpmv with uniform non-constant offsets and
+ * stvpmd with non-uniform offsets and enable
+ * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ ntq_emit_color_write(c, instr);
+ } else if (c->s->info.stage == MESA_SHADER_GEOMETRY) {
+ emit_store_output_gs(c, instr);
+ } else {
+ assert(c->s->info.stage == MESA_SHADER_VERTEX);
+ assert(instr->num_components == 1);
+
+ vir_VPM_WRITE(c,
+ ntq_get_src(c, instr->src[0], 0),
+ nir_intrinsic_base(instr));
+ }
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ ntq_emit_load_uniform(c, instr);