v3d/compiler: Fix sorting the gs and fs inputs

[mesa.git] / src / broadcom / compiler / nir_to_vir.c
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c

index 0f832254a5572b182a23f4b7ffc2a0215e6de19a..8f568d4779ade6d76ccd3c2483fdfedbd84f3579 100644 (file)
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -22,7 +22,8 @@
   */
  
  #include <inttypes.h>
-#include "util/u_format.h"
+#include "util/format/u_format.h"
+#include "util/u_helpers.h"
  #include "util/u_math.h"
  #include "util/u_memory.h"
  #include "util/ralloc.h"
@@ -192,17 +193,28 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
           * need/can to do things slightly different, like not loading the
           * amount to add/sub, as that is implicit.
           */
-        bool atomic_add_replaced = ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
-                                     instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
-                                    (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
-                                     tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+        bool atomic_add_replaced =
+                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
+                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+                 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+
          bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                           instr->intrinsic == nir_intrinsic_store_scratch ||
                           instr->intrinsic == nir_intrinsic_store_shared);
+
+        bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
+                        instr->intrinsic == nir_intrinsic_load_ubo ||
+                        instr->intrinsic == nir_intrinsic_load_ssbo ||
+                        instr->intrinsic == nir_intrinsic_load_scratch ||
+                        instr->intrinsic == nir_intrinsic_load_shared);
+
+        if (!is_load)
+                c->tmu_dirty_rcl = true;
+
          bool has_index = !is_shared_or_scratch;
  
          int offset_src;
-        int tmu_writes = 1; /* address */
          if (instr->intrinsic == nir_intrinsic_load_uniform) {
                  offset_src = 0;
          } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
@@ -213,25 +225,8 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                  offset_src = 0 + has_index;
          } else if (is_store) {
                  offset_src = 1 + has_index;
-                for (int i = 0; i < instr->num_components; i++) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[0], i));
-                        tmu_writes++;
-                }
          } else {
                  offset_src = 0 + has_index;
-                vir_MOV_dest(c,
-                             vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                             ntq_get_src(c, instr->src[1 + has_index], 0));
-                tmu_writes++;
-                if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
-                        vir_MOV_dest(c,
-                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
-                                     ntq_get_src(c, instr->src[2 + has_index],
-                                                 0));
-                        tmu_writes++;
-                }
          }
  
          bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
@@ -239,25 +234,20 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
          if (!dynamic_src)
                  const_offset = nir_src_as_uint(instr->src[offset_src]);
  
-        /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
-         * storing at the same time.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
-
-        struct qreg offset;
+        struct qreg base_offset;
          if (instr->intrinsic == nir_intrinsic_load_uniform) {
                  const_offset += nir_intrinsic_base(instr);
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                     v3d_unit_data_create(0, const_offset));
+                base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                          v3d_unit_data_create(0, const_offset));
                  const_offset = 0;
          } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
                  uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
                  /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
                   * 1 (0 is gallium's constant buffer 0).
                   */
-                offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
-                                     v3d_unit_data_create(index, const_offset));
+                base_offset =
+                        vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                    v3d_unit_data_create(index, const_offset));
                  const_offset = 0;
          } else if (is_shared_or_scratch) {
                  /* Shared and scratch variables have no buffer index, and all
@@ -266,81 +256,149 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                   */
                  if (instr->intrinsic == nir_intrinsic_load_scratch ||
                      instr->intrinsic == nir_intrinsic_store_scratch) {
-                        offset = c->spill_base;
+                        base_offset = c->spill_base;
                  } else {
-                        offset = c->cs_shared_offset;
+                        base_offset = c->cs_shared_offset;
                          const_offset += nir_intrinsic_base(instr);
                  }
          } else {
-                offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
-                                     nir_src_as_uint(instr->src[is_store ?
-                                                                1 : 0]));
+                base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+                                          nir_src_as_uint(instr->src[is_store ?
+                                                                      1 : 0]));
          }
  
-        /* The spec says that for atomics, the TYPE field is ignored, but that
-         * doesn't seem to be the case for CMPXCHG.  Just use the number of
-         * tmud writes we did to decide the type (or choose "32bit" for atomic
-         * reads, which has been fine).
-         */
-        int num_components;
-        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH)
-                num_components = 2;
-        else
-                num_components = instr->num_components;
+        struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
+        unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
+        uint32_t base_const_offset = const_offset;
+        int first_component = -1;
+        int last_component = -1;
+        do {
+                int tmu_writes = 1; /* address */
  
-        uint32_t config = (0xffffff00 |
-                           tmu_op << 3|
-                           GENERAL_TMU_LOOKUP_PER_PIXEL);
-        if (num_components == 1) {
-                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
-        } else {
-                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2;
-        }
+                if (is_store) {
+                        /* Find the first set of consecutive components that
+                         * are enabled in the writemask and emit the TMUD
+                         * instructions for them.
+                         */
+                        first_component = ffs(writemask) - 1;
+                        last_component = first_component;
+                        while (writemask & BITFIELD_BIT(last_component + 1))
+                                last_component++;
+
+                        assert(first_component >= 0 &&
+                               first_component <= last_component &&
+                               last_component < instr->num_components);
+
+                        struct qreg tmud = vir_reg(QFILE_MAGIC,
+                                                   V3D_QPU_WADDR_TMUD);
+                        for (int i = first_component; i <= last_component; i++) {
+                                struct qreg data =
+                                        ntq_get_src(c, instr->src[0], i);
+                                vir_MOV_dest(c, tmud, data);
+                                tmu_writes++;
+                        }
  
-        if (vir_in_nonuniform_control_flow(c)) {
-                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
-                           V3D_QPU_PF_PUSHZ);
-        }
+                        /* Update the offset for the TMU write based on the
+                         * the first component we are writing.
+                         */
+                        const_offset = base_const_offset + first_component * 4;
+
+                        /* Clear these components from the writemask */
+                        uint32_t written_mask =
+                                BITFIELD_RANGE(first_component, tmu_writes - 1);
+                        writemask &= ~written_mask;
+                } else if (!is_load && !atomic_add_replaced) {
+                        struct qreg data =
+                                ntq_get_src(c, instr->src[1 + has_index], 0);
+                        vir_MOV_dest(c, tmud, data);
+                        tmu_writes++;
+                        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+                                data = ntq_get_src(c, instr->src[2 + has_index],
+                                                   0);
+                                vir_MOV_dest(c, tmud, data);
+                                tmu_writes++;
+                        }
+                }
  
-        struct qreg tmua;
-        if (config == ~0)
-                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
-        else
-                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+                /* Make sure we won't exceed the 16-entry TMU fifo if each
+                 * thread is storing at the same time.
+                 */
+                while (tmu_writes > 16 / c->threads)
+                        c->threads /= 2;
  
-        struct qinst *tmu;
-        if (dynamic_src) {
-                if (const_offset != 0) {
-                        offset = vir_ADD(c, offset,
-                                         vir_uniform_ui(c, const_offset));
+                /* The spec says that for atomics, the TYPE field is ignored,
+                 * but that doesn't seem to be the case for CMPXCHG.  Just use
+                 * the number of tmud writes we did to decide the type (or
+                 * choose "32bit" for atomic reads, which has been fine).
+                 */
+                uint32_t num_components;
+                if (is_load || atomic_add_replaced) {
+                        num_components = instr->num_components;
+                } else {
+                        assert(tmu_writes > 1);
+                        num_components = tmu_writes - 1;
                  }
-                tmu = vir_ADD_dest(c, tmua, offset,
-                                   ntq_get_src(c, instr->src[offset_src], 0));
-        } else {
-                if (const_offset != 0) {
-                        tmu = vir_ADD_dest(c, tmua, offset,
-                                           vir_uniform_ui(c, const_offset));
+
+                uint32_t config = (0xffffff00 |
+                                   tmu_op << 3|
+                                   GENERAL_TMU_LOOKUP_PER_PIXEL);
+                if (num_components == 1) {
+                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
                  } else {
-                        tmu = vir_MOV_dest(c, tmua, offset);
+                        config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+                                  num_components - 2;
                  }
-        }
  
-        if (config != ~0) {
-                tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
-                                                     config);
-        }
+                if (vir_in_nonuniform_control_flow(c)) {
+                        vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                                   V3D_QPU_PF_PUSHZ);
+                }
+
+                struct qreg tmua;
+                if (config == ~0)
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+                else
+                        tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+                struct qinst *tmu;
+                if (dynamic_src) {
+                        struct qreg offset = base_offset;
+                        if (const_offset != 0) {
+                                offset = vir_ADD(c, offset,
+                                                 vir_uniform_ui(c, const_offset));
+                        }
+                        struct qreg data =
+                                ntq_get_src(c, instr->src[offset_src], 0);
+                        tmu = vir_ADD_dest(c, tmua, offset, data);
+                } else {
+                        if (const_offset != 0) {
+                                tmu = vir_ADD_dest(c, tmua, base_offset,
+                                                   vir_uniform_ui(c, const_offset));
+                        } else {
+                                tmu = vir_MOV_dest(c, tmua, base_offset);
+                        }
+                }
  
-        if (vir_in_nonuniform_control_flow(c))
-                vir_set_cond(tmu, V3D_QPU_COND_IFA);
+                if (config != ~0) {
+                        tmu->uniform =
+                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+                                                      config);
+                }
  
-        vir_emit_thrsw(c);
+                if (vir_in_nonuniform_control_flow(c))
+                        vir_set_cond(tmu, V3D_QPU_COND_IFA);
  
-        /* Read the result, or wait for the TMU op to complete. */
-        for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
-                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+                vir_emit_thrsw(c);
  
-        if (nir_intrinsic_dest_components(instr) == 0)
-                vir_TMUWT(c);
+                /* Read the result, or wait for the TMU op to complete. */
+                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_MOV(c, vir_LDTMU(c)));
+                }
+
+                if (nir_intrinsic_dest_components(instr) == 0)
+                        vir_TMUWT(c);
+        } while (is_store && writemask != 0);
  }
  
  static struct qreg *
@@ -352,6 +410,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
          return qregs;
  }
  
+static bool
+is_ld_signal(const struct v3d_qpu_sig *sig)
+{
+        return (sig->ldunif ||
+                sig->ldunifa ||
+                sig->ldunifrf ||
+                sig->ldunifarf ||
+                sig->ldtmu ||
+                sig->ldvary ||
+                sig->ldvpm ||
+                sig->ldtlb ||
+                sig->ldtlbu);
+}
+
  /**
   * This function is responsible for getting VIR results into the associated
   * storage for a NIR instruction.
@@ -372,7 +444,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                 struct qreg result)
  {
          struct qinst *last_inst = NULL;
-        if (!list_empty(&c->cur_block->instructions))
+        if (!list_is_empty(&c->cur_block->instructions))
                  last_inst = (struct qinst *)c->cur_block->instructions.prev;
  
          assert((result.file == QFILE_TEMP &&
@@ -399,11 +471,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                          _mesa_hash_table_search(c->def_ht, reg);
                  struct qreg *qregs = entry->data;
  
-                /* Insert a MOV if the source wasn't an SSA def in the
-                 * previous instruction.
+                /* If the previous instruction can't be predicated for
+                 * the store into the nir_register, then emit a MOV
+                 * that can be.
                   */
-                if ((vir_in_nonuniform_control_flow(c) &&
-                     c->defs[last_inst->dst.index]->qpu.sig.ldunif)) {
+                if (vir_in_nonuniform_control_flow(c) &&
+                    is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) {
                          result = vir_MOV(c, result);
                          last_inst = c->defs[result.index];
                  }
@@ -944,7 +1017,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
          case nir_op_sge:
          case nir_op_slt: {
                  enum v3d_qpu_cond cond;
-                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
                  assert(ok);
                  result = vir_MOV(c, vir_SEL(c, cond,
                                              vir_uniform_f(c, 1.0),
@@ -965,7 +1038,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
          case nir_op_ilt32:
          case nir_op_ult32: {
                  enum v3d_qpu_cond cond;
-                MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
                  assert(ok);
                  result = vir_MOV(c, vir_SEL(c, cond,
                                              vir_uniform_ui(c, ~0),
@@ -1121,12 +1194,12 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
          struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
  
          nir_variable *var = c->output_color_var[rt];
-        struct qreg *color = &c->outputs[var->data.driver_location * 4];
          int num_components = glsl_get_vector_elements(var->type);
          uint32_t conf = 0xffffff00;
          struct qinst *inst;
  
-        conf |= TLB_SAMPLE_MODE_PER_PIXEL;
+        conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
+                                            TLB_SAMPLE_MODE_PER_PIXEL;
          conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
  
          if (c->fs_key->swap_color_rb & (1 << rt))
@@ -1160,41 +1233,55 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
                  }
          }
  
-        struct qreg r = color[0];
-        struct qreg g = color[1];
-        struct qreg b = color[2];
-        struct qreg a = color[3];
+        int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
+        for (int i = 0; i < num_samples; i++) {
+                struct qreg *color = c->msaa_per_sample_output ?
+                        &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
+                        &c->outputs[var->data.driver_location * 4];
  
-        if (c->fs_key->swap_color_rb & (1 << rt))  {
-                r = color[2];
-                b = color[0];
-        }
+                struct qreg r = color[0];
+                struct qreg g = color[1];
+                struct qreg b = color[2];
+                struct qreg a = color[3];
  
-        if (c->fs_key->sample_alpha_to_one)
-                a = vir_uniform_f(c, 1.0);
+                if (c->fs_key->swap_color_rb & (1 << rt))  {
+                        r = color[2];
+                        b = color[0];
+                }
  
-        if (is_32b_tlb_format) {
-                inst = vir_MOV_dest(c, tlbu_reg, r);
-                inst->uniform =
-                        vir_get_uniform_index(c, QUNIFORM_CONSTANT, conf);
+                if (c->fs_key->sample_alpha_to_one)
+                        a = vir_uniform_f(c, 1.0);
  
-                if (num_components >= 2)
-                        vir_MOV_dest(c, tlb_reg, g);
-                if (num_components >= 3)
-                        vir_MOV_dest(c, tlb_reg, b);
-                if (num_components >= 4)
-                        vir_MOV_dest(c, tlb_reg, a);
-        } else {
-                inst = vir_VFPACK_dest(c, tlb_reg, r, g);
-                if (conf != ~0) {
-                        inst->dst = tlbu_reg;
-                        inst->uniform = vir_get_uniform_index(c,
+                if (is_32b_tlb_format) {
+                        if (i == 0) {
+                                inst = vir_MOV_dest(c, tlbu_reg, r);
+                                inst->uniform =
+                                        vir_get_uniform_index(c,
                                                                QUNIFORM_CONSTANT,
                                                                conf);
-                }
+                        } else {
+                                inst = vir_MOV_dest(c, tlb_reg, r);
+                        }
+
+                        if (num_components >= 2)
+                                vir_MOV_dest(c, tlb_reg, g);
+                        if (num_components >= 3)
+                                vir_MOV_dest(c, tlb_reg, b);
+                        if (num_components >= 4)
+                                vir_MOV_dest(c, tlb_reg, a);
+                } else {
+                        inst = vir_VFPACK_dest(c, tlb_reg, r, g);
+                        if (conf != ~0 && i == 0) {
+                                inst->dst = tlbu_reg;
+                                inst->uniform =
+                                        vir_get_uniform_index(c,
+                                                              QUNIFORM_CONSTANT,
+                                                              conf);
+                        }
  
-                if (num_components >= 3)
-                        inst = vir_VFPACK_dest(c, tlb_reg, b, a);
+                        if (num_components >= 3)
+                                inst = vir_VFPACK_dest(c, tlb_reg, b, a);
+                }
          }
  }
  
@@ -1281,11 +1368,20 @@ emit_frag_end(struct v3d_compile *c)
                  vir_emit_tlb_color_write(c, rt);
  }
  
+static inline void
+vir_VPM_WRITE_indirect(struct v3d_compile *c,
+                       struct qreg val,
+                       struct qreg vpm_index)
+{
+        assert(c->devinfo->ver >= 40);
+        vir_STVPMV(c, vpm_index, val);
+}
+
  static void
  vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
  {
          if (c->devinfo->ver >= 40) {
-                vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
+                vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
          } else {
                  /* XXX: v3d33_vir_vpm_write_setup(c); */
                  vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
@@ -1301,6 +1397,15 @@ emit_vert_end(struct v3d_compile *c)
                  vir_VPMWT(c);
  }
  
+static void
+emit_geom_end(struct v3d_compile *c)
+{
+        /* GFXH-1684: VPM writes need to be complete by the end of the shader.
+         */
+        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+                vir_VPMWT(c);
+}
+
  void
  v3d_optimize_nir(struct nir_shader *s)
  {
@@ -1314,7 +1419,7 @@ v3d_optimize_nir(struct nir_shader *s)
                  progress = false;
  
                  NIR_PASS_V(s, nir_lower_vars_to_ssa);
-                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
+                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
                  NIR_PASS(progress, s, nir_lower_phis_to_scalar);
                  NIR_PASS(progress, s, nir_copy_prop);
                  NIR_PASS(progress, s, nir_opt_remove_phis);
@@ -1346,7 +1451,7 @@ v3d_optimize_nir(struct nir_shader *s)
                  NIR_PASS(progress, s, nir_opt_undef);
          } while (progress);
  
-        NIR_PASS(progress, s, nir_opt_move_load_ubo);
+        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
  }
  
  static int
@@ -1355,6 +1460,9 @@ driver_location_compare(const void *in_a, const void *in_b)
          const nir_variable *const *a = in_a;
          const nir_variable *const *b = in_b;
  
+        if ((*a)->data.driver_location == (*b)->data.driver_location)
+                return (*a)->data.location_frac - (*b)->data.location_frac;
+
          return (*a)->data.driver_location - (*b)->data.driver_location;
  }
  
@@ -1388,7 +1496,7 @@ ntq_emit_vpm_read(struct v3d_compile *c,
  }
  
  static void
-ntq_setup_vpm_inputs(struct v3d_compile *c)
+ntq_setup_vs_inputs(struct v3d_compile *c)
  {
          /* Figure out how many components of each vertex attribute the shader
           * uses.  Each variable should have been split to individual
@@ -1458,48 +1566,83 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
          }
  }
  
-static bool
-var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
-{
-        return (var->data.location == VARYING_SLOT_PNTC ||
-                (var->data.location >= VARYING_SLOT_VAR0 &&
-                 (c->fs_key->point_sprite_mask &
-                  (1 << (var->data.location - VARYING_SLOT_VAR0)))));
-}
-
  static bool
  program_reads_point_coord(struct v3d_compile *c)
  {
          nir_foreach_variable(var, &c->s->inputs) {
-                if (var_needs_point_coord(c, var))
+                if (util_varying_is_point_coord(var->data.location,
+                                                c->fs_key->point_sprite_mask)) {
                          return true;
+                }
          }
  
          return false;
  }
  
  static void
-ntq_setup_fs_inputs(struct v3d_compile *c)
+get_sorted_input_variables(struct v3d_compile *c,
+                           unsigned *num_entries,
+                           nir_variable ***vars)
  {
-        unsigned num_entries = 0;
-        unsigned num_components = 0;
-        nir_foreach_variable(var, &c->s->inputs) {
-                num_entries++;
-                num_components += glsl_get_components(var->type);
-        }
+        *num_entries = 0;
+        nir_foreach_variable(var, &c->s->inputs)
+                (*num_entries)++;
  
-        nir_variable *vars[num_entries];
+        *vars = ralloc_array(c, nir_variable *, *num_entries);
  
          unsigned i = 0;
          nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
+                (*vars)[i++] = var;
  
          /* Sort the variables so that we emit the input setup in
           * driver_location order.  This is required for VPM reads, whose data
           * is fetched into the VPM in driver_location (TGSI register index)
           * order.
           */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+        qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
+}
+
+static void
+ntq_setup_gs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
+
+        for (unsigned i = 0; i < num_entries; i++) {
+                nir_variable *var = vars[i];
+
+                /* All GS inputs are arrays with as many entries as vertices
+                 * in the input primitive, but here we only care about the
+                 * per-vertex input type.
+                 */
+                const struct glsl_type *type = glsl_without_array(var->type);
+                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned loc = var->data.driver_location;
+
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + array_len) * 4);
+
+                for (unsigned j = 0; j < array_len; j++) {
+                        unsigned num_elements = glsl_get_vector_elements(type);
+                        for (unsigned k = 0; k < num_elements; k++) {
+                                unsigned chan = var->data.location_frac + k;
+                                unsigned input_idx = c->num_inputs++;
+                                struct v3d_varying_slot slot =
+                                        v3d_slot_from_slot_and_component(var->data.location + j, chan);
+                                c->input_slots[input_idx] = slot;
+                        }
+                }
+        }
+}
+
+
+static void
+ntq_setup_fs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
  
          for (unsigned i = 0; i < num_entries; i++) {
                  nir_variable *var = vars[i];
@@ -1511,7 +1654,8 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
  
                  if (var->data.location == VARYING_SLOT_POS) {
                          emit_fragcoord_input(c, loc);
-                } else if (var_needs_point_coord(c, var)) {
+                } else if (util_varying_is_point_coord(var->data.location,
+                                                       c->fs_key->point_sprite_mask)) {
                          c->inputs[loc * 4 + 0] = c->point_x;
                          c->inputs[loc * 4 + 1] = c->point_y;
                  } else {
@@ -1614,17 +1758,17 @@ ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
  static void
  ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
  {
-        assert(instr->intrinsic == nir_intrinsic_image_deref_size);
-        nir_variable *var = nir_intrinsic_get_var(instr, 0);
-        unsigned image_index = var->data.driver_location;
-        const struct glsl_type *sampler_type = glsl_without_array(var->type);
-        bool is_array = glsl_sampler_type_is_array(sampler_type);
+        unsigned image_index = nir_src_as_uint(instr->src[0]);
+        bool is_array = nir_intrinsic_image_array(instr);
  
          ntq_store_dest(c, &instr->dest, 0,
                         vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
          if (instr->num_components > 1) {
                  ntq_store_dest(c, &instr->dest, 1,
-                               vir_uniform(c, QUNIFORM_IMAGE_HEIGHT,
+                               vir_uniform(c,
+                                           instr->num_components == 2 && is_array ?
+                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
+                                                   QUNIFORM_IMAGE_HEIGHT,
                                             image_index));
          }
          if (instr->num_components > 2) {
@@ -1833,6 +1977,77 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
          }
  }
  
+static void
+ntq_emit_per_sample_color_write(struct v3d_compile *c,
+                                nir_intrinsic_instr *instr)
+{
+        assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
+
+        unsigned rt = nir_src_as_uint(instr->src[1]);
+        assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+        unsigned sample_idx = nir_intrinsic_base(instr);
+        assert(sample_idx < V3D_MAX_SAMPLES);
+
+        unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
+        for (int i = 0; i < instr->num_components; i++) {
+                c->sample_colors[offset + i] =
+                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+        }
+}
+
+static void
+ntq_emit_color_write(struct v3d_compile *c,
+                     nir_intrinsic_instr *instr)
+{
+        unsigned offset = (nir_intrinsic_base(instr) +
+                           nir_src_as_uint(instr->src[1])) * 4 +
+                          nir_intrinsic_component(instr);
+        for (int i = 0; i < instr->num_components; i++) {
+                c->outputs[offset + i] =
+                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+        }
+}
+
+static void
+emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(instr->num_components == 1);
+
+        uint32_t base_offset = nir_intrinsic_base(instr);
+        struct qreg src_offset = ntq_get_src(c, instr->src[1], 0);
+        struct qreg offset =
+                vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset);
+
+        /* Usually, for VS or FS, we only emit outputs once at program end so
+         * our VPM writes are never in non-uniform control flow, but this
+         * is not true for GS, where we are emitting multiple vertices.
+         */
+        if (vir_in_nonuniform_control_flow(c)) {
+                vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+                           V3D_QPU_PF_PUSHZ);
+        }
+
+        struct qreg val = ntq_get_src(c, instr->src[0], 0);
+
+        /* The offset isn’t necessarily dynamically uniform for a geometry
+         * shader. This can happen if the shader sometimes doesn’t emit one of
+         * the vertices. In that case subsequent vertices will be written to
+         * different offsets in the VPM and we need to use the scatter write
+         * instruction to have a different offset for each lane.
+         */
+        if (nir_src_is_dynamically_uniform(instr->src[1]))
+                vir_VPM_WRITE_indirect(c, val, offset);
+        else
+                vir_STVPMD(c, offset, val);
+
+        if (vir_in_nonuniform_control_flow(c)) {
+                struct qinst *last_inst =
+                        (struct qinst *)c->cur_block->instructions.prev;
+                vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+        }
+}
+
  static void
  ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
  {
@@ -1841,19 +2056,16 @@ ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
           * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
           */
          if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-            unsigned offset = ((nir_intrinsic_base(instr) +
-                    nir_src_as_uint(instr->src[1])) * 4 +
-                    nir_intrinsic_component(instr));
-            for (int i = 0; i < instr->num_components; i++) {
-                    c->outputs[offset + i] =
-                            vir_MOV(c, ntq_get_src(c, instr->src[0], i));
-            }
+               ntq_emit_color_write(c, instr);
+        } else if (c->s->info.stage == MESA_SHADER_GEOMETRY)  {
+               emit_store_output_gs(c, instr);
          } else {
-                assert(instr->num_components == 1);
+               assert(c->s->info.stage == MESA_SHADER_VERTEX);
+               assert(instr->num_components == 1);
  
-                vir_VPM_WRITE(c,
-                              ntq_get_src(c, instr->src[0], 0),
-                              nir_intrinsic_base(instr));
+               vir_VPM_WRITE(c,
+                             ntq_get_src(c, instr->src[0], 0),
+                             nir_intrinsic_base(instr));
          }
  }
  
@@ -1901,16 +2113,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  ntq_emit_tmu_general(c, instr, true);
                  break;
  
-        case nir_intrinsic_image_deref_load:
-        case nir_intrinsic_image_deref_store:
-        case nir_intrinsic_image_deref_atomic_add:
-        case nir_intrinsic_image_deref_atomic_min:
-        case nir_intrinsic_image_deref_atomic_max:
-        case nir_intrinsic_image_deref_atomic_and:
-        case nir_intrinsic_image_deref_atomic_or:
-        case nir_intrinsic_image_deref_atomic_xor:
-        case nir_intrinsic_image_deref_atomic_exchange:
-        case nir_intrinsic_image_deref_atomic_comp_swap:
+        case nir_intrinsic_image_load:
+        case nir_intrinsic_image_store:
+        case nir_intrinsic_image_atomic_add:
+        case nir_intrinsic_image_atomic_imin:
+        case nir_intrinsic_image_atomic_umin:
+        case nir_intrinsic_image_atomic_imax:
+        case nir_intrinsic_image_atomic_umax:
+        case nir_intrinsic_image_atomic_and:
+        case nir_intrinsic_image_atomic_or:
+        case nir_intrinsic_image_atomic_xor:
+        case nir_intrinsic_image_atomic_exchange:
+        case nir_intrinsic_image_atomic_comp_swap:
                  v3d40_vir_emit_image_load_store(c, instr);
                  break;
  
@@ -1921,7 +2135,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  break;
  
          case nir_intrinsic_load_user_clip_plane:
-                for (int i = 0; i < instr->num_components; i++) {
+                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
                          ntq_store_dest(c, &instr->dest, i,
                                         vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
                                                     nir_intrinsic_ucp_id(instr) *
@@ -1954,6 +2168,20 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                                 vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
                  break;
  
+        case nir_intrinsic_load_line_coord:
+                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+                break;
+
+        case nir_intrinsic_load_line_width:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+                break;
+
+        case nir_intrinsic_load_aa_line_width:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+                break;
+
          case nir_intrinsic_load_sample_mask_in:
                  ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
                  break;
@@ -1992,11 +2220,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  ntq_emit_load_input(c, instr);
                  break;
  
-        case nir_intrinsic_store_output:
+        case nir_intrinsic_store_tlb_sample_color_v3d:
+               ntq_emit_per_sample_color_write(c, instr);
+               break;
+
+       case nir_intrinsic_store_output:
                  ntq_emit_store_output(c, instr);
                  break;
  
-        case nir_intrinsic_image_deref_size:
+        case nir_intrinsic_image_size:
                  ntq_emit_image_size(c, instr);
                  break;
  
@@ -2034,10 +2266,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
          }
  
          case nir_intrinsic_memory_barrier:
-        case nir_intrinsic_memory_barrier_atomic_counter:
          case nir_intrinsic_memory_barrier_buffer:
          case nir_intrinsic_memory_barrier_image:
          case nir_intrinsic_memory_barrier_shared:
+        case nir_intrinsic_memory_barrier_tcs_patch:
          case nir_intrinsic_group_memory_barrier:
                  /* We don't do any instruction scheduling of these NIR
                   * instructions between each other, so we just need to make
@@ -2048,7 +2280,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                   */
                  break;
  
-        case nir_intrinsic_barrier:
+        case nir_intrinsic_control_barrier:
                  /* Emit a TSY op to get all invocations in the workgroup
                   * (actually supergroup) to block until the last invocation
                   * reaches the TSY op.
@@ -2105,6 +2337,43 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                  ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
                  break;
  
+        case nir_intrinsic_load_per_vertex_input: {
+                /* col: vertex index, row = varying index */
+                struct qreg col = ntq_get_src(c, instr->src[0], 0);
+                uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
+                                   nir_intrinsic_component(instr);
+                for (int i = 0; i < instr->num_components; i++) {
+                        struct qreg row = vir_uniform_ui(c, row_idx++);
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_LDVPMG_IN(c, row, col));
+                }
+                break;
+        }
+
+        case nir_intrinsic_emit_vertex:
+        case nir_intrinsic_end_primitive:
+                unreachable("Should have been lowered in v3d_nir_lower_io");
+                break;
+
+        case nir_intrinsic_load_primitive_id: {
+                /* gl_PrimitiveIdIn is written by the GBG in the first word of
+                 * VPM output header. According to docs, we should read this
+                 * using ldvpm(v,d)_in (See Table 71).
+                 */
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                break;
+        }
+
+        case nir_intrinsic_load_invocation_id:
+                ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+                break;
+
+        case nir_intrinsic_load_fb_layers_v3d:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+                break;
+
          default:
                  fprintf(stderr, "Unknown intrinsic: ");
                  nir_print_instr(&instr->instr, stderr);
@@ -2307,10 +2576,6 @@ static void
  ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
  {
          switch (instr->type) {
-        case nir_instr_type_deref:
-                /* ignored, will be walked by the intrinsic using it. */
-                break;
-
          case nir_instr_type_alu:
                  ntq_emit_alu(c, nir_instr_as_alu(instr));
                  break;
@@ -2472,7 +2737,10 @@ nir_to_vir(struct v3d_compile *c)
                          c->point_x = emit_fragment_varying(c, NULL, 0, 0);
                          c->point_y = emit_fragment_varying(c, NULL, 0, 0);
                          c->uses_implicit_point_line_varyings = true;
-                } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
+                } else if (c->fs_key->is_lines &&
+                           (c->devinfo->ver < 40 ||
+                            (c->s->info.system_values_read &
+                             BITFIELD64_BIT(SYSTEM_VALUE_LINE_COORD)))) {
                          c->line_x = emit_fragment_varying(c, NULL, 0, 0);
                          c->uses_implicit_point_line_varyings = true;
                  }
@@ -2527,10 +2795,21 @@ nir_to_vir(struct v3d_compile *c)
                  c->spill_size += V3D_CHANNELS * c->s->scratch_size;
          }
  
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                ntq_setup_vs_inputs(c);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                ntq_setup_gs_inputs(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                  ntq_setup_fs_inputs(c);
-        else
-                ntq_setup_vpm_inputs(c);
+                break;
+        case MESA_SHADER_COMPUTE:
+                break;
+        default:
+                unreachable("unsupported shader stage");
+        }
  
          ntq_setup_outputs(c);
  
@@ -2574,6 +2853,7 @@ const nir_shader_compiler_options v3d_nir_options = {
          .lower_mul_high = true,
          .lower_wpos_pntc = true,
          .lower_rotate = true,
+        .lower_to_scalar = true,
  };
  
  /**
@@ -2675,6 +2955,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
          case MESA_SHADER_FRAGMENT:
                  emit_frag_end(c);
                  break;
+        case MESA_SHADER_GEOMETRY:
+                emit_geom_end(c);
+                break;
          case MESA_SHADER_VERTEX:
                  emit_vert_end(c);
                  break;