From: Iago Toral Quiroga <itoral@igalia.com>
Date: Mon, 28 Oct 2019 12:24:44 +0000 (+0100)
Subject: v3d: add initial compiler plumbing for geometry shaders
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5d578c27cecb4682074778b90b3e4d57a5cc0ebe;p=mesa.git

v3d: add initial compiler plumbing for geometry shaders

Most of the relevant work happens in the v3d_nir_lower_io. Since
geometry shaders can write any number of output vertices, this pass
injects a few variables into the shader code to keep track of things
like the number of vertices emitted or the offsets into the VPM
of the current vertex output, etc. This is also where we handle
EmitVertex() and EmitPrimitive() intrinsics.

The geometry shader VPM output layout has a specific structure
with a 32-bit general header, then another 32-bit header slot for
each output vertex, and finally the actual vertex data.

When vertex shaders are paired with geometry shaders we also need
to consider the following:
  - Only geometry shaders emit fixed function outputs.
  - The coordinate shader used for the vertex stage during binning must
    not drop varyings other than those used by transform feedback, since
    these may be read by the binning GS.

v2:
 - Use MAX3 instead of a chain of MAX2 (Alejandro).
 - Make all loop variables unsigned in ntq_setup_gs_inputs (Alejandro)
 - Update comment in IO owering so it includes the GS stage (Alejandro)

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
---

diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index d65edddab74..e02582035f1 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -30,8 +30,11 @@
 #define V3D_CHANNELS 16
 
 #define V3D_MAX_FS_INPUTS 64
+#define V3D_MAX_GS_INPUTS 64
 #define V3D_MAX_VS_INPUTS 64
-#define V3D_MAX_ANY_STAGE_INPUTS MAX2(V3D_MAX_VS_INPUTS, V3D_MAX_FS_INPUTS)
+#define V3D_MAX_ANY_STAGE_INPUTS MAX3(V3D_MAX_VS_INPUTS, \
+                                      V3D_MAX_GS_INPUTS, \
+                                      V3D_MAX_FS_INPUTS)
 
 /* Not specifically a hardware limit, just coordination between compiler and
  * driver.
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 6b566c68e07..d7bef12fef9 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1367,11 +1367,20 @@ emit_frag_end(struct v3d_compile *c)
                 vir_emit_tlb_color_write(c, rt);
 }
 
+static inline void
+vir_VPM_WRITE_indirect(struct v3d_compile *c,
+                       struct qreg val,
+                       struct qreg vpm_index)
+{
+        assert(c->devinfo->ver >= 40);
+        vir_STVPMV(c, vpm_index, val);
+}
+
 static void
 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
 {
         if (c->devinfo->ver >= 40) {
-                vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
+                vir_VPM_WRITE_indirect(c, val, vir_uniform_ui(c, vpm_index));
         } else {
                 /* XXX: v3d33_vir_vpm_write_setup(c); */
                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
@@ -1387,6 +1396,15 @@ emit_vert_end(struct v3d_compile *c)
                 vir_VPMWT(c);
 }
 
+static void
+emit_geom_end(struct v3d_compile *c)
+{
+        /* GFXH-1684: VPM writes need to be complete by the end of the shader.
+         */
+        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+                vir_VPMWT(c);
+}
+
 void
 v3d_optimize_nir(struct nir_shader *s)
 {
@@ -1474,7 +1492,7 @@ ntq_emit_vpm_read(struct v3d_compile *c,
 }
 
 static void
-ntq_setup_vpm_inputs(struct v3d_compile *c)
+ntq_setup_vs_inputs(struct v3d_compile *c)
 {
         /* Figure out how many components of each vertex attribute the shader
          * uses.  Each variable should have been split to individual
@@ -1565,24 +1583,69 @@ program_reads_point_coord(struct v3d_compile *c)
 }
 
 static void
-ntq_setup_fs_inputs(struct v3d_compile *c)
+get_sorted_input_variables(struct v3d_compile *c,
+                           unsigned *num_entries,
+                           nir_variable ***vars)
 {
-        unsigned num_entries = 0;
+        *num_entries = 0;
         nir_foreach_variable(var, &c->s->inputs)
-                num_entries++;
+                (*num_entries)++;
 
-        nir_variable *vars[num_entries];
+        *vars = ralloc_array(c, nir_variable *, *num_entries);
 
         unsigned i = 0;
         nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
+                (*vars)[i++] = var;
 
         /* Sort the variables so that we emit the input setup in
          * driver_location order.  This is required for VPM reads, whose data
          * is fetched into the VPM in driver_location (TGSI register index)
          * order.
          */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+        qsort(*vars, *num_entries, sizeof(**vars), driver_location_compare);
+}
+
+static void
+ntq_setup_gs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
+
+        for (unsigned i = 0; i < num_entries; i++) {
+                nir_variable *var = vars[i];
+
+                /* All GS inputs are arrays with as many entries as vertices
+                 * in the input primitive, but here we only care about the
+                 * per-vertex input type.
+                 */
+                const struct glsl_type *type = glsl_without_array(var->type);
+                unsigned array_len = MAX2(glsl_get_length(type), 1);
+                unsigned loc = var->data.driver_location;
+
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + array_len) * 4);
+
+                for (unsigned j = 0; j < array_len; j++) {
+                        unsigned num_elements = glsl_get_vector_elements(type);
+                        for (unsigned k = 0; k < num_elements; k++) {
+                                unsigned chan = var->data.location_frac + k;
+                                unsigned input_idx = c->num_inputs++;
+                                struct v3d_varying_slot slot =
+                                        v3d_slot_from_slot_and_component(var->data.location + j, chan);
+                                c->input_slots[input_idx] = slot;
+                        }
+                }
+        }
+}
+
+
+static void
+ntq_setup_fs_inputs(struct v3d_compile *c)
+{
+        nir_variable **vars;
+        unsigned num_entries;
+        get_sorted_input_variables(c, &num_entries, &vars);
 
         for (unsigned i = 0; i < num_entries; i++) {
                 nir_variable *var = vars[i];
@@ -1948,6 +2011,40 @@ ntq_emit_color_write(struct v3d_compile *c,
         }
 }
 
+static void
+emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        assert(instr->num_components == 1);
+
+        uint32_t base_offset = nir_intrinsic_base(instr);
+        struct qreg src_offset = ntq_get_src(c, instr->src[1], 0);
+        struct qreg offset =
+                vir_ADD(c, vir_uniform_ui(c, base_offset), src_offset);
+
+        vir_VPM_WRITE_indirect(c, ntq_get_src(c, instr->src[0], 0), offset);
+}
+
+static void
+ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        /* XXX perf: Use stvpmv with uniform non-constant offsets and
+         * stvpmd with non-uniform offsets and enable
+         * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+         */
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+               ntq_emit_color_write(c, instr);
+        } else if (c->s->info.stage == MESA_SHADER_GEOMETRY)  {
+               emit_store_output_gs(c, instr);
+        } else {
+               assert(c->s->info.stage == MESA_SHADER_VERTEX);
+               assert(instr->num_components == 1);
+
+               vir_VPM_WRITE(c,
+                             ntq_get_src(c, instr->src[0], 0),
+                             nir_intrinsic_base(instr));
+        }
+}
+
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@@ -2090,19 +2187,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                break;
 
        case nir_intrinsic_store_output:
-                /* XXX perf: Use stvpmv with uniform non-constant offsets and
-                 * stvpmd with non-uniform offsets and enable
-                 * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
-                 */
-                if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                        ntq_emit_color_write(c, instr);
-                } else {
-                        assert(instr->num_components == 1);
-
-                        vir_VPM_WRITE(c,
-                                      ntq_get_src(c, instr->src[0], 0),
-                                      nir_intrinsic_base(instr));
-                }
+                ntq_emit_store_output(c, instr);
                 break;
 
         case nir_intrinsic_image_deref_size:
@@ -2214,6 +2299,34 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
                 break;
 
+        case nir_intrinsic_load_per_vertex_input: {
+                /* col: vertex index, row = varying index */
+                struct qreg col = ntq_get_src(c, instr->src[0], 0);
+                uint32_t row_idx = nir_intrinsic_base(instr) * 4 +
+                                   nir_intrinsic_component(instr);
+                for (int i = 0; i < instr->num_components; i++) {
+                        struct qreg row = vir_uniform_ui(c, row_idx++);
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_LDVPMG_IN(c, row, col));
+                }
+                break;
+        }
+
+        case nir_intrinsic_emit_vertex:
+        case nir_intrinsic_end_primitive:
+                unreachable("Should have been lowered in v3d_nir_lower_io");
+                break;
+
+        case nir_intrinsic_load_primitive_id: {
+                /* gl_PrimitiveIdIn is written by the GBG in the first word of
+                 * VPM output header. According to docs, we should read this
+                 * using ldvpm(v,d)_in (See Table 71).
+                 */
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+                break;
+        }
+
         default:
                 fprintf(stderr, "Unknown intrinsic: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -2636,10 +2749,21 @@ nir_to_vir(struct v3d_compile *c)
                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
         }
 
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                ntq_setup_vs_inputs(c);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                ntq_setup_gs_inputs(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 ntq_setup_fs_inputs(c);
-        else
-                ntq_setup_vpm_inputs(c);
+                break;
+        case MESA_SHADER_COMPUTE:
+                break;
+        default:
+                unreachable("unsupported shader stage");
+        }
 
         ntq_setup_outputs(c);
 
@@ -2785,6 +2909,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
         case MESA_SHADER_FRAGMENT:
                 emit_frag_end(c);
                 break;
+        case MESA_SHADER_GEOMETRY:
+                emit_geom_end(c);
+                break;
         case MESA_SHADER_VERTEX:
                 emit_vert_end(c);
                 break;
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 29057bdf4df..9b08e4a270e 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -329,6 +329,7 @@ struct v3d_key {
                 bool clamp_r:1;
         } tex[V3D_MAX_TEXTURE_SAMPLERS];
         uint8_t ucp_enables;
+        bool is_last_geometry_stage;
 };
 
 struct v3d_fs_key {
@@ -371,6 +372,16 @@ struct v3d_fs_key {
         struct pipe_rt_blend_state blend;
 };
 
+struct v3d_gs_key {
+        struct v3d_key base;
+
+        struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS];
+        uint8_t num_used_outputs;
+
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct v3d_vs_key {
         struct v3d_key base;
 
@@ -552,6 +563,7 @@ struct v3d_compile {
         int local_invocation_index_bits;
 
         uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
+        uint8_t gs_input_sizes[V3D_MAX_GS_INPUTS];
         uint32_t vpm_output_size;
 
         /* Size in bytes of registers that have been spilled. This is how much
@@ -586,6 +598,7 @@ struct v3d_compile {
         struct pipe_shader_state *shader_state;
         struct v3d_key *key;
         struct v3d_fs_key *fs_key;
+        struct v3d_gs_key *gs_key;
         struct v3d_vs_key *vs_key;
 
         /* Live ranges of temps. */
@@ -687,6 +700,26 @@ struct v3d_vs_prog_data {
         uint8_t vcm_cache_size;
 };
 
+struct v3d_gs_prog_data {
+        struct v3d_prog_data base;
+
+        /* Whether the program reads gl_PrimitiveIDIn */
+        bool uses_pid;
+
+        /* Number of components read from each input varying. */
+        uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4];
+
+        /* Number of inputs */
+        uint8_t num_inputs;
+        struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS];
+
+        /* Total number of components written, for the shader state record. */
+        uint32_t vpm_output_size;
+
+        /* Output primitive type */
+        uint8_t out_prim_type;
+};
+
 struct v3d_fs_prog_data {
         struct v3d_prog_data base;
 
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 3145c560a14..3c9279a2fee 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -45,22 +45,46 @@ struct v3d_nir_lower_io_state {
         int psiz_vpm_offset;
         int varyings_vpm_offset;
 
+        /* Geometry shader state */
+        struct {
+                /* VPM offset for the current vertex data output */
+                nir_variable *output_offset_var;
+                /* VPM offset for the current vertex header */
+                nir_variable *header_offset_var;
+                /* VPM header for the current vertex */
+                nir_variable *header_var;
+
+                /* Size of the complete VPM output header */
+                uint32_t output_header_size;
+                /* Size of the output data for a single vertex */
+                uint32_t output_vertex_data_size;
+        } gs;
+
         BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
 
         nir_ssa_def *pos[4];
 };
 
 static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan)
+v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+                            struct v3d_nir_lower_io_state *state);
+
+static void
+v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
+                     nir_ssa_def *chan)
 {
         nir_intrinsic_instr *intr =
-                nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_store_output);
         nir_ssa_dest_init(&intr->instr, &intr->dest,
                           1, intr->dest.ssa.bit_size, NULL);
         intr->num_components = 1;
 
         intr->src[0] = nir_src_for_ssa(chan);
-        intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+        if (offset)
+                intr->src[1] = nir_src_for_ssa(offset);
+        else
+                intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
 
         nir_intrinsic_set_base(intr, base);
         nir_intrinsic_set_write_mask(intr, 0x1);
@@ -91,8 +115,23 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
 {
         int component = var->data.location_frac + chan;
 
-        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
-                struct v3d_varying_slot slot = c->vs_key->used_outputs[i];
+        uint32_t num_used_outputs = 0;
+        struct v3d_varying_slot *used_outputs = NULL;
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                num_used_outputs = c->vs_key->num_used_outputs;
+                used_outputs = c->vs_key->used_outputs;
+                break;
+        case MESA_SHADER_GEOMETRY:
+                num_used_outputs = c->gs_key->num_used_outputs;
+                used_outputs = c->gs_key->used_outputs;
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
+
+        for (int i = 0; i < num_used_outputs; i++) {
+                struct v3d_varying_slot slot = used_outputs[i];
 
                 if (v3d_slot_get_slot(slot) == var->data.location &&
                     v3d_slot_get_component(slot) == component) {
@@ -105,6 +144,9 @@ v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
 
 /* Lowers a store_output(gallium driver location) to a series of store_outputs
  * with a driver_location equal to the offset in the VPM.
+ *
+ * For geometry shaders we need to emit multiple vertices so the VPM offsets
+ * need to be computed in the shader code based on the current vertex index.
  */
 static void
 v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
@@ -113,6 +155,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
 {
         b->cursor = nir_before_instr(&intr->instr);
 
+        /* If this is a geometry shader we need to emit our outputs
+         * to the current vertex offset in the VPM.
+         */
+        nir_ssa_def *offset_reg =
+                c->s->info.stage == MESA_SHADER_GEOMETRY ?
+                        nir_load_var(b, state->gs.output_offset_var) : NULL;
+
         int start_comp = nir_intrinsic_component(intr);
         nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
                                            intr->num_components);
@@ -141,7 +190,7 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
         /* Just psiz to the position in the FF header right now. */
         if (var->data.location == VARYING_SLOT_PSIZ &&
             state->psiz_vpm_offset != -1) {
-                v3d_nir_store_output(b, state->psiz_vpm_offset, src);
+                v3d_nir_store_output(b, state->psiz_vpm_offset, offset_reg, src);
         }
 
         /* Scalarize outputs if it hasn't happened already, since we want to
@@ -161,12 +210,73 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
                 BITSET_SET(state->varyings_stored, vpm_offset);
 
                 v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
-                                     nir_channel(b, src, i));
+                                     offset_reg, nir_channel(b, src, i));
         }
 
         nir_instr_remove(&intr->instr);
 }
 
+static inline void
+reset_gs_header(nir_builder *b, struct v3d_nir_lower_io_state *state)
+{
+        const uint8_t NEW_PRIMITIVE_OFFSET = 0;
+        const uint8_t VERTEX_DATA_LENGTH_OFFSET = 8;
+
+        uint32_t vertex_data_size = state->gs.output_vertex_data_size;
+        assert((vertex_data_size & 0xffffff00) == 0);
+
+        uint32_t header;
+        header  = 1 << NEW_PRIMITIVE_OFFSET;
+        header |= vertex_data_size << VERTEX_DATA_LENGTH_OFFSET;
+        nir_store_var(b, state->gs.header_var, nir_imm_int(b, header), 0x1);
+}
+
+static void
+v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *instr,
+                          struct v3d_nir_lower_io_state *state)
+{
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
+        nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+        nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+
+        /* Emit fixed function outputs */
+        v3d_nir_emit_ff_vpm_outputs(c, b, state);
+
+        /* Emit vertex header */
+        v3d_nir_store_output(b, 0, header_offset, header);
+
+        /* Update VPM offset for next vertex output data and header */
+        output_offset =
+                nir_iadd(b, output_offset,
+                            nir_imm_int(b, state->gs.output_vertex_data_size));
+
+        header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+
+        /* Reset the New Primitive bit */
+        header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+
+        nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
+        nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
+        nir_store_var(b, state->gs.header_var, header, 0x1);
+
+        nir_instr_remove(&instr->instr);
+}
+
+static void
+v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
+                            nir_intrinsic_instr *instr,
+                            struct v3d_nir_lower_io_state *state)
+{
+        assert(state->gs.header_var);
+        b->cursor = nir_before_instr(&instr->instr);
+        reset_gs_header(b, state);
+
+        nir_instr_remove(&instr->instr);
+}
+
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -182,8 +292,18 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                 break;
 
         case nir_intrinsic_store_output:
-                if (c->s->info.stage == MESA_SHADER_VERTEX)
+                if (c->s->info.stage == MESA_SHADER_VERTEX ||
+                    c->s->info.stage == MESA_SHADER_GEOMETRY) {
                         v3d_nir_lower_vpm_output(c, b, intr, state);
+                }
+                break;
+
+        case nir_intrinsic_emit_vertex:
+                v3d_nir_lower_emit_vertex(c, b, intr, state);
+                break;
+
+        case nir_intrinsic_end_primitive:
+                v3d_nir_lower_end_primitive(c, b, intr, state);
                 break;
 
         default:
@@ -226,12 +346,64 @@ v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
 }
 
 static void
-v3d_nir_setup_vpm_layout(struct v3d_compile *c,
-                         struct v3d_nir_lower_io_state *state)
+v3d_nir_setup_vpm_layout_vs(struct v3d_compile *c,
+                            struct v3d_nir_lower_io_state *state)
 {
         uint32_t vpm_offset = 0;
 
-        if (c->vs_key->is_coord) {
+        state->pos_vpm_offset = -1;
+        state->vp_vpm_offset = -1;
+        state->zs_vpm_offset = -1;
+        state->rcp_wc_vpm_offset = -1;
+        state->psiz_vpm_offset = -1;
+
+        bool needs_ff_outputs = c->vs_key->base.is_last_geometry_stage;
+        if (needs_ff_outputs) {
+                if (c->vs_key->is_coord) {
+                        state->pos_vpm_offset = vpm_offset;
+                        vpm_offset += 4;
+                }
+
+                state->vp_vpm_offset = vpm_offset;
+                vpm_offset += 2;
+
+                if (!c->vs_key->is_coord) {
+                        state->zs_vpm_offset = vpm_offset++;
+                        state->rcp_wc_vpm_offset = vpm_offset++;
+                }
+
+                if (c->vs_key->per_vertex_point_size)
+                        state->psiz_vpm_offset = vpm_offset++;
+        }
+
+        state->varyings_vpm_offset = vpm_offset;
+
+        c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs;
+}
+
+static void
+v3d_nir_setup_vpm_layout_gs(struct v3d_compile *c,
+                            struct v3d_nir_lower_io_state *state)
+{
+        /* 1 header slot for number of output vertices */
+        uint32_t vpm_offset = 1;
+
+        /* 1 header slot per output vertex */
+        const uint32_t num_vertices = c->s->info.gs.vertices_out;
+        vpm_offset += num_vertices;
+
+        state->gs.output_header_size = vpm_offset;
+
+        /* Vertex data: here we only compute offsets into a generic vertex data
+         * elements. When it is time to actually write a particular vertex to
+         * the VPM, we will add the offset for that vertex into the VPM output
+         * to these offsets.
+         *
+         * If geometry shaders are present, they are always the last shader
+         * stage before rasterization, so we always emit fixed function outputs.
+         */
+        vpm_offset = 0;
+        if (c->gs_key->is_coord) {
                 state->pos_vpm_offset = vpm_offset;
                 vpm_offset += 4;
         } else {
@@ -241,7 +413,7 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c,
         state->vp_vpm_offset = vpm_offset;
         vpm_offset += 2;
 
-        if (!c->vs_key->is_coord) {
+        if (!c->gs_key->is_coord) {
                 state->zs_vpm_offset = vpm_offset++;
                 state->rcp_wc_vpm_offset = vpm_offset++;
         } else {
@@ -249,20 +421,34 @@ v3d_nir_setup_vpm_layout(struct v3d_compile *c,
                 state->rcp_wc_vpm_offset = -1;
         }
 
-        if (c->vs_key->per_vertex_point_size)
+        /* Mesa enables OES_geometry_shader_point_size automatically with
+         * OES_geometry_shader so we always need to handle point size
+         * writes if present.
+         */
+        if (c->gs_key->per_vertex_point_size)
                 state->psiz_vpm_offset = vpm_offset++;
-        else
-                state->psiz_vpm_offset = -1;
 
         state->varyings_vpm_offset = vpm_offset;
 
-        c->vpm_output_size = vpm_offset + c->vs_key->num_used_outputs;
+        state->gs.output_vertex_data_size =
+                state->varyings_vpm_offset + c->gs_key->num_used_outputs;
+
+        c->vpm_output_size =
+                state->gs.output_header_size +
+                state->gs.output_vertex_data_size * num_vertices;
 }
 
 static void
 v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                             struct v3d_nir_lower_io_state *state)
 {
+        /* If this is a geometry shader we need to emit our fixed function
+         * outputs to the current vertex offset in the VPM.
+         */
+        nir_ssa_def *offset_reg =
+                c->s->info.stage == MESA_SHADER_GEOMETRY ?
+                        nir_load_var(b, state->gs.output_offset_var) : NULL;
+
         for (int i = 0; i < 4; i++) {
                 if (!state->pos[i])
                         state->pos[i] = nir_ssa_undef(b, 1, 32);
@@ -273,23 +459,25 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
         if (state->pos_vpm_offset != -1) {
                 for (int i = 0; i < 4; i++) {
                         v3d_nir_store_output(b, state->pos_vpm_offset + i,
-                                             state->pos[i]);
+                                             offset_reg, state->pos[i]);
                 }
         }
 
-        for (int i = 0; i < 2; i++) {
-                nir_ssa_def *pos;
-                nir_ssa_def *scale;
-                pos = state->pos[i];
-                if (i == 0)
-                        scale = nir_load_viewport_x_scale(b);
-                else
-                        scale = nir_load_viewport_y_scale(b);
-                pos = nir_fmul(b, pos, scale);
-                pos = nir_fmul(b, pos, rcp_wc);
-                pos = nir_f2i32(b, nir_fround_even(b, pos));
-                v3d_nir_store_output(b, state->vp_vpm_offset + i,
-                                     pos);
+        if (state->vp_vpm_offset != -1) {
+                for (int i = 0; i < 2; i++) {
+                        nir_ssa_def *pos;
+                        nir_ssa_def *scale;
+                        pos = state->pos[i];
+                        if (i == 0)
+                                scale = nir_load_viewport_x_scale(b);
+                        else
+                                scale = nir_load_viewport_y_scale(b);
+                        pos = nir_fmul(b, pos, scale);
+                        pos = nir_fmul(b, pos, rcp_wc);
+                        pos = nir_f2i32(b, nir_fround_even(b, pos));
+                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                             offset_reg, pos);
+                }
         }
 
         if (state->zs_vpm_offset != -1) {
@@ -297,38 +485,118 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                 z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
                 z = nir_fmul(b, z, rcp_wc);
                 z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
-                v3d_nir_store_output(b, state->zs_vpm_offset, z);
+                v3d_nir_store_output(b, state->zs_vpm_offset, offset_reg, z);
         }
 
-        if (state->rcp_wc_vpm_offset != -1)
-                v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc);
+        if (state->rcp_wc_vpm_offset != -1) {
+                v3d_nir_store_output(b, state->rcp_wc_vpm_offset,
+                                     offset_reg, rcp_wc);
+        }
 
-        /* Store 0 to varyings requested by the FS but not stored in the VS.
-         * This should be undefined behavior, but glsl-routing seems to rely
-         * on it.
+        /* Store 0 to varyings requested by the FS but not stored by the
+         * previous stage. This should be undefined behavior, but
+         * glsl-routing seems to rely on it.
          */
-        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
+        uint32_t num_used_outputs;
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                num_used_outputs = c->vs_key->num_used_outputs;
+                break;
+        case MESA_SHADER_GEOMETRY:
+                num_used_outputs = c->gs_key->num_used_outputs;
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
+
+        for (int i = 0; i < num_used_outputs; i++) {
                 if (!BITSET_TEST(state->varyings_stored, i)) {
                         v3d_nir_store_output(b, state->varyings_vpm_offset + i,
-                                             nir_imm_int(b, 0));
+                                             offset_reg, nir_imm_int(b, 0));
                 }
         }
 }
 
+static void
+emit_gs_prolog(struct v3d_compile *c, nir_builder *b,
+               nir_function_impl *impl,
+               struct v3d_nir_lower_io_state *state)
+{
+        nir_block *first = nir_start_block(impl);
+        b->cursor = nir_before_block(first);
+
+        const struct glsl_type *uint_type = glsl_uint_type();
+
+        assert(!state->gs.output_offset_var);
+        state->gs.output_offset_var =
+                nir_local_variable_create(impl, uint_type, "output_offset");
+        nir_store_var(b, state->gs.output_offset_var,
+                      nir_imm_int(b, state->gs.output_header_size), 0x1);
+
+        assert(!state->gs.header_offset_var);
+        state->gs.header_offset_var =
+                nir_local_variable_create(impl, uint_type, "header_offset");
+        nir_store_var(b, state->gs.header_offset_var, nir_imm_int(b, 1), 0x1);
+
+        assert(!state->gs.header_var);
+        state->gs.header_var =
+                nir_local_variable_create(impl, uint_type, "header");
+        reset_gs_header(b, state);
+}
+
+static void
+emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
+                                 struct v3d_nir_lower_io_state *state)
+{
+        const uint8_t VERTEX_COUNT_OFFSET = 16;
+
+        /* Our GS header has 1 generic header slot (at VPM offset 0) and then
+         * one slot per output vertex after it. This means we don't need to
+         * have a variable just to keep track of the number of vertices we
+         * emitted and instead we can just compute it here from the header
+         * offset variable by removing the one generic header slot that always
+         * goes at the begining of out header.
+         */
+        nir_ssa_def *header_offset =
+                nir_load_var(b, state->gs.header_offset_var);
+        nir_ssa_def *vertex_count =
+                nir_isub(b, header_offset, nir_imm_int(b, 1));
+        nir_ssa_def *header =
+                nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
+                           nir_ishl(b, vertex_count,
+                                    nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+
+        v3d_nir_store_output(b, 0, NULL, header);
+}
+
 void
 v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 {
         struct v3d_nir_lower_io_state state = { 0 };
 
         /* Set up the layout of the VPM outputs. */
-        if (s->info.stage == MESA_SHADER_VERTEX)
-                v3d_nir_setup_vpm_layout(c, &state);
+        switch (s->info.stage) {
+        case MESA_SHADER_VERTEX:
+                v3d_nir_setup_vpm_layout_vs(c, &state);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_setup_vpm_layout_gs(c, &state);
+                break;
+        case MESA_SHADER_FRAGMENT:
+        case MESA_SHADER_COMPUTE:
+                break;
+        default:
+                unreachable("Unsupported shader stage");
+        }
 
         nir_foreach_function(function, s) {
                 if (function->impl) {
                         nir_builder b;
                         nir_builder_init(&b, function->impl);
 
+                        if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+                                emit_gs_prolog(c, &b, function->impl, &state);
+
                         nir_foreach_block(block, function->impl) {
                                 nir_foreach_instr_safe(instr, block)
                                         v3d_nir_lower_io_instr(c, &b, instr,
@@ -337,8 +605,11 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
 
                         nir_block *last = nir_impl_last_block(function->impl);
                         b.cursor = nir_after_block(last);
-                        if (s->info.stage == MESA_SHADER_VERTEX)
+                        if (s->info.stage == MESA_SHADER_VERTEX) {
                                 v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+                        } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+                                emit_gs_vpm_output_header_prolog(c, &b, &state);
+                        }
 
                         nir_metadata_preserve(function->impl,
                                               nir_metadata_block_index |
@@ -346,6 +617,8 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
                 }
         }
 
-        if (s->info.stage == MESA_SHADER_VERTEX)
+        if (s->info.stage == MESA_SHADER_VERTEX ||
+            s->info.stage == MESA_SHADER_GEOMETRY) {
                 v3d_nir_lower_io_update_output_var_base(c, &state);
+        }
 }
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 340cda903e9..dc966bc80ca 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,6 +23,7 @@
 
 #include "broadcom/common/v3d_device_info.h"
 #include "v3d_compiler.h"
+#include "util/u_prim.h"
 
 int
 vir_get_nsrc(struct qinst *inst)
@@ -660,6 +661,28 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
 }
 
+static void
+v3d_gs_set_prog_data(struct v3d_compile *c,
+                     struct v3d_gs_prog_data *prog_data)
+{
+        prog_data->num_inputs = c->num_inputs;
+        memcpy(prog_data->input_slots, c->input_slots,
+               c->num_inputs * sizeof(*c->input_slots));
+
+        /* gl_PrimitiveIdIn is written by the GBG into the first word of the
+         * VPM output header automatically and the shader will overwrite
+         * it after reading it if necessary, so it doesn't add to the VPM
+         * size requirements.
+         */
+        prog_data->uses_pid = (c->s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_PRIMITIVE_ID));
+
+        /* Output segment size is in sectors (8 rows of 32 bits per channel) */
+        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
+
+        prog_data->out_prim_type = c->s->info.gs.output_primitive;
+}
+
 static void
 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
                             struct v3d_fs_prog_data *prog_data)
@@ -714,13 +737,21 @@ v3d_set_prog_data(struct v3d_compile *c,
 
         v3d_set_prog_data_uniforms(c, prog_data);
 
-        if (c->s->info.stage == MESA_SHADER_COMPUTE) {
-                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
-        } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
-        } else {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
+                break;
+        case MESA_SHADER_COMPUTE:
+                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
+                break;
+        default:
+                unreachable("unsupported shader stage");
         }
 }
 
@@ -771,6 +802,37 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
         NIR_PASS_V(c->s, nir_opt_dce);
 }
 
+static void
+v3d_nir_lower_gs_early(struct v3d_compile *c)
+{
+        /* Split our I/O vars and dead code eliminate the unused
+         * components.
+         */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
+                   nir_var_shader_in | nir_var_shader_out);
+        uint64_t used_outputs[4] = {0};
+        for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
+                int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
+                int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
+                used_outputs[comp] |= 1ull << slot;
+        }
+        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
+                   &c->s->outputs, used_outputs, NULL); /* demotes to globals */
+        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
+
+        /* This must go before nir_lower_io */
+        if (c->gs_key->per_vertex_point_size)
+                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+
+        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+                   type_size_vec4,
+                   (nir_lower_io_options)0);
+        /* clean up nir_lower_io's deref_var remains */
+        NIR_PASS_V(c->s, nir_opt_dce);
+}
+
 static void
 v3d_fixup_fs_output_types(struct v3d_compile *c)
 {
@@ -818,6 +880,18 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
         }
 }
 
+static void
+v3d_nir_lower_gs_late(struct v3d_compile *c)
+{
+        if (c->key->ucp_enables) {
+                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
+                           false, NULL);
+        }
+
+        /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+}
+
 static void
 v3d_nir_lower_vs_late(struct v3d_compile *c)
 {
@@ -907,6 +981,10 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 c->vs_key = (struct v3d_vs_key *)key;
                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
                 break;
+        case MESA_SHADER_GEOMETRY:
+                c->gs_key = (struct v3d_gs_key *)key;
+                prog_data = rzalloc_size(NULL, sizeof(struct v3d_gs_prog_data));
+                break;
         case MESA_SHADER_FRAGMENT:
                 c->fs_key = (struct v3d_fs_key *)key;
                 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
@@ -919,20 +997,35 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 unreachable("unsupported shader stage");
         }
 
-        if (c->s->info.stage == MESA_SHADER_VERTEX) {
+
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_nir_lower_vs_early(c);
-        } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_lower_gs_early(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_nir_lower_fs_early(c);
+                break;
+        default:
+                break;
         }
 
         v3d_lower_nir(c);
 
-        if (c->s->info.stage == MESA_SHADER_VERTEX) {
+        switch (c->s->info.stage) {
+        case MESA_SHADER_VERTEX:
                 v3d_nir_lower_vs_late(c);
-        } else if (c->s->info.stage != MESA_SHADER_COMPUTE)  {
-                assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+                break;
+        case MESA_SHADER_GEOMETRY:
+                v3d_nir_lower_gs_late(c);
+                break;
+        case MESA_SHADER_FRAGMENT:
                 v3d_nir_lower_fs_late(c);
+                break;
+        default:
+                break;
         }
 
         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
@@ -1134,7 +1227,9 @@ const char *
 vir_get_stage_name(struct v3d_compile *c)
 {
         if (c->vs_key && c->vs_key->is_coord)
-                return "MESA_SHADER_COORD";
+                return "MESA_SHADER_VERTEX_BIN";
+        else if (c->gs_key && c->gs_key->is_coord)
+                return "MESA_SHADER_GEOMETRY_BIN";
         else
                 return gl_shader_stage_name(c->s->info.stage);
 }
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index ecedbaf9efb..bf85b42eb9d 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -59,7 +59,8 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
 #define VC5_DIRTY_ZSA                 (1ull <<  2)
 #define VC5_DIRTY_COMPTEX             (1ull <<  3)
 #define VC5_DIRTY_VERTTEX             (1ull <<  4)
-#define VC5_DIRTY_FRAGTEX             (1ull <<  5)
+#define VC5_DIRTY_GEOMTEX             (1ull <<  5)
+#define VC5_DIRTY_FRAGTEX             (1ull <<  6)
 
 #define VC5_DIRTY_SHADER_IMAGE        (1ull <<  9)
 #define VC5_DIRTY_BLEND_COLOR         (1ull << 10)
@@ -77,18 +78,22 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
 #define VC5_DIRTY_CLIP                (1ull << 22)
 #define VC5_DIRTY_UNCOMPILED_CS       (1ull << 23)
 #define VC5_DIRTY_UNCOMPILED_VS       (1ull << 24)
-#define VC5_DIRTY_UNCOMPILED_FS       (1ull << 25)
+#define VC5_DIRTY_UNCOMPILED_GS       (1ull << 25)
+#define VC5_DIRTY_UNCOMPILED_FS       (1ull << 26)
 
 #define VC5_DIRTY_COMPILED_CS         (1ull << 29)
 #define VC5_DIRTY_COMPILED_VS         (1ull << 30)
-#define VC5_DIRTY_COMPILED_FS         (1ull << 31)
-
-#define VC5_DIRTY_FS_INPUTS           (1ull << 35)
-#define VC5_DIRTY_STREAMOUT           (1ull << 36)
-#define VC5_DIRTY_OQ                  (1ull << 37)
-#define VC5_DIRTY_CENTROID_FLAGS      (1ull << 38)
-#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 39)
-#define VC5_DIRTY_SSBO                (1ull << 40)
+#define VC5_DIRTY_COMPILED_GS_BIN     (1ULL << 31)
+#define VC5_DIRTY_COMPILED_GS         (1ULL << 32)
+#define VC5_DIRTY_COMPILED_FS         (1ull << 33)
+
+#define VC5_DIRTY_FS_INPUTS           (1ull << 38)
+#define VC5_DIRTY_GS_INPUTS           (1ull << 39)
+#define VC5_DIRTY_STREAMOUT           (1ull << 40)
+#define VC5_DIRTY_OQ                  (1ull << 41)
+#define VC5_DIRTY_CENTROID_FLAGS      (1ull << 42)
+#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1ull << 43)
+#define VC5_DIRTY_SSBO                (1ull << 44)
 
 #define VC5_MAX_FS_INPUTS 64
 
@@ -206,6 +211,7 @@ struct v3d_compiled_shader {
         union {
                 struct v3d_prog_data *base;
                 struct v3d_vs_prog_data *vs;
+                struct v3d_gs_prog_data *gs;
                 struct v3d_fs_prog_data *fs;
                 struct v3d_compute_prog_data *compute;
         } prog_data;
@@ -219,8 +225,8 @@ struct v3d_compiled_shader {
 };
 
 struct v3d_program_stateobj {
-        struct v3d_uncompiled_shader *bind_vs, *bind_fs, *bind_compute;
-        struct v3d_compiled_shader *cs, *vs, *fs, *compute;
+        struct v3d_uncompiled_shader *bind_vs, *bind_gs, *bind_fs, *bind_compute;
+        struct v3d_compiled_shader *cs, *vs, *gs_bin, *gs, *fs, *compute;
 
         struct hash_table *cache[MESA_SHADER_STAGES];
 
diff --git a/src/gallium/drivers/v3d/v3d_program.c b/src/gallium/drivers/v3d/v3d_program.c
index 0f7762f119d..7bbdbe409e2 100644
--- a/src/gallium/drivers/v3d/v3d_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -205,8 +205,12 @@ v3d_shader_precompile(struct v3d_context *v3d,
                 v3d_setup_shared_precompile_key(so, &key.base);
                 v3d_get_compiled_shader(v3d, &key.base, sizeof(key));
         } else {
+                /* FIXME: add geometry shaders */
+
                 struct v3d_vs_key key = {
                         .base.shader_state = so,
+                        /* Emit fixed function outputs */
+                        .base.is_last_geometry_stage = true,
                 };
 
                 v3d_setup_shared_precompile_key(so, &key.base);
@@ -271,8 +275,10 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,
         }
 
         nir_variable_mode lower_mode = nir_var_all & ~nir_var_uniform;
-        if (s->info.stage == MESA_SHADER_VERTEX)
+        if (s->info.stage == MESA_SHADER_VERTEX ||
+            s->info.stage == MESA_SHADER_GEOMETRY) {
                 lower_mode &= ~(nir_var_shader_in | nir_var_shader_out);
+        }
         NIR_PASS_V(s, nir_lower_io, lower_mode,
                    type_size,
                    (nir_lower_io_options)0);
@@ -609,55 +615,153 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
 }
 
 static void
-v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode)
+v3d_update_compiled_gs(struct v3d_context *v3d, uint8_t prim_mode)
 {
-        struct v3d_vs_key local_key;
-        struct v3d_vs_key *key = &local_key;
+        struct v3d_gs_key local_key;
+        struct v3d_gs_key *key = &local_key;
 
-        if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE |
+        if (!(v3d->dirty & (VC5_DIRTY_GEOMTEX |
                             VC5_DIRTY_RASTERIZER |
-                            VC5_DIRTY_VERTTEX |
-                            VC5_DIRTY_VTXSTATE |
-                            VC5_DIRTY_UNCOMPILED_VS |
+                            VC5_DIRTY_UNCOMPILED_GS |
+                            VC5_DIRTY_PRIM_MODE |
                             VC5_DIRTY_FS_INPUTS))) {
                 return;
         }
 
+        if (!v3d->prog.bind_gs) {
+                v3d->prog.gs = NULL;
+                v3d->prog.gs_bin = NULL;
+                return;
+        }
+
         memset(key, 0, sizeof(*key));
-        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]);
-        key->base.shader_state = v3d->prog.bind_vs;
+        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_GEOMETRY]);
+        key->base.shader_state = v3d->prog.bind_gs;
         key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable;
+        key->base.is_last_geometry_stage = true;
         key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs;
         STATIC_ASSERT(sizeof(key->used_outputs) ==
                       sizeof(v3d->prog.fs->prog_data.fs->input_slots));
         memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots,
                sizeof(key->used_outputs));
-        key->clamp_color = v3d->rasterizer->base.clamp_vertex_color;
 
         key->per_vertex_point_size =
                 (prim_mode == PIPE_PRIM_POINTS &&
                  v3d->rasterizer->base.point_size_per_vertex);
 
-        struct v3d_compiled_shader *vs =
+        struct v3d_compiled_shader *gs =
                 v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
-        if (vs != v3d->prog.vs) {
-                v3d->prog.vs = vs;
-                v3d->dirty |= VC5_DIRTY_COMPILED_VS;
+        if (gs != v3d->prog.gs) {
+                v3d->prog.gs = gs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_GS;
         }
 
         key->is_coord = true;
-        /* Coord shaders only output varyings used by transform feedback. */
+
+        /* The last bin-mode shader in the geometry pipeline only outputs
+         * varyings used by transform feedback.
+         */
         struct v3d_uncompiled_shader *shader_state = key->base.shader_state;
         memcpy(key->used_outputs, shader_state->tf_outputs,
                sizeof(*key->used_outputs) * shader_state->num_tf_outputs);
         if (shader_state->num_tf_outputs < key->num_used_outputs) {
+                uint32_t size = sizeof(*key->used_outputs) *
+                                (key->num_used_outputs -
+                                 shader_state->num_tf_outputs);
                 memset(&key->used_outputs[shader_state->num_tf_outputs],
-                       0,
-                       sizeof(*key->used_outputs) * (key->num_used_outputs -
-                                                  shader_state->num_tf_outputs));
+                       0, size);
         }
         key->num_used_outputs = shader_state->num_tf_outputs;
 
+        struct v3d_compiled_shader *old_gs = v3d->prog.gs;
+        struct v3d_compiled_shader *gs_bin =
+                v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
+        if (gs_bin != old_gs) {
+                v3d->prog.gs_bin = gs_bin;
+                v3d->dirty |= VC5_DIRTY_COMPILED_GS_BIN;
+        }
+
+        if (old_gs && memcmp(v3d->prog.gs->prog_data.gs->input_slots,
+                             old_gs->prog_data.gs->input_slots,
+                             sizeof(v3d->prog.gs->prog_data.gs->input_slots))) {
+                v3d->dirty |= VC5_DIRTY_GS_INPUTS;
+        }
+}
+
+static void
+v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode)
+{
+        struct v3d_vs_key local_key;
+        struct v3d_vs_key *key = &local_key;
+
+        if (!(v3d->dirty & (VC5_DIRTY_VERTTEX |
+                            VC5_DIRTY_VTXSTATE |
+                            VC5_DIRTY_UNCOMPILED_VS |
+                            (v3d->prog.bind_gs ? 0 : VC5_DIRTY_RASTERIZER) |
+                            (v3d->prog.bind_gs ? 0 : VC5_DIRTY_PRIM_MODE) |
+                            (v3d->prog.bind_gs ? VC5_DIRTY_GS_INPUTS :
+                                                 VC5_DIRTY_FS_INPUTS)))) {
+                return;
+        }
+
+        memset(key, 0, sizeof(*key));
+        v3d_setup_shared_key(v3d, &key->base, &v3d->tex[PIPE_SHADER_VERTEX]);
+        key->base.shader_state = v3d->prog.bind_vs;
+        key->base.ucp_enables = v3d->rasterizer->base.clip_plane_enable;
+        key->base.is_last_geometry_stage = !v3d->prog.bind_gs;
+
+        if (!v3d->prog.bind_gs) {
+            key->num_used_outputs = v3d->prog.fs->prog_data.fs->num_inputs;
+            STATIC_ASSERT(sizeof(key->used_outputs) ==
+                          sizeof(v3d->prog.fs->prog_data.fs->input_slots));
+            memcpy(key->used_outputs, v3d->prog.fs->prog_data.fs->input_slots,
+                   sizeof(key->used_outputs));
+        } else {
+            key->num_used_outputs = v3d->prog.gs->prog_data.gs->num_inputs;
+            STATIC_ASSERT(sizeof(key->used_outputs) ==
+                          sizeof(v3d->prog.gs->prog_data.gs->input_slots));
+            memcpy(key->used_outputs, v3d->prog.gs->prog_data.gs->input_slots,
+                   sizeof(key->used_outputs));
+        }
+
+        key->clamp_color = v3d->rasterizer->base.clamp_vertex_color;
+
+        key->per_vertex_point_size =
+                (prim_mode == PIPE_PRIM_POINTS &&
+                 v3d->rasterizer->base.point_size_per_vertex);
+
+        struct v3d_compiled_shader *vs =
+                v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
+        if (vs != v3d->prog.vs) {
+                v3d->prog.vs = vs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_VS;
+        }
+
+        key->is_coord = true;
+
+        /* Coord shaders only output varyings used by transform feedback,
+         * unless they are linked to other shaders in the geometry side
+         * of the pipeline, since in that case any of the output varyings
+         * could be required in later geometry stages to compute
+         * gl_Position or TF outputs.
+         */
+        if (!v3d->prog.bind_gs) {
+                struct v3d_uncompiled_shader *shader_state =
+                        key->base.shader_state;
+                memcpy(key->used_outputs, shader_state->tf_outputs,
+                       sizeof(*key->used_outputs) *
+                       shader_state->num_tf_outputs);
+                if (shader_state->num_tf_outputs < key->num_used_outputs) {
+                        uint32_t tail_bytes =
+                                sizeof(*key->used_outputs) *
+                                (key->num_used_outputs -
+                                 shader_state->num_tf_outputs);
+                        memset(&key->used_outputs[shader_state->num_tf_outputs],
+                               0, tail_bytes);
+                }
+                key->num_used_outputs = shader_state->num_tf_outputs;
+        }
+
         struct v3d_compiled_shader *cs =
                 v3d_get_compiled_shader(v3d, &key->base, sizeof(*key));
         if (cs != v3d->prog.cs) {
@@ -670,6 +774,7 @@ void
 v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode)
 {
         v3d_update_compiled_fs(v3d, prim_mode);
+        v3d_update_compiled_gs(v3d, prim_mode);
         v3d_update_compiled_vs(v3d, prim_mode);
 }
 
@@ -702,6 +807,12 @@ fs_cache_hash(const void *key)
         return _mesa_hash_data(key, sizeof(struct v3d_fs_key));
 }
 
+static uint32_t
+gs_cache_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(struct v3d_gs_key));
+}
+
 static uint32_t
 vs_cache_hash(const void *key)
 {
@@ -720,6 +831,12 @@ fs_cache_compare(const void *key1, const void *key2)
         return memcmp(key1, key2, sizeof(struct v3d_fs_key)) == 0;
 }
 
+static bool
+gs_cache_compare(const void *key1, const void *key2)
+{
+        return memcmp(key1, key2, sizeof(struct v3d_gs_key)) == 0;
+}
+
 static bool
 vs_cache_compare(const void *key1, const void *key2)
 {
@@ -771,6 +888,14 @@ v3d_fp_state_bind(struct pipe_context *pctx, void *hwcso)
         v3d->dirty |= VC5_DIRTY_UNCOMPILED_FS;
 }
 
+static void
+v3d_gp_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->prog.bind_gs = hwcso;
+        v3d->dirty |= VC5_DIRTY_UNCOMPILED_GS;
+}
+
 static void
 v3d_vp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
@@ -804,10 +929,14 @@ v3d_program_init(struct pipe_context *pctx)
         pctx->create_vs_state = v3d_shader_state_create;
         pctx->delete_vs_state = v3d_shader_state_delete;
 
+        pctx->create_gs_state = v3d_shader_state_create;
+        pctx->delete_gs_state = v3d_shader_state_delete;
+
         pctx->create_fs_state = v3d_shader_state_create;
         pctx->delete_fs_state = v3d_shader_state_delete;
 
         pctx->bind_fs_state = v3d_fp_state_bind;
+        pctx->bind_gs_state = v3d_gp_state_bind;
         pctx->bind_vs_state = v3d_vp_state_bind;
 
         if (v3d->screen->has_csd) {
@@ -818,6 +947,8 @@ v3d_program_init(struct pipe_context *pctx)
 
         v3d->prog.cache[MESA_SHADER_VERTEX] =
                 _mesa_hash_table_create(pctx, vs_cache_hash, vs_cache_compare);
+        v3d->prog.cache[MESA_SHADER_GEOMETRY] =
+                _mesa_hash_table_create(pctx, gs_cache_hash, gs_cache_compare);
         v3d->prog.cache[MESA_SHADER_FRAGMENT] =
                 _mesa_hash_table_create(pctx, fs_cache_hash, fs_cache_compare);
         v3d->prog.cache[MESA_SHADER_COMPUTE] =