* output targets.
  */
 void
-v3d_tf_update_counters(struct v3d_context *v3d)
+v3d_update_primitive_counters(struct v3d_context *v3d)
 {
         struct v3d_job *job = v3d_get_job_for_fbo(v3d);
         if (job->draw_calls_queued == 0)
 
 
 struct v3d_fence *v3d_fence_create(struct v3d_context *v3d);
 
-void v3d_tf_update_counters(struct v3d_context *v3d);
+void v3d_update_primitive_counters(struct v3d_context *v3d);
 
 #ifdef v3dX
 #  include "v3dx_context.h"
 
         if (v3d_bo_wait(rsc->bo, PIPE_TIMEOUT_INFINITE, "prim-counts")) {
                 uint32_t *map = v3d_bo_map(rsc->bo) + v3d->prim_counts_offset;
                 v3d->tf_prims_generated += map[V3D_PRIM_COUNTS_TF_WRITTEN];
+                /* When we only have a vertex shader we determine the primitive
+                 * count in the CPU so don't update it here again.
+                 */
+                if (v3d->prog.gs)
+                        v3d->prims_generated += map[V3D_PRIM_COUNTS_WRITTEN];
         }
 }
 
 
 
         switch (q->type) {
         case PIPE_QUERY_PRIMITIVES_GENERATED:
+                /* If we are using PRIMITIVE_COUNTS_FEEDBACK to retrieve
+                 * primitive counts from the GPU (which we need when a GS
+                 * is present), then we need to update our counters now
+                 * to discard any primitives generated before this.
+                 */
+                if (v3d->prog.gs)
+                        v3d_update_primitive_counters(v3d);
                 q->start = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
                  * primitive counts to skip primtives recorded before this.
                  */
                 if (v3d->streamout.num_targets > 0)
-                        v3d_tf_update_counters(v3d);
+                        v3d_update_primitive_counters(v3d);
                 q->start = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
 
         switch (q->type) {
         case PIPE_QUERY_PRIMITIVES_GENERATED:
+                /* If we are using PRIMITIVE_COUNTS_FEEDBACK to retrieve
+                 * primitive counts from the GPU (which we need when a GS
+                 * is present), then we need to update our counters now.
+                 */
+                if (v3d->prog.gs)
+                        v3d_update_primitive_counters(v3d);
                 q->end = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
                  * time. Otherwise, we have to do it now.
                  */
                 if (v3d->streamout.num_targets > 0)
-                        v3d_tf_update_counters(v3d);
+                        v3d_update_primitive_counters(v3d);
                 q->end = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
 
 }
 
 /**
- * Updates the number of primitvies generated from the number of vertices
- * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because
- * using the GPU packet for this might require sync waits and this is trivial
- * to handle in the CPU instead.
+ * Updates the number of primitives generated from the number of vertices
+ * to draw. This only works when no GS is present, since otherwise the number
+ * of primitives generated cannot be determined in advance and we need to
+ * use the PRIMITIVE_COUNTS_FEEDBACK command instead, however, that requires
+ * a sync wait for the draw to complete, so we only use that when GS is present.
  */
 static void
 v3d_update_primitives_generated_counter(struct v3d_context *v3d,
                                         const struct pipe_draw_info *info)
 {
+        assert(!v3d->prog.gs);
+
         if (!v3d->active_queries)
                 return;
 
          */
         if (v3d->streamout.num_targets > 0 &&
             u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) {
-                v3d_tf_update_counters(v3d);
+                v3d_update_primitive_counters(v3d);
         }
 
         struct v3d_job *job = v3d_get_job_for_fbo(v3d);
                 prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
 #endif
 
-        v3d_update_primitives_generated_counter(v3d, info);
+        if (!v3d->prog.gs)
+                v3d_update_primitives_generated_counter(v3d, info);
 
         uint32_t hw_prim_type = v3d_hw_prim_type(info->mode);
         if (info->index_size) {
 
          * draw we need to do it here as well.
          */
         if (num_targets == 0 && so->num_targets > 0)
-                v3d_tf_update_counters(ctx);
+                v3d_update_primitive_counters(ctx);
 
         for (i = 0; i < num_targets; i++) {
                 if (offsets[i] != -1)