<value name="packed complete patches" value="2"/>
   </enum>
 
+  <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+    <value name="tf_words_buffer0" value="0"/>
+    <value name="tf_words_buffer1" value="1"/>
+    <value name="tf_words_buffer2" value="2"/>
+    <value name="tf_words_buffer3" value="3"/>
+    <value name="written" value="4"/>
+    <value name="tf_written" value="5"/>
+    <value name="tf_overflow" value="6"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
 
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
 #include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
                 job->store &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
 }
 
+/**
+ * Flushes the current job to get up-to-date primive counts written to the
+ * primitive counts BO, then accumulates the transform feedback primitive count
+ * in the context and the corresponding vertex counts in the bound stream
+ * output targets.
+ */
+void
+v3d_tf_update_counters(struct v3d_context *v3d)
+{
+        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
+        if (job->draw_calls_queued == 0)
+                return;
+
+        /* In order to get up-to-date primitive counts we need to submit
+         * the job for execution so we get the counts written to memory.
+         * Notice that this will require a sync wait for the buffer write.
+         */
+        uint32_t prims_before = v3d->tf_prims_generated;
+        v3d_job_submit(v3d, job);
+        uint32_t prims_after = v3d->tf_prims_generated;
+        if (prims_before == prims_after)
+                return;
+
+        enum pipe_prim_type prim_type = u_base_prim_type(v3d->prim_mode);
+        uint32_t num_verts = u_vertices_for_prims(prim_type,
+                                                  prims_after - prims_before);
+        for (int i = 0; i < v3d->streamout.num_targets; i++) {
+                struct v3d_stream_output_target *so =
+                        v3d_stream_output_target(v3d->streamout.targets[i]);
+                so->recorded_vertex_count += num_verts;
+        }
+}
+
 static void
 v3d_context_destroy(struct pipe_context *pctx)
 {
         if (v3d->state_uploader)
                 u_upload_destroy(v3d->state_uploader);
 
+        if (v3d->prim_counts)
+                pipe_resource_reference(&v3d->prim_counts, NULL);
+
         slab_destroy_child(&v3d->transfer_pool);
 
         pipe_surface_reference(&v3d->framebuffer.cbufs[0], NULL);
 
         struct v3d_vertexbuf_stateobj vertexbuf;
         struct v3d_streamout_stateobj streamout;
         struct v3d_bo *current_oq;
+        struct pipe_resource *prim_counts;
+        uint32_t prim_counts_offset;
         struct pipe_debug_callback debug;
         /** @} */
 };
 
 struct v3d_fence *v3d_fence_create(struct v3d_context *v3d);
 
+void v3d_tf_update_counters(struct v3d_context *v3d);
+
 #ifdef v3dX
 #  include "v3dx_context.h"
 #else
 
         clif_dump_destroy(clif);
 }
 
+static void
+v3d_read_and_accumulate_primitive_counters(struct v3d_context *v3d)
+{
+        assert(v3d->prim_counts);
+
+        perf_debug("stalling on TF counts readback");
+        struct v3d_resource *rsc = v3d_resource(v3d->prim_counts);
+        if (v3d_bo_wait(rsc->bo, PIPE_TIMEOUT_INFINITE, "prim-counts")) {
+                uint32_t *map = v3d_bo_map(rsc->bo);
+                v3d->tf_prims_generated += map[V3D_PRIM_COUNTS_TF_WRITTEN];
+        }
+}
+
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
                                         "Expect corruption.\n", strerror(errno));
                         warned = true;
                 }
+
+                /* If we are submitting a job in the middle of transform
+                 * feedback we need to read the primitive counts and accumulate
+                 * them, otherwise they will be reset at the start of the next
+                 * draw when we emit the Tile Binning Mode Configuration packet.
+                 */
+                if (v3d->streamout.num_targets)
+                        v3d_read_and_accumulate_primitive_counters(v3d);
         }
 
 done:
 
                 q->start = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
+                /* If we are inside transform feedback we need to update the
+                 * primitive counts to skip primtives recorded before this.
+                 */
+                if (v3d->streamout.num_targets > 0)
+                        v3d_tf_update_counters(v3d);
                 q->start = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
                 q->end = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
+                /* If transform feedback has ended, then we have already
+                 * updated the primitive counts at glEndTransformFeedback()
+                 * time. Otherwise, we have to do it now.
+                 */
+                if (v3d->streamout.num_targets > 0)
+                        v3d_tf_update_counters(v3d);
                 q->end = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
 
 }
 
 /**
- * Computes the various transform feedback statistics, since they can't be
- * recorded by CL packets.
+ * Updates the number of primitvies generated from the number of vertices
+ * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because
+ * using the GPU packet for this might require sync waits and this is trivial
+ * to handle in the CPU instead.
  */
 static void
-v3d_tf_statistics_record(struct v3d_context *v3d,
-                         const struct pipe_draw_info *info)
+v3d_update_primitives_generated_counter(struct v3d_context *v3d,
+                                        const struct pipe_draw_info *info)
 {
         if (!v3d->active_queries)
                 return;
 
         uint32_t prims = u_prims_for_vertices(info->mode, info->count);
         v3d->prims_generated += prims;
-
-        if (v3d->streamout.num_targets <= 0)
-                return;
-
-        /* XXX: Only count if we didn't overflow. */
-        v3d->tf_prims_generated += prims;
-        for (int i = 0; i < v3d->streamout.num_targets; i++) {
-                struct v3d_stream_output_target *target =
-                        v3d_stream_output_target(v3d->streamout.targets[i]);
-                target->recorded_vertex_count += info->count;
-        }
 }
 
 static void
 
         v3d_predraw_check_outputs(pctx);
 
+        /* If transform feedback is active and we are switching primitive type
+         * we need to submit the job before drawing and update the vertex count
+         * written to TF based on the primitive type since we will need to
+         * know the exact vertex count if the application decides to call
+         * glDrawTransformFeedback() later.
+         */
+        if (v3d->streamout.num_targets > 0 &&
+            u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) {
+                v3d_tf_update_counters(v3d);
+        }
+
         struct v3d_job *job = v3d_get_job_for_fbo(v3d);
 
         /* If vertex texturing depends on the output of rendering, we need to
                 prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
 #endif
 
-        v3d_tf_statistics_record(v3d, info);
+        v3d_update_primitives_generated_counter(v3d, info);
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
 
 #endif
                                                 cl_packet_length(FLUSH));
 
+                if (job->tf_enabled) {
+                        /* Write primitive counts to memory. */
+                        assert(v3d->prim_counts);
+                        struct v3d_resource *rsc =
+                                v3d_resource(v3d->prim_counts);
+                        cl_emit(&job->bcl, PRIMITIVE_COUNTS_FEEDBACK, counter) {
+                                counter.address =
+                                        cl_address(rsc->bo,
+                                                   v3d->prim_counts_offset);
+                                counter.read_write_64byte = false;
+                                counter.op = 0;
+                        }
+                }
+
                 /* Disable TF at the end of the CL, so that the TF block
                  * cleans up and finishes before it gets reset by the next
                  * frame's tile binning mode cfg packet. (SWVC5-718).
 
 
         assert(num_targets <= ARRAY_SIZE(so->targets));
 
+        /* Update recorded vertex counts when we are ending the recording of
+         * transform feedback. We do this when we switch primitive types
+         * at draw time, but if we haven't switched primitives in our last
+         * draw we need to do it here as well.
+         */
+        if (num_targets == 0 && so->num_targets > 0)
+                v3d_tf_update_counters(ctx);
+
         for (i = 0; i < num_targets; i++) {
                 if (offsets[i] != -1)
                         so->offsets[i] = offsets[i];
 
         so->num_targets = num_targets;
 
+        /* Create primitive counters BO if needed */
+        if (num_targets > 0 && !ctx->prim_counts) {
+                uint32_t zeroes[7] = { 0 }; /* Init all 7 counters to 0 */
+                u_upload_data(ctx->uploader,
+                              0, sizeof(zeroes), 32, zeroes,
+                              &ctx->prim_counts_offset,
+                              &ctx->prim_counts);
+        }
+
         ctx->dirty |= VC5_DIRTY_STREAMOUT;
 }