v3d: use the GPU to record primitives written to transform feedback
authorIago Toral Quiroga <itoral@igalia.com>
Thu, 1 Aug 2019 10:30:34 +0000 (12:30 +0200)
committerIago Toral Quiroga <itoral@igalia.com>
Thu, 8 Aug 2019 06:36:52 +0000 (08:36 +0200)
We can use the PRIMITIVE_COUNTS_FEEDBACK packet to write various primitive
counts to a buffer, including the number of primives written to transform
feedback buffers, which will handle buffer overflow correctly.

There are a couple of caveats with this:

Primitive counters are reset when we emit a 'Tile Binning Mode Configuration'
packet, which can happen in the middle of a primitives query, so we need to
read the buffer when we submit a job and accumulate the counts in the context
so we don't lose them.

We also need to do the same when we switch primitive type during transform
feedback so we can compute the correct number of recorded vertices from
the number of primitives. This is necessary so we can provide an accurate
vertex count for draw from transform feedback.

v2:
 - When computing the number of vertices for a primitive, pass in the base
   primitive, since that is what the hardware will count.
 - No need to update primitive counts when switching primitive types if
   the base primitives are the same.
 - Log perf warning when mapping the primitive counts BO for readback (Eric).
 - Only emit the primitive counts packet once at job end (Eric).
 - Use u_upload mechanism for the primitive counts buffer (Eric).
 - Use the XML to generate indices into the primitive counters buffer (Eric).

Fixes piglit tests:
spec/ext_transform_feedback/overflow-edge-cases
spec/ext_transform_feedback/query-primitives_written-bufferrange
spec/ext_transform_feedback/query-primitives_written-bufferrange-discard
spec/ext_transform_feedback/change-size base-shrink
spec/ext_transform_feedback/change-size base-grow
spec/ext_transform_feedback/change-size offset-shrink
spec/ext_transform_feedback/change-size offset-grow
spec/ext_transform_feedback/change-size range-shrink
spec/ext_transform_feedback/change-size range-grow
spec/ext_transform_feedback/intervening-read prims-written

Reviewed-by: Eric Anholt <eric@anholt.net>
src/broadcom/cle/v3d_packet_v33.xml
src/gallium/drivers/v3d/v3d_context.c
src/gallium/drivers/v3d/v3d_context.h
src/gallium/drivers/v3d/v3d_job.c
src/gallium/drivers/v3d/v3d_query.c
src/gallium/drivers/v3d/v3dx_draw.c
src/gallium/drivers/v3d/v3dx_job.c
src/gallium/drivers/v3d/v3dx_state.c

index ebb7264f4234dce35b49947c81a462614a4a86d0..f40796612f9ce55d2c6bceb16a3d1dbb05748fdb 100644 (file)
     <value name="packed complete patches" value="2"/>
   </enum>
 
+  <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+    <value name="tf_words_buffer0" value="0"/>
+    <value name="tf_words_buffer1" value="1"/>
+    <value name="tf_words_buffer2" value="2"/>
+    <value name="tf_words_buffer3" value="3"/>
+    <value name="written" value="4"/>
+    <value name="tf_written" value="5"/>
+    <value name="tf_overflow" value="6"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
index fcd2d5ec69ba70ceb89ce0e7aaaa097ee7c7990c..8dc8dd635816677d3c5b273af4d00fd663ee59d6 100644 (file)
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
 #include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
@@ -109,6 +110,39 @@ v3d_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
                 job->store &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
 }
 
+/**
+ * Flushes the current job to get up-to-date primive counts written to the
+ * primitive counts BO, then accumulates the transform feedback primitive count
+ * in the context and the corresponding vertex counts in the bound stream
+ * output targets.
+ */
+void
+v3d_tf_update_counters(struct v3d_context *v3d)
+{
+        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
+        if (job->draw_calls_queued == 0)
+                return;
+
+        /* In order to get up-to-date primitive counts we need to submit
+         * the job for execution so we get the counts written to memory.
+         * Notice that this will require a sync wait for the buffer write.
+         */
+        uint32_t prims_before = v3d->tf_prims_generated;
+        v3d_job_submit(v3d, job);
+        uint32_t prims_after = v3d->tf_prims_generated;
+        if (prims_before == prims_after)
+                return;
+
+        enum pipe_prim_type prim_type = u_base_prim_type(v3d->prim_mode);
+        uint32_t num_verts = u_vertices_for_prims(prim_type,
+                                                  prims_after - prims_before);
+        for (int i = 0; i < v3d->streamout.num_targets; i++) {
+                struct v3d_stream_output_target *so =
+                        v3d_stream_output_target(v3d->streamout.targets[i]);
+                so->recorded_vertex_count += num_verts;
+        }
+}
+
 static void
 v3d_context_destroy(struct pipe_context *pctx)
 {
@@ -127,6 +161,9 @@ v3d_context_destroy(struct pipe_context *pctx)
         if (v3d->state_uploader)
                 u_upload_destroy(v3d->state_uploader);
 
+        if (v3d->prim_counts)
+                pipe_resource_reference(&v3d->prim_counts, NULL);
+
         slab_destroy_child(&v3d->transfer_pool);
 
         pipe_surface_reference(&v3d->framebuffer.cbufs[0], NULL);
index 5f19504dbae10d2c42c61aa013807f3f09329a3b..b2c917df2409d418efb3e74c9a4c0d995c5987d8 100644 (file)
@@ -498,6 +498,8 @@ struct v3d_context {
         struct v3d_vertexbuf_stateobj vertexbuf;
         struct v3d_streamout_stateobj streamout;
         struct v3d_bo *current_oq;
+        struct pipe_resource *prim_counts;
+        uint32_t prim_counts_offset;
         struct pipe_debug_callback debug;
         /** @} */
 };
@@ -652,6 +654,8 @@ bool v3d_generate_mipmap(struct pipe_context *pctx,
 
 struct v3d_fence *v3d_fence_create(struct v3d_context *v3d);
 
+void v3d_tf_update_counters(struct v3d_context *v3d);
+
 #ifdef v3dX
 #  include "v3dx_context.h"
 #else
index 3226059161acd4db99068d32de2b67b88d84acfc..17fc41d317ab76c11392475ef037552928337b2f 100644 (file)
@@ -429,6 +429,19 @@ v3d_clif_dump(struct v3d_context *v3d, struct v3d_job *job)
         clif_dump_destroy(clif);
 }
 
+static void
+v3d_read_and_accumulate_primitive_counters(struct v3d_context *v3d)
+{
+        assert(v3d->prim_counts);
+
+        perf_debug("stalling on TF counts readback");
+        struct v3d_resource *rsc = v3d_resource(v3d->prim_counts);
+        if (v3d_bo_wait(rsc->bo, PIPE_TIMEOUT_INFINITE, "prim-counts")) {
+                uint32_t *map = v3d_bo_map(rsc->bo);
+                v3d->tf_prims_generated += map[V3D_PRIM_COUNTS_TF_WRITTEN];
+        }
+}
+
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
@@ -488,6 +501,14 @@ v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
                                         "Expect corruption.\n", strerror(errno));
                         warned = true;
                 }
+
+                /* If we are submitting a job in the middle of transform
+                 * feedback we need to read the primitive counts and accumulate
+                 * them, otherwise they will be reset at the start of the next
+                 * draw when we emit the Tile Binning Mode Configuration packet.
+                 */
+                if (v3d->streamout.num_targets)
+                        v3d_read_and_accumulate_primitive_counters(v3d);
         }
 
 done:
index d31b9dd896b98a801e27f70482f914f4d458880a..72bb2e43c51a1c04a3d7e50dcf13517aaf7d4434 100644 (file)
@@ -75,6 +75,11 @@ v3d_begin_query(struct pipe_context *pctx, struct pipe_query *query)
                 q->start = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
+                /* If we are inside transform feedback we need to update the
+                 * primitive counts to skip primtives recorded before this.
+                 */
+                if (v3d->streamout.num_targets > 0)
+                        v3d_tf_update_counters(v3d);
                 q->start = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -105,6 +110,12 @@ v3d_end_query(struct pipe_context *pctx, struct pipe_query *query)
                 q->end = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
+                /* If transform feedback has ended, then we have already
+                 * updated the primitive counts at glEndTransformFeedback()
+                 * time. Otherwise, we have to do it now.
+                 */
+                if (v3d->streamout.num_targets > 0)
+                        v3d_tf_update_counters(v3d);
                 q->end = v3d->tf_prims_generated;
                 break;
         case PIPE_QUERY_OCCLUSION_COUNTER:
index c2b3ecd8a131a543b7d25a0ec5caf2d778a652f3..fec9d54c25b58f1d79f94b89f76e8c569020dfa9 100644 (file)
@@ -545,29 +545,20 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
 }
 
 /**
- * Computes the various transform feedback statistics, since they can't be
- * recorded by CL packets.
+ * Updates the number of primitvies generated from the number of vertices
+ * to draw. We do this here instead of using PRIMITIVE_COUNTS_FEEDBACK because
+ * using the GPU packet for this might require sync waits and this is trivial
+ * to handle in the CPU instead.
  */
 static void
-v3d_tf_statistics_record(struct v3d_context *v3d,
-                         const struct pipe_draw_info *info)
+v3d_update_primitives_generated_counter(struct v3d_context *v3d,
+                                        const struct pipe_draw_info *info)
 {
         if (!v3d->active_queries)
                 return;
 
         uint32_t prims = u_prims_for_vertices(info->mode, info->count);
         v3d->prims_generated += prims;
-
-        if (v3d->streamout.num_targets <= 0)
-                return;
-
-        /* XXX: Only count if we didn't overflow. */
-        v3d->tf_prims_generated += prims;
-        for (int i = 0; i < v3d->streamout.num_targets; i++) {
-                struct v3d_stream_output_target *target =
-                        v3d_stream_output_target(v3d->streamout.targets[i]);
-                target->recorded_vertex_count += info->count;
-        }
 }
 
 static void
@@ -665,6 +656,17 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
         v3d_predraw_check_outputs(pctx);
 
+        /* If transform feedback is active and we are switching primitive type
+         * we need to submit the job before drawing and update the vertex count
+         * written to TF based on the primitive type since we will need to
+         * know the exact vertex count if the application decides to call
+         * glDrawTransformFeedback() later.
+         */
+        if (v3d->streamout.num_targets > 0 &&
+            u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) {
+                v3d_tf_update_counters(v3d);
+        }
+
         struct v3d_job *job = v3d_get_job_for_fbo(v3d);
 
         /* If vertex texturing depends on the output of rendering, we need to
@@ -762,7 +764,7 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
 #endif
 
-        v3d_tf_statistics_record(v3d, info);
+        v3d_update_primitives_generated_counter(v3d, info);
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
index 1dbd20b225162416acf1be78b1b5a050d3abd8b7..84228a4876010fbf89b0c8ff6b98b3c2a30a41c9 100644 (file)
@@ -38,6 +38,20 @@ void v3dX(bcl_epilogue)(struct v3d_context *v3d, struct v3d_job *job)
 #endif
                                                 cl_packet_length(FLUSH));
 
+                if (job->tf_enabled) {
+                        /* Write primitive counts to memory. */
+                        assert(v3d->prim_counts);
+                        struct v3d_resource *rsc =
+                                v3d_resource(v3d->prim_counts);
+                        cl_emit(&job->bcl, PRIMITIVE_COUNTS_FEEDBACK, counter) {
+                                counter.address =
+                                        cl_address(rsc->bo,
+                                                   v3d->prim_counts_offset);
+                                counter.read_write_64byte = false;
+                                counter.op = 0;
+                        }
+                }
+
                 /* Disable TF at the end of the CL, so that the TF block
                  * cleans up and finishes before it gets reset by the next
                  * frame's tile binning mode cfg packet. (SWVC5-718).
index b6a57249044a26862b4791fa6f3d7509e7b212cb..c709b476f99933df78440ea62fba6cce01893de7 100644 (file)
@@ -1222,6 +1222,14 @@ v3d_set_stream_output_targets(struct pipe_context *pctx,
 
         assert(num_targets <= ARRAY_SIZE(so->targets));
 
+        /* Update recorded vertex counts when we are ending the recording of
+         * transform feedback. We do this when we switch primitive types
+         * at draw time, but if we haven't switched primitives in our last
+         * draw we need to do it here as well.
+         */
+        if (num_targets == 0 && so->num_targets > 0)
+                v3d_tf_update_counters(ctx);
+
         for (i = 0; i < num_targets; i++) {
                 if (offsets[i] != -1)
                         so->offsets[i] = offsets[i];
@@ -1234,6 +1242,15 @@ v3d_set_stream_output_targets(struct pipe_context *pctx,
 
         so->num_targets = num_targets;
 
+        /* Create primitive counters BO if needed */
+        if (num_targets > 0 && !ctx->prim_counts) {
+                uint32_t zeroes[7] = { 0 }; /* Init all 7 counters to 0 */
+                u_upload_data(ctx->uploader,
+                              0, sizeof(zeroes), 32, zeroes,
+                              &ctx->prim_counts_offset,
+                              &ctx->prim_counts);
+        }
+
         ctx->dirty |= VC5_DIRTY_STREAMOUT;
 }