radeonsi: fix user fence space when MCBP is enabled
[mesa.git] / src / gallium / drivers / v3d / v3d_job.c
index 20f8bcc3f322510c6cf71ffd05c577b9480e6ed8..58410484d3f93a44350564847f85fae814dd3ec7 100644 (file)
 #include "util/set.h"
 #include "broadcom/clif/clif_dump.h"
 
-static void
-remove_from_ht(struct hash_table *ht, void *key)
-{
-        struct hash_entry *entry = _mesa_hash_table_search(ht, key);
-        _mesa_hash_table_remove(ht, entry);
-}
-
-static void
+void
 v3d_job_free(struct v3d_context *v3d, struct v3d_job *job)
 {
-        struct set_entry *entry;
-
         set_foreach(job->bos, entry) {
                 struct v3d_bo *bo = (struct v3d_bo *)entry->key;
                 v3d_bo_unreference(&bo);
         }
 
-        remove_from_ht(v3d->jobs, &job->key);
+        _mesa_hash_table_remove_key(v3d->jobs, &job->key);
 
         if (job->write_prscs) {
-                struct set_entry *entry;
-
                 set_foreach(job->write_prscs, entry) {
                         const struct pipe_resource *prsc = entry->key;
 
-                        remove_from_ht(v3d->write_jobs, (void *)prsc);
+                        _mesa_hash_table_remove_key(v3d->write_jobs, prsc);
                 }
         }
 
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 if (job->cbufs[i]) {
-                        remove_from_ht(v3d->write_jobs, job->cbufs[i]->texture);
+                        _mesa_hash_table_remove_key(v3d->write_jobs,
+                                                    job->cbufs[i]->texture);
                         pipe_surface_reference(&job->cbufs[i], NULL);
                 }
         }
         if (job->zsbuf) {
                 struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
                 if (rsc->separate_stencil)
-                        remove_from_ht(v3d->write_jobs,
-                                       &rsc->separate_stencil->base);
+                        _mesa_hash_table_remove_key(v3d->write_jobs,
+                                                    &rsc->separate_stencil->base);
 
-                remove_from_ht(v3d->write_jobs, job->zsbuf->texture);
+                _mesa_hash_table_remove_key(v3d->write_jobs,
+                                            job->zsbuf->texture);
                 pipe_surface_reference(&job->zsbuf, NULL);
         }
 
@@ -94,7 +85,7 @@ v3d_job_free(struct v3d_context *v3d, struct v3d_job *job)
         ralloc_free(job);
 }
 
-static struct v3d_job *
+struct v3d_job *
 v3d_job_create(struct v3d_context *v3d)
 {
         struct v3d_job *job = rzalloc(v3d, struct v3d_job);
@@ -156,36 +147,130 @@ v3d_job_add_write_resource(struct v3d_job *job, struct pipe_resource *prsc)
 }
 
 void
-v3d_flush_jobs_writing_resource(struct v3d_context *v3d,
+v3d_flush_jobs_using_bo(struct v3d_context *v3d, struct v3d_bo *bo)
+{
+        hash_table_foreach(v3d->jobs, entry) {
+                struct v3d_job *job = entry->data;
+
+                if (_mesa_set_search(job->bos, bo))
+                        v3d_job_submit(v3d, job);
+        }
+}
+
+void
+v3d_job_add_tf_write_resource(struct v3d_job *job, struct pipe_resource *prsc)
+{
+        v3d_job_add_write_resource(job, prsc);
+
+        if (!job->tf_write_prscs)
+                job->tf_write_prscs = _mesa_pointer_set_create(job);
+
+        _mesa_set_add(job->tf_write_prscs, prsc);
+}
+
+static bool
+v3d_job_writes_resource_from_tf(struct v3d_job *job,
                                 struct pipe_resource *prsc)
+{
+        if (!job->tf_enabled)
+                return false;
+
+        if (!job->tf_write_prscs)
+                return false;
+
+        return _mesa_set_search(job->tf_write_prscs, prsc) != NULL;
+}
+
+void
+v3d_flush_jobs_writing_resource(struct v3d_context *v3d,
+                                struct pipe_resource *prsc,
+                                enum v3d_flush_cond flush_cond,
+                                bool is_compute_pipeline)
 {
         struct hash_entry *entry = _mesa_hash_table_search(v3d->write_jobs,
                                                            prsc);
-        if (entry) {
-                struct v3d_job *job = entry->data;
-                v3d_job_submit(v3d, job);
+        struct v3d_resource *rsc = v3d_resource(prsc);
+
+        /* We need to sync if graphics pipeline reads a resource written
+         * by the compute pipeline. The same would be needed for the case of
+         * graphics-compute dependency but nowadays all compute jobs
+         * are serialized with the previous submitted job.
+         */
+        if (!is_compute_pipeline && rsc->bo != NULL && rsc->compute_written) {
+           v3d->sync_on_last_compute_job = true;
+           rsc->compute_written = false;
+        }
+
+        if (!entry)
+                return;
+
+        struct v3d_job *job = entry->data;
+
+        bool needs_flush;
+        switch (flush_cond) {
+        case V3D_FLUSH_ALWAYS:
+                needs_flush = true;
+                break;
+        case V3D_FLUSH_NOT_CURRENT_JOB:
+                needs_flush = !v3d->job || v3d->job != job;
+                break;
+        case V3D_FLUSH_DEFAULT:
+        default:
+                /* For writes from TF in the same job we use the "Wait for TF"
+                 * feature provided by the hardware so we don't want to flush.
+                 * The exception to this is when the caller is about to map the
+                 * resource since in that case we don't have a 'Wait for TF'
+                 * command the in command stream. In this scenario the caller
+                 * is expected to set 'always_flush' to True.
+                 */
+                needs_flush = !v3d_job_writes_resource_from_tf(job, prsc);
         }
+
+        if (needs_flush)
+                v3d_job_submit(v3d, job);
 }
 
 void
 v3d_flush_jobs_reading_resource(struct v3d_context *v3d,
-                                struct pipe_resource *prsc)
+                                struct pipe_resource *prsc,
+                                enum v3d_flush_cond flush_cond,
+                                bool is_compute_pipeline)
 {
         struct v3d_resource *rsc = v3d_resource(prsc);
 
-        v3d_flush_jobs_writing_resource(v3d, prsc);
+        /* We only need to force the flush on TF writes, which is the only
+         * case where we might skip the flush to use the 'Wait for TF'
+         * command. Here we are flushing for a read, which means that the
+         * caller intends to write to the resource, so we don't care if
+         * there was a previous TF write to it.
+         */
+        v3d_flush_jobs_writing_resource(v3d, prsc, flush_cond,
+                                        is_compute_pipeline);
 
-        struct hash_entry *entry;
         hash_table_foreach(v3d->jobs, entry) {
                 struct v3d_job *job = entry->data;
 
-                if (_mesa_set_search(job->bos, rsc->bo)) {
-                        v3d_job_submit(v3d, job);
-                        /* Reminder: v3d->jobs is safe to keep iterating even
-                         * after deletion of an entry.
-                         */
+                if (!_mesa_set_search(job->bos, rsc->bo))
                         continue;
+
+                bool needs_flush;
+                switch (flush_cond) {
+                case V3D_FLUSH_NOT_CURRENT_JOB:
+                        needs_flush = !v3d->job || v3d->job != job;
+                        break;
+                case V3D_FLUSH_ALWAYS:
+                case V3D_FLUSH_DEFAULT:
+                default:
+                        needs_flush = true;
                 }
+
+                if (needs_flush)
+                        v3d_job_submit(v3d, job);
+
+                /* Reminder: v3d->jobs is safe to keep iterating even
+                 * after deletion of an entry.
+                 */
+                continue;
         }
 }
 
@@ -209,7 +294,7 @@ v3d_job_set_tile_buffer_size(struct v3d_job *job)
                 tile_size_index++;
 
         int max_bpp = RENDER_TARGET_MAXIMUM_32BPP;
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 if (job->cbufs[i]) {
                         struct v3d_surface *surf = v3d_surface(job->cbufs[i]);
                         max_bpp = MAX2(max_bpp, surf->internal_bpp);
@@ -227,7 +312,7 @@ v3d_job_set_tile_buffer_size(struct v3d_job *job)
 /**
  * Returns a v3d_job struture for tracking V3D rendering to a particular FBO.
  *
- * If we've already started rendering to this FBO, then return old same job,
+ * If we've already started rendering to this FBO, then return the same job,
  * otherwise make a new one.  If we're beginning rendering to an FBO, make
  * sure that any previous reads of the FBO (or writes to its color/Z surfaces)
  * have been flushed.
@@ -256,9 +341,11 @@ v3d_get_job(struct v3d_context *v3d,
          */
         struct v3d_job *job = v3d_job_create(v3d);
 
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 if (cbufs[i]) {
-                        v3d_flush_jobs_reading_resource(v3d, cbufs[i]->texture);
+                        v3d_flush_jobs_reading_resource(v3d, cbufs[i]->texture,
+                                                        V3D_FLUSH_DEFAULT,
+                                                        false);
                         pipe_surface_reference(&job->cbufs[i], cbufs[i]);
 
                         if (cbufs[i]->texture->nr_samples > 1)
@@ -266,15 +353,15 @@ v3d_get_job(struct v3d_context *v3d,
                 }
         }
         if (zsbuf) {
-                v3d_flush_jobs_reading_resource(v3d, zsbuf->texture);
+                v3d_flush_jobs_reading_resource(v3d, zsbuf->texture,
+                                                V3D_FLUSH_DEFAULT,
+                                                false);
                 pipe_surface_reference(&job->zsbuf, zsbuf);
                 if (zsbuf->texture->nr_samples > 1)
                         job->msaa = true;
         }
 
-        v3d_job_set_tile_buffer_size(job);
-
-        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
                 if (cbufs[i])
                         _mesa_hash_table_insert(v3d->write_jobs,
                                                 cbufs[i]->texture, job);
@@ -285,7 +372,9 @@ v3d_get_job(struct v3d_context *v3d,
                 struct v3d_resource *rsc = v3d_resource(zsbuf->texture);
                 if (rsc->separate_stencil) {
                         v3d_flush_jobs_reading_resource(v3d,
-                                                        &rsc->separate_stencil->base);
+                                                        &rsc->separate_stencil->base,
+                                                        V3D_FLUSH_DEFAULT,
+                                                        false);
                         _mesa_hash_table_insert(v3d->write_jobs,
                                                 &rsc->separate_stencil->base,
                                                 job);
@@ -308,6 +397,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
         struct pipe_surface *zsbuf = v3d->framebuffer.zsbuf;
         struct v3d_job *job = v3d_get_job(v3d, cbufs, zsbuf);
 
+        if (v3d->framebuffer.samples >= 1)
+                job->msaa = true;
+
+        v3d_job_set_tile_buffer_size(job);
+
         /* The dirty flags are tracking what's been updated while v3d->job has
          * been bound, so set them all to ~0 when switching between jobs.  We
          * also need to reset all state at the start of rendering.
@@ -328,7 +422,13 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
         if (zsbuf) {
                 struct v3d_resource *rsc = v3d_resource(zsbuf->texture);
                 if (!rsc->writes)
-                        job->clear |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
+                        job->clear |= PIPE_CLEAR_DEPTH;
+
+                if (rsc->separate_stencil)
+                        rsc = rsc->separate_stencil;
+
+                if (!rsc->writes)
+                        job->clear |= PIPE_CLEAR_STENCIL;
         }
 
         job->draw_tiles_x = DIV_ROUND_UP(v3d->framebuffer.width,
@@ -344,13 +444,13 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
 static void
 v3d_clif_dump(struct v3d_context *v3d, struct v3d_job *job)
 {
-        if (!(V3D_DEBUG & V3D_DEBUG_CL))
+        if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
                 return;
 
         struct clif_dump *clif = clif_dump_init(&v3d->screen->devinfo,
-                                                stderr);
+                                                stderr,
+                                                V3D_DEBUG & V3D_DEBUG_CL);
 
-        struct set_entry *entry;
         set_foreach(job->bos, entry) {
                 struct v3d_bo *bo = (void *)entry->key;
                 char *name = ralloc_asprintf(NULL, "%s_0x%x",
@@ -362,30 +462,41 @@ v3d_clif_dump(struct v3d_context *v3d, struct v3d_job *job)
                 ralloc_free(name);
         }
 
-        fprintf(stderr, "BCL: 0x%08x..0x%08x\n",
-                job->submit.bcl_start, job->submit.bcl_end);
-
-        clif_dump_add_cl(clif, job->submit.bcl_start, job->submit.bcl_end);
-
-        fprintf(stderr, "RCL: 0x%08x..0x%08x\n",
-                job->submit.rcl_start, job->submit.rcl_end);
-        clif_dump_add_cl(clif, job->submit.rcl_start, job->submit.rcl_end);
+        clif_dump(clif, &job->submit);
 
         clif_dump_destroy(clif);
 }
 
+static void
+v3d_read_and_accumulate_primitive_counters(struct v3d_context *v3d)
+{
+        assert(v3d->prim_counts);
+
+        perf_debug("stalling on TF counts readback\n");
+        struct v3d_resource *rsc = v3d_resource(v3d->prim_counts);
+        if (v3d_bo_wait(rsc->bo, PIPE_TIMEOUT_INFINITE, "prim-counts")) {
+                uint32_t *map = v3d_bo_map(rsc->bo) + v3d->prim_counts_offset;
+                v3d->tf_prims_generated += map[V3D_PRIM_COUNTS_TF_WRITTEN];
+                /* When we only have a vertex shader we determine the primitive
+                 * count in the CPU so don't update it here again.
+                 */
+                if (v3d->prog.gs)
+                        v3d->prims_generated += map[V3D_PRIM_COUNTS_WRITTEN];
+        }
+}
+
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
 void
 v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
 {
-        MAYBE_UNUSED struct v3d_screen *screen = v3d->screen;
+        struct v3d_screen *screen = v3d->screen;
 
         if (!job->needs_flush)
                 goto done;
 
-        if (v3d->screen->devinfo.ver >= 41)
+        if (screen->devinfo.ver >= 41)
                 v3d41_emit_rcl(job);
         else
                 v3d33_emit_rcl(job);
@@ -397,10 +508,22 @@ v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
                         v3d33_bcl_epilogue(v3d, job);
         }
 
+        /* While the RCL will implicitly depend on the last RCL to have
+         * finished, we also need to block on any previous TFU job we may have
+         * dispatched.
+         */
+        job->submit.in_sync_rcl = v3d->out_sync;
+
+        /* Update the sync object for the last rendering by our context. */
         job->submit.out_sync = v3d->out_sync;
+
         job->submit.bcl_end = job->bcl.bo->offset + cl_offset(&job->bcl);
         job->submit.rcl_end = job->rcl.bo->offset + cl_offset(&job->rcl);
 
+        job->submit.flags = 0;
+        if (job->tmu_dirty_rcl && screen->has_cache_flush)
+                job->submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
+
         /* On V3D 4.1, the tile alloc/state setup moved to register writes
          * instead of binner packets.
          */
@@ -418,17 +541,29 @@ v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
         if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) {
                 int ret;
 
-#ifndef USE_V3D_SIMULATOR
-                ret = drmIoctl(v3d->fd, DRM_IOCTL_V3D_SUBMIT_CL, &job->submit);
-#else
-                ret = v3d_simulator_flush(v3d, &job->submit, job);
-#endif
+                ret = v3d_ioctl(v3d->fd, DRM_IOCTL_V3D_SUBMIT_CL, &job->submit);
                 static bool warned = false;
                 if (ret && !warned) {
                         fprintf(stderr, "Draw call returned %s.  "
                                         "Expect corruption.\n", strerror(errno));
                         warned = true;
                 }
+
+                /* If we are submitting a job in the middle of transform
+                 * feedback we need to read the primitive counts and accumulate
+                 * them, otherwise they will be reset at the start of the next
+                 * draw when we emit the Tile Binning Mode Configuration packet.
+                 *
+                 * If the job doesn't have any TF draw calls, then we know
+                 * the primitive count must be zero and we can skip stalling
+                 * for this. This also fixes a problem because it seems that
+                 * in this scenario the counters are not reset with the Tile
+                 * Binning Mode Configuration packet, which would translate
+                 * to us reading an obsolete (possibly non-zero) value from
+                 * the GPU counters.
+                 */
+                if (v3d->streamout.num_targets && job->tf_draw_calls_queued > 0)
+                        v3d_read_and_accumulate_primitive_counters(v3d);
         }
 
 done: