panfrost: Rewrite scoreboarding routines
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Sun, 16 Feb 2020 19:59:11 +0000 (14:59 -0500)
committerAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Tue, 18 Feb 2020 13:45:21 +0000 (08:45 -0500)
Rather than manipulating job descriptor headers as fat pointers (slow)
and using fancy manipulation functions for programatically building the
tree in arbitrary orders (slow and complicated) and then having to do a
topological sort at runtime every frame (slow) which requires traversing
said headers in GPU memory (slow!)... we finally know enough about
the hardware to just get things right the first time, or second for
next_job linking. So rip out all that code and replace it with a much
better routine to create, upload, and queue a job all in one (since now
it's the same operation essentially - which is much better for memory
access patterns, by the way) and most everything falls into place
gracefully according to the rules we've set out. Even wallpapering isn't
*so* terrible if you just... move that one little... giant... hack out
of sight... ahem....

panfrost_scoreboard_link_batch is no longer a bottleneck, mostly because
it no longer exists :-)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3836>

src/gallium/drivers/panfrost/pan_compute.c
src/gallium/drivers/panfrost/pan_context.c
src/gallium/drivers/panfrost/pan_context.h
src/gallium/drivers/panfrost/pan_fragment.c
src/gallium/drivers/panfrost/pan_job.c
src/gallium/drivers/panfrost/pan_job.h
src/gallium/drivers/panfrost/pan_scoreboard.c

index 1901f58dda7f73654b268f818d5ab406805ef452..9747c5cdab7bae9fb764280b6b5b43c4249403b5 100644 (file)
@@ -104,12 +104,6 @@ panfrost_launch_grid(struct pipe_context *pipe,
 
         ctx->compute_grid = info;
 
-        struct mali_job_descriptor_header job = {
-                .job_type = JOB_TYPE_COMPUTE,
-                .job_descriptor_size = 1,
-                .job_barrier = 1
-        };
-
         /* TODO: Stub */
         struct midgard_payload_vertex_tiler *payload = &ctx->payloads[PIPE_SHADER_COMPUTE];
         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
@@ -152,15 +146,7 @@ panfrost_launch_grid(struct pipe_context *pipe,
                         info->grid[0], info->grid[1], info->grid[2],
                         info->block[0], info->block[1], info->block[2], false);
 
-        /* Upload the payload */
-
-        struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(*payload));
-        memcpy(transfer.cpu, &job, sizeof(job));
-        memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload));
-
-        /* Queue the job */
-        panfrost_scoreboard_queue_compute_job(batch, transfer);
-
+        panfrost_new_job(batch, JOB_TYPE_COMPUTE, true, 0, payload, sizeof(*payload), false);
         panfrost_flush_all_batches(ctx, true);
 }
 
index 9bb2cf7923ab12332aabe026307873b28bb27130..6e03839e8ea24e582d0ba7021aca339d56274858 100644 (file)
@@ -359,29 +359,6 @@ panfrost_default_shader_backend(struct panfrost_context *ctx)
         memcpy(&ctx->fragment_shader_core, &shader, sizeof(shader));
 }
 
-/* Generates a vertex/tiler job. This is, in some sense, the heart of the
- * graphics command stream. It should be called once per draw, accordding to
- * presentations. Set is_tiler for "tiler" jobs (fragment shader jobs, but in
- * Mali parlance, "fragment" refers to framebuffer writeout). Clear it for
- * vertex jobs. */
-
-struct panfrost_transfer
-panfrost_vertex_tiler_job(struct panfrost_context *ctx, bool is_tiler)
-{
-        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
-        struct mali_job_descriptor_header job = {
-                .job_type = is_tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX,
-                .job_descriptor_size = 1,
-        };
-
-        struct midgard_payload_vertex_tiler *payload = is_tiler ? &ctx->payloads[PIPE_SHADER_FRAGMENT] : &ctx->payloads[PIPE_SHADER_VERTEX];
-
-        struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(*payload));
-        memcpy(transfer.cpu, &job, sizeof(job));
-        memcpy(transfer.cpu + sizeof(job), payload, sizeof(*payload));
-        return transfer;
-}
-
 mali_ptr
 panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i)
 {
@@ -1272,20 +1249,23 @@ panfrost_queue_draw(struct panfrost_context *ctx)
         bool rasterizer_discard = ctx->rasterizer
                                   && ctx->rasterizer->base.rasterizer_discard;
 
-        struct panfrost_transfer vertex = panfrost_vertex_tiler_job(ctx, false);
-        struct panfrost_transfer tiler;
 
-        if (!rasterizer_discard)
-                tiler = panfrost_vertex_tiler_job(ctx, true);
+        struct midgard_payload_vertex_tiler *vertex_payload = &ctx->payloads[PIPE_SHADER_VERTEX];
+        struct midgard_payload_vertex_tiler *tiler_payload = &ctx->payloads[PIPE_SHADER_FRAGMENT];
 
         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
+        bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
 
-        if (rasterizer_discard)
-                panfrost_scoreboard_queue_vertex_job(batch, vertex, FALSE);
-        else if (ctx->wallpaper_batch && batch->first_tiler.gpu)
-                panfrost_scoreboard_queue_fused_job_prepend(batch, vertex, tiler);
-        else
-                panfrost_scoreboard_queue_fused_job(batch, vertex, tiler);
+        if (wallpapering) {
+                /* Inject in reverse order, with "predicted" job indices. THIS IS A HACK XXX */
+                panfrost_new_job(batch, JOB_TYPE_TILER, false, batch->job_index + 2, tiler_payload, sizeof(*tiler_payload), true);
+                panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0, vertex_payload, sizeof(*vertex_payload), true);
+        } else  {
+                unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0, vertex_payload, sizeof(*vertex_payload), false);
+
+                if (!rasterizer_discard)
+                        panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tiler_payload, sizeof(*tiler_payload), false);
+        }
 
         for (unsigned i = 0; i < PIPE_SHADER_TYPES; ++i) {
                 struct panfrost_shader_variants *all = ctx->shader[i];
index b2736d46d2471d16e1b9deea6b0284739f841290..f33f6ec846ccb4775a803b4abef3c8d9a195086a 100644 (file)
@@ -317,8 +317,7 @@ struct midgard_tiler_descriptor
 panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count);
 
 mali_ptr
-panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws,
-                      struct mali_job_descriptor_header **header_cpu);
+panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws);
 
 void
 panfrost_shader_compile(
index 88b2db3c94900deeae95c1714006d218de500853..e2d71c57a10b2271b206785a97a1271f5bae205c 100644 (file)
@@ -49,8 +49,7 @@ panfrost_initialize_surface(
  * presentations, this is supposed to correspond to eglSwapBuffers) */
 
 mali_ptr
-panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws,
-                      struct mali_job_descriptor_header **header_cpu)
+panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws)
 {
         struct panfrost_screen *screen = pan_screen(batch->ctx->base.screen);
 
@@ -105,6 +104,5 @@ panfrost_fragment_job(struct panfrost_batch *batch, bool has_draws,
         struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(header) + sizeof(payload));
         memcpy(transfer.cpu, &header, sizeof(header));
         memcpy(transfer.cpu + sizeof(header), &payload, sizeof(payload));
-        *header_cpu = (struct mali_job_descriptor_header *)transfer.cpu;
         return transfer.gpu;
 }
index 35d5e0be67e6caa5b71ba69a5bb54483fe21a8b4..62ce2e36bc9b8e4cc48117cc967480a39b586562 100644 (file)
@@ -114,9 +114,6 @@ panfrost_create_batch(struct panfrost_context *ctx,
         batch->maxx = batch->maxy = 0;
         batch->transient_offset = 0;
 
-        util_dynarray_init(&batch->headers, batch);
-        util_dynarray_init(&batch->gpu_headers, batch);
-        util_dynarray_init(&batch->dependencies, batch);
         batch->out_sync = panfrost_create_batch_fence(batch);
         util_copy_framebuffer_state(&batch->key, key);
 
@@ -181,9 +178,6 @@ panfrost_free_batch(struct panfrost_batch *batch)
                 panfrost_batch_fence_unreference(*dep);
         }
 
-        util_dynarray_fini(&batch->headers);
-        util_dynarray_fini(&batch->gpu_headers);
-
         /* The out_sync fence lifetime is different from the the batch one
          * since other batches might want to wait on a fence of already
          * submitted/signaled batch. All we need to do here is make sure the
@@ -308,7 +302,7 @@ panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx)
          * Note that it's perfectly fine to re-use a batch with an
          * existing clear, we'll just update it with the new clear request.
          */
-        if (!batch->last_job.gpu)
+        if (!batch->first_job)
                 return batch;
 
         /* Otherwise, we need to freeze the existing one and instantiate a new
@@ -744,7 +738,7 @@ panfrost_batch_draw_wallpaper(struct panfrost_batch *batch)
         /* No draw calls, and no clear on the depth/stencil bufs.
          * Drawing the wallpaper would be useless.
          */
-        if (!batch->last_tiler.gpu &&
+        if (!batch->tiler_dep &&
             !(batch->clear & PIPE_CLEAR_DEPTHSTENCIL))
                 return;
 
@@ -846,8 +840,7 @@ panfrost_batch_draw_wallpaper(struct panfrost_batch *batch)
 static int
 panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
                             mali_ptr first_job_desc,
-                            uint32_t reqs,
-                            struct mali_job_descriptor_header *header)
+                            uint32_t reqs)
 {
         struct panfrost_context *ctx = batch->ctx;
         struct pipe_context *gallium = (struct pipe_context *) ctx;
@@ -857,7 +850,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         bool is_fragment_shader;
         int ret;
 
-        is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job.gpu;
+        is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job;
         if (is_fragment_shader)
                 submit.in_sync_count = 1;
         else
@@ -934,20 +927,17 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
 static int
 panfrost_batch_submit_jobs(struct panfrost_batch *batch)
 {
-        bool has_draws = batch->first_job.gpu;
-        struct mali_job_descriptor_header *header;
+        bool has_draws = batch->first_job;
         int ret = 0;
 
         if (has_draws) {
-                header = (struct mali_job_descriptor_header *)batch->first_job.cpu;
-                ret = panfrost_batch_submit_ioctl(batch, batch->first_job.gpu, 0, header);
+                ret = panfrost_batch_submit_ioctl(batch, batch->first_job, 0);
                 assert(!ret);
         }
 
-        if (batch->first_tiler.gpu || batch->clear) {
-                mali_ptr fragjob = panfrost_fragment_job(batch, has_draws, &header);
-
-                ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS, header);
+        if (batch->tiler_dep || batch->clear) {
+                mali_ptr fragjob = panfrost_fragment_job(batch, has_draws);
+                ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS);
                 assert(!ret);
         }
 
@@ -969,7 +959,7 @@ panfrost_batch_submit(struct panfrost_batch *batch)
         int ret;
 
         /* Nothing to do! */
-        if (!batch->last_job.gpu && !batch->clear) {
+        if (!batch->first_job && !batch->clear) {
                 /* Mark the fence as signaled so the fence logic does not try
                  * to wait on it.
                  */
@@ -982,7 +972,7 @@ panfrost_batch_submit(struct panfrost_batch *batch)
         /* Now that all draws are in, we can finally prepare the
          * FBD for the batch */
 
-        if (batch->framebuffer.gpu && batch->first_job.gpu) {
+        if (batch->framebuffer.gpu && batch->first_job) {
                 struct panfrost_context *ctx = batch->ctx;
                 struct pipe_context *gallium = (struct pipe_context *) ctx;
                 struct panfrost_screen *screen = pan_screen(gallium->screen);
@@ -993,7 +983,7 @@ panfrost_batch_submit(struct panfrost_batch *batch)
                         panfrost_attach_mfbd(batch, ~0);
         }
 
-        panfrost_scoreboard_link_batch(batch);
+        panfrost_scoreboard_initialize_tiler(batch);
 
         ret = panfrost_batch_submit_jobs(batch);
 
index 55da645530296c82e53fc0f1cdd9c32c1826a2ed..59279925e36412ed351ab8f5915defad98ad22f9 100644 (file)
@@ -99,40 +99,22 @@ struct panfrost_batch {
         unsigned minx, miny;
         unsigned maxx, maxy;
 
-        /* CPU pointers to the job descriptor headers. next_job is only
-         * set at submit time (since only then are all the dependencies
-         * known). The upshot is that this is append-only.
-         *
-         * These arrays contain the headers for the "primary batch", our jargon
-         * referring to the part of the panfrost_job that actually contains
-         * meaningful work. In an OpenGL ES setting, that means the
-         * WRITE_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT
-         * job, which is sent on as a secondary batch containing only a single
-         * hardware job. Since there's one and only one FRAGMENT job issued per
-         * panfrost_job, there is no need to do any scoreboarding / management;
-         * it's easy enough to open-code it and it's not like we can get any
-         * better anyway. */
-        struct util_dynarray headers;
-
-        /* (And the GPU versions; TODO maybe combine) */
-        struct util_dynarray gpu_headers;
-
-        /* The last job in the primary batch */
-        struct panfrost_transfer last_job;
-
-        /* The first/last tiler job */
-        struct panfrost_transfer first_tiler;
-        struct panfrost_transfer last_tiler;
-
-        /* The first vertex job used as the input to a tiler job */
-        struct panfrost_transfer first_vertex_for_tiler;
-
-        /* The first job. Notice we've created a linked list */
-        struct panfrost_transfer first_job;
+        /* The first job in the batch */
+        mali_ptr first_job;
 
         /* The number of jobs in the primary batch, essentially */
         unsigned job_index;
 
+        /* A CPU-side pointer to the previous job for next_job linking */
+        struct mali_job_descriptor_header *prev_job;
+
+        /* The dependency for tiler jobs (i.e. the index of the last emitted
+         * tiler job, or zero if none have been emitted) */
+        unsigned tiler_dep;
+
+        /* The job index of the WRITE_VALUE job (before it has been created) */
+        unsigned write_value_index;
+
         /* BOs referenced -- will be used for flushing logic */
         struct hash_table *bos;
 
@@ -241,35 +223,16 @@ panfrost_batch_intersection_scissor(struct panfrost_batch *batch,
 
 /* Scoreboarding */
 
-void
-panfrost_scoreboard_queue_compute_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer job);
-
-void
-panfrost_scoreboard_queue_vertex_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        bool requires_tiling);
-
-void
-panfrost_scoreboard_queue_tiler_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer tiler);
-
-void
-panfrost_scoreboard_queue_fused_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        struct panfrost_transfer tiler);
-void
-panfrost_scoreboard_queue_fused_job_prepend(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        struct panfrost_transfer tiler);
+unsigned
+panfrost_new_job(
+                struct panfrost_batch *batch,
+                enum mali_job_type type,
+                bool barrier,
+                unsigned local_dep,
+                void *payload, size_t payload_size,
+                bool inject);
 
-void
-panfrost_scoreboard_link_batch(struct panfrost_batch *batch);
+void panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch);
 
 bool
 panfrost_batch_is_scanout(struct panfrost_batch *batch);
index 927a6f61f6b3313f9d630defdbd832d7df53493b..0e27a0ae64ce7b854f5fd623ee2f6437561c8d4a 100644 (file)
  *
  */
 
-/* Coerce a panfrost_transfer to a header */
-
-static inline struct mali_job_descriptor_header *
-job_descriptor_header(struct panfrost_transfer t)
+/* Generates, uploads, and queues a a new job. All fields are written in order
+ * except for next_job accounting (TODO: Should we be clever and defer the
+ * upload of the header here until next job to keep the access pattern totally
+ * linear? Or is that just a micro op at this point?). Returns the generated
+ * index for dep management.
+ *
+ * Inject is used to inject a job at the front, for wallpapering. If you are
+ * not wallpapering and set this, dragons will eat you. */
+
+unsigned
+panfrost_new_job(
+                struct panfrost_batch *batch,
+                enum mali_job_type type,
+                bool barrier,
+                unsigned local_dep,
+                void *payload, size_t payload_size,
+                bool inject)
 {
-        return (struct mali_job_descriptor_header *) t.cpu;
-}
+        unsigned global_dep = 0;
+
+        if (type == JOB_TYPE_TILER) {
+                /* Tiler jobs must be chained, and the first tiler job must
+                 * depend on the write value job, whose index we reserve now */
+
+                if (batch->tiler_dep)
+                        global_dep = batch->tiler_dep;
+                else {
+                        batch->write_value_index = ++batch->job_index;
+                        global_dep = batch->write_value_index;
+                }
+        }
 
-static void
-panfrost_assign_index(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer transfer)
-{
         /* Assign the index */
         unsigned index = ++batch->job_index;
-        job_descriptor_header(transfer)->job_index = index;
-}
 
-/* Helper to add a dependency to a job */
+        struct mali_job_descriptor_header job = {
+                .job_descriptor_size = 1,
+                .job_type = type,
+                .job_barrier = barrier,
+                .job_index = index,
+                .job_dependency_index_1 = local_dep,
+                .job_dependency_index_2 = global_dep,
+        };
 
-static void
-panfrost_add_dependency(
-        struct panfrost_transfer depender,
-        struct panfrost_transfer dependent)
-{
+        if (inject)
+                job.next_job = batch->first_job;
 
-        struct mali_job_descriptor_header *first =
-                job_descriptor_header(dependent);
+        struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + payload_size);
+        memcpy(transfer.cpu, &job, sizeof(job));
+        memcpy(transfer.cpu + sizeof(job), payload, payload_size);
 
-        struct mali_job_descriptor_header *second =
-                job_descriptor_header(depender);
+        if (inject) {
+                batch->first_job = transfer.gpu;
+                return index;
+        }
 
-        /* Look for an open slot */
+        /* Form a chain */
+        if (type == JOB_TYPE_TILER)
+                batch->tiler_dep = index;
 
-        if (!second->job_dependency_index_1)
-                second->job_dependency_index_1 = first->job_index;
-        else if (!second->job_dependency_index_2)
-                second->job_dependency_index_2 = first->job_index;
+        if (batch->prev_job)
+                batch->prev_job->next_job = transfer.gpu;
         else
-                unreachable("No available slot for new dependency");
-}
-
-/* Queues a job WITHOUT updating pointers. Be careful. */
-
-static void
-panfrost_scoreboard_queue_job_internal(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer job)
-{
-        panfrost_assign_index(batch, job);
-
-        /* Queue a pointer to the job */
-        util_dynarray_append(&batch->headers, void*, job.cpu);
-        util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
-}
-
-
-/* Queues a compute job, with no special dependencies. This is a bit of a
- * misnomer -- internally, all job types are queued with this function, but
- * outside of this file, it's for pure compute jobs */
-
-void
-panfrost_scoreboard_queue_compute_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer job)
-{
-        panfrost_scoreboard_queue_job_internal(batch, job);
-
-        /* Update the linked list metadata as appropriate */
-        batch->last_job = job;
-
-        if (!batch->first_job.gpu)
-                batch->first_job = job;
-}
-
-/* Queues a vertex job. There are no special dependencies yet, but if
- * tiling is required (anytime 'rasterize discard' is disabled), we have
- * some extra bookkeeping for later */
-
-void
-panfrost_scoreboard_queue_vertex_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        bool requires_tiling)
-{
-        panfrost_scoreboard_queue_compute_job(batch, vertex);
+                batch->first_job = transfer.gpu;
 
-        if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
-                batch->first_vertex_for_tiler = vertex;
+        batch->prev_job = (struct mali_job_descriptor_header *) transfer.cpu;
+        return index;
 }
 
-/* Queues a tiler job, respecting the dependency of each tiler job on the
- * previous */
+/* Generates a write value job, used to initialize the tiler structures. Note
+ * this is called right before frame submission. */
 
 void
-panfrost_scoreboard_queue_tiler_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer tiler)
-{
-        panfrost_scoreboard_queue_compute_job(batch, tiler);
-
-        if (!batch->first_tiler.gpu)
-                batch->first_tiler = tiler;
-
-        if (batch->last_tiler.gpu)
-                panfrost_add_dependency(tiler, batch->last_tiler);
-
-        batch->last_tiler = tiler;
-}
-
-/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
- * fused jobs are not supported (the default until Bifrost rolls out) */
-
-void
-panfrost_scoreboard_queue_fused_job(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        struct panfrost_transfer tiler)
-{
-        panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
-        panfrost_scoreboard_queue_tiler_job(batch, tiler);
-        panfrost_add_dependency(tiler, vertex);
-}
-
-/* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
- * wallpaper blits */
-
-void
-panfrost_scoreboard_queue_fused_job_prepend(
-        struct panfrost_batch *batch,
-        struct panfrost_transfer vertex,
-        struct panfrost_transfer tiler)
+panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch)
 {
-        /* Sanity check */
-        assert(batch->last_tiler.gpu);
-        assert(batch->first_tiler.gpu);
-
-        /* First, we add the vertex job directly to the queue, forcing it to
-         * the front */
-
-        panfrost_scoreboard_queue_job_internal(batch, vertex);
-        batch->first_job = vertex;
-        batch->first_vertex_for_tiler = vertex;
-
-        /* Similarly, we add the tiler job directly to the queue, forcing it to
-         * the front (second place), manually setting the tiler on vertex
-         * dependency (since this is pseudofused) and forcing a dependency of
-         * the now-second tiler on us (since all tiler jobs are linked in order
-         * and we're injecting ourselves at the front) */
+        /* Check if we even need tiling */
+        if (!batch->tiler_dep)
+                return;
 
-        panfrost_scoreboard_queue_job_internal(batch, tiler);
-        panfrost_add_dependency(tiler, vertex);
-        panfrost_add_dependency(batch->first_tiler, tiler);
-        batch->first_tiler = tiler;
-}
+        /* Okay, we do. Let's generate it. We'll need the job's polygon list
+         * regardless of size. */
 
-/* Generates a write value job, used to initialize the tiler structures. */
+        mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch,
+                MALI_TILER_MINIMUM_HEADER_SIZE);
 
-static struct panfrost_transfer
-panfrost_write_value_job(struct panfrost_batch *batch, mali_ptr polygon_list)
-{
         struct mali_job_descriptor_header job = {
                 .job_type = JOB_TYPE_WRITE_VALUE,
+                .job_index = batch->write_value_index,
                 .job_descriptor_size = 1,
+                .next_job = batch->first_job
         };
 
         struct mali_payload_write_value payload = {
@@ -275,224 +201,5 @@ panfrost_write_value_job(struct panfrost_batch *batch, mali_ptr polygon_list)
         memcpy(transfer.cpu, &job, sizeof(job));
         memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
 
-        return transfer;
-}
-
-/* If there are any tiler jobs, we need to initialize the tiler by writing
- * zeroes to a magic tiler structure. We do so via a WRITE_VALUE job linked to
- * the first vertex job feeding into tiling. */
-
-static void
-panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch)
-{
-        /* Check if we even need tiling */
-        if (!batch->last_tiler.gpu)
-                return;
-
-        /* Okay, we do. Let's generate it. We'll need the job's polygon list
-         * regardless of size. */
-
-        mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch,
-                MALI_TILER_MINIMUM_HEADER_SIZE);
-
-        struct panfrost_transfer job =
-                panfrost_write_value_job(batch, polygon_list);
-
-        /* Queue it */
-        panfrost_scoreboard_queue_compute_job(batch, job);
-
-        /* Tiler jobs need us */
-        panfrost_add_dependency(batch->first_tiler, job);
-}
-
-/* Once all jobs have been added to a batch and we're ready to submit, we need
- * to order them to set each of the next_job fields, obeying the golden rule:
- * "A job's dependencies must appear earlier in the job chain than itself".
- * Fortunately, computing this job chain is a well-studied graph theory problem
- * known as "topological sorting", which has linear time algorithms. We let
- * each job represent a node, each dependency a directed edge, and the entire
- * set of jobs to be a dependency graph. This graph is inherently acyclic, as
- * otherwise there are unresolveable dependencies.
- *
- * We implement Kahn's algorithm here to compute the next_job chain:
- * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
- *
- * A few implementation notes: we represent S explicitly with a bitset, L
- * implicitly in the next_job fields. The indices of the bitset are off-by-one:
- * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
- * hardware and dependencies are [1, node_count].
- *
- * We represent edge removal implicitly with another pair of bitsets, rather
- * than explicitly removing the edges, since we need to keep the dependencies
- * there for the hardware.
- */
-
-#define DESCRIPTOR_FOR_NODE(count) \
-        *(util_dynarray_element(&batch->headers, \
-                struct mali_job_descriptor_header*, count))
-
-#define GPU_ADDRESS_FOR_NODE(count) \
-        *(util_dynarray_element(&batch->gpu_headers, \
-                mali_ptr, count))
-
-void
-panfrost_scoreboard_link_batch(struct panfrost_batch *batch)
-{
-        /* Finalize the batch */
-        panfrost_scoreboard_initialize_tiler(batch);
-
-        /* Let no_incoming represent the set S described. */
-
-        unsigned node_count = batch->job_index;
-
-        size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
-        BITSET_WORD *no_incoming = calloc(sz, 1);
-
-        /* Sets for edges being removed in dep 1 or 2 respectively */
-
-        BITSET_WORD *edge_removal_1 = calloc(sz, 1);
-        BITSET_WORD *edge_removal_2 = calloc(sz, 1);
-
-        /* We compute no_incoming by traversing the batch. Simultaneously, we
-         * would like to keep track of a parity-reversed version of the
-         * dependency graph. Dependency indices are 16-bit and in practice (for
-         * ES3.0, at least), we can guarantee a given node will be depended on
-         * by no more than one other nodes. P.f:
-         *
-         * Proposition: Given a node N of type T, no more than one other node
-         * depends on N.
-         *
-         * If type is WRITE_VALUE: The only dependency added against us is from
-         * the first tiler job, so there is 1 dependent.
-         *
-         * If type is VERTEX: If there is a tiler node, that tiler node depends
-         * on us; if there is not (transform feedback), nothing depends on us.
-         * Therefore there is at most 1 dependent.
-         *
-         * If type is TILER: If there is another TILER job in succession, that
-         * node depends on us. No other job type depends on us. Therefore there
-         * is at most 1 dependent.
-         *
-         * If type is FRAGMENT: This type cannot be in a primary chain, so it
-         * is irrelevant. Just for kicks, nobody would depend on us, so there
-         * are zero dependents, so it holds anyway.
-         *
-         * TODO: Revise this logic for ES3.1 and above. This result may not
-         * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case
-         * those. Can FBO dependencies be expressed within a chain?
-         * ---
-         *
-         * Point is, we only need to hold a single dependent, which is a pretty
-         * helpful result.
-         */
-
-        unsigned *dependents = calloc(node_count, sizeof(unsigned));
-
-        for (unsigned i = 0; i < node_count; ++i) {
-                struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
-
-                unsigned dep_1 = node->job_dependency_index_1;
-                unsigned dep_2 = node->job_dependency_index_2;
-
-                /* Record no_incoming info for this node */
-
-                if (!(dep_1 || dep_2))
-                        BITSET_SET(no_incoming, i);
-
-                /* Record this node as the dependent of each of its
-                 * dependencies */
-
-                if (dep_1) {
-                        assert(!dependents[dep_1 - 1]);
-                        dependents[dep_1 - 1] = i + 1;
-                }
-
-                if (dep_2) {
-                        assert(!dependents[dep_2 - 1]);
-                        dependents[dep_2 - 1] = i + 1;
-                }
-        }
-
-        /* No next_job fields are set at the beginning, so L is implciitly the
-         * empty set. As next_job fields are filled, L is implicitly set. Tail
-         * is the tail of L, however. */
-
-        struct mali_job_descriptor_header *tail = NULL;
-
-        /* We iterate, popping off elements of S. A simple foreach won't do,
-         * since we mutate S as we go (even adding elements) */
-
-        unsigned arr_size = BITSET_WORDS(node_count);
-
-        for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
-             (node_n_1 != 0);
-             node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
-
-                unsigned node_n = node_n_1 - 1;
-
-                /* We've got a node n, pop it off */
-                BITSET_CLEAR(no_incoming, node_n);
-
-                /* Add it to the list */
-                struct mali_job_descriptor_header *n =
-                        DESCRIPTOR_FOR_NODE(node_n);
-
-                mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
-
-                if (tail) {
-                        /* Link us to the last node */
-                        tail->next_job = addr;
-                } else {
-                        /* We are the first/last node */
-                        batch->first_job.cpu = (uint8_t *) n;
-                        batch->first_job.gpu = addr;
-                }
-
-                tail = n;
-
-                /* Grab the dependent, if there is one */
-                unsigned node_m_1 = dependents[node_n];
-
-                if (node_m_1) {
-                        unsigned node_m = node_m_1 - 1;
-
-                        struct mali_job_descriptor_header *m =
-                                DESCRIPTOR_FOR_NODE(node_m);
-
-                        /* Get the deps, accounting for removal */
-                        unsigned dep_1 = m->job_dependency_index_1;
-                        unsigned dep_2 = m->job_dependency_index_2;
-
-                        if (BITSET_TEST(edge_removal_1, node_m))
-                                dep_1 = 0;
-
-                        if (BITSET_TEST(edge_removal_2, node_m))
-                                dep_2 = 0;
-
-                        /* Pretend to remove edges */
-                        if (dep_1 == node_n_1) {
-                                BITSET_SET(edge_removal_1, node_m);
-                                dep_1 = 0;
-                        } else if (dep_2 == node_n_1) {
-                                BITSET_SET(edge_removal_2, node_m);
-                                dep_2 = 0;
-                        } else {
-                                /* This node has no relevant dependencies */
-                                assert(0);
-                        }
-
-                        /* Are there edges left? If not, add us to S */
-                        bool has_edges = dep_1 || dep_2;
-
-                        if (!has_edges)
-                                BITSET_SET(no_incoming, node_m);
-                }
-        }
-
-        /* Cleanup */
-        free(no_incoming);
-        free(dependents);
-        free(edge_removal_1);
-        free(edge_removal_2);
-
+        batch->first_job = transfer.gpu;
 }