src/gallium/drivers/panfrost/pan_scoreboard.c

   1 /*
   2  * Copyright (C) 2019 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  */
  24
  25 #include "pan_context.h"
  26 #include "pan_job.h"
  27 #include "pan_allocate.h"
  28 #include "util/bitset.h"
  29
  30 /*
  31  * Within a batch (panfrost_job), there are various types of Mali jobs:
  32  *
  33  *  - SET_VALUE: initializes tiler
  34  *  - VERTEX: runs a vertex shader
  35  *  - TILER: runs tiling and sets up a fragment shader
  36  *  - FRAGMENT: runs fragment shaders and writes out
  37  *  - COMPUTE: runs a compute shader
  38  *  - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
  39  *  - GEOMETRY: runs a geometry shader (unimplemented)
  40  *  - CACHE_FLUSH: unseen in the wild, theoretically cache flush
  41  *
  42  * In between a full batch and a single Mali job is the "job chain", a series
  43  * of Mali jobs together forming a linked list. Within the job chain, each Mali
  44  * job can set (up to) two dependencies on other earlier jobs in the chain.
  45  * This dependency graph forms a scoreboard. The general idea of a scoreboard
  46  * applies: when there is a data dependency of job B on job A, job B sets one
  47  * of its dependency indices to job A, ensuring that job B won't start until
  48  * job A finishes.
  49  *
  50  * More specifically, here are a set of rules:
  51  *
  52  * - A set value job must appear if and only if there is at least one tiler job.
  53  *
  54  * - Vertex jobs and tiler jobs are independent.
  55  *
  56  * - A tiler job must have a dependency on its data source. If it's getting
  57  *   data from a vertex job, it depends on the vertex job. If it's getting data
  58  *   from software, this is null.
  59  *
  60  * - The first vertex job used as the input to tiling must depend on the set
  61  *   value job, if it is present.
  62  *
  63  * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
  64  *   previous job in the chain.
  65  *
  66  * - Jobs linking via next_job has no bearing on order of execution, rather it
  67  *   just establishes the linked list of jobs, EXCEPT:
  68  *
  69  * - A job's dependencies must appear earlier in the linked list (job chain).
  70  *
  71  * Justification for each rule:
  72  *
  73  * - Set value jobs set up tiling, essentially. If tiling occurs, they are
  74  *   needed; if it does not, we cannot emit them since then tiling partially
  75  *   occurs and it's bad.
  76  *
  77  * - The hardware has no notion of a "vertex/tiler job" (at least not our
  78  *   hardware -- other revs have fused jobs, but --- crap, this just got even
  79  *   more complicated). They are independent units that take in data, process
  80  *   it, and spit out data.
  81  *
  82  * - Any job must depend on its data source, in fact, or risk a
  83  *   read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
  84  *   tiler jobs depend on the corresponding vertex job (if it's there).
  85  *
  86  * - In fact, tiling depends on the set value job, but tiler jobs depend on the
  87  *   corresponding vertex jobs and each other, so this rule ensures each tiler
  88  *   job automatically depends on the set value job.
  89  *
  90  * - The tiler is not thread-safe; this dependency prevents race conditions
  91  *   between two different jobs trying to write to the tiler outputs at the
  92  *   same time.
  93  *
  94  * - Internally, jobs are scoreboarded; the next job fields just form a linked
  95  *   list to allow the jobs to be read in; the execution order is from
  96  *   resolving the dependency fields instead.
  97  *
  98  * - The hardware cannot set a dependency on a job it doesn't know about yet,
  99  *   and dependencies are processed in-order of the next job fields.
 100  *
 101  */
 102
 103 /* Accessor to set the next job field */
 104
 105 static void
 106 panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
 107 {
 108         if (first->job_descriptor_size)
 109                 first->next_job_64 = (u64) (uintptr_t) next;
 110         else
 111                 first->next_job_32 = (u32) (uintptr_t) next;
 112 }
 113
 114 /* Coerce a panfrost_transfer to a header */
 115
 116 static inline struct mali_job_descriptor_header *
 117 job_descriptor_header(struct panfrost_transfer t)
 118 {
 119         return (struct mali_job_descriptor_header *) t.cpu;
 120 }
 121
 122 static void
 123 panfrost_assign_index(
 124         struct panfrost_batch *batch,
 125         struct panfrost_transfer transfer)
 126 {
 127         /* Assign the index */
 128         unsigned index = ++batch->job_index;
 129         job_descriptor_header(transfer)->job_index = index;
 130 }
 131
 132 /* Helper to add a dependency to a job */
 133
 134 static void
 135 panfrost_add_dependency(
 136         struct panfrost_transfer depender,
 137         struct panfrost_transfer dependent)
 138 {
 139
 140         struct mali_job_descriptor_header *first =
 141                 job_descriptor_header(dependent);
 142
 143         struct mali_job_descriptor_header *second =
 144                 job_descriptor_header(depender);
 145
 146         /* Look for an open slot */
 147
 148         if (!second->job_dependency_index_1)
 149                 second->job_dependency_index_1 = first->job_index;
 150         else if (!second->job_dependency_index_2)
 151                 second->job_dependency_index_2 = first->job_index;
 152         else
 153                 unreachable("No available slot for new dependency");
 154 }
 155
 156 /* Queues a job WITHOUT updating pointers. Be careful. */
 157
 158 static void
 159 panfrost_scoreboard_queue_job_internal(
 160         struct panfrost_batch *batch,
 161         struct panfrost_transfer job)
 162 {
 163         panfrost_assign_index(batch, job);
 164
 165         /* Queue a pointer to the job */
 166         util_dynarray_append(&batch->headers, void*, job.cpu);
 167         util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
 168 }
 169
 170
 171 /* Queues a compute job, with no special dependencies. This is a bit of a
 172  * misnomer -- internally, all job types are queued with this function, but
 173  * outside of this file, it's for pure compute jobs */
 174
 175 void
 176 panfrost_scoreboard_queue_compute_job(
 177         struct panfrost_batch *batch,
 178         struct panfrost_transfer job)
 179 {
 180         panfrost_scoreboard_queue_job_internal(batch, job);
 181
 182         /* Update the linked list metadata as appropriate */
 183         batch->last_job = job;
 184
 185         if (!batch->first_job.gpu)
 186                 batch->first_job = job;
 187 }
 188
 189 /* Queues a vertex job. There are no special dependencies yet, but if
 190  * tiling is required (anytime 'rasterize discard' is disabled), we have
 191  * some extra bookkeeping for later */
 192
 193 void
 194 panfrost_scoreboard_queue_vertex_job(
 195         struct panfrost_batch *batch,
 196         struct panfrost_transfer vertex,
 197         bool requires_tiling)
 198 {
 199         panfrost_scoreboard_queue_compute_job(batch, vertex);
 200
 201         if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
 202                 batch->first_vertex_for_tiler = vertex;
 203 }
 204
 205 /* Queues a tiler job, respecting the dependency of each tiler job on the
 206  * previous */
 207
 208 void
 209 panfrost_scoreboard_queue_tiler_job(
 210         struct panfrost_batch *batch,
 211         struct panfrost_transfer tiler)
 212 {
 213         panfrost_scoreboard_queue_compute_job(batch, tiler);
 214
 215         if (!batch->first_tiler.gpu)
 216                 batch->first_tiler = tiler;
 217
 218         if (batch->last_tiler.gpu)
 219                 panfrost_add_dependency(tiler, batch->last_tiler);
 220
 221         batch->last_tiler = tiler;
 222 }
 223
 224 /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
 225  * fused jobs are not supported (the default until Bifrost rolls out) */
 226
 227 void
 228 panfrost_scoreboard_queue_fused_job(
 229         struct panfrost_batch *batch,
 230         struct panfrost_transfer vertex,
 231         struct panfrost_transfer tiler)
 232 {
 233         panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
 234         panfrost_scoreboard_queue_tiler_job(batch, tiler);
 235         panfrost_add_dependency(tiler, vertex);
 236 }
 237
 238 /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
 239  * wallpaper blits */
 240
 241 void
 242 panfrost_scoreboard_queue_fused_job_prepend(
 243         struct panfrost_batch *batch,
 244         struct panfrost_transfer vertex,
 245         struct panfrost_transfer tiler)
 246 {
 247         /* Sanity check */
 248         assert(batch->last_tiler.gpu);
 249         assert(batch->first_tiler.gpu);
 250
 251         /* First, we add the vertex job directly to the queue, forcing it to
 252          * the front */
 253
 254         panfrost_scoreboard_queue_job_internal(batch, vertex);
 255         batch->first_job = vertex;
 256         batch->first_vertex_for_tiler = vertex;
 257
 258         /* Similarly, we add the tiler job directly to the queue, forcing it to
 259          * the front (second place), manually setting the tiler on vertex
 260          * dependency (since this is pseudofused) and forcing a dependency of
 261          * the now-second tiler on us (since all tiler jobs are linked in order
 262          * and we're injecting ourselves at the front) */
 263
 264         panfrost_scoreboard_queue_job_internal(batch, tiler);
 265         panfrost_add_dependency(tiler, vertex);
 266         panfrost_add_dependency(batch->first_tiler, tiler);
 267         batch->first_tiler = tiler;
 268 }
 269
 270 /* Generates a set value job, used below as part of TILER job scheduling. */
 271
 272 static struct panfrost_transfer
 273 panfrost_set_value_job(struct panfrost_batch *batch, mali_ptr polygon_list)
 274 {
 275         struct mali_job_descriptor_header job = {
 276                 .job_type = JOB_TYPE_SET_VALUE,
 277                 .job_descriptor_size = 1,
 278         };
 279
 280         struct mali_payload_set_value payload = {
 281                 .out = polygon_list,
 282                 .unknown = 0x3,
 283         };
 284
 285         struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(payload));
 286         memcpy(transfer.cpu, &job, sizeof(job));
 287         memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
 288
 289         return transfer;
 290 }
 291
 292 /* If there are any tiler jobs, there needs to be a corresponding set value job
 293  * linked to the first vertex job feeding into tiling. */
 294
 295 static void
 296 panfrost_scoreboard_set_value(struct panfrost_batch *batch)
 297 {
 298         /* Check if we even need tiling */
 299         if (!batch->last_tiler.gpu)
 300                 return;
 301
 302         /* Okay, we do. Let's generate it. We'll need the job's polygon list
 303          * regardless of size. */
 304
 305         mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, 0);
 306
 307         struct panfrost_transfer job =
 308                 panfrost_set_value_job(batch, polygon_list);
 309
 310         /* Queue it */
 311         panfrost_scoreboard_queue_compute_job(batch, job);
 312
 313         /* Tiler jobs need us */
 314         panfrost_add_dependency(batch->first_tiler, job);
 315 }
 316
 317 /* Once all jobs have been added to a batch and we're ready to submit, we need
 318  * to order them to set each of the next_job fields, obeying the golden rule:
 319  * "A job's dependencies must appear earlier in the job chain than itself".
 320  * Fortunately, computing this job chain is a well-studied graph theory problem
 321  * known as "topological sorting", which has linear time algorithms. We let
 322  * each job represent a node, each dependency a directed edge, and the entire
 323  * set of jobs to be a dependency graph. This graph is inherently acyclic, as
 324  * otherwise there are unresolveable dependencies.
 325  *
 326  * We implement Kahn's algorithm here to compute the next_job chain:
 327  * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
 328  *
 329  * A few implementation notes: we represent S explicitly with a bitset, L
 330  * implicitly in the next_job fields. The indices of the bitset are off-by-one:
 331  * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
 332  * hardware and dependencies are [1, node_count].
 333  *
 334  * We represent edge removal implicitly with another pair of bitsets, rather
 335  * than explicitly removing the edges, since we need to keep the dependencies
 336  * there for the hardware.
 337  */
 338
 339 #define DESCRIPTOR_FOR_NODE(count) \
 340         *(util_dynarray_element(&batch->headers, \
 341                 struct mali_job_descriptor_header*, count))
 342
 343 #define GPU_ADDRESS_FOR_NODE(count) \
 344         *(util_dynarray_element(&batch->gpu_headers, \
 345                 mali_ptr, count))
 346
 347 void
 348 panfrost_scoreboard_link_batch(struct panfrost_batch *batch)
 349 {
 350         /* Finalize the batch */
 351         panfrost_scoreboard_set_value(batch);
 352
 353         /* Let no_incoming represent the set S described. */
 354
 355         unsigned node_count = batch->job_index;
 356
 357         size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
 358         BITSET_WORD *no_incoming = calloc(sz, 1);
 359
 360         /* Sets for edges being removed in dep 1 or 2 respectively */
 361
 362         BITSET_WORD *edge_removal_1 = calloc(sz, 1);
 363         BITSET_WORD *edge_removal_2 = calloc(sz, 1);
 364
 365         /* We compute no_incoming by traversing the batch. Simultaneously, we
 366          * would like to keep track of a parity-reversed version of the
 367          * dependency graph. Dependency indices are 16-bit and in practice (for
 368          * ES3.0, at least), we can guarantee a given node will be depended on
 369          * by no more than one other nodes. P.f:
 370          *
 371          * Proposition: Given a node N of type T, no more than one other node
 372          * depends on N.
 373          *
 374          * If type is SET_VALUE: The only dependency added against us is from
 375          * the first tiler job, so there is 1 dependent.
 376          *
 377          * If type is VERTEX: If there is a tiler node, that tiler node depends
 378          * on us; if there is not (transform feedback), nothing depends on us.
 379          * Therefore there is at most 1 dependent.
 380          *
 381          * If type is TILER: If there is another TILER job in succession, that
 382          * node depends on us. No other job type depends on us. Therefore there
 383          * is at most 1 dependent.
 384          *
 385          * If type is FRAGMENT: This type cannot be in a primary chain, so it
 386          * is irrelevant. Just for kicks, nobody would depend on us, so there
 387          * are zero dependents, so it holds anyway.
 388          *
 389          * TODO: Revise this logic for ES3.1 and above. This result may not
 390          * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case
 391          * those. Can FBO dependencies be expressed within a chain?
 392          * ---
 393          *
 394          * Point is, we only need to hold a single dependent, which is a pretty
 395          * helpful result.
 396          */
 397
 398         unsigned *dependents = calloc(node_count, sizeof(unsigned));
 399
 400         for (unsigned i = 0; i < node_count; ++i) {
 401                 struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
 402
 403                 unsigned dep_1 = node->job_dependency_index_1;
 404                 unsigned dep_2 = node->job_dependency_index_2;
 405
 406                 /* Record no_incoming info for this node */
 407
 408                 if (!(dep_1 || dep_2))
 409                         BITSET_SET(no_incoming, i);
 410
 411                 /* Record this node as the dependent of each of its
 412                  * dependencies */
 413
 414                 if (dep_1) {
 415                         assert(!dependents[dep_1 - 1]);
 416                         dependents[dep_1 - 1] = i + 1;
 417                 }
 418
 419                 if (dep_2) {
 420                         assert(!dependents[dep_2 - 1]);
 421                         dependents[dep_2 - 1] = i + 1;
 422                 }
 423         }
 424
 425         /* No next_job fields are set at the beginning, so L is implciitly the
 426          * empty set. As next_job fields are filled, L is implicitly set. Tail
 427          * is the tail of L, however. */
 428
 429         struct mali_job_descriptor_header *tail = NULL;
 430
 431         /* We iterate, popping off elements of S. A simple foreach won't do,
 432          * since we mutate S as we go (even adding elements) */
 433
 434         unsigned arr_size = BITSET_WORDS(node_count);
 435
 436         for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
 437              (node_n_1 != 0);
 438              node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
 439
 440                 unsigned node_n = node_n_1 - 1;
 441
 442                 /* We've got a node n, pop it off */
 443                 BITSET_CLEAR(no_incoming, node_n);
 444
 445                 /* Add it to the list */
 446                 struct mali_job_descriptor_header *n =
 447                         DESCRIPTOR_FOR_NODE(node_n);
 448
 449                 mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
 450
 451                 if (tail) {
 452                         /* Link us to the last node */
 453                         panfrost_set_job_next(tail, addr);
 454                 } else {
 455                         /* We are the first/last node */
 456                         batch->first_job.cpu = (uint8_t *) n;
 457                         batch->first_job.gpu = addr;
 458                 }
 459
 460                 tail = n;
 461
 462                 /* Grab the dependent, if there is one */
 463                 unsigned node_m_1 = dependents[node_n];
 464
 465                 if (node_m_1) {
 466                         unsigned node_m = node_m_1 - 1;
 467
 468                         struct mali_job_descriptor_header *m =
 469                                 DESCRIPTOR_FOR_NODE(node_m);
 470
 471                         /* Get the deps, accounting for removal */
 472                         unsigned dep_1 = m->job_dependency_index_1;
 473                         unsigned dep_2 = m->job_dependency_index_2;
 474
 475                         if (BITSET_TEST(edge_removal_1, node_m))
 476                                 dep_1 = 0;
 477
 478                         if (BITSET_TEST(edge_removal_2, node_m))
 479                                 dep_2 = 0;
 480
 481                         /* Pretend to remove edges */
 482                         if (dep_1 == node_n_1) {
 483                                 BITSET_SET(edge_removal_1, node_m);
 484                                 dep_1 = 0;
 485                         } else if (dep_2 == node_n_1) {
 486                                 BITSET_SET(edge_removal_2, node_m);
 487                                 dep_2 = 0;
 488                         } else {
 489                                 /* This node has no relevant dependencies */
 490                                 assert(0);
 491                         }
 492
 493                         /* Are there edges left? If not, add us to S */
 494                         bool has_edges = dep_1 || dep_2;
 495
 496                         if (!has_edges)
 497                                 BITSET_SET(no_incoming, node_m);
 498                 }
 499         }
 500
 501         /* Cleanup */
 502         free(no_incoming);
 503         free(dependents);
 504         free(edge_removal_1);
 505         free(edge_removal_2);
 506
 507 }