2 * Copyright (C) 2019 Collabora, Ltd.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "pan_context.h"
27 #include "pan_allocate.h"
28 #include "util/bitset.h"
31 * Within a batch (panfrost_job), there are various types of Mali jobs:
33 * - SET_VALUE: initializes tiler
34 * - VERTEX: runs a vertex shader
35 * - TILER: runs tiling and sets up a fragment shader
36 * - FRAGMENT: runs fragment shaders and writes out
37 * - COMPUTE: runs a compute shader
38 * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
39 * - GEOMETRY: runs a geometry shader (unimplemented)
40 * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
42 * In between a full batch and a single Mali job is the "job chain", a series
43 * of Mali jobs together forming a linked list. Within the job chain, each Mali
44 * job can set (up to) two dependencies on other earlier jobs in the chain.
45 * This dependency graph forms a scoreboard. The general idea of a scoreboard
46 * applies: when there is a data dependency of job B on job A, job B sets one
47 * of its dependency indices to job A, ensuring that job B won't start until
50 * More specifically, here are a set of rules:
52 * - A set value job must appear if and only if there is at least one tiler job.
54 * - Vertex jobs and tiler jobs are independent.
56 * - A tiler job must have a dependency on its data source. If it's getting
57 * data from a vertex job, it depends on the vertex job. If it's getting data
58 * from software, this is null.
60 * - The first vertex job used as the input to tiling must depend on the set
61 * value job, if it is present.
63 * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
64 * previous job in the chain.
66 * - Jobs linking via next_job has no bearing on order of execution, rather it
67 * just establishes the linked list of jobs, EXCEPT:
69 * - A job's dependencies must appear earlier in the linked list (job chain).
71 * Justification for each rule:
73 * - Set value jobs set up tiling, essentially. If tiling occurs, they are
74 * needed; if it does not, we cannot emit them since then tiling partially
75 * occurs and it's bad.
77 * - The hardware has no notion of a "vertex/tiler job" (at least not our
78 * hardware -- other revs have fused jobs, but --- crap, this just got even
79 * more complicated). They are independent units that take in data, process
80 * it, and spit out data.
82 * - Any job must depend on its data source, in fact, or risk a
83 * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
84 * tiler jobs depend on the corresponding vertex job (if it's there).
86 * - In fact, tiling depends on the set value job, but tiler jobs depend on the
87 * corresponding vertex jobs and each other, so this rule ensures each tiler
88 * job automatically depends on the set value job.
90 * - The tiler is not thread-safe; this dependency prevents race conditions
91 * between two different jobs trying to write to the tiler outputs at the
94 * - Internally, jobs are scoreboarded; the next job fields just form a linked
95 * list to allow the jobs to be read in; the execution order is from
96 * resolving the dependency fields instead.
98 * - The hardware cannot set a dependency on a job it doesn't know about yet,
99 * and dependencies are processed in-order of the next job fields.
103 /* Accessor to set the next job field */
106 panfrost_set_job_next(struct mali_job_descriptor_header
*first
, mali_ptr next
)
108 if (first
->job_descriptor_size
)
109 first
->next_job_64
= (u64
) (uintptr_t) next
;
111 first
->next_job_32
= (u32
) (uintptr_t) next
;
114 /* Coerce a panfrost_transfer to a header */
116 static inline struct mali_job_descriptor_header
*
117 job_descriptor_header(struct panfrost_transfer t
)
119 return (struct mali_job_descriptor_header
*) t
.cpu
;
123 panfrost_assign_index(
124 struct panfrost_job
*job
,
125 struct panfrost_transfer transfer
)
127 /* Assign the index */
128 unsigned index
= ++job
->job_index
;
129 job_descriptor_header(transfer
)->job_index
= index
;
132 /* Helper to add a dependency to a job */
135 panfrost_add_dependency(
136 struct panfrost_transfer depender
,
137 struct panfrost_transfer dependent
)
140 struct mali_job_descriptor_header
*first
=
141 job_descriptor_header(dependent
);
143 struct mali_job_descriptor_header
*second
=
144 job_descriptor_header(depender
);
146 /* Ensure we're ready for dependencies */
147 assert(second
->job_index
);
148 assert(first
->job_index
);
150 /* Look for an open slot */
152 if (!second
->job_dependency_index_1
)
153 second
->job_dependency_index_1
= first
->job_index
;
154 else if (!second
->job_dependency_index_2
)
155 second
->job_dependency_index_2
= first
->job_index
;
157 unreachable("No available slot for new dependency");
160 /* Queues a job WITHOUT updating pointers. Be careful. */
163 panfrost_scoreboard_queue_job_internal(
164 struct panfrost_job
*batch
,
165 struct panfrost_transfer job
)
167 panfrost_assign_index(batch
, job
);
169 /* Queue a pointer to the job */
170 util_dynarray_append(&batch
->headers
, void*, job
.cpu
);
171 util_dynarray_append(&batch
->gpu_headers
, mali_ptr
, job
.gpu
);
175 /* Queues a compute job, with no special dependencies. This is a bit of a
176 * misnomer -- internally, all job types are queued with this function, but
177 * outside of this file, it's for pure compute jobs */
180 panfrost_scoreboard_queue_compute_job(
181 struct panfrost_job
*batch
,
182 struct panfrost_transfer job
)
184 panfrost_scoreboard_queue_job_internal(batch
, job
);
186 /* Update the linked list metadata as appropriate */
187 batch
->last_job
= job
;
189 if (!batch
->first_job
.gpu
)
190 batch
->first_job
= job
;
193 /* Queues a vertex job. There are no special dependencies yet, but if
194 * tiling is required (anytime 'rasterize discard' is disabled), we have
195 * some extra bookkeeping for later */
198 panfrost_scoreboard_queue_vertex_job(
199 struct panfrost_job
*batch
,
200 struct panfrost_transfer vertex
,
201 bool requires_tiling
)
203 panfrost_scoreboard_queue_compute_job(batch
, vertex
);
205 if (requires_tiling
&& !batch
->first_vertex_for_tiler
.gpu
)
206 batch
->first_vertex_for_tiler
= vertex
;
209 /* Queues a tiler job, respecting the dependency of each tiler job on the
213 panfrost_scoreboard_queue_tiler_job(
214 struct panfrost_job
*batch
,
215 struct panfrost_transfer tiler
)
217 panfrost_scoreboard_queue_compute_job(batch
, tiler
);
219 if (!batch
->first_tiler
.gpu
)
220 batch
->first_tiler
= tiler
;
222 if (batch
->last_tiler
.gpu
)
223 panfrost_add_dependency(tiler
, batch
->last_tiler
);
225 batch
->last_tiler
= tiler
;
228 /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
229 * fused jobs are not supported (the default until Bifrost rolls out) */
232 panfrost_scoreboard_queue_fused_job(
233 struct panfrost_job
*batch
,
234 struct panfrost_transfer vertex
,
235 struct panfrost_transfer tiler
)
237 panfrost_scoreboard_queue_vertex_job(batch
, vertex
, true);
238 panfrost_scoreboard_queue_tiler_job(batch
, tiler
);
239 panfrost_add_dependency(tiler
, vertex
);
242 /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
246 panfrost_scoreboard_queue_fused_job_prepend(
247 struct panfrost_job
*batch
,
248 struct panfrost_transfer vertex
,
249 struct panfrost_transfer tiler
)
252 assert(batch
->last_tiler
.gpu
);
253 assert(batch
->first_tiler
.gpu
);
255 /* First, we add the vertex job directly to the queue, forcing it to
258 panfrost_scoreboard_queue_job_internal(batch
, vertex
);
259 batch
->first_job
= vertex
;
260 batch
->first_vertex_for_tiler
= vertex
;
262 /* Similarly, we add the tiler job directly to the queue, forcing it to
263 * the front (second place), manually setting the tiler on vertex
264 * dependency (since this is pseudofused) and forcing a dependency of
265 * the now-second tiler on us (since all tiler jobs are linked in order
266 * and we're injecting ourselves at the front) */
268 panfrost_scoreboard_queue_job_internal(batch
, tiler
);
269 panfrost_add_dependency(tiler
, vertex
);
270 panfrost_add_dependency(batch
->first_tiler
, tiler
);
271 batch
->first_tiler
= tiler
;
274 /* Generates a set value job, used below as part of TILER job scheduling. */
276 static struct panfrost_transfer
277 panfrost_set_value_job(struct panfrost_context
*ctx
, mali_ptr polygon_list
)
279 struct mali_job_descriptor_header job
= {
280 .job_type
= JOB_TYPE_SET_VALUE
,
281 .job_descriptor_size
= 1,
284 struct mali_payload_set_value payload
= {
289 struct panfrost_transfer transfer
= panfrost_allocate_transient(ctx
, sizeof(job
) + sizeof(payload
));
290 memcpy(transfer
.cpu
, &job
, sizeof(job
));
291 memcpy(transfer
.cpu
+ sizeof(job
), &payload
, sizeof(payload
));
296 /* If there are any tiler jobs, there needs to be a corresponding set value job
297 * linked to the first vertex job feeding into tiling. */
300 panfrost_scoreboard_set_value(struct panfrost_job
*batch
)
302 /* Check if we even need tiling */
303 if (!batch
->last_tiler
.gpu
)
306 /* Okay, we do. Let's generate it */
308 struct panfrost_context
*ctx
= batch
->ctx
;
309 mali_ptr polygon_list
= ctx
->tiler_polygon_list
.bo
->gpu
;
311 struct panfrost_transfer job
=
312 panfrost_set_value_job(ctx
, polygon_list
);
315 panfrost_scoreboard_queue_compute_job(batch
, job
);
317 /* Tiler jobs need us */
318 panfrost_add_dependency(batch
->first_tiler
, job
);
321 /* Once all jobs have been added to a batch and we're ready to submit, we need
322 * to order them to set each of the next_job fields, obeying the golden rule:
323 * "A job's dependencies must appear earlier in the job chain than itself".
324 * Fortunately, computing this job chain is a well-studied graph theory problem
325 * known as "topological sorting", which has linear time algorithms. We let
326 * each job represent a node, each dependency a directed edge, and the entire
327 * set of jobs to be a dependency graph. This graph is inherently acyclic, as
328 * otherwise there are unresolveable dependencies.
330 * We implement Kahn's algorithm here to compute the next_job chain:
331 * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
333 * A few implementation notes: we represent S explicitly with a bitset, L
334 * implicitly in the next_job fields. The indices of the bitset are off-by-one:
335 * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
336 * hardware and dependencies are [1, node_count].
338 * We represent edge removal implicitly with another pair of bitsets, rather
339 * than explicitly removing the edges, since we need to keep the dependencies
340 * there for the hardware.
343 #define DESCRIPTOR_FOR_NODE(count) \
344 *(util_dynarray_element(&batch->headers, \
345 struct mali_job_descriptor_header*, count))
347 #define GPU_ADDRESS_FOR_NODE(count) \
348 *(util_dynarray_element(&batch->gpu_headers, \
352 panfrost_scoreboard_link_batch(struct panfrost_job
*batch
)
354 /* Finalize the batch */
355 panfrost_scoreboard_set_value(batch
);
357 /* Let no_incoming represent the set S described. */
359 unsigned node_count
= batch
->job_index
;
361 size_t sz
= BITSET_WORDS(node_count
) * sizeof(BITSET_WORD
);
362 BITSET_WORD
*no_incoming
= calloc(sz
, 1);
364 /* Sets for edges being removed in dep 1 or 2 respectively */
366 BITSET_WORD
*edge_removal_1
= calloc(sz
, 1);
367 BITSET_WORD
*edge_removal_2
= calloc(sz
, 1);
369 /* We compute no_incoming by traversing the batch. */
371 for (unsigned i
= 0; i
< node_count
; ++i
) {
372 struct mali_job_descriptor_header
*node
= DESCRIPTOR_FOR_NODE(i
);
374 unsigned dep_1
= node
->job_dependency_index_1
;
375 unsigned dep_2
= node
->job_dependency_index_2
;
377 if (!(dep_1
|| dep_2
))
378 BITSET_SET(no_incoming
, i
);
381 /* No next_job fields are set at the beginning, so L is implciitly the
382 * empty set. As next_job fields are filled, L is implicitly set. Tail
383 * is the tail of L, however. */
385 struct mali_job_descriptor_header
*tail
= NULL
;
387 /* We iterate, popping off elements of S. A simple foreach won't do,
388 * since we mutate S as we go (even adding elements) */
390 unsigned arr_size
= BITSET_WORDS(node_count
);
392 for (unsigned node_n_1
= __bitset_ffs(no_incoming
, arr_size
);
394 node_n_1
= __bitset_ffs(no_incoming
, arr_size
)) {
396 unsigned node_n
= node_n_1
- 1;
398 /* We've got a node n, pop it off */
399 BITSET_CLEAR(no_incoming
, node_n
);
401 /* Add it to the list */
402 struct mali_job_descriptor_header
*n
=
403 DESCRIPTOR_FOR_NODE(node_n
);
405 mali_ptr addr
= GPU_ADDRESS_FOR_NODE(node_n
);
408 /* Link us to the last node */
409 panfrost_set_job_next(tail
, addr
);
411 /* We are the first/last node */
412 batch
->first_job
.cpu
= (uint8_t *) n
;
413 batch
->first_job
.gpu
= addr
;
418 /* Scan dependencies */
419 for (unsigned node_m
= 0; node_m
< node_count
; ++node_m
) {
420 struct mali_job_descriptor_header
*m
=
421 DESCRIPTOR_FOR_NODE(node_m
);
423 /* Get the deps, accounting for removal */
424 unsigned dep_1
= m
->job_dependency_index_1
;
425 unsigned dep_2
= m
->job_dependency_index_2
;
427 if (BITSET_TEST(edge_removal_1
, node_m
))
430 if (BITSET_TEST(edge_removal_2
, node_m
))
433 /* Pretend to remove edges */
434 if (dep_1
== node_n_1
) {
435 BITSET_SET(edge_removal_1
, node_m
);
437 } else if (dep_2
== node_n_1
) {
438 BITSET_SET(edge_removal_2
, node_m
);
441 /* This node has no relevant dependencies */
445 /* Are there edges left? If not, add us to S */
446 bool has_edges
= dep_1
|| dep_2
;
449 BITSET_SET(no_incoming
, node_m
);