panfrost: Stop passing screen around for BO operations
[mesa.git] / src / gallium / drivers / panfrost / pan_scoreboard.c
1 /*
2 * Copyright (C) 2019 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25 #include "pan_context.h"
26 #include "pan_job.h"
27 #include "pan_allocate.h"
28 #include "util/bitset.h"
29
30 /*
31 * Within a batch (panfrost_job), there are various types of Mali jobs:
32 *
33 * - SET_VALUE: initializes tiler
34 * - VERTEX: runs a vertex shader
35 * - TILER: runs tiling and sets up a fragment shader
36 * - FRAGMENT: runs fragment shaders and writes out
37 * - COMPUTE: runs a compute shader
38 * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
39 * - GEOMETRY: runs a geometry shader (unimplemented)
40 * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
41 *
42 * In between a full batch and a single Mali job is the "job chain", a series
43 * of Mali jobs together forming a linked list. Within the job chain, each Mali
44 * job can set (up to) two dependencies on other earlier jobs in the chain.
45 * This dependency graph forms a scoreboard. The general idea of a scoreboard
46 * applies: when there is a data dependency of job B on job A, job B sets one
47 * of its dependency indices to job A, ensuring that job B won't start until
48 * job A finishes.
49 *
50 * More specifically, here are a set of rules:
51 *
52 * - A set value job must appear if and only if there is at least one tiler job.
53 *
54 * - Vertex jobs and tiler jobs are independent.
55 *
56 * - A tiler job must have a dependency on its data source. If it's getting
57 * data from a vertex job, it depends on the vertex job. If it's getting data
58 * from software, this is null.
59 *
60 * - The first vertex job used as the input to tiling must depend on the set
61 * value job, if it is present.
62 *
63 * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
64 * previous job in the chain.
65 *
66 * - Jobs linking via next_job has no bearing on order of execution, rather it
67 * just establishes the linked list of jobs, EXCEPT:
68 *
69 * - A job's dependencies must appear earlier in the linked list (job chain).
70 *
71 * Justification for each rule:
72 *
73 * - Set value jobs set up tiling, essentially. If tiling occurs, they are
74 * needed; if it does not, we cannot emit them since then tiling partially
75 * occurs and it's bad.
76 *
77 * - The hardware has no notion of a "vertex/tiler job" (at least not our
78 * hardware -- other revs have fused jobs, but --- crap, this just got even
79 * more complicated). They are independent units that take in data, process
80 * it, and spit out data.
81 *
82 * - Any job must depend on its data source, in fact, or risk a
83 * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
84 * tiler jobs depend on the corresponding vertex job (if it's there).
85 *
86 * - In fact, tiling depends on the set value job, but tiler jobs depend on the
87 * corresponding vertex jobs and each other, so this rule ensures each tiler
88 * job automatically depends on the set value job.
89 *
90 * - The tiler is not thread-safe; this dependency prevents race conditions
91 * between two different jobs trying to write to the tiler outputs at the
92 * same time.
93 *
94 * - Internally, jobs are scoreboarded; the next job fields just form a linked
95 * list to allow the jobs to be read in; the execution order is from
96 * resolving the dependency fields instead.
97 *
98 * - The hardware cannot set a dependency on a job it doesn't know about yet,
99 * and dependencies are processed in-order of the next job fields.
100 *
101 */
102
103 /* Accessor to set the next job field */
104
105 static void
106 panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
107 {
108 if (first->job_descriptor_size)
109 first->next_job_64 = (u64) (uintptr_t) next;
110 else
111 first->next_job_32 = (u32) (uintptr_t) next;
112 }
113
114 /* Coerce a panfrost_transfer to a header */
115
116 static inline struct mali_job_descriptor_header *
117 job_descriptor_header(struct panfrost_transfer t)
118 {
119 return (struct mali_job_descriptor_header *) t.cpu;
120 }
121
122 static void
123 panfrost_assign_index(
124 struct panfrost_batch *batch,
125 struct panfrost_transfer transfer)
126 {
127 /* Assign the index */
128 unsigned index = ++batch->job_index;
129 job_descriptor_header(transfer)->job_index = index;
130 }
131
132 /* Helper to add a dependency to a job */
133
134 static void
135 panfrost_add_dependency(
136 struct panfrost_transfer depender,
137 struct panfrost_transfer dependent)
138 {
139
140 struct mali_job_descriptor_header *first =
141 job_descriptor_header(dependent);
142
143 struct mali_job_descriptor_header *second =
144 job_descriptor_header(depender);
145
146 /* Look for an open slot */
147
148 if (!second->job_dependency_index_1)
149 second->job_dependency_index_1 = first->job_index;
150 else if (!second->job_dependency_index_2)
151 second->job_dependency_index_2 = first->job_index;
152 else
153 unreachable("No available slot for new dependency");
154 }
155
156 /* Queues a job WITHOUT updating pointers. Be careful. */
157
158 static void
159 panfrost_scoreboard_queue_job_internal(
160 struct panfrost_batch *batch,
161 struct panfrost_transfer job)
162 {
163 panfrost_assign_index(batch, job);
164
165 /* Queue a pointer to the job */
166 util_dynarray_append(&batch->headers, void*, job.cpu);
167 util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
168 }
169
170
171 /* Queues a compute job, with no special dependencies. This is a bit of a
172 * misnomer -- internally, all job types are queued with this function, but
173 * outside of this file, it's for pure compute jobs */
174
175 void
176 panfrost_scoreboard_queue_compute_job(
177 struct panfrost_batch *batch,
178 struct panfrost_transfer job)
179 {
180 panfrost_scoreboard_queue_job_internal(batch, job);
181
182 /* Update the linked list metadata as appropriate */
183 batch->last_job = job;
184
185 if (!batch->first_job.gpu)
186 batch->first_job = job;
187 }
188
189 /* Queues a vertex job. There are no special dependencies yet, but if
190 * tiling is required (anytime 'rasterize discard' is disabled), we have
191 * some extra bookkeeping for later */
192
193 void
194 panfrost_scoreboard_queue_vertex_job(
195 struct panfrost_batch *batch,
196 struct panfrost_transfer vertex,
197 bool requires_tiling)
198 {
199 panfrost_scoreboard_queue_compute_job(batch, vertex);
200
201 if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
202 batch->first_vertex_for_tiler = vertex;
203 }
204
205 /* Queues a tiler job, respecting the dependency of each tiler job on the
206 * previous */
207
208 void
209 panfrost_scoreboard_queue_tiler_job(
210 struct panfrost_batch *batch,
211 struct panfrost_transfer tiler)
212 {
213 panfrost_scoreboard_queue_compute_job(batch, tiler);
214
215 if (!batch->first_tiler.gpu)
216 batch->first_tiler = tiler;
217
218 if (batch->last_tiler.gpu)
219 panfrost_add_dependency(tiler, batch->last_tiler);
220
221 batch->last_tiler = tiler;
222 }
223
224 /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
225 * fused jobs are not supported (the default until Bifrost rolls out) */
226
227 void
228 panfrost_scoreboard_queue_fused_job(
229 struct panfrost_batch *batch,
230 struct panfrost_transfer vertex,
231 struct panfrost_transfer tiler)
232 {
233 panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
234 panfrost_scoreboard_queue_tiler_job(batch, tiler);
235 panfrost_add_dependency(tiler, vertex);
236 }
237
238 /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
239 * wallpaper blits */
240
241 void
242 panfrost_scoreboard_queue_fused_job_prepend(
243 struct panfrost_batch *batch,
244 struct panfrost_transfer vertex,
245 struct panfrost_transfer tiler)
246 {
247 /* Sanity check */
248 assert(batch->last_tiler.gpu);
249 assert(batch->first_tiler.gpu);
250
251 /* First, we add the vertex job directly to the queue, forcing it to
252 * the front */
253
254 panfrost_scoreboard_queue_job_internal(batch, vertex);
255 batch->first_job = vertex;
256 batch->first_vertex_for_tiler = vertex;
257
258 /* Similarly, we add the tiler job directly to the queue, forcing it to
259 * the front (second place), manually setting the tiler on vertex
260 * dependency (since this is pseudofused) and forcing a dependency of
261 * the now-second tiler on us (since all tiler jobs are linked in order
262 * and we're injecting ourselves at the front) */
263
264 panfrost_scoreboard_queue_job_internal(batch, tiler);
265 panfrost_add_dependency(tiler, vertex);
266 panfrost_add_dependency(batch->first_tiler, tiler);
267 batch->first_tiler = tiler;
268 }
269
270 /* Generates a set value job, used below as part of TILER job scheduling. */
271
272 static struct panfrost_transfer
273 panfrost_set_value_job(struct panfrost_batch *batch, mali_ptr polygon_list)
274 {
275 struct mali_job_descriptor_header job = {
276 .job_type = JOB_TYPE_SET_VALUE,
277 .job_descriptor_size = 1,
278 };
279
280 struct mali_payload_set_value payload = {
281 .out = polygon_list,
282 .unknown = 0x3,
283 };
284
285 struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + sizeof(payload));
286 memcpy(transfer.cpu, &job, sizeof(job));
287 memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
288
289 return transfer;
290 }
291
292 /* If there are any tiler jobs, there needs to be a corresponding set value job
293 * linked to the first vertex job feeding into tiling. */
294
295 static void
296 panfrost_scoreboard_set_value(struct panfrost_batch *batch)
297 {
298 /* Check if we even need tiling */
299 if (!batch->last_tiler.gpu)
300 return;
301
302 /* Okay, we do. Let's generate it. We'll need the job's polygon list
303 * regardless of size. */
304
305 mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch, 0);
306
307 struct panfrost_transfer job =
308 panfrost_set_value_job(batch, polygon_list);
309
310 /* Queue it */
311 panfrost_scoreboard_queue_compute_job(batch, job);
312
313 /* Tiler jobs need us */
314 panfrost_add_dependency(batch->first_tiler, job);
315 }
316
317 /* Once all jobs have been added to a batch and we're ready to submit, we need
318 * to order them to set each of the next_job fields, obeying the golden rule:
319 * "A job's dependencies must appear earlier in the job chain than itself".
320 * Fortunately, computing this job chain is a well-studied graph theory problem
321 * known as "topological sorting", which has linear time algorithms. We let
322 * each job represent a node, each dependency a directed edge, and the entire
323 * set of jobs to be a dependency graph. This graph is inherently acyclic, as
324 * otherwise there are unresolveable dependencies.
325 *
326 * We implement Kahn's algorithm here to compute the next_job chain:
327 * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
328 *
329 * A few implementation notes: we represent S explicitly with a bitset, L
330 * implicitly in the next_job fields. The indices of the bitset are off-by-one:
331 * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
332 * hardware and dependencies are [1, node_count].
333 *
334 * We represent edge removal implicitly with another pair of bitsets, rather
335 * than explicitly removing the edges, since we need to keep the dependencies
336 * there for the hardware.
337 */
338
339 #define DESCRIPTOR_FOR_NODE(count) \
340 *(util_dynarray_element(&batch->headers, \
341 struct mali_job_descriptor_header*, count))
342
343 #define GPU_ADDRESS_FOR_NODE(count) \
344 *(util_dynarray_element(&batch->gpu_headers, \
345 mali_ptr, count))
346
347 void
348 panfrost_scoreboard_link_batch(struct panfrost_batch *batch)
349 {
350 /* Finalize the batch */
351 panfrost_scoreboard_set_value(batch);
352
353 /* Let no_incoming represent the set S described. */
354
355 unsigned node_count = batch->job_index;
356
357 size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
358 BITSET_WORD *no_incoming = calloc(sz, 1);
359
360 /* Sets for edges being removed in dep 1 or 2 respectively */
361
362 BITSET_WORD *edge_removal_1 = calloc(sz, 1);
363 BITSET_WORD *edge_removal_2 = calloc(sz, 1);
364
365 /* We compute no_incoming by traversing the batch. Simultaneously, we
366 * would like to keep track of a parity-reversed version of the
367 * dependency graph. Dependency indices are 16-bit and in practice (for
368 * ES3.0, at least), we can guarantee a given node will be depended on
369 * by no more than one other nodes. P.f:
370 *
371 * Proposition: Given a node N of type T, no more than one other node
372 * depends on N.
373 *
374 * If type is SET_VALUE: The only dependency added against us is from
375 * the first tiler job, so there is 1 dependent.
376 *
377 * If type is VERTEX: If there is a tiler node, that tiler node depends
378 * on us; if there is not (transform feedback), nothing depends on us.
379 * Therefore there is at most 1 dependent.
380 *
381 * If type is TILER: If there is another TILER job in succession, that
382 * node depends on us. No other job type depends on us. Therefore there
383 * is at most 1 dependent.
384 *
385 * If type is FRAGMENT: This type cannot be in a primary chain, so it
386 * is irrelevant. Just for kicks, nobody would depend on us, so there
387 * are zero dependents, so it holds anyway.
388 *
389 * TODO: Revise this logic for ES3.1 and above. This result may not
390 * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case
391 * those. Can FBO dependencies be expressed within a chain?
392 * ---
393 *
394 * Point is, we only need to hold a single dependent, which is a pretty
395 * helpful result.
396 */
397
398 unsigned *dependents = calloc(node_count, sizeof(unsigned));
399
400 for (unsigned i = 0; i < node_count; ++i) {
401 struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
402
403 unsigned dep_1 = node->job_dependency_index_1;
404 unsigned dep_2 = node->job_dependency_index_2;
405
406 /* Record no_incoming info for this node */
407
408 if (!(dep_1 || dep_2))
409 BITSET_SET(no_incoming, i);
410
411 /* Record this node as the dependent of each of its
412 * dependencies */
413
414 if (dep_1) {
415 assert(!dependents[dep_1 - 1]);
416 dependents[dep_1 - 1] = i + 1;
417 }
418
419 if (dep_2) {
420 assert(!dependents[dep_2 - 1]);
421 dependents[dep_2 - 1] = i + 1;
422 }
423 }
424
425 /* No next_job fields are set at the beginning, so L is implciitly the
426 * empty set. As next_job fields are filled, L is implicitly set. Tail
427 * is the tail of L, however. */
428
429 struct mali_job_descriptor_header *tail = NULL;
430
431 /* We iterate, popping off elements of S. A simple foreach won't do,
432 * since we mutate S as we go (even adding elements) */
433
434 unsigned arr_size = BITSET_WORDS(node_count);
435
436 for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
437 (node_n_1 != 0);
438 node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
439
440 unsigned node_n = node_n_1 - 1;
441
442 /* We've got a node n, pop it off */
443 BITSET_CLEAR(no_incoming, node_n);
444
445 /* Add it to the list */
446 struct mali_job_descriptor_header *n =
447 DESCRIPTOR_FOR_NODE(node_n);
448
449 mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
450
451 if (tail) {
452 /* Link us to the last node */
453 panfrost_set_job_next(tail, addr);
454 } else {
455 /* We are the first/last node */
456 batch->first_job.cpu = (uint8_t *) n;
457 batch->first_job.gpu = addr;
458 }
459
460 tail = n;
461
462 /* Grab the dependent, if there is one */
463 unsigned node_m_1 = dependents[node_n];
464
465 if (node_m_1) {
466 unsigned node_m = node_m_1 - 1;
467
468 struct mali_job_descriptor_header *m =
469 DESCRIPTOR_FOR_NODE(node_m);
470
471 /* Get the deps, accounting for removal */
472 unsigned dep_1 = m->job_dependency_index_1;
473 unsigned dep_2 = m->job_dependency_index_2;
474
475 if (BITSET_TEST(edge_removal_1, node_m))
476 dep_1 = 0;
477
478 if (BITSET_TEST(edge_removal_2, node_m))
479 dep_2 = 0;
480
481 /* Pretend to remove edges */
482 if (dep_1 == node_n_1) {
483 BITSET_SET(edge_removal_1, node_m);
484 dep_1 = 0;
485 } else if (dep_2 == node_n_1) {
486 BITSET_SET(edge_removal_2, node_m);
487 dep_2 = 0;
488 } else {
489 /* This node has no relevant dependencies */
490 assert(0);
491 }
492
493 /* Are there edges left? If not, add us to S */
494 bool has_edges = dep_1 || dep_2;
495
496 if (!has_edges)
497 BITSET_SET(no_incoming, node_m);
498 }
499 }
500
501 /* Cleanup */
502 free(no_incoming);
503 free(dependents);
504 free(edge_removal_1);
505 free(edge_removal_2);
506
507 }