66fb689c1f47aee45fe36a9af2e3e103359a66fe
[mesa.git] / src / gallium / drivers / panfrost / pan_scoreboard.c
1 /*
2 * Copyright (C) 2019 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25 #include "pan_context.h"
26 #include "pan_job.h"
27 #include "pan_allocate.h"
28 #include "util/bitset.h"
29
30 /*
31 * Within a batch (panfrost_job), there are various types of Mali jobs:
32 *
33 * - SET_VALUE: initializes tiler
34 * - VERTEX: runs a vertex shader
35 * - TILER: runs tiling and sets up a fragment shader
36 * - FRAGMENT: runs fragment shaders and writes out
37 * - COMPUTE: runs a compute shader
38 * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
39 * - GEOMETRY: runs a geometry shader (unimplemented)
40 * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
41 *
42 * In between a full batch and a single Mali job is the "job chain", a series
43 * of Mali jobs together forming a linked list. Within the job chain, each Mali
44 * job can set (up to) two dependencies on other earlier jobs in the chain.
45 * This dependency graph forms a scoreboard. The general idea of a scoreboard
46 * applies: when there is a data dependency of job B on job A, job B sets one
47 * of its dependency indices to job A, ensuring that job B won't start until
48 * job A finishes.
49 *
50 * More specifically, here are a set of rules:
51 *
52 * - A set value job must appear if and only if there is at least one tiler job.
53 *
54 * - Vertex jobs and tiler jobs are independent.
55 *
56 * - A tiler job must have a dependency on its data source. If it's getting
57 * data from a vertex job, it depends on the vertex job. If it's getting data
58 * from software, this is null.
59 *
60 * - The first vertex job used as the input to tiling must depend on the set
61 * value job, if it is present.
62 *
63 * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
64 * previous job in the chain.
65 *
66 * - Jobs linking via next_job has no bearing on order of execution, rather it
67 * just establishes the linked list of jobs, EXCEPT:
68 *
69 * - A job's dependencies must appear earlier in the linked list (job chain).
70 *
71 * Justification for each rule:
72 *
73 * - Set value jobs set up tiling, essentially. If tiling occurs, they are
74 * needed; if it does not, we cannot emit them since then tiling partially
75 * occurs and it's bad.
76 *
77 * - The hardware has no notion of a "vertex/tiler job" (at least not our
78 * hardware -- other revs have fused jobs, but --- crap, this just got even
79 * more complicated). They are independent units that take in data, process
80 * it, and spit out data.
81 *
82 * - Any job must depend on its data source, in fact, or risk a
83 * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
84 * tiler jobs depend on the corresponding vertex job (if it's there).
85 *
86 * - In fact, tiling depends on the set value job, but tiler jobs depend on the
87 * corresponding vertex jobs and each other, so this rule ensures each tiler
88 * job automatically depends on the set value job.
89 *
90 * - The tiler is not thread-safe; this dependency prevents race conditions
91 * between two different jobs trying to write to the tiler outputs at the
92 * same time.
93 *
94 * - Internally, jobs are scoreboarded; the next job fields just form a linked
95 * list to allow the jobs to be read in; the execution order is from
96 * resolving the dependency fields instead.
97 *
98 * - The hardware cannot set a dependency on a job it doesn't know about yet,
99 * and dependencies are processed in-order of the next job fields.
100 *
101 */
102
103 /* Accessor to set the next job field */
104
105 static void
106 panfrost_set_job_next(struct mali_job_descriptor_header *first, mali_ptr next)
107 {
108 if (first->job_descriptor_size)
109 first->next_job_64 = (u64) (uintptr_t) next;
110 else
111 first->next_job_32 = (u32) (uintptr_t) next;
112 }
113
114 /* Coerce a panfrost_transfer to a header */
115
116 static inline struct mali_job_descriptor_header *
117 job_descriptor_header(struct panfrost_transfer t)
118 {
119 return (struct mali_job_descriptor_header *) t.cpu;
120 }
121
122 static void
123 panfrost_assign_index(
124 struct panfrost_job *job,
125 struct panfrost_transfer transfer)
126 {
127 /* Assign the index */
128 unsigned index = ++job->job_index;
129 job_descriptor_header(transfer)->job_index = index;
130 }
131
132 /* Helper to add a dependency to a job */
133
134 static void
135 panfrost_add_dependency(
136 struct panfrost_transfer depender,
137 struct panfrost_transfer dependent)
138 {
139
140 struct mali_job_descriptor_header *first =
141 job_descriptor_header(dependent);
142
143 struct mali_job_descriptor_header *second =
144 job_descriptor_header(depender);
145
146 /* Ensure we're ready for dependencies */
147 assert(second->job_index);
148 assert(first->job_index);
149
150 /* Look for an open slot */
151
152 if (!second->job_dependency_index_1)
153 second->job_dependency_index_1 = first->job_index;
154 else if (!second->job_dependency_index_2)
155 second->job_dependency_index_2 = first->job_index;
156 else
157 unreachable("No available slot for new dependency");
158 }
159
160 /* Queues a job WITHOUT updating pointers. Be careful. */
161
162 static void
163 panfrost_scoreboard_queue_job_internal(
164 struct panfrost_job *batch,
165 struct panfrost_transfer job)
166 {
167 panfrost_assign_index(batch, job);
168
169 /* Queue a pointer to the job */
170 util_dynarray_append(&batch->headers, void*, job.cpu);
171 util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
172 }
173
174
175 /* Queues a compute job, with no special dependencies. This is a bit of a
176 * misnomer -- internally, all job types are queued with this function, but
177 * outside of this file, it's for pure compute jobs */
178
179 void
180 panfrost_scoreboard_queue_compute_job(
181 struct panfrost_job *batch,
182 struct panfrost_transfer job)
183 {
184 panfrost_scoreboard_queue_job_internal(batch, job);
185
186 /* Update the linked list metadata as appropriate */
187 batch->last_job = job;
188
189 if (!batch->first_job.gpu)
190 batch->first_job = job;
191 }
192
193 /* Queues a vertex job. There are no special dependencies yet, but if
194 * tiling is required (anytime 'rasterize discard' is disabled), we have
195 * some extra bookkeeping for later */
196
197 void
198 panfrost_scoreboard_queue_vertex_job(
199 struct panfrost_job *batch,
200 struct panfrost_transfer vertex,
201 bool requires_tiling)
202 {
203 panfrost_scoreboard_queue_compute_job(batch, vertex);
204
205 if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
206 batch->first_vertex_for_tiler = vertex;
207 }
208
209 /* Queues a tiler job, respecting the dependency of each tiler job on the
210 * previous */
211
212 void
213 panfrost_scoreboard_queue_tiler_job(
214 struct panfrost_job *batch,
215 struct panfrost_transfer tiler)
216 {
217 panfrost_scoreboard_queue_compute_job(batch, tiler);
218
219 if (!batch->first_tiler.gpu)
220 batch->first_tiler = tiler;
221
222 if (batch->last_tiler.gpu)
223 panfrost_add_dependency(tiler, batch->last_tiler);
224
225 batch->last_tiler = tiler;
226 }
227
228 /* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
229 * fused jobs are not supported (the default until Bifrost rolls out) */
230
231 void
232 panfrost_scoreboard_queue_fused_job(
233 struct panfrost_job *batch,
234 struct panfrost_transfer vertex,
235 struct panfrost_transfer tiler)
236 {
237 panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
238 panfrost_scoreboard_queue_tiler_job(batch, tiler);
239 panfrost_add_dependency(tiler, vertex);
240 }
241
242 /* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
243 * wallpaper blits */
244
245 void
246 panfrost_scoreboard_queue_fused_job_prepend(
247 struct panfrost_job *batch,
248 struct panfrost_transfer vertex,
249 struct panfrost_transfer tiler)
250 {
251 /* Sanity check */
252 assert(batch->last_tiler.gpu);
253 assert(batch->first_tiler.gpu);
254
255 /* First, we add the vertex job directly to the queue, forcing it to
256 * the front */
257
258 panfrost_scoreboard_queue_job_internal(batch, vertex);
259 batch->first_job = vertex;
260 batch->first_vertex_for_tiler = vertex;
261
262 /* Similarly, we add the tiler job directly to the queue, forcing it to
263 * the front (second place), manually setting the tiler on vertex
264 * dependency (since this is pseudofused) and forcing a dependency of
265 * the now-second tiler on us (since all tiler jobs are linked in order
266 * and we're injecting ourselves at the front) */
267
268 panfrost_scoreboard_queue_job_internal(batch, tiler);
269 panfrost_add_dependency(tiler, vertex);
270 panfrost_add_dependency(batch->first_tiler, tiler);
271 batch->first_tiler = tiler;
272 }
273
274 /* Generates a set value job, used below as part of TILER job scheduling. */
275
276 static struct panfrost_transfer
277 panfrost_set_value_job(struct panfrost_context *ctx, mali_ptr polygon_list)
278 {
279 struct mali_job_descriptor_header job = {
280 .job_type = JOB_TYPE_SET_VALUE,
281 .job_descriptor_size = 1,
282 };
283
284 struct mali_payload_set_value payload = {
285 .out = polygon_list,
286 .unknown = 0x3,
287 };
288
289 struct panfrost_transfer transfer = panfrost_allocate_transient(ctx, sizeof(job) + sizeof(payload));
290 memcpy(transfer.cpu, &job, sizeof(job));
291 memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
292
293 return transfer;
294 }
295
296 /* If there are any tiler jobs, there needs to be a corresponding set value job
297 * linked to the first vertex job feeding into tiling. */
298
299 static void
300 panfrost_scoreboard_set_value(struct panfrost_job *batch)
301 {
302 /* Check if we even need tiling */
303 if (!batch->last_tiler.gpu)
304 return;
305
306 /* Okay, we do. Let's generate it */
307
308 struct panfrost_context *ctx = batch->ctx;
309 mali_ptr polygon_list = ctx->tiler_polygon_list.bo->gpu;
310
311 struct panfrost_transfer job =
312 panfrost_set_value_job(ctx, polygon_list);
313
314 /* Queue it */
315 panfrost_scoreboard_queue_compute_job(batch, job);
316
317 /* Tiler jobs need us */
318 panfrost_add_dependency(batch->first_tiler, job);
319 }
320
321 /* Once all jobs have been added to a batch and we're ready to submit, we need
322 * to order them to set each of the next_job fields, obeying the golden rule:
323 * "A job's dependencies must appear earlier in the job chain than itself".
324 * Fortunately, computing this job chain is a well-studied graph theory problem
325 * known as "topological sorting", which has linear time algorithms. We let
326 * each job represent a node, each dependency a directed edge, and the entire
327 * set of jobs to be a dependency graph. This graph is inherently acyclic, as
328 * otherwise there are unresolveable dependencies.
329 *
330 * We implement Kahn's algorithm here to compute the next_job chain:
331 * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
332 *
333 * A few implementation notes: we represent S explicitly with a bitset, L
334 * implicitly in the next_job fields. The indices of the bitset are off-by-one:
335 * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
336 * hardware and dependencies are [1, node_count].
337 *
338 * We represent edge removal implicitly with another pair of bitsets, rather
339 * than explicitly removing the edges, since we need to keep the dependencies
340 * there for the hardware.
341 */
342
343 #define DESCRIPTOR_FOR_NODE(count) \
344 *(util_dynarray_element(&batch->headers, \
345 struct mali_job_descriptor_header*, count))
346
347 #define GPU_ADDRESS_FOR_NODE(count) \
348 *(util_dynarray_element(&batch->gpu_headers, \
349 mali_ptr, count))
350
351 void
352 panfrost_scoreboard_link_batch(struct panfrost_job *batch)
353 {
354 /* Finalize the batch */
355 panfrost_scoreboard_set_value(batch);
356
357 /* Let no_incoming represent the set S described. */
358
359 unsigned node_count = batch->job_index;
360
361 size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
362 BITSET_WORD *no_incoming = calloc(sz, 1);
363
364 /* Sets for edges being removed in dep 1 or 2 respectively */
365
366 BITSET_WORD *edge_removal_1 = calloc(sz, 1);
367 BITSET_WORD *edge_removal_2 = calloc(sz, 1);
368
369 /* We compute no_incoming by traversing the batch. */
370
371 for (unsigned i = 0; i < node_count; ++i) {
372 struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
373
374 unsigned dep_1 = node->job_dependency_index_1;
375 unsigned dep_2 = node->job_dependency_index_2;
376
377 if (!(dep_1 || dep_2))
378 BITSET_SET(no_incoming, i);
379 }
380
381 /* No next_job fields are set at the beginning, so L is implciitly the
382 * empty set. As next_job fields are filled, L is implicitly set. Tail
383 * is the tail of L, however. */
384
385 struct mali_job_descriptor_header *tail = NULL;
386
387 /* We iterate, popping off elements of S. A simple foreach won't do,
388 * since we mutate S as we go (even adding elements) */
389
390 unsigned arr_size = BITSET_WORDS(node_count);
391
392 for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
393 (node_n_1 != 0);
394 node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
395
396 unsigned node_n = node_n_1 - 1;
397
398 /* We've got a node n, pop it off */
399 BITSET_CLEAR(no_incoming, node_n);
400
401 /* Add it to the list */
402 struct mali_job_descriptor_header *n =
403 DESCRIPTOR_FOR_NODE(node_n);
404
405 mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
406
407 if (tail) {
408 /* Link us to the last node */
409 panfrost_set_job_next(tail, addr);
410 } else {
411 /* We are the first/last node */
412 batch->first_job.cpu = (uint8_t *) n;
413 batch->first_job.gpu = addr;
414 }
415
416 tail = n;
417
418 /* Scan dependencies */
419 for (unsigned node_m = 0; node_m < node_count; ++node_m) {
420 struct mali_job_descriptor_header *m =
421 DESCRIPTOR_FOR_NODE(node_m);
422
423 /* Get the deps, accounting for removal */
424 unsigned dep_1 = m->job_dependency_index_1;
425 unsigned dep_2 = m->job_dependency_index_2;
426
427 if (BITSET_TEST(edge_removal_1, node_m))
428 dep_1 = 0;
429
430 if (BITSET_TEST(edge_removal_2, node_m))
431 dep_2 = 0;
432
433 /* Pretend to remove edges */
434 if (dep_1 == node_n_1) {
435 BITSET_SET(edge_removal_1, node_m);
436 dep_1 = 0;
437 } else if (dep_2 == node_n_1) {
438 BITSET_SET(edge_removal_2, node_m);
439 dep_2 = 0;
440 } else {
441 /* This node has no relevant dependencies */
442 continue;
443 }
444
445 /* Are there edges left? If not, add us to S */
446 bool has_edges = dep_1 || dep_2;
447
448 if (!has_edges)
449 BITSET_SET(no_incoming, node_m);
450 }
451 }
452
453 }