panfrost/midgard: Use _safe iterator
[mesa.git] / src / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27
28 /* Create a mask of accessed components from a swizzle to figure out vector
29 * dependencies */
30
31 static unsigned
32 swizzle_to_access_mask(unsigned swizzle)
33 {
34 unsigned component_mask = 0;
35
36 for (int i = 0; i < 4; ++i) {
37 unsigned c = (swizzle >> (2 * i)) & 3;
38 component_mask |= (1 << c);
39 }
40
41 return component_mask;
42 }
43
44 /* Does the mask cover more than a scalar? */
45
46 static bool
47 is_single_component_mask(unsigned mask)
48 {
49 int components = 0;
50
51 for (int c = 0; c < 8; ++c) {
52 if (mask & (1 << c))
53 components++;
54 }
55
56 return components == 1;
57 }
58
59 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
60 * mind that we are a vector architecture and we can write to different
61 * components simultaneously */
62
63 static bool
64 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
65 {
66 /* Each instruction reads some registers and writes to a register. See
67 * where the first writes */
68
69 /* Figure out where exactly we wrote to */
70 int source = first->ssa_args.dest;
71 int source_mask = first->mask;
72
73 /* As long as the second doesn't read from the first, we're okay */
74 if (second->ssa_args.src0 == source) {
75 if (first->type == TAG_ALU_4) {
76 /* Figure out which components we just read from */
77
78 int q = second->alu.src1;
79 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
80
81 /* Check if there are components in common, and fail if so */
82 if (swizzle_to_access_mask(m->swizzle) & source_mask)
83 return false;
84 } else
85 return false;
86
87 }
88
89 if (second->ssa_args.src1 == source)
90 return false;
91
92 /* Otherwise, it's safe in that regard. Another data hazard is both
93 * writing to the same place, of course */
94
95 if (second->ssa_args.dest == source) {
96 /* ...but only if the components overlap */
97
98 if (second->mask & source_mask)
99 return false;
100 }
101
102 /* ...That's it */
103 return true;
104 }
105
106 static bool
107 midgard_has_hazard(
108 midgard_instruction **segment, unsigned segment_size,
109 midgard_instruction *ains)
110 {
111 for (int s = 0; s < segment_size; ++s)
112 if (!can_run_concurrent_ssa(segment[s], ains))
113 return true;
114
115 return false;
116
117
118 }
119
120 /* Schedules, but does not emit, a single basic block. After scheduling, the
121 * final tag and size of the block are known, which are necessary for branching
122 * */
123
124 static midgard_bundle
125 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
126 {
127 int instructions_emitted = 0, packed_idx = 0;
128 midgard_bundle bundle = { 0 };
129
130 uint8_t tag = ins->type;
131
132 /* Default to the instruction's tag */
133 bundle.tag = tag;
134
135 switch (ins->type) {
136 case TAG_ALU_4: {
137 uint32_t control = 0;
138 size_t bytes_emitted = sizeof(control);
139
140 /* TODO: Constant combining */
141 int index = 0, last_unit = 0;
142
143 /* Previous instructions, for the purpose of parallelism */
144 midgard_instruction *segment[4] = {0};
145 int segment_size = 0;
146
147 instructions_emitted = -1;
148 midgard_instruction *pins = ins;
149
150 unsigned constant_count = 0;
151
152 for (;;) {
153 midgard_instruction *ains = pins;
154
155 /* Advance instruction pointer */
156 if (index) {
157 ains = mir_next_op(pins);
158 pins = ains;
159 }
160
161 /* Out-of-work condition */
162 if ((struct list_head *) ains == &block->instructions)
163 break;
164
165 /* Ensure that the chain can continue */
166 if (ains->type != TAG_ALU_4) break;
167
168 /* If there's already something in the bundle and we
169 * have weird scheduler constraints, break now */
170 if (ains->precede_break && index) break;
171
172 /* According to the presentation "The ARM
173 * Mali-T880 Mobile GPU" from HotChips 27,
174 * there are two pipeline stages. Branching
175 * position determined experimentally. Lines
176 * are executed in parallel:
177 *
178 * [ VMUL ] [ SADD ]
179 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
180 *
181 * Verify that there are no ordering dependencies here.
182 *
183 * TODO: Allow for parallelism!!!
184 */
185
186 /* Pick a unit for it if it doesn't force a particular unit */
187
188 int unit = ains->unit;
189
190 if (!unit) {
191 int op = ains->alu.op;
192 int units = alu_opcode_props[op].props;
193
194 bool scalarable = units & UNITS_SCALAR;
195 bool could_scalar = is_single_component_mask(ains->mask);
196
197 /* Only 16/32-bit can run on a scalar unit */
198 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
199 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
200 could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
201
202 if (ains->alu.reg_mode == midgard_reg_mode_16) {
203 /* If we're running in 16-bit mode, we
204 * can't have any 8-bit sources on the
205 * scalar unit (since the scalar unit
206 * doesn't understand 8-bit) */
207
208 midgard_vector_alu_src s1 =
209 vector_alu_from_unsigned(ains->alu.src1);
210
211 could_scalar &= !s1.half;
212
213 if (!ains->ssa_args.inline_constant) {
214 midgard_vector_alu_src s2 =
215 vector_alu_from_unsigned(ains->alu.src2);
216
217 could_scalar &= !s2.half;
218 }
219
220 }
221
222 bool scalar = could_scalar && scalarable;
223
224 /* TODO: Check ahead-of-time for other scalar
225 * hazards that otherwise get aborted out */
226
227 if (scalar)
228 assert(units & UNITS_SCALAR);
229
230 if (!scalar) {
231 if (last_unit >= UNIT_VADD) {
232 if (units & UNIT_VLUT)
233 unit = UNIT_VLUT;
234 else
235 break;
236 } else {
237 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
238 unit = UNIT_VMUL;
239 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
240 unit = UNIT_VADD;
241 else if (units & UNIT_VLUT)
242 unit = UNIT_VLUT;
243 else
244 break;
245 }
246 } else {
247 if (last_unit >= UNIT_VADD) {
248 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
249 unit = UNIT_SMUL;
250 else if (units & UNIT_VLUT)
251 unit = UNIT_VLUT;
252 else
253 break;
254 } else {
255 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
256 unit = UNIT_SADD;
257 else if (units & UNIT_SMUL)
258 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
259 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
260 unit = UNIT_VADD;
261 else
262 break;
263 }
264 }
265
266 assert(unit & units);
267 }
268
269 /* Late unit check, this time for encoding (not parallelism) */
270 if (unit <= last_unit) break;
271
272 /* Clear the segment */
273 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
274 segment_size = 0;
275
276 if (midgard_has_hazard(segment, segment_size, ains))
277 break;
278
279 /* We're good to go -- emit the instruction */
280 ains->unit = unit;
281
282 segment[segment_size++] = ains;
283
284 /* We try to reuse constants if possible, by adjusting
285 * the swizzle */
286
287 if (ains->has_blend_constant) {
288 /* Everything conflicts with the blend constant */
289 if (bundle.has_embedded_constants)
290 break;
291
292 bundle.has_blend_constant = 1;
293 bundle.has_embedded_constants = 1;
294 } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
295 /* TODO: DRY with the analysis pass */
296
297 if (bundle.has_blend_constant)
298 break;
299
300 if (constant_count)
301 break;
302
303 /* TODO: Fix packing XXX */
304 uint16_t *bundles = (uint16_t *) bundle.constants;
305 uint32_t *constants = (uint32_t *) ains->constants;
306
307 /* Copy them wholesale */
308 for (unsigned i = 0; i < 4; ++i)
309 bundles[i] = constants[i];
310
311 bundle.has_embedded_constants = true;
312 constant_count = 4;
313 } else if (ains->has_constants) {
314 /* By definition, blend constants conflict with
315 * everything, so if there are already
316 * constants we break the bundle *now* */
317
318 if (bundle.has_blend_constant)
319 break;
320
321 /* For anything but blend constants, we can do
322 * proper analysis, however */
323
324 /* TODO: Mask by which are used */
325 uint32_t *constants = (uint32_t *) ains->constants;
326 uint32_t *bundles = (uint32_t *) bundle.constants;
327
328 uint32_t indices[4] = { 0 };
329 bool break_bundle = false;
330
331 for (unsigned i = 0; i < 4; ++i) {
332 uint32_t cons = constants[i];
333 bool constant_found = false;
334
335 /* Search for the constant */
336 for (unsigned j = 0; j < constant_count; ++j) {
337 if (bundles[j] != cons)
338 continue;
339
340 /* We found it, reuse */
341 indices[i] = j;
342 constant_found = true;
343 break;
344 }
345
346 if (constant_found)
347 continue;
348
349 /* We didn't find it, so allocate it */
350 unsigned idx = constant_count++;
351
352 if (idx >= 4) {
353 /* Uh-oh, out of space */
354 break_bundle = true;
355 break;
356 }
357
358 /* We have space, copy it in! */
359 bundles[idx] = cons;
360 indices[i] = idx;
361 }
362
363 if (break_bundle)
364 break;
365
366 /* Cool, we have it in. So use indices as a
367 * swizzle */
368
369 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
370 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
371
372 if (ains->ssa_args.src0 == r_constant)
373 ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
374
375 if (ains->ssa_args.src1 == r_constant)
376 ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
377
378 bundle.has_embedded_constants = true;
379 }
380
381 if (ains->unit & UNITS_ANY_VECTOR) {
382 bytes_emitted += sizeof(midgard_reg_info);
383 bytes_emitted += sizeof(midgard_vector_alu);
384 } else if (ains->compact_branch) {
385 /* All of r0 has to be written out along with
386 * the branch writeout */
387
388 if (ains->writeout) {
389 /* The rules for when "bare" writeout
390 * is safe are when all components are
391 * r0 are written out in the final
392 * bundle, earlier than VLUT, where any
393 * register dependencies of r0 are from
394 * an earlier bundle. We can't verify
395 * this before RA, so we don't try. */
396
397 if (index != 0)
398 break;
399
400 /* Inject a move */
401 midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
402 ins.unit = UNIT_VMUL;
403 control |= ins.unit;
404
405 /* TODO don't leak */
406 midgard_instruction *move =
407 mem_dup(&ins, sizeof(midgard_instruction));
408 bytes_emitted += sizeof(midgard_reg_info);
409 bytes_emitted += sizeof(midgard_vector_alu);
410 bundle.instructions[packed_idx++] = move;
411 }
412
413 if (ains->unit == ALU_ENAB_BRANCH) {
414 bytes_emitted += sizeof(midgard_branch_extended);
415 } else {
416 bytes_emitted += sizeof(ains->br_compact);
417 }
418 } else {
419 bytes_emitted += sizeof(midgard_reg_info);
420 bytes_emitted += sizeof(midgard_scalar_alu);
421 }
422
423 /* Defer marking until after writing to allow for break */
424 control |= ains->unit;
425 last_unit = ains->unit;
426 ++instructions_emitted;
427 ++index;
428 }
429
430 int padding = 0;
431
432 /* Pad ALU op to nearest word */
433
434 if (bytes_emitted & 15) {
435 padding = 16 - (bytes_emitted & 15);
436 bytes_emitted += padding;
437 }
438
439 /* Constants must always be quadwords */
440 if (bundle.has_embedded_constants)
441 bytes_emitted += 16;
442
443 /* Size ALU instruction for tag */
444 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
445 bundle.padding = padding;
446 bundle.control = bundle.tag | control;
447
448 break;
449 }
450
451 case TAG_LOAD_STORE_4: {
452 /* Load store instructions have two words at once. If
453 * we only have one queued up, we need to NOP pad.
454 * Otherwise, we store both in succession to save space
455 * and cycles -- letting them go in parallel -- skip
456 * the next. The usefulness of this optimisation is
457 * greatly dependent on the quality of the instruction
458 * scheduler.
459 */
460
461 midgard_instruction *next_op = mir_next_op(ins);
462
463 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
464 /* TODO: Concurrency check */
465 instructions_emitted++;
466 }
467
468 break;
469 }
470
471 case TAG_TEXTURE_4: {
472 /* Which tag we use depends on the shader stage */
473 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
474 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
475 break;
476 }
477
478 default:
479 unreachable("Unknown tag");
480 break;
481 }
482
483 /* Copy the instructions into the bundle */
484 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
485
486 midgard_instruction *uins = ins;
487 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
488 bundle.instructions[packed_idx] = uins;
489 uins = mir_next_op(uins);
490 }
491
492 *skip = instructions_emitted;
493
494 return bundle;
495 }
496
497 /* Schedule a single block by iterating its instruction to create bundles.
498 * While we go, tally about the bundle sizes to compute the block size. */
499
500 static void
501 schedule_block(compiler_context *ctx, midgard_block *block)
502 {
503 util_dynarray_init(&block->bundles, NULL);
504
505 block->quadword_count = 0;
506
507 mir_foreach_instr_in_block_safe(block, ins) {
508 int skip;
509 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
510 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
511
512 if (bundle.has_blend_constant) {
513 /* TODO: Multiblock? */
514 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
515 ctx->blend_constant_offset = quadwords_within_block * 0x10;
516 }
517
518 while(skip--)
519 ins = mir_next_op(ins);
520
521 block->quadword_count += quadword_size(bundle.tag);
522 }
523
524 block->is_scheduled = true;
525 }
526
527 void
528 schedule_program(compiler_context *ctx)
529 {
530 /* We run RA prior to scheduling */
531
532 mir_foreach_block(ctx, block) {
533 schedule_block(ctx, block);
534 }
535
536 /* Pipeline registers creation is a prepass before RA */
537 mir_create_pipeline_registers(ctx);
538
539 struct ra_graph *g = allocate_registers(ctx);
540 install_registers(ctx, g);
541 }