panfrost/midgard: Hoist mask field
[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27
28 /* Create a mask of accessed components from a swizzle to figure out vector
29 * dependencies */
30
31 static unsigned
32 swizzle_to_access_mask(unsigned swizzle)
33 {
34 unsigned component_mask = 0;
35
36 for (int i = 0; i < 4; ++i) {
37 unsigned c = (swizzle >> (2 * i)) & 3;
38 component_mask |= (1 << c);
39 }
40
41 return component_mask;
42 }
43
44 /* Does the mask cover more than a scalar? */
45
46 static bool
47 is_single_component_mask(unsigned mask)
48 {
49 int components = 0;
50
51 for (int c = 0; c < 8; ++c) {
52 if (mask & (1 << c))
53 components++;
54 }
55
56 return components == 1;
57 }
58
59 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
60 * mind that we are a vector architecture and we can write to different
61 * components simultaneously */
62
63 static bool
64 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
65 {
66 /* Each instruction reads some registers and writes to a register. See
67 * where the first writes */
68
69 /* Figure out where exactly we wrote to */
70 int source = first->ssa_args.dest;
71 int source_mask = first->mask;
72
73 /* As long as the second doesn't read from the first, we're okay */
74 if (second->ssa_args.src0 == source) {
75 if (first->type == TAG_ALU_4) {
76 /* Figure out which components we just read from */
77
78 int q = second->alu.src1;
79 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
80
81 /* Check if there are components in common, and fail if so */
82 if (swizzle_to_access_mask(m->swizzle) & source_mask)
83 return false;
84 } else
85 return false;
86
87 }
88
89 if (second->ssa_args.src1 == source)
90 return false;
91
92 /* Otherwise, it's safe in that regard. Another data hazard is both
93 * writing to the same place, of course */
94
95 if (second->ssa_args.dest == source) {
96 /* ...but only if the components overlap */
97
98 if (second->mask & source_mask)
99 return false;
100 }
101
102 /* ...That's it */
103 return true;
104 }
105
106 static bool
107 midgard_has_hazard(
108 midgard_instruction **segment, unsigned segment_size,
109 midgard_instruction *ains)
110 {
111 for (int s = 0; s < segment_size; ++s)
112 if (!can_run_concurrent_ssa(segment[s], ains))
113 return true;
114
115 return false;
116
117
118 }
119
120 /* Schedules, but does not emit, a single basic block. After scheduling, the
121 * final tag and size of the block are known, which are necessary for branching
122 * */
123
124 static midgard_bundle
125 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
126 {
127 int instructions_emitted = 0, packed_idx = 0;
128 midgard_bundle bundle = { 0 };
129
130 uint8_t tag = ins->type;
131
132 /* Default to the instruction's tag */
133 bundle.tag = tag;
134
135 switch (ins->type) {
136 case TAG_ALU_4: {
137 uint32_t control = 0;
138 size_t bytes_emitted = sizeof(control);
139
140 /* TODO: Constant combining */
141 int index = 0, last_unit = 0;
142
143 /* Previous instructions, for the purpose of parallelism */
144 midgard_instruction *segment[4] = {0};
145 int segment_size = 0;
146
147 instructions_emitted = -1;
148 midgard_instruction *pins = ins;
149
150 unsigned constant_count = 0;
151
152 for (;;) {
153 midgard_instruction *ains = pins;
154
155 /* Advance instruction pointer */
156 if (index) {
157 ains = mir_next_op(pins);
158 pins = ains;
159 }
160
161 /* Out-of-work condition */
162 if ((struct list_head *) ains == &block->instructions)
163 break;
164
165 /* Ensure that the chain can continue */
166 if (ains->type != TAG_ALU_4) break;
167
168 /* If there's already something in the bundle and we
169 * have weird scheduler constraints, break now */
170 if (ains->precede_break && index) break;
171
172 /* According to the presentation "The ARM
173 * Mali-T880 Mobile GPU" from HotChips 27,
174 * there are two pipeline stages. Branching
175 * position determined experimentally. Lines
176 * are executed in parallel:
177 *
178 * [ VMUL ] [ SADD ]
179 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
180 *
181 * Verify that there are no ordering dependencies here.
182 *
183 * TODO: Allow for parallelism!!!
184 */
185
186 /* Pick a unit for it if it doesn't force a particular unit */
187
188 int unit = ains->unit;
189
190 if (!unit) {
191 int op = ains->alu.op;
192 int units = alu_opcode_props[op].props;
193
194 bool vectorable = units & UNITS_ANY_VECTOR;
195 bool scalarable = units & UNITS_SCALAR;
196 bool could_scalar = is_single_component_mask(ains->mask);
197
198 /* Only 16/32-bit can run on a scalar unit */
199 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
200 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
201 could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
202
203 bool vector = vectorable && !(could_scalar && scalarable);
204
205 /* TODO: Check ahead-of-time for other scalar
206 * hazards that otherwise get aborted out */
207
208 if (!vector)
209 assert(units & UNITS_SCALAR);
210
211 if (vector) {
212 if (last_unit >= UNIT_VADD) {
213 if (units & UNIT_VLUT)
214 unit = UNIT_VLUT;
215 else
216 break;
217 } else {
218 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
219 unit = UNIT_VMUL;
220 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
221 unit = UNIT_VADD;
222 else if (units & UNIT_VLUT)
223 unit = UNIT_VLUT;
224 else
225 break;
226 }
227 } else {
228 if (last_unit >= UNIT_VADD) {
229 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
230 unit = UNIT_SMUL;
231 else if (units & UNIT_VLUT)
232 unit = UNIT_VLUT;
233 else
234 break;
235 } else {
236 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
237 unit = UNIT_SADD;
238 else if (units & UNIT_SMUL)
239 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
240 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
241 unit = UNIT_VADD;
242 else
243 break;
244 }
245 }
246
247 assert(unit & units);
248 }
249
250 /* Late unit check, this time for encoding (not parallelism) */
251 if (unit <= last_unit) break;
252
253 /* Clear the segment */
254 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
255 segment_size = 0;
256
257 if (midgard_has_hazard(segment, segment_size, ains))
258 break;
259
260 /* We're good to go -- emit the instruction */
261 ains->unit = unit;
262
263 segment[segment_size++] = ains;
264
265 /* We try to reuse constants if possible, by adjusting
266 * the swizzle */
267
268 if (ains->has_blend_constant) {
269 /* Everything conflicts with the blend constant */
270 if (bundle.has_embedded_constants)
271 break;
272
273 bundle.has_blend_constant = 1;
274 bundle.has_embedded_constants = 1;
275 } else if (ains->has_constants) {
276 /* By definition, blend constants conflict with
277 * everything, so if there are already
278 * constants we break the bundle *now* */
279
280 if (bundle.has_blend_constant)
281 break;
282
283 /* For anything but blend constants, we can do
284 * proper analysis, however */
285
286 /* TODO: Mask by which are used */
287 uint32_t *constants = (uint32_t *) ains->constants;
288 uint32_t *bundles = (uint32_t *) bundle.constants;
289
290 uint32_t indices[4] = { 0 };
291 bool break_bundle = false;
292
293 for (unsigned i = 0; i < 4; ++i) {
294 uint32_t cons = constants[i];
295 bool constant_found = false;
296
297 /* Search for the constant */
298 for (unsigned j = 0; j < constant_count; ++j) {
299 if (bundles[j] != cons)
300 continue;
301
302 /* We found it, reuse */
303 indices[i] = j;
304 constant_found = true;
305 break;
306 }
307
308 if (constant_found)
309 continue;
310
311 /* We didn't find it, so allocate it */
312 unsigned idx = constant_count++;
313
314 if (idx >= 4) {
315 /* Uh-oh, out of space */
316 break_bundle = true;
317 break;
318 }
319
320 /* We have space, copy it in! */
321 bundles[idx] = cons;
322 indices[i] = idx;
323 }
324
325 if (break_bundle)
326 break;
327
328 /* Cool, we have it in. So use indices as a
329 * swizzle */
330
331 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
332 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
333
334 if (ains->ssa_args.src0 == r_constant)
335 ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
336
337 if (ains->ssa_args.src1 == r_constant)
338 ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
339
340 bundle.has_embedded_constants = true;
341 }
342
343 if (ains->unit & UNITS_ANY_VECTOR) {
344 bytes_emitted += sizeof(midgard_reg_info);
345 bytes_emitted += sizeof(midgard_vector_alu);
346 } else if (ains->compact_branch) {
347 /* All of r0 has to be written out along with
348 * the branch writeout */
349
350 if (ains->writeout) {
351 /* The rules for when "bare" writeout
352 * is safe are when all components are
353 * r0 are written out in the final
354 * bundle, earlier than VLUT, where any
355 * register dependencies of r0 are from
356 * an earlier bundle. We can't verify
357 * this before RA, so we don't try. */
358
359 if (index != 0)
360 break;
361
362 /* Inject a move */
363 midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
364 ins.unit = UNIT_VMUL;
365 control |= ins.unit;
366
367 /* TODO don't leak */
368 midgard_instruction *move =
369 mem_dup(&ins, sizeof(midgard_instruction));
370 bytes_emitted += sizeof(midgard_reg_info);
371 bytes_emitted += sizeof(midgard_vector_alu);
372 bundle.instructions[packed_idx++] = move;
373 }
374
375 if (ains->unit == ALU_ENAB_BRANCH) {
376 bytes_emitted += sizeof(midgard_branch_extended);
377 } else {
378 bytes_emitted += sizeof(ains->br_compact);
379 }
380 } else {
381 bytes_emitted += sizeof(midgard_reg_info);
382 bytes_emitted += sizeof(midgard_scalar_alu);
383 }
384
385 /* Defer marking until after writing to allow for break */
386 control |= ains->unit;
387 last_unit = ains->unit;
388 ++instructions_emitted;
389 ++index;
390 }
391
392 int padding = 0;
393
394 /* Pad ALU op to nearest word */
395
396 if (bytes_emitted & 15) {
397 padding = 16 - (bytes_emitted & 15);
398 bytes_emitted += padding;
399 }
400
401 /* Constants must always be quadwords */
402 if (bundle.has_embedded_constants)
403 bytes_emitted += 16;
404
405 /* Size ALU instruction for tag */
406 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
407 bundle.padding = padding;
408 bundle.control = bundle.tag | control;
409
410 break;
411 }
412
413 case TAG_LOAD_STORE_4: {
414 /* Load store instructions have two words at once. If
415 * we only have one queued up, we need to NOP pad.
416 * Otherwise, we store both in succession to save space
417 * and cycles -- letting them go in parallel -- skip
418 * the next. The usefulness of this optimisation is
419 * greatly dependent on the quality of the instruction
420 * scheduler.
421 */
422
423 midgard_instruction *next_op = mir_next_op(ins);
424
425 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
426 /* TODO: Concurrency check */
427 instructions_emitted++;
428 }
429
430 break;
431 }
432
433 case TAG_TEXTURE_4: {
434 /* Which tag we use depends on the shader stage */
435 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
436 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
437 break;
438 }
439
440 default:
441 unreachable("Unknown tag");
442 break;
443 }
444
445 /* Copy the instructions into the bundle */
446 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
447
448 midgard_instruction *uins = ins;
449 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
450 bundle.instructions[packed_idx] = uins;
451 uins = mir_next_op(uins);
452 }
453
454 *skip = instructions_emitted;
455
456 return bundle;
457 }
458
459 /* Schedule a single block by iterating its instruction to create bundles.
460 * While we go, tally about the bundle sizes to compute the block size. */
461
462 static void
463 schedule_block(compiler_context *ctx, midgard_block *block)
464 {
465 util_dynarray_init(&block->bundles, NULL);
466
467 block->quadword_count = 0;
468
469 mir_foreach_instr_in_block(block, ins) {
470 int skip;
471 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
472 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
473
474 if (bundle.has_blend_constant) {
475 /* TODO: Multiblock? */
476 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
477 ctx->blend_constant_offset = quadwords_within_block * 0x10;
478 }
479
480 while(skip--)
481 ins = mir_next_op(ins);
482
483 block->quadword_count += quadword_size(bundle.tag);
484 }
485
486 block->is_scheduled = true;
487 }
488
489 void
490 schedule_program(compiler_context *ctx)
491 {
492 /* We run RA prior to scheduling */
493
494 mir_foreach_block(ctx, block) {
495 schedule_block(ctx, block);
496 }
497
498 /* Pipeline registers creation is a prepass before RA */
499 mir_create_pipeline_registers(ctx);
500
501 struct ra_graph *g = allocate_registers(ctx);
502 install_registers(ctx, g);
503 }