panfrost/midgard: Allow fp16 in scalar ALU
[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27
28 /* Create a mask of accessed components from a swizzle to figure out vector
29 * dependencies */
30
31 static unsigned
32 swizzle_to_access_mask(unsigned swizzle)
33 {
34 unsigned component_mask = 0;
35
36 for (int i = 0; i < 4; ++i) {
37 unsigned c = (swizzle >> (2 * i)) & 3;
38 component_mask |= (1 << c);
39 }
40
41 return component_mask;
42 }
43
44 /* Does the mask cover more than a scalar? */
45
46 static bool
47 is_single_component_mask(unsigned mask, bool full)
48 {
49 int components = 0;
50
51 for (int c = 0; c < 8; ++c) {
52 if (mask & (1 << c))
53 components++;
54
55 /* Full uses 2-bit components */
56 if (full)
57 c++;
58 }
59
60 return components == 1;
61 }
62
63 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
64 * mind that we are a vector architecture and we can write to different
65 * components simultaneously */
66
67 static bool
68 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
69 {
70 /* Each instruction reads some registers and writes to a register. See
71 * where the first writes */
72
73 /* Figure out where exactly we wrote to */
74 int source = first->ssa_args.dest;
75 int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
76
77 /* As long as the second doesn't read from the first, we're okay */
78 if (second->ssa_args.src0 == source) {
79 if (first->type == TAG_ALU_4) {
80 /* Figure out which components we just read from */
81
82 int q = second->alu.src1;
83 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
84
85 /* Check if there are components in common, and fail if so */
86 if (swizzle_to_access_mask(m->swizzle) & source_mask)
87 return false;
88 } else
89 return false;
90
91 }
92
93 if (second->ssa_args.src1 == source)
94 return false;
95
96 /* Otherwise, it's safe in that regard. Another data hazard is both
97 * writing to the same place, of course */
98
99 if (second->ssa_args.dest == source) {
100 /* ...but only if the components overlap */
101 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
102
103 if (dest_mask & source_mask)
104 return false;
105 }
106
107 /* ...That's it */
108 return true;
109 }
110
111 static bool
112 midgard_has_hazard(
113 midgard_instruction **segment, unsigned segment_size,
114 midgard_instruction *ains)
115 {
116 for (int s = 0; s < segment_size; ++s)
117 if (!can_run_concurrent_ssa(segment[s], ains))
118 return true;
119
120 return false;
121
122
123 }
124
125 /* Schedules, but does not emit, a single basic block. After scheduling, the
126 * final tag and size of the block are known, which are necessary for branching
127 * */
128
129 static midgard_bundle
130 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
131 {
132 int instructions_emitted = 0, packed_idx = 0;
133 midgard_bundle bundle = { 0 };
134
135 uint8_t tag = ins->type;
136
137 /* Default to the instruction's tag */
138 bundle.tag = tag;
139
140 switch (ins->type) {
141 case TAG_ALU_4: {
142 uint32_t control = 0;
143 size_t bytes_emitted = sizeof(control);
144
145 /* TODO: Constant combining */
146 int index = 0, last_unit = 0;
147
148 /* Previous instructions, for the purpose of parallelism */
149 midgard_instruction *segment[4] = {0};
150 int segment_size = 0;
151
152 instructions_emitted = -1;
153 midgard_instruction *pins = ins;
154
155 unsigned constant_count = 0;
156
157 for (;;) {
158 midgard_instruction *ains = pins;
159
160 /* Advance instruction pointer */
161 if (index) {
162 ains = mir_next_op(pins);
163 pins = ains;
164 }
165
166 /* Out-of-work condition */
167 if ((struct list_head *) ains == &block->instructions)
168 break;
169
170 /* Ensure that the chain can continue */
171 if (ains->type != TAG_ALU_4) break;
172
173 /* If there's already something in the bundle and we
174 * have weird scheduler constraints, break now */
175 if (ains->precede_break && index) break;
176
177 /* According to the presentation "The ARM
178 * Mali-T880 Mobile GPU" from HotChips 27,
179 * there are two pipeline stages. Branching
180 * position determined experimentally. Lines
181 * are executed in parallel:
182 *
183 * [ VMUL ] [ SADD ]
184 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
185 *
186 * Verify that there are no ordering dependencies here.
187 *
188 * TODO: Allow for parallelism!!!
189 */
190
191 /* Pick a unit for it if it doesn't force a particular unit */
192
193 int unit = ains->unit;
194
195 if (!unit) {
196 int op = ains->alu.op;
197 int units = alu_opcode_props[op].props;
198
199 bool vectorable = units & UNITS_ANY_VECTOR;
200 bool scalarable = units & UNITS_SCALAR;
201 bool full = ains->alu.reg_mode == midgard_reg_mode_32;
202 bool could_scalar = is_single_component_mask(ains->alu.mask, full);
203 bool vector = vectorable && !(could_scalar && scalarable);
204
205 /* Only 16/32-bit can run on a scalar unit */
206 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
207 could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
208
209 /* TODO: Check ahead-of-time for other scalar
210 * hazards that otherwise get aborted out */
211
212 if (!vector)
213 assert(units & UNITS_SCALAR);
214
215 if (vector) {
216 if (last_unit >= UNIT_VADD) {
217 if (units & UNIT_VLUT)
218 unit = UNIT_VLUT;
219 else
220 break;
221 } else {
222 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
223 unit = UNIT_VMUL;
224 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
225 unit = UNIT_VADD;
226 else if (units & UNIT_VLUT)
227 unit = UNIT_VLUT;
228 else
229 break;
230 }
231 } else {
232 if (last_unit >= UNIT_VADD) {
233 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
234 unit = UNIT_SMUL;
235 else if (units & UNIT_VLUT)
236 unit = UNIT_VLUT;
237 else
238 break;
239 } else {
240 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
241 unit = UNIT_SADD;
242 else if (units & UNIT_SMUL)
243 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
244 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
245 unit = UNIT_VADD;
246 else
247 break;
248 }
249 }
250
251 assert(unit & units);
252 }
253
254 /* Late unit check, this time for encoding (not parallelism) */
255 if (unit <= last_unit) break;
256
257 /* Clear the segment */
258 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
259 segment_size = 0;
260
261 if (midgard_has_hazard(segment, segment_size, ains))
262 break;
263
264 /* We're good to go -- emit the instruction */
265 ains->unit = unit;
266
267 segment[segment_size++] = ains;
268
269 /* We try to reuse constants if possible, by adjusting
270 * the swizzle */
271
272 if (ains->has_blend_constant) {
273 /* Everything conflicts with the blend constant */
274 if (bundle.has_embedded_constants)
275 break;
276
277 bundle.has_blend_constant = 1;
278 bundle.has_embedded_constants = 1;
279 } else if (ains->has_constants) {
280 /* By definition, blend constants conflict with
281 * everything, so if there are already
282 * constants we break the bundle *now* */
283
284 if (bundle.has_blend_constant)
285 break;
286
287 /* For anything but blend constants, we can do
288 * proper analysis, however */
289
290 /* TODO: Mask by which are used */
291 uint32_t *constants = (uint32_t *) ains->constants;
292 uint32_t *bundles = (uint32_t *) bundle.constants;
293
294 uint32_t indices[4] = { 0 };
295 bool break_bundle = false;
296
297 for (unsigned i = 0; i < 4; ++i) {
298 uint32_t cons = constants[i];
299 bool constant_found = false;
300
301 /* Search for the constant */
302 for (unsigned j = 0; j < constant_count; ++j) {
303 if (bundles[j] != cons)
304 continue;
305
306 /* We found it, reuse */
307 indices[i] = j;
308 constant_found = true;
309 break;
310 }
311
312 if (constant_found)
313 continue;
314
315 /* We didn't find it, so allocate it */
316 unsigned idx = constant_count++;
317
318 if (idx >= 4) {
319 /* Uh-oh, out of space */
320 break_bundle = true;
321 break;
322 }
323
324 /* We have space, copy it in! */
325 bundles[idx] = cons;
326 indices[i] = idx;
327 }
328
329 if (break_bundle)
330 break;
331
332 /* Cool, we have it in. So use indices as a
333 * swizzle */
334
335 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
336 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
337
338 if (ains->ssa_args.src0 == r_constant)
339 ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
340
341 if (ains->ssa_args.src1 == r_constant)
342 ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
343
344 bundle.has_embedded_constants = true;
345 }
346
347 if (ains->unit & UNITS_ANY_VECTOR) {
348 bytes_emitted += sizeof(midgard_reg_info);
349 bytes_emitted += sizeof(midgard_vector_alu);
350 } else if (ains->compact_branch) {
351 /* All of r0 has to be written out along with
352 * the branch writeout */
353
354 if (ains->writeout) {
355 /* The rules for when "bare" writeout
356 * is safe are when all components are
357 * r0 are written out in the final
358 * bundle, earlier than VLUT, where any
359 * register dependencies of r0 are from
360 * an earlier bundle. We can't verify
361 * this before RA, so we don't try. */
362
363 if (index != 0)
364 break;
365
366 /* Inject a move */
367 midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
368 ins.unit = UNIT_VMUL;
369 control |= ins.unit;
370
371 /* TODO don't leak */
372 midgard_instruction *move =
373 mem_dup(&ins, sizeof(midgard_instruction));
374 bytes_emitted += sizeof(midgard_reg_info);
375 bytes_emitted += sizeof(midgard_vector_alu);
376 bundle.instructions[packed_idx++] = move;
377 }
378
379 if (ains->unit == ALU_ENAB_BRANCH) {
380 bytes_emitted += sizeof(midgard_branch_extended);
381 } else {
382 bytes_emitted += sizeof(ains->br_compact);
383 }
384 } else {
385 bytes_emitted += sizeof(midgard_reg_info);
386 bytes_emitted += sizeof(midgard_scalar_alu);
387 }
388
389 /* Defer marking until after writing to allow for break */
390 control |= ains->unit;
391 last_unit = ains->unit;
392 ++instructions_emitted;
393 ++index;
394 }
395
396 int padding = 0;
397
398 /* Pad ALU op to nearest word */
399
400 if (bytes_emitted & 15) {
401 padding = 16 - (bytes_emitted & 15);
402 bytes_emitted += padding;
403 }
404
405 /* Constants must always be quadwords */
406 if (bundle.has_embedded_constants)
407 bytes_emitted += 16;
408
409 /* Size ALU instruction for tag */
410 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
411 bundle.padding = padding;
412 bundle.control = bundle.tag | control;
413
414 break;
415 }
416
417 case TAG_LOAD_STORE_4: {
418 /* Load store instructions have two words at once. If
419 * we only have one queued up, we need to NOP pad.
420 * Otherwise, we store both in succession to save space
421 * and cycles -- letting them go in parallel -- skip
422 * the next. The usefulness of this optimisation is
423 * greatly dependent on the quality of the instruction
424 * scheduler.
425 */
426
427 midgard_instruction *next_op = mir_next_op(ins);
428
429 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
430 /* TODO: Concurrency check */
431 instructions_emitted++;
432 }
433
434 break;
435 }
436
437 case TAG_TEXTURE_4: {
438 /* Which tag we use depends on the shader stage */
439 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
440 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
441 break;
442 }
443
444 default:
445 unreachable("Unknown tag");
446 break;
447 }
448
449 /* Copy the instructions into the bundle */
450 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
451
452 midgard_instruction *uins = ins;
453 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
454 bundle.instructions[packed_idx] = uins;
455 uins = mir_next_op(uins);
456 }
457
458 *skip = instructions_emitted;
459
460 return bundle;
461 }
462
463 /* Schedule a single block by iterating its instruction to create bundles.
464 * While we go, tally about the bundle sizes to compute the block size. */
465
466 static void
467 schedule_block(compiler_context *ctx, midgard_block *block)
468 {
469 util_dynarray_init(&block->bundles, NULL);
470
471 block->quadword_count = 0;
472
473 mir_foreach_instr_in_block(block, ins) {
474 int skip;
475 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
476 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
477
478 if (bundle.has_blend_constant) {
479 /* TODO: Multiblock? */
480 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
481 ctx->blend_constant_offset = quadwords_within_block * 0x10;
482 }
483
484 while(skip--)
485 ins = mir_next_op(ins);
486
487 block->quadword_count += quadword_size(bundle.tag);
488 }
489
490 block->is_scheduled = true;
491 }
492
493 void
494 schedule_program(compiler_context *ctx)
495 {
496 /* We run RA prior to scheduling */
497
498 mir_foreach_block(ctx, block) {
499 schedule_block(ctx, block);
500 }
501
502 /* Pipeline registers creation is a prepass before RA */
503 mir_create_pipeline_registers(ctx);
504
505 struct ra_graph *g = allocate_registers(ctx);
506 install_registers(ctx, g);
507 }