panfrost/midgard: Fix blend constant scheduling bug
[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27
28 /* Create a mask of accessed components from a swizzle to figure out vector
29 * dependencies */
30
31 static unsigned
32 swizzle_to_access_mask(unsigned swizzle)
33 {
34 unsigned component_mask = 0;
35
36 for (int i = 0; i < 4; ++i) {
37 unsigned c = (swizzle >> (2 * i)) & 3;
38 component_mask |= (1 << c);
39 }
40
41 return component_mask;
42 }
43
44 /* Does the mask cover more than a scalar? */
45
46 static bool
47 is_single_component_mask(unsigned mask)
48 {
49 int components = 0;
50
51 for (int c = 0; c < 4; ++c)
52 if (mask & (3 << (2 * c)))
53 components++;
54
55 return components == 1;
56 }
57
58 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
59 * mind that we are a vector architecture and we can write to different
60 * components simultaneously */
61
62 static bool
63 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
64 {
65 /* Each instruction reads some registers and writes to a register. See
66 * where the first writes */
67
68 /* Figure out where exactly we wrote to */
69 int source = first->ssa_args.dest;
70 int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
71
72 /* As long as the second doesn't read from the first, we're okay */
73 if (second->ssa_args.src0 == source) {
74 if (first->type == TAG_ALU_4) {
75 /* Figure out which components we just read from */
76
77 int q = second->alu.src1;
78 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
79
80 /* Check if there are components in common, and fail if so */
81 if (swizzle_to_access_mask(m->swizzle) & source_mask)
82 return false;
83 } else
84 return false;
85
86 }
87
88 if (second->ssa_args.src1 == source)
89 return false;
90
91 /* Otherwise, it's safe in that regard. Another data hazard is both
92 * writing to the same place, of course */
93
94 if (second->ssa_args.dest == source) {
95 /* ...but only if the components overlap */
96 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
97
98 if (dest_mask & source_mask)
99 return false;
100 }
101
102 /* ...That's it */
103 return true;
104 }
105
106 static bool
107 midgard_has_hazard(
108 midgard_instruction **segment, unsigned segment_size,
109 midgard_instruction *ains)
110 {
111 for (int s = 0; s < segment_size; ++s)
112 if (!can_run_concurrent_ssa(segment[s], ains))
113 return true;
114
115 return false;
116
117
118 }
119
120 /* Schedules, but does not emit, a single basic block. After scheduling, the
121 * final tag and size of the block are known, which are necessary for branching
122 * */
123
124 static midgard_bundle
125 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
126 {
127 int instructions_emitted = 0, packed_idx = 0;
128 midgard_bundle bundle = { 0 };
129
130 uint8_t tag = ins->type;
131
132 /* Default to the instruction's tag */
133 bundle.tag = tag;
134
135 switch (ins->type) {
136 case TAG_ALU_4: {
137 uint32_t control = 0;
138 size_t bytes_emitted = sizeof(control);
139
140 /* TODO: Constant combining */
141 int index = 0, last_unit = 0;
142
143 /* Previous instructions, for the purpose of parallelism */
144 midgard_instruction *segment[4] = {0};
145 int segment_size = 0;
146
147 instructions_emitted = -1;
148 midgard_instruction *pins = ins;
149
150 unsigned constant_count = 0;
151
152 for (;;) {
153 midgard_instruction *ains = pins;
154
155 /* Advance instruction pointer */
156 if (index) {
157 ains = mir_next_op(pins);
158 pins = ains;
159 }
160
161 /* Out-of-work condition */
162 if ((struct list_head *) ains == &block->instructions)
163 break;
164
165 /* Ensure that the chain can continue */
166 if (ains->type != TAG_ALU_4) break;
167
168 /* If there's already something in the bundle and we
169 * have weird scheduler constraints, break now */
170 if (ains->precede_break && index) break;
171
172 /* According to the presentation "The ARM
173 * Mali-T880 Mobile GPU" from HotChips 27,
174 * there are two pipeline stages. Branching
175 * position determined experimentally. Lines
176 * are executed in parallel:
177 *
178 * [ VMUL ] [ SADD ]
179 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
180 *
181 * Verify that there are no ordering dependencies here.
182 *
183 * TODO: Allow for parallelism!!!
184 */
185
186 /* Pick a unit for it if it doesn't force a particular unit */
187
188 int unit = ains->unit;
189
190 if (!unit) {
191 int op = ains->alu.op;
192 int units = alu_opcode_props[op].props;
193
194 bool vectorable = units & UNITS_ANY_VECTOR;
195 bool scalarable = units & UNITS_SCALAR;
196 bool could_scalar = is_single_component_mask(ains->alu.mask);
197 bool vector = vectorable && !(could_scalar && scalarable);
198
199 if (!vector)
200 assert(units & UNITS_SCALAR);
201
202 if (vector) {
203 if (last_unit >= UNIT_VADD) {
204 if (units & UNIT_VLUT)
205 unit = UNIT_VLUT;
206 else
207 break;
208 } else {
209 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
210 unit = UNIT_VMUL;
211 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
212 unit = UNIT_VADD;
213 else if (units & UNIT_VLUT)
214 unit = UNIT_VLUT;
215 else
216 break;
217 }
218 } else {
219 if (last_unit >= UNIT_VADD) {
220 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
221 unit = UNIT_SMUL;
222 else if (units & UNIT_VLUT)
223 unit = UNIT_VLUT;
224 else
225 break;
226 } else {
227 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
228 unit = UNIT_SADD;
229 else if (units & UNIT_SMUL)
230 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
231 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
232 unit = UNIT_VADD;
233 else
234 break;
235 }
236 }
237
238 assert(unit & units);
239 }
240
241 /* Late unit check, this time for encoding (not parallelism) */
242 if (unit <= last_unit) break;
243
244 /* Clear the segment */
245 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
246 segment_size = 0;
247
248 if (midgard_has_hazard(segment, segment_size, ains))
249 break;
250
251 /* We're good to go -- emit the instruction */
252 ains->unit = unit;
253
254 segment[segment_size++] = ains;
255
256 /* We try to reuse constants if possible, by adjusting
257 * the swizzle */
258
259 if (ains->has_blend_constant) {
260 /* Everything conflicts with the blend constant */
261 if (bundle.has_embedded_constants)
262 break;
263
264 bundle.has_blend_constant = 1;
265 bundle.has_embedded_constants = 1;
266 } else if (ains->has_constants) {
267 /* By definition, blend constants conflict with
268 * everything, so if there are already
269 * constants we break the bundle *now* */
270
271 if (bundle.has_blend_constant)
272 break;
273
274 /* For anything but blend constants, we can do
275 * proper analysis, however */
276
277 /* TODO: Mask by which are used */
278 uint32_t *constants = (uint32_t *) ains->constants;
279 uint32_t *bundles = (uint32_t *) bundle.constants;
280
281 uint32_t indices[4] = { 0 };
282 bool break_bundle = false;
283
284 for (unsigned i = 0; i < 4; ++i) {
285 uint32_t cons = constants[i];
286 bool constant_found = false;
287
288 /* Search for the constant */
289 for (unsigned j = 0; j < constant_count; ++j) {
290 if (bundles[j] != cons)
291 continue;
292
293 /* We found it, reuse */
294 indices[i] = j;
295 constant_found = true;
296 break;
297 }
298
299 if (constant_found)
300 continue;
301
302 /* We didn't find it, so allocate it */
303 unsigned idx = constant_count++;
304
305 if (idx >= 4) {
306 /* Uh-oh, out of space */
307 break_bundle = true;
308 break;
309 }
310
311 /* We have space, copy it in! */
312 bundles[idx] = cons;
313 indices[i] = idx;
314 }
315
316 if (break_bundle)
317 break;
318
319 /* Cool, we have it in. So use indices as a
320 * swizzle */
321
322 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
323 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
324
325 if (ains->ssa_args.src0 == r_constant)
326 ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
327
328 if (ains->ssa_args.src1 == r_constant)
329 ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
330
331 bundle.has_embedded_constants = true;
332 }
333
334 if (ains->unit & UNITS_ANY_VECTOR) {
335 bytes_emitted += sizeof(midgard_reg_info);
336 bytes_emitted += sizeof(midgard_vector_alu);
337 } else if (ains->compact_branch) {
338 /* All of r0 has to be written out along with
339 * the branch writeout */
340
341 if (ains->writeout) {
342 /* The rules for when "bare" writeout
343 * is safe are when all components are
344 * r0 are written out in the final
345 * bundle, earlier than VLUT, where any
346 * register dependencies of r0 are from
347 * an earlier bundle. We can't verify
348 * this before RA, so we don't try. */
349
350 if (index != 0)
351 break;
352
353 /* Inject a move */
354 midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
355 ins.unit = UNIT_VMUL;
356 control |= ins.unit;
357
358 /* TODO don't leak */
359 midgard_instruction *move =
360 mem_dup(&ins, sizeof(midgard_instruction));
361 bytes_emitted += sizeof(midgard_reg_info);
362 bytes_emitted += sizeof(midgard_vector_alu);
363 bundle.instructions[packed_idx++] = move;
364 }
365
366 if (ains->unit == ALU_ENAB_BRANCH) {
367 bytes_emitted += sizeof(midgard_branch_extended);
368 } else {
369 bytes_emitted += sizeof(ains->br_compact);
370 }
371 } else {
372 bytes_emitted += sizeof(midgard_reg_info);
373 bytes_emitted += sizeof(midgard_scalar_alu);
374 }
375
376 /* Defer marking until after writing to allow for break */
377 control |= ains->unit;
378 last_unit = ains->unit;
379 ++instructions_emitted;
380 ++index;
381 }
382
383 int padding = 0;
384
385 /* Pad ALU op to nearest word */
386
387 if (bytes_emitted & 15) {
388 padding = 16 - (bytes_emitted & 15);
389 bytes_emitted += padding;
390 }
391
392 /* Constants must always be quadwords */
393 if (bundle.has_embedded_constants)
394 bytes_emitted += 16;
395
396 /* Size ALU instruction for tag */
397 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
398 bundle.padding = padding;
399 bundle.control = bundle.tag | control;
400
401 break;
402 }
403
404 case TAG_LOAD_STORE_4: {
405 /* Load store instructions have two words at once. If
406 * we only have one queued up, we need to NOP pad.
407 * Otherwise, we store both in succession to save space
408 * and cycles -- letting them go in parallel -- skip
409 * the next. The usefulness of this optimisation is
410 * greatly dependent on the quality of the instruction
411 * scheduler.
412 */
413
414 midgard_instruction *next_op = mir_next_op(ins);
415
416 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
417 /* TODO: Concurrency check */
418 instructions_emitted++;
419 }
420
421 break;
422 }
423
424 case TAG_TEXTURE_4: {
425 /* Which tag we use depends on the shader stage */
426 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
427 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
428 break;
429 }
430
431 default:
432 unreachable("Unknown tag");
433 break;
434 }
435
436 /* Copy the instructions into the bundle */
437 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
438
439 midgard_instruction *uins = ins;
440 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
441 bundle.instructions[packed_idx] = uins;
442 uins = mir_next_op(uins);
443 }
444
445 *skip = instructions_emitted;
446
447 return bundle;
448 }
449
450 /* Schedule a single block by iterating its instruction to create bundles.
451 * While we go, tally about the bundle sizes to compute the block size. */
452
453 static void
454 schedule_block(compiler_context *ctx, midgard_block *block)
455 {
456 util_dynarray_init(&block->bundles, NULL);
457
458 block->quadword_count = 0;
459
460 mir_foreach_instr_in_block(block, ins) {
461 int skip;
462 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
463 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
464
465 if (bundle.has_blend_constant) {
466 /* TODO: Multiblock? */
467 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
468 ctx->blend_constant_offset = quadwords_within_block * 0x10;
469 }
470
471 while(skip--)
472 ins = mir_next_op(ins);
473
474 block->quadword_count += quadword_size(bundle.tag);
475 }
476
477 block->is_scheduled = true;
478 }
479
480 void
481 schedule_program(compiler_context *ctx)
482 {
483 /* We run RA prior to scheduling */
484
485 mir_foreach_block(ctx, block) {
486 schedule_block(ctx, block);
487 }
488
489 /* Pipeline registers creation is a prepass before RA */
490 mir_create_pipeline_registers(ctx);
491
492 struct ra_graph *g = allocate_registers(ctx);
493 install_registers(ctx, g);
494 }