panfrost/midgard: Differentiate vertex/fragment texture tags
[mesa.git] / src / gallium / drivers / panfrost / midgard / midgard_schedule.c
1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "midgard_ops.h"
26 #include "util/u_memory.h"
27
28 /* Create a mask of accessed components from a swizzle to figure out vector
29 * dependencies */
30
31 static unsigned
32 swizzle_to_access_mask(unsigned swizzle)
33 {
34 unsigned component_mask = 0;
35
36 for (int i = 0; i < 4; ++i) {
37 unsigned c = (swizzle >> (2 * i)) & 3;
38 component_mask |= (1 << c);
39 }
40
41 return component_mask;
42 }
43
44 /* Does the mask cover more than a scalar? */
45
46 static bool
47 is_single_component_mask(unsigned mask)
48 {
49 int components = 0;
50
51 for (int c = 0; c < 4; ++c)
52 if (mask & (3 << (2 * c)))
53 components++;
54
55 return components == 1;
56 }
57
58 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
59 * mind that we are a vector architecture and we can write to different
60 * components simultaneously */
61
62 static bool
63 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
64 {
65 /* Each instruction reads some registers and writes to a register. See
66 * where the first writes */
67
68 /* Figure out where exactly we wrote to */
69 int source = first->ssa_args.dest;
70 int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
71
72 /* As long as the second doesn't read from the first, we're okay */
73 if (second->ssa_args.src0 == source) {
74 if (first->type == TAG_ALU_4) {
75 /* Figure out which components we just read from */
76
77 int q = second->alu.src1;
78 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
79
80 /* Check if there are components in common, and fail if so */
81 if (swizzle_to_access_mask(m->swizzle) & source_mask)
82 return false;
83 } else
84 return false;
85
86 }
87
88 if (second->ssa_args.src1 == source)
89 return false;
90
91 /* Otherwise, it's safe in that regard. Another data hazard is both
92 * writing to the same place, of course */
93
94 if (second->ssa_args.dest == source) {
95 /* ...but only if the components overlap */
96 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
97
98 if (dest_mask & source_mask)
99 return false;
100 }
101
102 /* ...That's it */
103 return true;
104 }
105
106 static bool
107 midgard_has_hazard(
108 midgard_instruction **segment, unsigned segment_size,
109 midgard_instruction *ains)
110 {
111 for (int s = 0; s < segment_size; ++s)
112 if (!can_run_concurrent_ssa(segment[s], ains))
113 return true;
114
115 return false;
116
117
118 }
119
120 /* Schedules, but does not emit, a single basic block. After scheduling, the
121 * final tag and size of the block are known, which are necessary for branching
122 * */
123
124 static midgard_bundle
125 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
126 {
127 int instructions_emitted = 0, packed_idx = 0;
128 midgard_bundle bundle = { 0 };
129
130 uint8_t tag = ins->type;
131
132 /* Default to the instruction's tag */
133 bundle.tag = tag;
134
135 switch (ins->type) {
136 case TAG_ALU_4: {
137 uint32_t control = 0;
138 size_t bytes_emitted = sizeof(control);
139
140 /* TODO: Constant combining */
141 int index = 0, last_unit = 0;
142
143 /* Previous instructions, for the purpose of parallelism */
144 midgard_instruction *segment[4] = {0};
145 int segment_size = 0;
146
147 instructions_emitted = -1;
148 midgard_instruction *pins = ins;
149
150 for (;;) {
151 midgard_instruction *ains = pins;
152
153 /* Advance instruction pointer */
154 if (index) {
155 ains = mir_next_op(pins);
156 pins = ains;
157 }
158
159 /* Out-of-work condition */
160 if ((struct list_head *) ains == &block->instructions)
161 break;
162
163 /* Ensure that the chain can continue */
164 if (ains->type != TAG_ALU_4) break;
165
166 /* If there's already something in the bundle and we
167 * have weird scheduler constraints, break now */
168 if (ains->precede_break && index) break;
169
170 /* According to the presentation "The ARM
171 * Mali-T880 Mobile GPU" from HotChips 27,
172 * there are two pipeline stages. Branching
173 * position determined experimentally. Lines
174 * are executed in parallel:
175 *
176 * [ VMUL ] [ SADD ]
177 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
178 *
179 * Verify that there are no ordering dependencies here.
180 *
181 * TODO: Allow for parallelism!!!
182 */
183
184 /* Pick a unit for it if it doesn't force a particular unit */
185
186 int unit = ains->unit;
187
188 if (!unit) {
189 int op = ains->alu.op;
190 int units = alu_opcode_props[op].props;
191
192 bool vectorable = units & UNITS_ANY_VECTOR;
193 bool scalarable = units & UNITS_SCALAR;
194 bool could_scalar = is_single_component_mask(ains->alu.mask);
195 bool vector = vectorable && !(could_scalar && scalarable);
196
197 if (!vector)
198 assert(units & UNITS_SCALAR);
199
200 if (vector) {
201 if (last_unit >= UNIT_VADD) {
202 if (units & UNIT_VLUT)
203 unit = UNIT_VLUT;
204 else
205 break;
206 } else {
207 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
208 unit = UNIT_VMUL;
209 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
210 unit = UNIT_VADD;
211 else if (units & UNIT_VLUT)
212 unit = UNIT_VLUT;
213 else
214 break;
215 }
216 } else {
217 if (last_unit >= UNIT_VADD) {
218 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
219 unit = UNIT_SMUL;
220 else if (units & UNIT_VLUT)
221 unit = UNIT_VLUT;
222 else
223 break;
224 } else {
225 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
226 unit = UNIT_SADD;
227 else if (units & UNIT_SMUL)
228 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
229 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
230 unit = UNIT_VADD;
231 else
232 break;
233 }
234 }
235
236 assert(unit & units);
237 }
238
239 /* Late unit check, this time for encoding (not parallelism) */
240 if (unit <= last_unit) break;
241
242 /* Clear the segment */
243 if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
244 segment_size = 0;
245
246 if (midgard_has_hazard(segment, segment_size, ains))
247 break;
248
249 /* We're good to go -- emit the instruction */
250 ains->unit = unit;
251
252 segment[segment_size++] = ains;
253
254 /* Only one set of embedded constants per
255 * bundle possible; if we have more, we must
256 * break the chain early, unfortunately */
257
258 if (ains->has_constants) {
259 if (bundle.has_embedded_constants) {
260 /* The blend constant needs to be
261 * alone, since it conflicts with
262 * everything by definition */
263
264 if (ains->has_blend_constant || bundle.has_blend_constant)
265 break;
266
267 /* ...but if there are already
268 * constants but these are the
269 * *same* constants, we let it
270 * through */
271
272 if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
273 break;
274 } else {
275 bundle.has_embedded_constants = true;
276 memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
277
278 /* If this is a blend shader special constant, track it for patching */
279 bundle.has_blend_constant |= ains->has_blend_constant;
280 }
281 }
282
283 if (ains->unit & UNITS_ANY_VECTOR) {
284 bytes_emitted += sizeof(midgard_reg_info);
285 bytes_emitted += sizeof(midgard_vector_alu);
286 } else if (ains->compact_branch) {
287 /* All of r0 has to be written out along with
288 * the branch writeout */
289
290 if (ains->writeout) {
291 /* The rules for when "bare" writeout
292 * is safe are when all components are
293 * r0 are written out in the final
294 * bundle, earlier than VLUT, where any
295 * register dependencies of r0 are from
296 * an earlier bundle. We can't verify
297 * this before RA, so we don't try. */
298
299 if (index != 0)
300 break;
301
302 /* Inject a move */
303 midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
304 ins.unit = UNIT_VMUL;
305 control |= ins.unit;
306
307 /* TODO don't leak */
308 midgard_instruction *move =
309 mem_dup(&ins, sizeof(midgard_instruction));
310 bytes_emitted += sizeof(midgard_reg_info);
311 bytes_emitted += sizeof(midgard_vector_alu);
312 bundle.instructions[packed_idx++] = move;
313 }
314
315 if (ains->unit == ALU_ENAB_BRANCH) {
316 bytes_emitted += sizeof(midgard_branch_extended);
317 } else {
318 bytes_emitted += sizeof(ains->br_compact);
319 }
320 } else {
321 bytes_emitted += sizeof(midgard_reg_info);
322 bytes_emitted += sizeof(midgard_scalar_alu);
323 }
324
325 /* Defer marking until after writing to allow for break */
326 control |= ains->unit;
327 last_unit = ains->unit;
328 ++instructions_emitted;
329 ++index;
330 }
331
332 int padding = 0;
333
334 /* Pad ALU op to nearest word */
335
336 if (bytes_emitted & 15) {
337 padding = 16 - (bytes_emitted & 15);
338 bytes_emitted += padding;
339 }
340
341 /* Constants must always be quadwords */
342 if (bundle.has_embedded_constants)
343 bytes_emitted += 16;
344
345 /* Size ALU instruction for tag */
346 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
347 bundle.padding = padding;
348 bundle.control = bundle.tag | control;
349
350 break;
351 }
352
353 case TAG_LOAD_STORE_4: {
354 /* Load store instructions have two words at once. If
355 * we only have one queued up, we need to NOP pad.
356 * Otherwise, we store both in succession to save space
357 * and cycles -- letting them go in parallel -- skip
358 * the next. The usefulness of this optimisation is
359 * greatly dependent on the quality of the instruction
360 * scheduler.
361 */
362
363 midgard_instruction *next_op = mir_next_op(ins);
364
365 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
366 /* TODO: Concurrency check */
367 instructions_emitted++;
368 }
369
370 break;
371 }
372
373 case TAG_TEXTURE_4: {
374 /* Which tag we use depends on the shader stage */
375 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
376 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
377 break;
378 }
379
380 default:
381 unreachable("Unknown tag");
382 break;
383 }
384
385 /* Copy the instructions into the bundle */
386 bundle.instruction_count = instructions_emitted + 1 + packed_idx;
387
388 midgard_instruction *uins = ins;
389 for (; packed_idx < bundle.instruction_count; ++packed_idx) {
390 bundle.instructions[packed_idx] = uins;
391 uins = mir_next_op(uins);
392 }
393
394 *skip = instructions_emitted;
395
396 return bundle;
397 }
398
399 /* Schedule a single block by iterating its instruction to create bundles.
400 * While we go, tally about the bundle sizes to compute the block size. */
401
402 static void
403 schedule_block(compiler_context *ctx, midgard_block *block)
404 {
405 util_dynarray_init(&block->bundles, NULL);
406
407 block->quadword_count = 0;
408
409 mir_foreach_instr_in_block(block, ins) {
410 int skip;
411 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
412 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
413
414 if (bundle.has_blend_constant) {
415 /* TODO: Multiblock? */
416 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
417 ctx->blend_constant_offset = quadwords_within_block * 0x10;
418 }
419
420 while(skip--)
421 ins = mir_next_op(ins);
422
423 block->quadword_count += quadword_size(bundle.tag);
424 }
425
426 block->is_scheduled = true;
427 }
428
429 void
430 schedule_program(compiler_context *ctx)
431 {
432 /* We run RA prior to scheduling */
433
434 mir_foreach_block(ctx, block) {
435 schedule_block(ctx, block);
436 }
437
438 /* Pipeline registers creation is a prepass before RA */
439 mir_create_pipeline_registers(ctx);
440
441 struct ra_graph *g = allocate_registers(ctx);
442 install_registers(ctx, g);
443 }