freedreno/a2xx: ir2: fix incorrect instruction reordering
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 static bool scalar_possible(struct ir2_instr *instr)
30 {
31 if (instr->alu.scalar_opc == SCALAR_NONE)
32 return false;
33
34 return src_ncomp(instr) == 1;
35 }
36
37 static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
38 {
39 if (!a)
40 return true;
41
42 /* dont use same instruction twice */
43 if (a == b)
44 return false;
45
46 /* PRED_SET must be alone */
47 if (b->alu.scalar_opc >= PRED_SETEs &&
48 b->alu.scalar_opc <= PRED_SET_RESTOREs)
49 return false;
50
51 /* must write to same export (issues otherwise?) */
52 return a->alu.export == b->alu.export;
53 }
54
55 /* priority of vector instruction for scheduling (lower=higher prio) */
56 static unsigned alu_vector_prio(struct ir2_instr *instr)
57 {
58 if (instr->alu.vector_opc == VECTOR_NONE)
59 return ~0u;
60
61 if (is_export(instr))
62 return 4;
63
64 /* TODO check src type and ncomps */
65 if (instr->src_count == 3)
66 return 0;
67
68 if (!scalar_possible(instr))
69 return 1;
70
71 return instr->src_count == 2 ? 2 : 3;
72 }
73
74 /* priority of scalar instruction for scheduling (lower=higher prio) */
75 static unsigned alu_scalar_prio(struct ir2_instr *instr)
76 {
77 if (!scalar_possible(instr))
78 return ~0u;
79
80 /* this case is dealt with later */
81 if (instr->src_count > 1)
82 return ~0u;
83
84 if (is_export(instr))
85 return 4;
86
87 /* PRED to end of block */
88 if (instr->alu.scalar_opc >= PRED_SETEs &&
89 instr->alu.scalar_opc <= PRED_SET_RESTOREs)
90 return 5;
91
92 /* scalar only have highest priority */
93 return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
94 }
95
96 /* this is a bit messy:
97 * we want to find a slot where we can insert a scalar MOV with
98 * a vector instruction that was already scheduled
99 */
100 static struct ir2_sched_instr*
101 insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
102 struct ir2_src src1, unsigned *comp)
103 {
104 struct ir2_sched_instr *sched = NULL, *s;
105 unsigned i, mask = 0xf;
106
107 /* go first earliest point where the mov can be inserted */
108 for (i = ctx->instr_sched_count-1; i > 0; i--) {
109 s = &ctx->instr_sched[i - 1];
110
111 if (s->instr && s->instr->block_idx != block_idx)
112 break;
113 if (s->instr_s && s->instr_s->block_idx != block_idx)
114 break;
115
116 if (src1.type == IR2_SRC_SSA) {
117 if ((s->instr && s->instr->idx == src1.num) ||
118 (s->instr_s && s->instr_s->idx == src1.num))
119 break;
120 }
121
122 unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
123 if ((mask & mr) == 0)
124 break;
125
126 mask &= mr;
127 if (s->instr_s || s->instr->src_count == 3)
128 continue;
129
130 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
131 continue;
132
133 sched = s;
134 }
135 *comp = ffs(mask) - 1;
136 return sched;
137 }
138
139 /* case1:
140 * in this case, insert a mov to place the 2nd src into to same reg
141 * (scalar sources come from the same register)
142 *
143 * this is a common case which works when one of the srcs is input/const
144 * but for instrs which have 2 ssa/reg srcs, then its not ideal
145 */
146 static bool
147 scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
148 {
149 struct ir2_src src0 = instr->src[ order];
150 struct ir2_src src1 = instr->src[!order];
151 struct ir2_sched_instr *sched;
152 struct ir2_instr *ins;
153 struct ir2_reg *reg;
154 unsigned idx, comp;
155
156 switch (src0.type) {
157 case IR2_SRC_CONST:
158 case IR2_SRC_INPUT:
159 return false;
160 default:
161 break;
162 }
163
164 /* TODO, insert needs logic for this */
165 if (src1.type == IR2_SRC_REG)
166 return false;
167
168 /* we could do something if they match src1.. */
169 if (src0.negate || src0.abs)
170 return false;
171
172 reg = get_reg_src(ctx, &src0);
173
174 /* result not used more since we will overwrite */
175 for (int i = 0; i < 4; i++)
176 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
177 return false;
178
179 /* find a place to insert the mov */
180 sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
181 if (!sched)
182 return false;
183
184 ins = &ctx->instr[idx = ctx->instr_count++];
185 ins->idx = idx;
186 ins->type = IR2_ALU;
187 ins->src[0] = src1;
188 ins->src_count = 1;
189 ins->is_ssa = true;
190 ins->ssa.idx = reg->idx;
191 ins->ssa.ncomp = 1;
192 ins->ssa.comp[0].c = comp;
193 ins->alu.scalar_opc = MAXs;
194 ins->alu.export = -1;
195 ins->alu.write_mask = 1;
196 ins->pred = instr->pred;
197 ins->block_idx = instr->block_idx;
198
199 instr->src[0] = src0;
200 instr->alu.src1_swizzle = comp;
201
202 sched->instr_s = ins;
203 return true;
204 }
205
206 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
207 static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
208 {
209 struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
210 unsigned avail_count = 0;
211
212 instr_alloc_type_t export = ~0u;
213 int block_idx = -1;
214
215 /* XXX merge this loop with the other one somehow? */
216 ir2_foreach_instr(instr, ctx) {
217 if (!instr->need_emit)
218 continue;
219 if (is_export(instr))
220 export = MIN2(export, export_buf(instr->alu.export));
221 }
222
223 ir2_foreach_instr(instr, ctx) {
224 if (!instr->need_emit)
225 continue;
226
227 /* dont mix exports */
228 if (is_export(instr) && export_buf(instr->alu.export) != export)
229 continue;
230
231 if (block_idx < 0)
232 block_idx = instr->block_idx;
233 else if (block_idx != instr->block_idx || /* must be same block */
234 instr->type == IR2_CF || /* CF/MEM must be alone */
235 (is_export(instr) && export == SQ_MEMORY))
236 break;
237 /* it works because IR2_CF is always at end of block
238 * and somewhat same idea with MEM exports, which might not be alone
239 * but will end up in-order at least
240 */
241
242 /* check if dependencies are satisfied */
243 bool is_ok = true;
244 ir2_foreach_src(src, instr) {
245 if (src->type == IR2_SRC_REG) {
246 /* need to check if all previous instructions in the block
247 * which write the reg have been emitted
248 * slow..
249 * XXX: check components instead of whole register
250 */
251 struct ir2_reg *reg = get_reg_src(ctx, src);
252 ir2_foreach_instr(p, ctx) {
253 if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
254 is_ok &= !p->need_emit;
255 }
256 } else if (src->type == IR2_SRC_SSA) {
257 /* in this case its easy, just check need_emit */
258 is_ok &= !ctx->instr[src->num].need_emit;
259 }
260 }
261 /* don't reorder non-ssa write before read */
262 if (!instr->is_ssa) {
263 ir2_foreach_instr(p, ctx) {
264 if (!p->need_emit || p->idx >= instr->idx)
265 continue;
266
267 ir2_foreach_src(src, p) {
268 if (get_reg_src(ctx, src) == instr->reg)
269 is_ok = false;
270 }
271 }
272 }
273 /* don't reorder across predicates */
274 if (avail_count && instr->pred != avail[0]->pred)
275 is_ok = false;
276
277 if (!is_ok)
278 continue;
279
280 avail[avail_count++] = instr;
281 }
282
283 if (!avail_count) {
284 assert(block_idx == -1);
285 return -1;
286 }
287
288 /* priority to FETCH instructions */
289 ir2_foreach_avail(instr) {
290 if (instr->type == IR2_ALU)
291 continue;
292
293 ra_src_free(ctx, instr);
294 ra_reg(ctx, get_reg(instr), -1, false, 0);
295
296 instr->need_emit = false;
297 sched->instr = instr;
298 sched->instr_s = NULL;
299 return block_idx;
300 }
301
302 /* TODO precompute priorities */
303
304 unsigned prio_v = ~0u, prio_s = ~0u, prio;
305 ir2_foreach_avail(instr) {
306 prio = alu_vector_prio(instr);
307 if (prio < prio_v) {
308 instr_v = instr;
309 prio_v = prio;
310 }
311 }
312
313 /* TODO can still insert scalar if src_count=3, if smart about it */
314 if (!instr_v || instr_v->src_count < 3) {
315 ir2_foreach_avail(instr) {
316 bool compat = is_alu_compatible(instr_v, instr);
317
318 prio = alu_scalar_prio(instr);
319 if (prio >= prio_v && !compat)
320 continue;
321
322 if (prio < prio_s) {
323 instr_s = instr;
324 prio_s = prio;
325 if (!compat)
326 instr_v = NULL;
327 }
328 }
329 }
330
331 assert(instr_v || instr_s);
332
333 /* now, we try more complex insertion of vector instruction as scalar
334 * TODO: if we are smart we can still insert if instr_v->src_count==3
335 */
336 if (!instr_s && instr_v->src_count < 3) {
337 ir2_foreach_avail(instr) {
338 if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
339 continue;
340
341 /* at this point, src_count should always be 2 */
342 assert(instr->src_count == 2);
343
344 if (scalarize_case1(ctx, instr, 0)) {
345 instr_s = instr;
346 break;
347 }
348 if (scalarize_case1(ctx, instr, 1)) {
349 instr_s = instr;
350 break;
351 }
352 }
353 }
354
355 /* free src registers */
356 if (instr_v) {
357 instr_v->need_emit = false;
358 ra_src_free(ctx, instr_v);
359 }
360
361 if (instr_s) {
362 instr_s->need_emit = false;
363 ra_src_free(ctx, instr_s);
364 }
365
366 /* allocate dst registers */
367 if (instr_v)
368 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
369
370 if (instr_s)
371 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
372
373 sched->instr = instr_v;
374 sched->instr_s = instr_s;
375 return block_idx;
376 }
377
378 /* scheduling: determine order of instructions */
379 static void schedule_instrs(struct ir2_context *ctx)
380 {
381 struct ir2_sched_instr *sched;
382 int block_idx;
383
384 /* allocate input registers */
385 for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
386 if (ctx->input[idx].initialized)
387 ra_reg(ctx, &ctx->input[idx], idx, false, 0);
388
389 for (;;) {
390 sched = &ctx->instr_sched[ctx->instr_sched_count++];
391 block_idx = sched_next(ctx, sched);
392 if (block_idx < 0)
393 break;
394 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
395
396 /* catch texture fetch after scheduling and insert the
397 * SET_TEX_LOD right before it if necessary
398 * TODO clean this up
399 */
400 struct ir2_instr *instr = sched->instr, *tex_lod;
401 if (instr && instr->type == IR2_FETCH &&
402 instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
403 /* generate the SET_LOD instruction */
404 tex_lod = &ctx->instr[ctx->instr_count++];
405 tex_lod->type = IR2_FETCH;
406 tex_lod->block_idx = instr->block_idx;
407 tex_lod->pred = instr->pred;
408 tex_lod->fetch.opc = TEX_SET_TEX_LOD;
409 tex_lod->src[0] = instr->src[1];
410 tex_lod->src_count = 1;
411
412 sched[1] = sched[0];
413 sched->instr = tex_lod;
414 ctx->instr_sched_count++;
415 }
416
417 bool free_block = true;
418 ir2_foreach_instr(instr, ctx)
419 free_block &= instr->block_idx != block_idx;
420 if (free_block)
421 ra_block_free(ctx, block_idx);
422 };
423 ctx->instr_sched_count--;
424 }
425
426 void
427 ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
428 struct fd2_shader_stateobj *fp)
429 {
430 struct ir2_context ctx = { };
431 bool binning = !fp && so->type == MESA_SHADER_VERTEX;
432
433 if (fp)
434 so->variant[variant].f = fp->variant[0].f;
435
436 ctx.so = so;
437 ctx.info = &so->variant[variant].info;
438 ctx.f = &so->variant[variant].f;
439 ctx.info->max_reg = -1;
440
441 /* convert nir to internal representation */
442 ir2_nir_compile(&ctx, binning);
443
444 /* copy propagate srcs */
445 cp_src(&ctx);
446
447 /* get ref_counts and kill non-needed instructions */
448 ra_count_refs(&ctx);
449
450 /* remove movs used to write outputs */
451 cp_export(&ctx);
452
453 /* instruction order.. and vector->scalar conversions */
454 schedule_instrs(&ctx);
455
456 /* finally, assemble to bitcode */
457 assemble(&ctx, binning);
458 }