freedreno/a3xx/compiler: handle kill properly (new compiler)
[mesa.git] / src / gallium / drivers / freedreno / a3xx / ir3_ra.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33 #include "ir3_visitor.h"
34
35 /*
36 * Register Assignment:
37 *
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
42 *
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
50 *
51 */
52
53 struct ir3_ra_ctx {
54 struct ir3_block *block;
55 enum shader_t type;
56 int cnt;
57 bool error;
58 };
59
60 struct ir3_ra_assignment {
61 int8_t off; /* offset of instruction dst within range */
62 uint8_t num; /* number of components for the range */
63 };
64
65 static void ra_assign(struct ir3_ra_ctx *ctx,
66 struct ir3_instruction *assigner, int num);
67 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
68
69 /*
70 * Register Allocation:
71 */
72
73 #define REG(n, wm) (struct ir3_register){ \
74 /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \
75 .num = (n), \
76 .wrmask = TGSI_WRITEMASK_ ## wm, \
77 }
78
79 /* check that the register exists, is a GPR and is not special (a0/p0) */
80 static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
81 {
82 if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
83 return instr->regs[n];
84 return NULL;
85 }
86
87 static int output_base(struct ir3_ra_ctx *ctx)
88 {
89 /* ugg, for fragment shader we need to have input at r0.x
90 * (or at least if there is a way to configure it, I can't
91 * see how because the blob driver always uses r0.x (ie.
92 * all zeros)
93 */
94 if (ctx->type == SHADER_FRAGMENT)
95 return 2;
96 return 0;
97 }
98
99 /* live means read before written */
100 static void compute_liveregs(struct ir3_ra_ctx *ctx,
101 struct ir3_instruction *instr, regmask_t *liveregs)
102 {
103 struct ir3_block *block = instr->block;
104 regmask_t written;
105 unsigned i, j;
106
107 regmask_init(liveregs);
108 regmask_init(&written);
109
110 for (instr = instr->next; instr; instr = instr->next) {
111 struct ir3_register *r;
112
113 if (is_meta(instr))
114 continue;
115
116 /* check first src's read: */
117 for (j = 1; j < instr->regs_count; j++) {
118 r = reg_check(instr, j);
119 if (r)
120 regmask_set_if_not(liveregs, r, &written);
121 }
122
123 /* then dst written (if assigned already): */
124 if (instr->flags & IR3_INSTR_MARK) {
125 r = reg_check(instr, 0);
126 if (r)
127 regmask_set(&written, r);
128 }
129 }
130
131 /* be sure to account for output registers too: */
132 for (i = 0; i < block->noutputs; i++) {
133 struct ir3_register reg = REG(output_base(ctx) + i, X);
134 regmask_set_if_not(liveregs, &reg, &written);
135 }
136 }
137
138 /* calculate registers that are clobbered before last use of 'assigner'.
139 * This needs to be done backwards, although it could possibly be
140 * combined into compute_liveregs(). (Ie. compute_liveregs() could
141 * reverse the list, then do this part backwards reversing the list
142 * again back to original order.) Otoh, probably I should try to
143 * construct a proper interference graph instead.
144 *
145 * XXX this need to follow the same recursion path that is used for
146 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
147 * ugly right now, maybe refactor into node iterator sort of things
148 * that iterates nodes in the correct order?
149 */
150 static bool compute_clobbers(struct ir3_ra_ctx *ctx,
151 struct ir3_instruction *instr, struct ir3_instruction *assigner,
152 regmask_t *liveregs)
153 {
154 unsigned i;
155 bool live = false, was_live = false;
156
157 if (instr == NULL) {
158 struct ir3_block *block = ctx->block;
159
160 /* if at the end, check outputs: */
161 for (i = 0; i < block->noutputs; i++)
162 if (block->outputs[i] == assigner)
163 return true;
164 return false;
165 }
166
167 for (i = 1; i < instr->regs_count; i++) {
168 struct ir3_register *reg = instr->regs[i];
169 if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
170 if (is_meta(instr)) {
171 switch (instr->opc) {
172 case OPC_META_INPUT:
173 // TODO
174 assert(0);
175 break;
176 case OPC_META_FO:
177 case OPC_META_FI:
178 was_live |= compute_clobbers(ctx, instr->next,
179 instr, liveregs);
180 break;
181 default:
182 break;
183 }
184 }
185 live = true;
186 break;
187 }
188 }
189
190 was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
191
192 if (was_live && (instr->regs_count > 0) &&
193 (instr->flags & IR3_INSTR_MARK) &&
194 !is_meta(instr))
195 regmask_set(liveregs, instr->regs[0]);
196
197 return live || was_live;
198 }
199
200 static int find_available(regmask_t *liveregs, int size)
201 {
202 unsigned i;
203 for (i = 0; i < MAX_REG - size; i++) {
204 if (!regmask_get(liveregs, &REG(i, X))) {
205 unsigned start = i++;
206 for (; (i < MAX_REG) && ((i - start) < size); i++)
207 if (regmask_get(liveregs, &REG(i, X)))
208 break;
209 if ((i - start) >= size)
210 return start;
211 }
212 }
213 assert(0);
214 return -1;
215 }
216
217 static int alloc_block(struct ir3_ra_ctx *ctx,
218 struct ir3_instruction *instr, int size)
219 {
220 if (!instr) {
221 /* special case, allocating shader outputs. At this
222 * point, nothing is allocated, just start the shader
223 * outputs at r0.x and let compute_liveregs() take
224 * care of the rest from here:
225 */
226 return 0;
227 } else {
228 regmask_t liveregs;
229 compute_liveregs(ctx, instr, &liveregs);
230
231 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
232 // XXX hack.. maybe ra_calc should give us a list of
233 // instrs to compute_clobbers() on?
234 if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
235 (instr->regs_count == 1)) {
236 unsigned i, base = instr->regs[0]->num & ~0x3;
237 for (i = 0; i < 4; i++) {
238 struct ir3_instruction *in = ctx->block->inputs[base + i];
239 if (in)
240 compute_clobbers(ctx, in->next, in, &liveregs);
241 }
242 } else
243 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
244 compute_clobbers(ctx, instr->next, instr, &liveregs);
245 return find_available(&liveregs, size);
246 }
247 }
248
249 /*
250 * Constraint Calculation:
251 */
252
253 struct ra_calc_visitor {
254 struct ir3_visitor base;
255 struct ir3_ra_assignment a;
256 };
257
258 static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
259 {
260 return (struct ra_calc_visitor *)v;
261 }
262
263 /* calculate register assignment for the instruction. If the register
264 * written by this instruction is required to be part of a range, to
265 * handle other (input/output/sam/bary.f/etc) contiguous register range
266 * constraints, that is calculated handled here.
267 */
268 static void ra_calc_dst(struct ir3_visitor *v,
269 struct ir3_instruction *instr, struct ir3_register *reg)
270 {
271 struct ra_calc_visitor *c = ra_calc_visitor(v);
272 if (is_tex(instr)) {
273 c->a.off = 0;
274 c->a.num = 4;
275 } else {
276 c->a.off = 0;
277 c->a.num = 1;
278 }
279 }
280
281 static void
282 ra_calc_dst_shader_input(struct ir3_visitor *v,
283 struct ir3_instruction *instr, struct ir3_register *reg)
284 {
285 struct ra_calc_visitor *c = ra_calc_visitor(v);
286 struct ir3_block *block = instr->block;
287 struct ir3_register *dst = instr->regs[0];
288 unsigned base = dst->num & ~0x3;
289 unsigned i, num = 0;
290
291 assert(!(dst->flags & IR3_REG_IA));
292
293 /* check what input components we need: */
294 for (i = 0; i < 4; i++) {
295 unsigned idx = base + i;
296 if ((idx < block->ninputs) && block->inputs[idx])
297 num = i + 1;
298 }
299
300 c->a.off = dst->num - base;
301 c->a.num = num;
302 }
303
304 static void ra_calc_src_fanin(struct ir3_visitor *v,
305 struct ir3_instruction *instr, struct ir3_register *reg)
306 {
307 struct ra_calc_visitor *c = ra_calc_visitor(v);
308 unsigned srcn = ir3_instr_regno(instr, reg) - 1;
309 c->a.off -= srcn;
310 c->a.num += srcn;
311 c->a.num = MAX2(c->a.num, instr->regs_count - 1);
312 }
313
314 static const struct ir3_visitor_funcs calc_visitor_funcs = {
315 .instr = ir3_visit_instr,
316 .dst_shader_input = ra_calc_dst_shader_input,
317 .dst_fanout = ra_calc_dst,
318 .dst_fanin = ra_calc_dst,
319 .dst = ra_calc_dst,
320 .src_fanout = ir3_visit_reg,
321 .src_fanin = ra_calc_src_fanin,
322 .src = ir3_visit_reg,
323 };
324
325 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
326 {
327 struct ra_calc_visitor v = {
328 .base.funcs = &calc_visitor_funcs,
329 };
330
331 ir3_visit_instr(&v.base, assigner);
332
333 return v.a;
334 }
335
336 /*
337 * Register Assignment:
338 */
339
340 struct ra_assign_visitor {
341 struct ir3_visitor base;
342 struct ir3_ra_ctx *ctx;
343 int num;
344 };
345
346 static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
347 {
348 return (struct ra_assign_visitor *)v;
349 }
350
351 static void ra_assign_reg(struct ir3_visitor *v,
352 struct ir3_instruction *instr, struct ir3_register *reg)
353 {
354 struct ra_assign_visitor *a = ra_assign_visitor(v);
355 reg->flags &= ~IR3_REG_SSA;
356 reg->num = a->num;
357 }
358
359 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
360 struct ir3_instruction *instr, struct ir3_register *reg)
361 {
362 struct ra_assign_visitor *a = ra_assign_visitor(v);
363 unsigned i, base = reg->num & ~0x3;
364 int off = base - reg->num;
365
366 ra_assign_reg(v, instr, reg);
367 reg->flags |= IR3_REG_IA;
368
369 /* trigger assignment of all our companion input components: */
370 for (i = 0; i < 4; i++) {
371 struct ir3_instruction *in = instr->block->inputs[i+base];
372 if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
373 ra_assign(a->ctx, in, a->num + off + i);
374 }
375 }
376
377 static void ra_assign_dst_fanout(struct ir3_visitor *v,
378 struct ir3_instruction *instr, struct ir3_register *reg)
379 {
380 struct ra_assign_visitor *a = ra_assign_visitor(v);
381 struct ir3_register *src = instr->regs[1];
382 ra_assign_reg(v, instr, reg);
383 if (src->flags & IR3_REG_SSA)
384 ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
385 }
386
387 static void ra_assign_src_fanout(struct ir3_visitor *v,
388 struct ir3_instruction *instr, struct ir3_register *reg)
389 {
390 struct ra_assign_visitor *a = ra_assign_visitor(v);
391 ra_assign_reg(v, instr, reg);
392 ra_assign(a->ctx, instr, a->num + instr->fo.off);
393 }
394
395
396 static void ra_assign_src_fanin(struct ir3_visitor *v,
397 struct ir3_instruction *instr, struct ir3_register *reg)
398 {
399 struct ra_assign_visitor *a = ra_assign_visitor(v);
400 unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
401 ra_assign_reg(v, instr, reg);
402 ra_assign(a->ctx, instr, a->num - srcn);
403 for (j = 1; j < instr->regs_count; j++) {
404 struct ir3_register *reg = instr->regs[j];
405 if (reg->flags & IR3_REG_SSA) /* could be renamed already */
406 ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
407 }
408 }
409
410 static const struct ir3_visitor_funcs assign_visitor_funcs = {
411 .instr = ir3_visit_instr,
412 .dst_shader_input = ra_assign_dst_shader_input,
413 .dst_fanout = ra_assign_dst_fanout,
414 .dst_fanin = ra_assign_reg,
415 .dst = ra_assign_reg,
416 .src_fanout = ra_assign_src_fanout,
417 .src_fanin = ra_assign_src_fanin,
418 .src = ra_assign_reg,
419 };
420
421 static void ra_assign(struct ir3_ra_ctx *ctx,
422 struct ir3_instruction *assigner, int num)
423 {
424 struct ra_assign_visitor v = {
425 .base.funcs = &assign_visitor_funcs,
426 .ctx = ctx,
427 .num = num,
428 };
429
430 /* if we've already visited this instruction, bail now: */
431 if (ir3_instr_check_mark(assigner)) {
432 debug_assert(assigner->regs[0]->num == num);
433 if (assigner->regs[0]->num != num) {
434 /* impossible situation, should have been resolved
435 * at an earlier stage by inserting extra mov's:
436 */
437 ctx->error = true;
438 }
439 return;
440 }
441
442 ir3_visit_instr(&v.base, assigner);
443 }
444
445 /*
446 *
447 */
448
449 static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
450 struct ir3_instruction *instr)
451 {
452 struct ir3_ra_assignment a;
453 unsigned num;
454
455 /* skip over nop's */
456 if (instr->regs_count == 0)
457 return;
458
459 /* skip writes to a0, p0, etc */
460 if (!reg_gpr(instr->regs[0]))
461 return;
462
463 /* if we've already visited this instruction, bail now: */
464 if (instr->flags & IR3_INSTR_MARK)
465 return;
466
467 /* allocate register(s): */
468 a = ra_calc(instr);
469 num = alloc_block(ctx, instr, a.num) + a.off;
470
471 ra_assign(ctx, instr, num);
472 }
473
474 /* flatten into shader: */
475 // XXX this should probably be somewhere else:
476 static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
477 {
478 struct ir3_instruction *n;
479 struct ir3_shader *shader = block->shader;
480 struct ir3_instruction *end =
481 ir3_instr_create(block, 0, OPC_END);
482 struct ir3_instruction *last_input = NULL;
483 regmask_t needs_ss;
484 regmask_t needs_sy;
485
486 regmask_init(&needs_ss);
487 regmask_init(&needs_sy);
488
489 shader->instrs_count = 0;
490
491 for (n = block->head; n; n = n->next) {
492 unsigned i;
493
494 if (is_meta(n))
495 continue;
496
497 for (i = 1; i < n->regs_count; i++) {
498 struct ir3_register *reg = n->regs[i];
499
500 if (reg_gpr(reg)) {
501
502 /* TODO: we probably only need (ss) for alu
503 * instr consuming sfu result.. need to make
504 * some tests for both this and (sy)..
505 */
506 if (regmask_get(&needs_ss, reg)) {
507 n->flags |= IR3_INSTR_SS;
508 regmask_init(&needs_ss);
509 }
510
511 if (regmask_get(&needs_sy, reg)) {
512 n->flags |= IR3_INSTR_SY;
513 regmask_init(&needs_sy);
514 }
515 }
516 }
517
518 shader->instrs[shader->instrs_count++] = n;
519
520 if (is_sfu(n))
521 regmask_set(&needs_ss, n->regs[0]);
522 if (is_tex(n))
523 regmask_set(&needs_sy, n->regs[0]);
524 if (is_input(n))
525 last_input = n;
526 }
527
528 if (last_input)
529 last_input->regs[0]->flags |= IR3_REG_EI;
530
531 shader->instrs[shader->instrs_count++] = end;
532
533 shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
534 }
535
536 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
537 {
538 struct ir3_instruction *n;
539
540 if (!block->parent) {
541 unsigned i;
542 int base, off = output_base(ctx);
543
544 base = alloc_block(ctx, NULL, block->noutputs + off);
545
546 for (i = 0; i < block->noutputs; i++)
547 if (block->outputs[i])
548 ra_assign(ctx, block->outputs[i], base + i + off);
549
550 if (ctx->type == SHADER_FRAGMENT) {
551 for (i = 0; i < block->ninputs; i++)
552 if (block->inputs[i])
553 ra_assign(ctx, block->inputs[i], base + i);
554 } else {
555 for (i = 0; i < block->ninputs; i++)
556 if (block->inputs[i])
557 ir3_instr_ra(ctx, block->inputs[i]);
558 }
559 }
560
561 /* then loop over instruction list and assign registers:
562 */
563 n = block->head;
564 while (n) {
565 ir3_instr_ra(ctx, n);
566 if (ctx->error)
567 return -1;
568 n = n->next;
569 }
570
571 legalize(ctx, block);
572
573 return 0;
574 }
575
576 int ir3_block_ra(struct ir3_block *block, enum shader_t type)
577 {
578 struct ir3_ra_ctx ctx = {
579 .block = block,
580 .type = type,
581 };
582 ir3_shader_clear_mark(block->shader);
583 return block_ra(&ctx, block);
584 }