freedreno/ir3: split out legalize pass
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_ra.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33 #include "ir3_visitor.h"
34
35 /*
36 * Register Assignment:
37 *
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
42 *
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
50 *
51 */
52
53 struct ir3_ra_ctx {
54 struct ir3_block *block;
55 enum shader_t type;
56 bool half_precision;
57 bool frag_coord;
58 bool frag_face;
59 int cnt;
60 bool error;
61 };
62
63 #define ra_debug 0
64
65 #define ra_dump_list(msg, n) do { \
66 if (ra_debug) { \
67 debug_printf("-- " msg); \
68 ir3_dump_instr_list(n); \
69 } \
70 } while (0)
71
72 #define ra_dump_instr(msg, n) do { \
73 if (ra_debug) { \
74 debug_printf(">> " msg); \
75 ir3_dump_instr_single(n); \
76 } \
77 } while (0)
78
79 /* sorta ugly way to retrofit half-precision support.. rather than
80 * passing extra param around, just OR in a high bit. All the low
81 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
82 * will continue to work as long as you don't underflow (and that
83 * would go badly anyways).
84 */
85 #define REG_HALF 0x8000
86
87 struct ir3_ra_assignment {
88 int8_t off; /* offset of instruction dst within range */
89 uint8_t num; /* number of components for the range */
90 };
91
92 static void ra_assign(struct ir3_ra_ctx *ctx,
93 struct ir3_instruction *assigner, int num);
94 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
95
96 /*
97 * Register Allocation:
98 */
99
100 #define REG(n, wm, f) (struct ir3_register){ \
101 .flags = (f), \
102 .num = (n), \
103 .wrmask = TGSI_WRITEMASK_ ## wm, \
104 }
105
106 /* check that the register exists, is a GPR and is not special (a0/p0) */
107 static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
108 {
109 if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) &&
110 !(instr->regs[n]->flags & IR3_REG_SSA))
111 return instr->regs[n];
112 return NULL;
113 }
114
115 static int output_base(struct ir3_ra_ctx *ctx)
116 {
117 /* ugg, for fragment shader we need to have input at r0.x
118 * (or at least if there is a way to configure it, I can't
119 * see how because the blob driver always uses r0.x (ie.
120 * all zeros)
121 */
122 if (ctx->type == SHADER_FRAGMENT) {
123 if (ctx->half_precision)
124 return ctx->frag_face ? 4 : 3;
125 return ctx->frag_coord ? 8 : 4;
126 }
127 return 0;
128 }
129
130 /* live means read before written */
131 static void compute_liveregs(struct ir3_ra_ctx *ctx,
132 struct ir3_instruction *instr, regmask_t *liveregs)
133 {
134 struct ir3_block *block = instr->block;
135 regmask_t written;
136 unsigned i, j;
137
138 regmask_init(liveregs);
139 regmask_init(&written);
140
141 for (instr = instr->next; instr; instr = instr->next) {
142 struct ir3_register *r;
143
144 if (is_meta(instr))
145 continue;
146
147 /* check first src's read: */
148 for (j = 1; j < instr->regs_count; j++) {
149 r = reg_check(instr, j);
150 if (r)
151 regmask_set_if_not(liveregs, r, &written);
152 }
153
154 /* then dst written (if assigned already): */
155 if (instr->flags & IR3_INSTR_MARK) {
156 r = reg_check(instr, 0);
157 if (r)
158 regmask_set(&written, r);
159 }
160 }
161
162 /* be sure to account for output registers too: */
163 for (i = 0; i < block->noutputs; i++) {
164 struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
165 regmask_set_if_not(liveregs, &reg, &written);
166 }
167 }
168
169 /* calculate registers that are clobbered before last use of 'assigner'.
170 * This needs to be done backwards, although it could possibly be
171 * combined into compute_liveregs(). (Ie. compute_liveregs() could
172 * reverse the list, then do this part backwards reversing the list
173 * again back to original order.) Otoh, probably I should try to
174 * construct a proper interference graph instead.
175 *
176 * XXX this need to follow the same recursion path that is used for
177 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
178 * ugly right now, maybe refactor into node iterator sort of things
179 * that iterates nodes in the correct order?
180 */
181 static bool compute_clobbers(struct ir3_ra_ctx *ctx,
182 struct ir3_instruction *instr, struct ir3_instruction *assigner,
183 regmask_t *liveregs)
184 {
185 unsigned i;
186 bool live = false, was_live = false;
187
188 if (instr == NULL) {
189 struct ir3_block *block = ctx->block;
190
191 /* if at the end, check outputs: */
192 for (i = 0; i < block->noutputs; i++)
193 if (block->outputs[i] == assigner)
194 return true;
195 return false;
196 }
197
198 for (i = 1; i < instr->regs_count; i++) {
199 struct ir3_register *reg = instr->regs[i];
200 if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
201 if (is_meta(instr)) {
202 switch (instr->opc) {
203 case OPC_META_INPUT:
204 // TODO
205 assert(0);
206 break;
207 case OPC_META_FO:
208 case OPC_META_FI:
209 was_live |= compute_clobbers(ctx, instr->next,
210 instr, liveregs);
211 break;
212 default:
213 break;
214 }
215 }
216 live = true;
217 break;
218 }
219 }
220
221 was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
222
223 if (was_live && (instr->regs_count > 0) &&
224 (instr->flags & IR3_INSTR_MARK) &&
225 !is_meta(instr))
226 regmask_set(liveregs, instr->regs[0]);
227
228 return live || was_live;
229 }
230
231 static int find_available(regmask_t *liveregs, int size, bool half)
232 {
233 unsigned i;
234 unsigned f = half ? IR3_REG_HALF : 0;
235 for (i = 0; i < MAX_REG - size; i++) {
236 if (!regmask_get(liveregs, &REG(i, X, f))) {
237 unsigned start = i++;
238 for (; (i < MAX_REG) && ((i - start) < size); i++)
239 if (regmask_get(liveregs, &REG(i, X, f)))
240 break;
241 if ((i - start) >= size)
242 return start;
243 }
244 }
245 assert(0);
246 return -1;
247 }
248
249 static int alloc_block(struct ir3_ra_ctx *ctx,
250 struct ir3_instruction *instr, int size)
251 {
252 if (!instr) {
253 /* special case, allocating shader outputs. At this
254 * point, nothing is allocated, just start the shader
255 * outputs at r0.x and let compute_liveregs() take
256 * care of the rest from here:
257 */
258 return 0;
259 } else {
260 struct ir3_register *dst = instr->regs[0];
261 regmask_t liveregs;
262
263 compute_liveregs(ctx, instr, &liveregs);
264
265 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
266 // XXX hack.. maybe ra_calc should give us a list of
267 // instrs to compute_clobbers() on?
268 if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
269 (instr->regs_count == 1)) {
270 unsigned i, base = instr->regs[0]->num & ~0x3;
271 for (i = 0; i < 4; i++) {
272 struct ir3_instruction *in = NULL;
273 if ((base + i) < ctx->block->ninputs)
274 in = ctx->block->inputs[base + i];
275 if (in)
276 compute_clobbers(ctx, in->next, in, &liveregs);
277 }
278 } else
279 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
280 compute_clobbers(ctx, instr->next, instr, &liveregs);
281
282 return find_available(&liveregs, size,
283 !!(dst->flags & IR3_REG_HALF));
284 }
285 }
286
287 /*
288 * Constraint Calculation:
289 */
290
291 struct ra_calc_visitor {
292 struct ir3_visitor base;
293 struct ir3_ra_assignment a;
294 };
295
296 static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
297 {
298 return (struct ra_calc_visitor *)v;
299 }
300
301 /* calculate register assignment for the instruction. If the register
302 * written by this instruction is required to be part of a range, to
303 * handle other (input/output/sam/bary.f/etc) contiguous register range
304 * constraints, that is calculated handled here.
305 */
306 static void ra_calc_dst(struct ir3_visitor *v,
307 struct ir3_instruction *instr, struct ir3_register *reg)
308 {
309 struct ra_calc_visitor *c = ra_calc_visitor(v);
310 if (is_tex(instr)) {
311 c->a.off = 0;
312 c->a.num = 4;
313 } else {
314 c->a.off = 0;
315 c->a.num = 1;
316 }
317 }
318
319 static void
320 ra_calc_dst_shader_input(struct ir3_visitor *v,
321 struct ir3_instruction *instr, struct ir3_register *reg)
322 {
323 struct ra_calc_visitor *c = ra_calc_visitor(v);
324 struct ir3_block *block = instr->block;
325 struct ir3_register *dst = instr->regs[0];
326 unsigned base = dst->num & ~0x3;
327 unsigned i, num = 0;
328
329 assert(!(dst->flags & IR3_REG_IA));
330
331 /* check what input components we need: */
332 for (i = 0; i < 4; i++) {
333 unsigned idx = base + i;
334 if ((idx < block->ninputs) && block->inputs[idx])
335 num = i + 1;
336 }
337
338 c->a.off = dst->num - base;
339 c->a.num = num;
340 }
341
342 static void ra_calc_src_fanin(struct ir3_visitor *v,
343 struct ir3_instruction *instr, struct ir3_register *reg)
344 {
345 struct ra_calc_visitor *c = ra_calc_visitor(v);
346 unsigned srcn = ir3_instr_regno(instr, reg) - 1;
347 c->a.off += srcn;
348 c->a.num += srcn;
349 c->a.num = MAX2(c->a.num, instr->regs_count - 1);
350 }
351
352 static const struct ir3_visitor_funcs calc_visitor_funcs = {
353 .instr = ir3_visit_instr,
354 .dst_shader_input = ra_calc_dst_shader_input,
355 .dst_fanout = ra_calc_dst,
356 .dst_fanin = ra_calc_dst,
357 .dst = ra_calc_dst,
358 .src_fanout = ir3_visit_reg,
359 .src_fanin = ra_calc_src_fanin,
360 .src = ir3_visit_reg,
361 };
362
363 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
364 {
365 struct ra_calc_visitor v = {
366 .base.funcs = &calc_visitor_funcs,
367 };
368
369 ir3_visit_instr(&v.base, assigner);
370
371 return v.a;
372 }
373
374 /*
375 * Register Assignment:
376 */
377
378 struct ra_assign_visitor {
379 struct ir3_visitor base;
380 struct ir3_ra_ctx *ctx;
381 int num;
382 };
383
384 static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
385 {
386 return (struct ra_assign_visitor *)v;
387 }
388
389 static type_t half_type(type_t type)
390 {
391 switch (type) {
392 case TYPE_F32: return TYPE_F16;
393 case TYPE_U32: return TYPE_U16;
394 case TYPE_S32: return TYPE_S16;
395 /* instructions may already be fixed up: */
396 case TYPE_F16:
397 case TYPE_U16:
398 case TYPE_S16:
399 return type;
400 default:
401 assert(0);
402 return ~0;
403 }
404 }
405
406 /* some instructions need fix-up if dst register is half precision: */
407 static void fixup_half_instr_dst(struct ir3_instruction *instr)
408 {
409 switch (instr->category) {
410 case 1: /* move instructions */
411 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
412 break;
413 case 3:
414 switch (instr->opc) {
415 case OPC_MAD_F32:
416 instr->opc = OPC_MAD_F16;
417 break;
418 case OPC_SEL_B32:
419 instr->opc = OPC_SEL_B16;
420 break;
421 case OPC_SEL_S32:
422 instr->opc = OPC_SEL_S16;
423 break;
424 case OPC_SEL_F32:
425 instr->opc = OPC_SEL_F16;
426 break;
427 case OPC_SAD_S32:
428 instr->opc = OPC_SAD_S16;
429 break;
430 /* instructions may already be fixed up: */
431 case OPC_MAD_F16:
432 case OPC_SEL_B16:
433 case OPC_SEL_S16:
434 case OPC_SEL_F16:
435 case OPC_SAD_S16:
436 break;
437 default:
438 assert(0);
439 break;
440 }
441 break;
442 case 5:
443 instr->cat5.type = half_type(instr->cat5.type);
444 break;
445 }
446 }
447 /* some instructions need fix-up if src register is half precision: */
448 static void fixup_half_instr_src(struct ir3_instruction *instr)
449 {
450 switch (instr->category) {
451 case 1: /* move instructions */
452 instr->cat1.src_type = half_type(instr->cat1.src_type);
453 break;
454 }
455 }
456
457 static void ra_assign_reg(struct ir3_visitor *v,
458 struct ir3_instruction *instr, struct ir3_register *reg)
459 {
460 struct ra_assign_visitor *a = ra_assign_visitor(v);
461
462 reg->flags &= ~IR3_REG_SSA;
463 reg->num = a->num & ~REG_HALF;
464
465 assert(reg->num >= 0);
466
467 if (a->num & REG_HALF) {
468 reg->flags |= IR3_REG_HALF;
469 /* if dst reg being assigned, patch up the instr: */
470 if (reg == instr->regs[0])
471 fixup_half_instr_dst(instr);
472 else
473 fixup_half_instr_src(instr);
474 }
475 }
476
477 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
478 struct ir3_instruction *instr, struct ir3_register *reg)
479 {
480 struct ra_assign_visitor *a = ra_assign_visitor(v);
481 unsigned i, base = reg->num & ~0x3;
482 int off = base - reg->num;
483
484 ra_assign_reg(v, instr, reg);
485 reg->flags |= IR3_REG_IA;
486
487 /* trigger assignment of all our companion input components: */
488 for (i = 0; i < 4; i++) {
489 struct ir3_instruction *in = NULL;
490 if ((base + i) < instr->block->ninputs)
491 in = instr->block->inputs[base + i];
492 if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
493 ra_assign(a->ctx, in, a->num + off + i);
494 }
495 }
496
497 static void ra_assign_dst_fanout(struct ir3_visitor *v,
498 struct ir3_instruction *instr, struct ir3_register *reg)
499 {
500 struct ra_assign_visitor *a = ra_assign_visitor(v);
501 struct ir3_register *src = instr->regs[1];
502 ra_assign_reg(v, instr, reg);
503 if (src->flags & IR3_REG_SSA)
504 ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
505 }
506
507 static void ra_assign_src_fanout(struct ir3_visitor *v,
508 struct ir3_instruction *instr, struct ir3_register *reg)
509 {
510 struct ra_assign_visitor *a = ra_assign_visitor(v);
511 ra_assign_reg(v, instr, reg);
512 ra_assign(a->ctx, instr, a->num + instr->fo.off);
513 }
514
515
516 static void ra_assign_src_fanin(struct ir3_visitor *v,
517 struct ir3_instruction *instr, struct ir3_register *reg)
518 {
519 struct ra_assign_visitor *a = ra_assign_visitor(v);
520 unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
521 ra_assign_reg(v, instr, reg);
522 ra_assign(a->ctx, instr, a->num - srcn);
523 for (j = 1; j < instr->regs_count; j++) {
524 struct ir3_register *reg = instr->regs[j];
525 if (reg->flags & IR3_REG_SSA) /* could be renamed already */
526 ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
527 }
528 }
529
530 static const struct ir3_visitor_funcs assign_visitor_funcs = {
531 .instr = ir3_visit_instr,
532 .dst_shader_input = ra_assign_dst_shader_input,
533 .dst_fanout = ra_assign_dst_fanout,
534 .dst_fanin = ra_assign_reg,
535 .dst = ra_assign_reg,
536 .src_fanout = ra_assign_src_fanout,
537 .src_fanin = ra_assign_src_fanin,
538 .src = ra_assign_reg,
539 };
540
541 static void ra_assign(struct ir3_ra_ctx *ctx,
542 struct ir3_instruction *assigner, int num)
543 {
544 struct ra_assign_visitor v = {
545 .base.funcs = &assign_visitor_funcs,
546 .ctx = ctx,
547 .num = num,
548 };
549
550 /* if we've already visited this instruction, bail now: */
551 if (ir3_instr_check_mark(assigner)) {
552 debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
553 if (assigner->regs[0]->num != (num & ~REG_HALF)) {
554 /* impossible situation, should have been resolved
555 * at an earlier stage by inserting extra mov's:
556 */
557 ctx->error = true;
558 }
559 return;
560 }
561
562 ir3_visit_instr(&v.base, assigner);
563 }
564
565 /*
566 *
567 */
568
569 static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
570 struct ir3_instruction *instr)
571 {
572 struct ir3_register *dst;
573 unsigned num;
574
575 /* skip over nop's */
576 if (instr->regs_count == 0)
577 return;
578
579 dst = instr->regs[0];
580
581 /* if we've already visited this instruction, bail now: */
582 if (instr->flags & IR3_INSTR_MARK)
583 return;
584
585 /* allocate register(s): */
586 if (is_addr(instr)) {
587 num = instr->regs[2]->num;
588 } else if (reg_gpr(dst)) {
589 struct ir3_ra_assignment a;
590 a = ra_calc(instr);
591 num = alloc_block(ctx, instr, a.num) + a.off;
592 } else if (dst->flags & IR3_REG_ADDR) {
593 dst->flags &= ~IR3_REG_ADDR;
594 num = regid(REG_A0, 0) | REG_HALF;
595 } else {
596 /* predicate register (p0).. etc */
597 num = regid(REG_P0, 0);
598 debug_assert(dst->num == num);
599 }
600
601 ra_assign(ctx, instr, num);
602 }
603
604 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
605 {
606 struct ir3_instruction *n;
607
608 ra_dump_list("before:\n", block->head);
609
610 if (!block->parent) {
611 unsigned i, j;
612 int base, off = output_base(ctx);
613
614 base = alloc_block(ctx, NULL, block->noutputs + off);
615
616 if (ctx->half_precision)
617 base |= REG_HALF;
618
619 for (i = 0; i < block->noutputs; i++)
620 if (block->outputs[i] && !is_kill(block->outputs[i]))
621 ra_assign(ctx, block->outputs[i], base + i + off);
622
623 if (ctx->type == SHADER_FRAGMENT) {
624 i = 0;
625 if (ctx->frag_face) {
626 /* if we have frag_face, it gets hr0.x */
627 ra_assign(ctx, block->inputs[i], REG_HALF | 0);
628 i += 4;
629 }
630 for (j = 0; i < block->ninputs; i++, j++)
631 if (block->inputs[i])
632 ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
633 } else {
634 for (i = 0; i < block->ninputs; i++)
635 if (block->inputs[i])
636 ir3_instr_ra(ctx, block->inputs[i]);
637 }
638 }
639
640 ra_dump_list("after:\n", block->head);
641
642 /* then loop over instruction list and assign registers:
643 */
644 for (n = block->head; n; n = n->next) {
645 ra_dump_instr("ASSIGN: ", n);
646 ir3_instr_ra(ctx, n);
647 if (ctx->error)
648 return -1;
649 ra_dump_list("-------", block->head);
650 }
651
652 return 0;
653 }
654
655 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
656 bool half_precision, bool frag_coord, bool frag_face)
657 {
658 struct ir3_instruction *n;
659 struct ir3_ra_ctx ctx = {
660 .block = block,
661 .type = type,
662 .half_precision = half_precision,
663 .frag_coord = frag_coord,
664 .frag_face = frag_face,
665 };
666 int ret;
667
668 /* mark dst registers w/ SSA flag so we can see which
669 * have been assigned so far:
670 */
671 for (n = block->head; n; n = n->next)
672 if (n->regs_count > 0)
673 n->regs[0]->flags |= IR3_REG_SSA;
674
675 ir3_clear_mark(block->shader);
676 ret = block_ra(&ctx, block);
677
678 return ret;
679 }