freedreno/ir3: drop instr_clone() stuff
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_ra.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33 #include "ir3_visitor.h"
34
35 /*
36 * Register Assignment:
37 *
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
42 *
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
50 *
51 */
52
53 struct ir3_ra_ctx {
54 struct ir3_block *block;
55 enum shader_t type;
56 bool half_precision;
57 bool frag_coord;
58 bool frag_face;
59 int cnt;
60 bool error;
61 };
62
63 #ifdef DEBUG
64 # include "freedreno_util.h"
65 # define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS)
66 #else
67 # define ra_debug 0
68 #endif
69
70 #define ra_dump_list(msg, n) do { \
71 if (ra_debug) { \
72 debug_printf("-- " msg); \
73 ir3_dump_instr_list(n); \
74 } \
75 } while (0)
76
77 #define ra_dump_instr(msg, n) do { \
78 if (ra_debug) { \
79 debug_printf(">> " msg); \
80 ir3_dump_instr_single(n); \
81 } \
82 } while (0)
83
84 /* sorta ugly way to retrofit half-precision support.. rather than
85 * passing extra param around, just OR in a high bit. All the low
86 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
87 * will continue to work as long as you don't underflow (and that
88 * would go badly anyways).
89 */
90 #define REG_HALF 0x8000
91
92 struct ir3_ra_assignment {
93 int8_t off; /* offset of instruction dst within range */
94 uint8_t num; /* number of components for the range */
95 };
96
97 static void ra_assign(struct ir3_ra_ctx *ctx,
98 struct ir3_instruction *assigner, int num);
99 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
100
101 /*
102 * Register Allocation:
103 */
104
105 #define REG(n, wm, f) (struct ir3_register){ \
106 .flags = (f), \
107 .num = (n), \
108 .wrmask = TGSI_WRITEMASK_ ## wm, \
109 }
110
111 /* check that the register exists, is a GPR and is not special (a0/p0) */
112 static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
113 {
114 if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) &&
115 !(instr->regs[n]->flags & IR3_REG_SSA))
116 return instr->regs[n];
117 return NULL;
118 }
119
120 static int output_base(struct ir3_ra_ctx *ctx)
121 {
122 /* ugg, for fragment shader we need to have input at r0.x
123 * (or at least if there is a way to configure it, I can't
124 * see how because the blob driver always uses r0.x (ie.
125 * all zeros)
126 */
127 if (ctx->type == SHADER_FRAGMENT) {
128 if (ctx->half_precision)
129 return ctx->frag_face ? 4 : 3;
130 return ctx->frag_coord ? 8 : 4;
131 }
132 return 0;
133 }
134
135 /* live means read before written */
136 static void compute_liveregs(struct ir3_ra_ctx *ctx,
137 struct ir3_instruction *instr, regmask_t *liveregs)
138 {
139 struct ir3_block *block = instr->block;
140 regmask_t written;
141 unsigned i, j;
142
143 regmask_init(liveregs);
144 regmask_init(&written);
145
146 for (instr = instr->next; instr; instr = instr->next) {
147 struct ir3_register *r;
148
149 if (is_meta(instr))
150 continue;
151
152 /* check first src's read: */
153 for (j = 1; j < instr->regs_count; j++) {
154 r = reg_check(instr, j);
155 if (r)
156 regmask_set_if_not(liveregs, r, &written);
157 }
158
159 /* then dst written (if assigned already): */
160 if (instr->flags & IR3_INSTR_MARK) {
161 r = reg_check(instr, 0);
162 if (r)
163 regmask_set(&written, r);
164 }
165 }
166
167 /* be sure to account for output registers too: */
168 for (i = 0; i < block->noutputs; i++) {
169 struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
170 regmask_set_if_not(liveregs, &reg, &written);
171 }
172 }
173
174 /* calculate registers that are clobbered before last use of 'assigner'.
175 * This needs to be done backwards, although it could possibly be
176 * combined into compute_liveregs(). (Ie. compute_liveregs() could
177 * reverse the list, then do this part backwards reversing the list
178 * again back to original order.) Otoh, probably I should try to
179 * construct a proper interference graph instead.
180 *
181 * XXX this need to follow the same recursion path that is used for
182 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
183 * ugly right now, maybe refactor into node iterator sort of things
184 * that iterates nodes in the correct order?
185 */
186 static bool compute_clobbers(struct ir3_ra_ctx *ctx,
187 struct ir3_instruction *instr, struct ir3_instruction *assigner,
188 regmask_t *liveregs)
189 {
190 unsigned i;
191 bool live = false, was_live = false;
192
193 if (instr == NULL) {
194 struct ir3_block *block = ctx->block;
195
196 /* if at the end, check outputs: */
197 for (i = 0; i < block->noutputs; i++)
198 if (block->outputs[i] == assigner)
199 return true;
200 return false;
201 }
202
203 for (i = 1; i < instr->regs_count; i++) {
204 struct ir3_register *reg = instr->regs[i];
205 if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
206 if (is_meta(instr)) {
207 switch (instr->opc) {
208 case OPC_META_INPUT:
209 // TODO
210 assert(0);
211 break;
212 case OPC_META_FO:
213 case OPC_META_FI:
214 was_live |= compute_clobbers(ctx, instr->next,
215 instr, liveregs);
216 break;
217 default:
218 break;
219 }
220 }
221 live = true;
222 break;
223 }
224 }
225
226 was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
227
228 if (was_live && (instr->regs_count > 0) &&
229 (instr->flags & IR3_INSTR_MARK) &&
230 !is_meta(instr))
231 regmask_set(liveregs, instr->regs[0]);
232
233 return live || was_live;
234 }
235
236 static int find_available(regmask_t *liveregs, int size, bool half)
237 {
238 unsigned i;
239 unsigned f = half ? IR3_REG_HALF : 0;
240 for (i = 0; i < MAX_REG - size; i++) {
241 if (!regmask_get(liveregs, &REG(i, X, f))) {
242 unsigned start = i++;
243 for (; (i < MAX_REG) && ((i - start) < size); i++)
244 if (regmask_get(liveregs, &REG(i, X, f)))
245 break;
246 if ((i - start) >= size)
247 return start;
248 }
249 }
250 assert(0);
251 return -1;
252 }
253
254 static int alloc_block(struct ir3_ra_ctx *ctx,
255 struct ir3_instruction *instr, int size)
256 {
257 if (!instr) {
258 /* special case, allocating shader outputs. At this
259 * point, nothing is allocated, just start the shader
260 * outputs at r0.x and let compute_liveregs() take
261 * care of the rest from here:
262 */
263 return 0;
264 } else {
265 struct ir3_register *dst = instr->regs[0];
266 regmask_t liveregs;
267
268 compute_liveregs(ctx, instr, &liveregs);
269
270 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
271 // XXX hack.. maybe ra_calc should give us a list of
272 // instrs to compute_clobbers() on?
273 if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
274 (instr->regs_count == 1)) {
275 unsigned i, base = instr->regs[0]->num & ~0x3;
276 for (i = 0; i < 4; i++) {
277 struct ir3_instruction *in = NULL;
278 if ((base + i) < ctx->block->ninputs)
279 in = ctx->block->inputs[base + i];
280 if (in)
281 compute_clobbers(ctx, in->next, in, &liveregs);
282 }
283 } else
284 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
285 compute_clobbers(ctx, instr->next, instr, &liveregs);
286
287 return find_available(&liveregs, size,
288 !!(dst->flags & IR3_REG_HALF));
289 }
290 }
291
292 /*
293 * Constraint Calculation:
294 */
295
296 struct ra_calc_visitor {
297 struct ir3_visitor base;
298 struct ir3_ra_assignment a;
299 };
300
301 static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
302 {
303 return (struct ra_calc_visitor *)v;
304 }
305
306 /* calculate register assignment for the instruction. If the register
307 * written by this instruction is required to be part of a range, to
308 * handle other (input/output/sam/bary.f/etc) contiguous register range
309 * constraints, that is calculated handled here.
310 */
311 static void ra_calc_dst(struct ir3_visitor *v,
312 struct ir3_instruction *instr, struct ir3_register *reg)
313 {
314 struct ra_calc_visitor *c = ra_calc_visitor(v);
315 if (is_tex(instr)) {
316 c->a.off = 0;
317 c->a.num = 4;
318 } else {
319 c->a.off = 0;
320 c->a.num = 1;
321 }
322 }
323
324 static void
325 ra_calc_dst_shader_input(struct ir3_visitor *v,
326 struct ir3_instruction *instr, struct ir3_register *reg)
327 {
328 struct ra_calc_visitor *c = ra_calc_visitor(v);
329 struct ir3_block *block = instr->block;
330 struct ir3_register *dst = instr->regs[0];
331 unsigned base = dst->num & ~0x3;
332 unsigned i, num = 0;
333
334 assert(!(dst->flags & IR3_REG_IA));
335
336 /* check what input components we need: */
337 for (i = 0; i < 4; i++) {
338 unsigned idx = base + i;
339 if ((idx < block->ninputs) && block->inputs[idx])
340 num = i + 1;
341 }
342
343 c->a.off = dst->num - base;
344 c->a.num = num;
345 }
346
347 static void ra_calc_src_fanin(struct ir3_visitor *v,
348 struct ir3_instruction *instr, struct ir3_register *reg)
349 {
350 struct ra_calc_visitor *c = ra_calc_visitor(v);
351 unsigned srcn = ir3_instr_regno(instr, reg) - 1;
352 c->a.off += srcn;
353 c->a.num += srcn;
354 c->a.num = MAX2(c->a.num, instr->regs_count - 1);
355 }
356
357 static const struct ir3_visitor_funcs calc_visitor_funcs = {
358 .instr = ir3_visit_instr,
359 .dst_shader_input = ra_calc_dst_shader_input,
360 .dst_fanout = ra_calc_dst,
361 .dst_fanin = ra_calc_dst,
362 .dst = ra_calc_dst,
363 .src_fanout = ir3_visit_reg,
364 .src_fanin = ra_calc_src_fanin,
365 .src = ir3_visit_reg,
366 };
367
368 static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
369 {
370 struct ra_calc_visitor v = {
371 .base.funcs = &calc_visitor_funcs,
372 };
373
374 ir3_visit_instr(&v.base, assigner);
375
376 return v.a;
377 }
378
379 /*
380 * Register Assignment:
381 */
382
383 struct ra_assign_visitor {
384 struct ir3_visitor base;
385 struct ir3_ra_ctx *ctx;
386 int num;
387 };
388
389 static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
390 {
391 return (struct ra_assign_visitor *)v;
392 }
393
394 static type_t half_type(type_t type)
395 {
396 switch (type) {
397 case TYPE_F32: return TYPE_F16;
398 case TYPE_U32: return TYPE_U16;
399 case TYPE_S32: return TYPE_S16;
400 /* instructions may already be fixed up: */
401 case TYPE_F16:
402 case TYPE_U16:
403 case TYPE_S16:
404 return type;
405 default:
406 assert(0);
407 return ~0;
408 }
409 }
410
411 /* some instructions need fix-up if dst register is half precision: */
412 static void fixup_half_instr_dst(struct ir3_instruction *instr)
413 {
414 switch (instr->category) {
415 case 1: /* move instructions */
416 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
417 break;
418 case 3:
419 switch (instr->opc) {
420 case OPC_MAD_F32:
421 instr->opc = OPC_MAD_F16;
422 break;
423 case OPC_SEL_B32:
424 instr->opc = OPC_SEL_B16;
425 break;
426 case OPC_SEL_S32:
427 instr->opc = OPC_SEL_S16;
428 break;
429 case OPC_SEL_F32:
430 instr->opc = OPC_SEL_F16;
431 break;
432 case OPC_SAD_S32:
433 instr->opc = OPC_SAD_S16;
434 break;
435 /* instructions may already be fixed up: */
436 case OPC_MAD_F16:
437 case OPC_SEL_B16:
438 case OPC_SEL_S16:
439 case OPC_SEL_F16:
440 case OPC_SAD_S16:
441 break;
442 default:
443 assert(0);
444 break;
445 }
446 break;
447 case 5:
448 instr->cat5.type = half_type(instr->cat5.type);
449 break;
450 }
451 }
452 /* some instructions need fix-up if src register is half precision: */
453 static void fixup_half_instr_src(struct ir3_instruction *instr)
454 {
455 switch (instr->category) {
456 case 1: /* move instructions */
457 instr->cat1.src_type = half_type(instr->cat1.src_type);
458 break;
459 }
460 }
461
462 static void ra_assign_reg(struct ir3_visitor *v,
463 struct ir3_instruction *instr, struct ir3_register *reg)
464 {
465 struct ra_assign_visitor *a = ra_assign_visitor(v);
466
467 reg->flags &= ~IR3_REG_SSA;
468 reg->num = a->num & ~REG_HALF;
469
470 assert(reg->num >= 0);
471
472 if (a->num & REG_HALF) {
473 reg->flags |= IR3_REG_HALF;
474 /* if dst reg being assigned, patch up the instr: */
475 if (reg == instr->regs[0])
476 fixup_half_instr_dst(instr);
477 else
478 fixup_half_instr_src(instr);
479 }
480 }
481
482 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
483 struct ir3_instruction *instr, struct ir3_register *reg)
484 {
485 struct ra_assign_visitor *a = ra_assign_visitor(v);
486 unsigned i, base = reg->num & ~0x3;
487 int off = base - reg->num;
488
489 ra_assign_reg(v, instr, reg);
490 reg->flags |= IR3_REG_IA;
491
492 /* trigger assignment of all our companion input components: */
493 for (i = 0; i < 4; i++) {
494 struct ir3_instruction *in = NULL;
495 if ((base + i) < instr->block->ninputs)
496 in = instr->block->inputs[base + i];
497 if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
498 ra_assign(a->ctx, in, a->num + off + i);
499 }
500 }
501
502 static void ra_assign_dst_fanout(struct ir3_visitor *v,
503 struct ir3_instruction *instr, struct ir3_register *reg)
504 {
505 struct ra_assign_visitor *a = ra_assign_visitor(v);
506 struct ir3_register *src = instr->regs[1];
507 ra_assign_reg(v, instr, reg);
508 if (src->flags & IR3_REG_SSA)
509 ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
510 }
511
512 static void ra_assign_src_fanout(struct ir3_visitor *v,
513 struct ir3_instruction *instr, struct ir3_register *reg)
514 {
515 struct ra_assign_visitor *a = ra_assign_visitor(v);
516 ra_assign_reg(v, instr, reg);
517 ra_assign(a->ctx, instr, a->num + instr->fo.off);
518 }
519
520
521 static void ra_assign_src_fanin(struct ir3_visitor *v,
522 struct ir3_instruction *instr, struct ir3_register *reg)
523 {
524 struct ra_assign_visitor *a = ra_assign_visitor(v);
525 unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
526 ra_assign_reg(v, instr, reg);
527 ra_assign(a->ctx, instr, a->num - srcn);
528 for (j = 1; j < instr->regs_count; j++) {
529 struct ir3_register *reg = instr->regs[j];
530 if (reg->flags & IR3_REG_SSA) /* could be renamed already */
531 ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
532 }
533 }
534
535 static const struct ir3_visitor_funcs assign_visitor_funcs = {
536 .instr = ir3_visit_instr,
537 .dst_shader_input = ra_assign_dst_shader_input,
538 .dst_fanout = ra_assign_dst_fanout,
539 .dst_fanin = ra_assign_reg,
540 .dst = ra_assign_reg,
541 .src_fanout = ra_assign_src_fanout,
542 .src_fanin = ra_assign_src_fanin,
543 .src = ra_assign_reg,
544 };
545
546 static void ra_assign(struct ir3_ra_ctx *ctx,
547 struct ir3_instruction *assigner, int num)
548 {
549 struct ra_assign_visitor v = {
550 .base.funcs = &assign_visitor_funcs,
551 .ctx = ctx,
552 .num = num,
553 };
554
555 /* if we've already visited this instruction, bail now: */
556 if (ir3_instr_check_mark(assigner)) {
557 debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
558 if (assigner->regs[0]->num != (num & ~REG_HALF)) {
559 /* impossible situation, should have been resolved
560 * at an earlier stage by inserting extra mov's:
561 */
562 ctx->error = true;
563 }
564 return;
565 }
566
567 ir3_visit_instr(&v.base, assigner);
568 }
569
570 /*
571 *
572 */
573
574 static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
575 struct ir3_instruction *instr)
576 {
577 struct ir3_register *dst;
578 unsigned num;
579
580 /* skip over nop's */
581 if (instr->regs_count == 0)
582 return;
583
584 dst = instr->regs[0];
585
586 /* if we've already visited this instruction, bail now: */
587 if (instr->flags & IR3_INSTR_MARK)
588 return;
589
590 /* allocate register(s): */
591 if (is_addr(instr)) {
592 num = instr->regs[2]->num;
593 } else if (reg_gpr(dst)) {
594 struct ir3_ra_assignment a;
595 a = ra_calc(instr);
596 num = alloc_block(ctx, instr, a.num) + a.off;
597 } else if (dst->flags & IR3_REG_ADDR) {
598 dst->flags &= ~IR3_REG_ADDR;
599 num = regid(REG_A0, 0) | REG_HALF;
600 } else {
601 /* predicate register (p0).. etc */
602 num = regid(REG_P0, 0);
603 debug_assert(dst->num == num);
604 }
605
606 ra_assign(ctx, instr, num);
607 }
608
609 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
610 {
611 struct ir3_instruction *n;
612
613 ra_dump_list("before:\n", block->head);
614
615 if (!block->parent) {
616 unsigned i, j;
617 int base, off = output_base(ctx);
618
619 base = alloc_block(ctx, NULL, block->noutputs + off);
620
621 if (ctx->half_precision)
622 base |= REG_HALF;
623
624 for (i = 0; i < block->noutputs; i++)
625 if (block->outputs[i] && !is_kill(block->outputs[i]))
626 ra_assign(ctx, block->outputs[i], base + i + off);
627
628 if (ctx->type == SHADER_FRAGMENT) {
629 i = 0;
630 if (ctx->frag_face) {
631 /* if we have frag_face, it gets hr0.x */
632 ra_assign(ctx, block->inputs[i], REG_HALF | 0);
633 i += 4;
634 }
635 for (j = 0; i < block->ninputs; i++, j++)
636 if (block->inputs[i])
637 ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
638 } else {
639 for (i = 0; i < block->ninputs; i++)
640 if (block->inputs[i])
641 ir3_instr_ra(ctx, block->inputs[i]);
642 }
643 }
644
645 ra_dump_list("after:\n", block->head);
646
647 /* then loop over instruction list and assign registers:
648 */
649 for (n = block->head; n; n = n->next) {
650 ra_dump_instr("ASSIGN: ", n);
651 ir3_instr_ra(ctx, n);
652 if (ctx->error)
653 return -1;
654 ra_dump_list("-------", block->head);
655 }
656
657 return 0;
658 }
659
660 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
661 bool half_precision, bool frag_coord, bool frag_face)
662 {
663 struct ir3_instruction *n;
664 struct ir3_ra_ctx ctx = {
665 .block = block,
666 .type = type,
667 .half_precision = half_precision,
668 .frag_coord = frag_coord,
669 .frag_face = frag_face,
670 };
671 int ret;
672
673 /* mark dst registers w/ SSA flag so we can see which
674 * have been assigned so far:
675 */
676 for (n = block->head; n; n = n->next)
677 if (n->regs_count > 0)
678 n->regs[0]->flags |= IR3_REG_SSA;
679
680 ir3_clear_mark(block->shader);
681 ret = block_ra(&ctx, block);
682
683 return ret;
684 }