1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
33 #include "ir3_visitor.h"
36 * Register Assignment:
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
54 struct ir3_block
*block
;
65 /* sorta ugly way to retrofit half-precision support.. rather than
66 * passing extra param around, just OR in a high bit. All the low
67 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
68 * will continue to work as long as you don't underflow (and that
69 * would go badly anyways).
71 #define REG_HALF 0x8000
73 struct ir3_ra_assignment
{
74 int8_t off
; /* offset of instruction dst within range */
75 uint8_t num
; /* number of components for the range */
78 static void ra_assign(struct ir3_ra_ctx
*ctx
,
79 struct ir3_instruction
*assigner
, int num
);
80 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*instr
);
83 * Register Allocation:
86 #define REG(n, wm, f) (struct ir3_register){ \
89 .wrmask = TGSI_WRITEMASK_ ## wm, \
92 /* check that the register exists, is a GPR and is not special (a0/p0) */
93 static struct ir3_register
* reg_check(struct ir3_instruction
*instr
, unsigned n
)
95 if ((n
< instr
->regs_count
) && reg_gpr(instr
->regs
[n
]))
96 return instr
->regs
[n
];
100 static int output_base(struct ir3_ra_ctx
*ctx
)
102 /* ugg, for fragment shader we need to have input at r0.x
103 * (or at least if there is a way to configure it, I can't
104 * see how because the blob driver always uses r0.x (ie.
107 if (ctx
->type
== SHADER_FRAGMENT
) {
108 if (ctx
->half_precision
)
109 return ctx
->frag_face
? 4 : 3;
110 return ctx
->frag_coord
? 8 : 4;
115 /* live means read before written */
116 static void compute_liveregs(struct ir3_ra_ctx
*ctx
,
117 struct ir3_instruction
*instr
, regmask_t
*liveregs
)
119 struct ir3_block
*block
= instr
->block
;
123 regmask_init(liveregs
);
124 regmask_init(&written
);
126 for (instr
= instr
->next
; instr
; instr
= instr
->next
) {
127 struct ir3_register
*r
;
132 /* check first src's read: */
133 for (j
= 1; j
< instr
->regs_count
; j
++) {
134 r
= reg_check(instr
, j
);
136 regmask_set_if_not(liveregs
, r
, &written
);
139 /* then dst written (if assigned already): */
140 if (instr
->flags
& IR3_INSTR_MARK
) {
141 r
= reg_check(instr
, 0);
143 regmask_set(&written
, r
);
147 /* be sure to account for output registers too: */
148 for (i
= 0; i
< block
->noutputs
; i
++) {
149 struct ir3_register reg
= REG(output_base(ctx
) + i
, X
, 0);
150 regmask_set_if_not(liveregs
, ®
, &written
);
154 /* calculate registers that are clobbered before last use of 'assigner'.
155 * This needs to be done backwards, although it could possibly be
156 * combined into compute_liveregs(). (Ie. compute_liveregs() could
157 * reverse the list, then do this part backwards reversing the list
158 * again back to original order.) Otoh, probably I should try to
159 * construct a proper interference graph instead.
161 * XXX this need to follow the same recursion path that is used for
162 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
163 * ugly right now, maybe refactor into node iterator sort of things
164 * that iterates nodes in the correct order?
166 static bool compute_clobbers(struct ir3_ra_ctx
*ctx
,
167 struct ir3_instruction
*instr
, struct ir3_instruction
*assigner
,
171 bool live
= false, was_live
= false;
174 struct ir3_block
*block
= ctx
->block
;
176 /* if at the end, check outputs: */
177 for (i
= 0; i
< block
->noutputs
; i
++)
178 if (block
->outputs
[i
] == assigner
)
183 for (i
= 1; i
< instr
->regs_count
; i
++) {
184 struct ir3_register
*reg
= instr
->regs
[i
];
185 if ((reg
->flags
& IR3_REG_SSA
) && (reg
->instr
== assigner
)) {
186 if (is_meta(instr
)) {
187 switch (instr
->opc
) {
194 was_live
|= compute_clobbers(ctx
, instr
->next
,
206 was_live
|= compute_clobbers(ctx
, instr
->next
, assigner
, liveregs
);
208 if (was_live
&& (instr
->regs_count
> 0) &&
209 (instr
->flags
& IR3_INSTR_MARK
) &&
211 regmask_set(liveregs
, instr
->regs
[0]);
213 return live
|| was_live
;
216 static int find_available(regmask_t
*liveregs
, int size
, bool half
)
219 unsigned f
= half
? IR3_REG_HALF
: 0;
220 for (i
= 0; i
< MAX_REG
- size
; i
++) {
221 if (!regmask_get(liveregs
, ®(i
, X
, f
))) {
222 unsigned start
= i
++;
223 for (; (i
< MAX_REG
) && ((i
- start
) < size
); i
++)
224 if (regmask_get(liveregs
, ®(i
, X
, f
)))
226 if ((i
- start
) >= size
)
234 static int alloc_block(struct ir3_ra_ctx
*ctx
,
235 struct ir3_instruction
*instr
, int size
)
238 /* special case, allocating shader outputs. At this
239 * point, nothing is allocated, just start the shader
240 * outputs at r0.x and let compute_liveregs() take
241 * care of the rest from here:
245 struct ir3_register
*dst
= instr
->regs
[0];
248 compute_liveregs(ctx
, instr
, &liveregs
);
250 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
251 // XXX hack.. maybe ra_calc should give us a list of
252 // instrs to compute_clobbers() on?
253 if (is_meta(instr
) && (instr
->opc
== OPC_META_INPUT
) &&
254 (instr
->regs_count
== 1)) {
255 unsigned i
, base
= instr
->regs
[0]->num
& ~0x3;
256 for (i
= 0; i
< 4; i
++) {
257 struct ir3_instruction
*in
= NULL
;
258 if ((base
+ i
) < ctx
->block
->ninputs
)
259 in
= ctx
->block
->inputs
[base
+ i
];
261 compute_clobbers(ctx
, in
->next
, in
, &liveregs
);
264 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
265 compute_clobbers(ctx
, instr
->next
, instr
, &liveregs
);
267 return find_available(&liveregs
, size
,
268 !!(dst
->flags
& IR3_REG_HALF
));
273 * Constraint Calculation:
276 struct ra_calc_visitor
{
277 struct ir3_visitor base
;
278 struct ir3_ra_assignment a
;
281 static inline struct ra_calc_visitor
*ra_calc_visitor(struct ir3_visitor
*v
)
283 return (struct ra_calc_visitor
*)v
;
286 /* calculate register assignment for the instruction. If the register
287 * written by this instruction is required to be part of a range, to
288 * handle other (input/output/sam/bary.f/etc) contiguous register range
289 * constraints, that is calculated handled here.
291 static void ra_calc_dst(struct ir3_visitor
*v
,
292 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
294 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
305 ra_calc_dst_shader_input(struct ir3_visitor
*v
,
306 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
308 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
309 struct ir3_block
*block
= instr
->block
;
310 struct ir3_register
*dst
= instr
->regs
[0];
311 unsigned base
= dst
->num
& ~0x3;
314 assert(!(dst
->flags
& IR3_REG_IA
));
316 /* check what input components we need: */
317 for (i
= 0; i
< 4; i
++) {
318 unsigned idx
= base
+ i
;
319 if ((idx
< block
->ninputs
) && block
->inputs
[idx
])
323 c
->a
.off
= dst
->num
- base
;
327 static void ra_calc_src_fanin(struct ir3_visitor
*v
,
328 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
330 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
331 unsigned srcn
= ir3_instr_regno(instr
, reg
) - 1;
334 c
->a
.num
= MAX2(c
->a
.num
, instr
->regs_count
- 1);
337 static const struct ir3_visitor_funcs calc_visitor_funcs
= {
338 .instr
= ir3_visit_instr
,
339 .dst_shader_input
= ra_calc_dst_shader_input
,
340 .dst_fanout
= ra_calc_dst
,
341 .dst_fanin
= ra_calc_dst
,
343 .src_fanout
= ir3_visit_reg
,
344 .src_fanin
= ra_calc_src_fanin
,
345 .src
= ir3_visit_reg
,
348 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*assigner
)
350 struct ra_calc_visitor v
= {
351 .base
.funcs
= &calc_visitor_funcs
,
354 ir3_visit_instr(&v
.base
, assigner
);
360 * Register Assignment:
363 struct ra_assign_visitor
{
364 struct ir3_visitor base
;
365 struct ir3_ra_ctx
*ctx
;
369 static inline struct ra_assign_visitor
*ra_assign_visitor(struct ir3_visitor
*v
)
371 return (struct ra_assign_visitor
*)v
;
374 static type_t
half_type(type_t type
)
377 case TYPE_F32
: return TYPE_F16
;
378 case TYPE_U32
: return TYPE_U16
;
379 case TYPE_S32
: return TYPE_S16
;
380 /* instructions may already be fixed up: */
391 /* some instructions need fix-up if dst register is half precision: */
392 static void fixup_half_instr_dst(struct ir3_instruction
*instr
)
394 switch (instr
->category
) {
395 case 1: /* move instructions */
396 instr
->cat1
.dst_type
= half_type(instr
->cat1
.dst_type
);
399 switch (instr
->opc
) {
401 instr
->opc
= OPC_MAD_F16
;
404 instr
->opc
= OPC_SEL_B16
;
407 instr
->opc
= OPC_SEL_S16
;
410 instr
->opc
= OPC_SEL_F16
;
413 instr
->opc
= OPC_SAD_S16
;
415 /* instructions may already be fixed up: */
428 instr
->cat5
.type
= half_type(instr
->cat5
.type
);
432 /* some instructions need fix-up if src register is half precision: */
433 static void fixup_half_instr_src(struct ir3_instruction
*instr
)
435 switch (instr
->category
) {
436 case 1: /* move instructions */
437 instr
->cat1
.src_type
= half_type(instr
->cat1
.src_type
);
442 static void ra_assign_reg(struct ir3_visitor
*v
,
443 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
445 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
447 if (is_flow(instr
) && (instr
->opc
== OPC_KILL
))
450 reg
->flags
&= ~IR3_REG_SSA
;
451 reg
->num
= a
->num
& ~REG_HALF
;
453 assert(reg
->num
>= 0);
455 if (a
->num
& REG_HALF
) {
456 reg
->flags
|= IR3_REG_HALF
;
457 /* if dst reg being assigned, patch up the instr: */
458 if (reg
== instr
->regs
[0])
459 fixup_half_instr_dst(instr
);
461 fixup_half_instr_src(instr
);
465 static void ra_assign_dst_shader_input(struct ir3_visitor
*v
,
466 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
468 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
469 unsigned i
, base
= reg
->num
& ~0x3;
470 int off
= base
- reg
->num
;
472 ra_assign_reg(v
, instr
, reg
);
473 reg
->flags
|= IR3_REG_IA
;
475 /* trigger assignment of all our companion input components: */
476 for (i
= 0; i
< 4; i
++) {
477 struct ir3_instruction
*in
= NULL
;
478 if ((base
+ i
) < instr
->block
->ninputs
)
479 in
= instr
->block
->inputs
[base
+ i
];
480 if (in
&& is_meta(in
) && (in
->opc
== OPC_META_INPUT
))
481 ra_assign(a
->ctx
, in
, a
->num
+ off
+ i
);
485 static void ra_assign_dst_fanout(struct ir3_visitor
*v
,
486 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
488 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
489 struct ir3_register
*src
= instr
->regs
[1];
490 ra_assign_reg(v
, instr
, reg
);
491 if (src
->flags
& IR3_REG_SSA
)
492 ra_assign(a
->ctx
, src
->instr
, a
->num
- instr
->fo
.off
);
495 static void ra_assign_src_fanout(struct ir3_visitor
*v
,
496 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
498 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
499 ra_assign_reg(v
, instr
, reg
);
500 ra_assign(a
->ctx
, instr
, a
->num
+ instr
->fo
.off
);
504 static void ra_assign_src_fanin(struct ir3_visitor
*v
,
505 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
507 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
508 unsigned j
, srcn
= ir3_instr_regno(instr
, reg
) - 1;
509 ra_assign_reg(v
, instr
, reg
);
510 ra_assign(a
->ctx
, instr
, a
->num
- srcn
);
511 for (j
= 1; j
< instr
->regs_count
; j
++) {
512 struct ir3_register
*reg
= instr
->regs
[j
];
513 if (reg
->flags
& IR3_REG_SSA
) /* could be renamed already */
514 ra_assign(a
->ctx
, reg
->instr
, a
->num
- srcn
+ j
- 1);
518 static const struct ir3_visitor_funcs assign_visitor_funcs
= {
519 .instr
= ir3_visit_instr
,
520 .dst_shader_input
= ra_assign_dst_shader_input
,
521 .dst_fanout
= ra_assign_dst_fanout
,
522 .dst_fanin
= ra_assign_reg
,
523 .dst
= ra_assign_reg
,
524 .src_fanout
= ra_assign_src_fanout
,
525 .src_fanin
= ra_assign_src_fanin
,
526 .src
= ra_assign_reg
,
529 static void ra_assign(struct ir3_ra_ctx
*ctx
,
530 struct ir3_instruction
*assigner
, int num
)
532 struct ra_assign_visitor v
= {
533 .base
.funcs
= &assign_visitor_funcs
,
538 /* if we've already visited this instruction, bail now: */
539 if (ir3_instr_check_mark(assigner
)) {
540 debug_assert(assigner
->regs
[0]->num
== (num
& ~REG_HALF
));
541 if (assigner
->regs
[0]->num
!= (num
& ~REG_HALF
)) {
542 /* impossible situation, should have been resolved
543 * at an earlier stage by inserting extra mov's:
550 ir3_visit_instr(&v
.base
, assigner
);
557 static void ir3_instr_ra(struct ir3_ra_ctx
*ctx
,
558 struct ir3_instruction
*instr
)
560 struct ir3_register
*dst
;
563 /* skip over nop's */
564 if (instr
->regs_count
== 0)
567 dst
= instr
->regs
[0];
569 /* if we've already visited this instruction, bail now: */
570 if (instr
->flags
& IR3_INSTR_MARK
)
573 /* allocate register(s): */
574 if (is_addr(instr
)) {
575 num
= instr
->regs
[2]->num
;
576 } else if (reg_gpr(dst
)) {
577 struct ir3_ra_assignment a
;
579 num
= alloc_block(ctx
, instr
, a
.num
) + a
.off
;
580 } else if (dst
->flags
& IR3_REG_ADDR
) {
581 dst
->flags
&= ~IR3_REG_ADDR
;
582 num
= regid(REG_A0
, 0) | REG_HALF
;
584 /* predicate register (p0).. etc */
588 ra_assign(ctx
, instr
, num
);
591 /* flatten into shader: */
592 // XXX this should probably be somewhere else:
593 static void legalize(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
595 struct ir3_instruction
*n
;
596 struct ir3
*shader
= block
->shader
;
597 struct ir3_instruction
*end
=
598 ir3_instr_create(block
, 0, OPC_END
);
599 struct ir3_instruction
*last_input
= NULL
;
600 struct ir3_instruction
*last_rel
= NULL
;
601 regmask_t needs_ss_war
; /* write after read */
605 regmask_init(&needs_ss_war
);
606 regmask_init(&needs_ss
);
607 regmask_init(&needs_sy
);
609 shader
->instrs_count
= 0;
611 for (n
= block
->head
; n
; n
= n
->next
) {
612 struct ir3_register
*reg
;
619 struct ir3_register
*inloc
= n
->regs
[1];
620 assert(inloc
->flags
& IR3_REG_IMMED
);
621 ctx
->max_bary
= MAX2(ctx
->max_bary
, inloc
->iim_val
);
624 for (i
= 1; i
< n
->regs_count
; i
++) {
629 /* TODO: we probably only need (ss) for alu
630 * instr consuming sfu result.. need to make
631 * some tests for both this and (sy)..
633 if (regmask_get(&needs_ss
, reg
)) {
634 n
->flags
|= IR3_INSTR_SS
;
635 regmask_init(&needs_ss
);
638 if (regmask_get(&needs_sy
, reg
)) {
639 n
->flags
|= IR3_INSTR_SY
;
640 regmask_init(&needs_sy
);
644 /* TODO: is it valid to have address reg loaded from a
645 * relative src (ie. mova a0, c<a0.x+4>)? If so, the
646 * last_rel check below should be moved ahead of this:
648 if (reg
->flags
& IR3_REG_RELATIV
)
652 if (n
->regs_count
> 0) {
654 if (regmask_get(&needs_ss_war
, reg
)) {
655 n
->flags
|= IR3_INSTR_SS
;
656 regmask_init(&needs_ss_war
); // ??? I assume?
659 if (last_rel
&& (reg
->num
== regid(REG_A0
, 0))) {
660 last_rel
->flags
|= IR3_INSTR_UL
;
665 /* cat5+ does not have an (ss) bit, if needed we need to
666 * insert a nop to carry the sync flag. Would be kinda
667 * clever if we were aware of this during scheduling, but
668 * this should be a pretty rare case:
670 if ((n
->flags
& IR3_INSTR_SS
) && (n
->category
>= 5)) {
671 struct ir3_instruction
*nop
;
672 nop
= ir3_instr_create(block
, 0, OPC_NOP
);
673 nop
->flags
|= IR3_INSTR_SS
;
674 n
->flags
&= ~IR3_INSTR_SS
;
677 /* need to be able to set (ss) on first instruction: */
678 if ((shader
->instrs_count
== 0) && (n
->category
>= 5))
679 ir3_instr_create(block
, 0, OPC_NOP
);
681 if (is_nop(n
) && shader
->instrs_count
) {
682 struct ir3_instruction
*last
=
683 shader
->instrs
[shader
->instrs_count
-1];
684 if (is_nop(last
) && (last
->repeat
< 5)) {
686 last
->flags
|= n
->flags
;
691 shader
->instrs
[shader
->instrs_count
++] = n
;
694 regmask_set(&needs_ss
, n
->regs
[0]);
697 /* this ends up being the # of samp instructions.. but that
698 * is ok, everything else only cares whether it is zero or
699 * not. We do this here, rather than when we encounter a
700 * SAMP decl, because (especially in binning pass shader)
701 * the samp instruction(s) could get eliminated if the
702 * result is not used.
704 ctx
->has_samp
= true;
705 regmask_set(&needs_sy
, n
->regs
[0]);
708 /* both tex/sfu appear to not always immediately consume
709 * their src register(s):
711 if (is_tex(n
) || is_sfu(n
)) {
712 for (i
= 1; i
< n
->regs_count
; i
++) {
715 regmask_set(&needs_ss_war
, reg
);
724 last_input
->regs
[0]->flags
|= IR3_REG_EI
;
727 last_rel
->flags
|= IR3_INSTR_UL
;
729 shader
->instrs
[shader
->instrs_count
++] = end
;
731 shader
->instrs
[0]->flags
|= IR3_INSTR_SS
| IR3_INSTR_SY
;
734 static int block_ra(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
736 struct ir3_instruction
*n
;
738 if (!block
->parent
) {
740 int base
, off
= output_base(ctx
);
742 base
= alloc_block(ctx
, NULL
, block
->noutputs
+ off
);
744 if (ctx
->half_precision
)
747 for (i
= 0; i
< block
->noutputs
; i
++)
748 if (block
->outputs
[i
] && !is_kill(block
->outputs
[i
]))
749 ra_assign(ctx
, block
->outputs
[i
], base
+ i
+ off
);
751 if (ctx
->type
== SHADER_FRAGMENT
) {
753 if (ctx
->frag_face
) {
754 /* if we have frag_face, it gets hr0.x */
755 ra_assign(ctx
, block
->inputs
[i
], REG_HALF
| 0);
758 for (j
= 0; i
< block
->ninputs
; i
++, j
++)
759 if (block
->inputs
[i
])
760 ra_assign(ctx
, block
->inputs
[i
], (base
& ~REG_HALF
) + j
);
762 for (i
= 0; i
< block
->ninputs
; i
++)
763 if (block
->inputs
[i
])
764 ir3_instr_ra(ctx
, block
->inputs
[i
]);
768 /* then loop over instruction list and assign registers:
772 ir3_instr_ra(ctx
, n
);
778 legalize(ctx
, block
);
783 int ir3_block_ra(struct ir3_block
*block
, enum shader_t type
,
784 bool half_precision
, bool frag_coord
, bool frag_face
,
785 bool *has_samp
, int *max_bary
)
787 struct ir3_ra_ctx ctx
= {
790 .half_precision
= half_precision
,
791 .frag_coord
= frag_coord
,
792 .frag_face
= frag_face
,
797 ir3_clear_mark(block
->shader
);
798 ret
= block_ra(&ctx
, block
);
799 *has_samp
= ctx
.has_samp
;
800 *max_bary
= ctx
.max_bary
;