1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
33 #include "ir3_visitor.h"
36 * Register Assignment:
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
54 struct ir3_block
*block
;
63 /* sorta ugly way to retrofit half-precision support.. rather than
64 * passing extra param around, just OR in a high bit. All the low
65 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
66 * will continue to work as long as you don't underflow (and that
67 * would go badly anyways).
69 #define REG_HALF 0x8000
71 struct ir3_ra_assignment
{
72 int8_t off
; /* offset of instruction dst within range */
73 uint8_t num
; /* number of components for the range */
76 static void ra_assign(struct ir3_ra_ctx
*ctx
,
77 struct ir3_instruction
*assigner
, int num
);
78 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*instr
);
81 * Register Allocation:
84 #define REG(n, wm) (struct ir3_register){ \
85 /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \
87 .wrmask = TGSI_WRITEMASK_ ## wm, \
90 /* check that the register exists, is a GPR and is not special (a0/p0) */
91 static struct ir3_register
* reg_check(struct ir3_instruction
*instr
, unsigned n
)
93 if ((n
< instr
->regs_count
) && reg_gpr(instr
->regs
[n
]))
94 return instr
->regs
[n
];
98 static int output_base(struct ir3_ra_ctx
*ctx
)
100 /* ugg, for fragment shader we need to have input at r0.x
101 * (or at least if there is a way to configure it, I can't
102 * see how because the blob driver always uses r0.x (ie.
105 if (ctx
->type
== SHADER_FRAGMENT
) {
106 if (ctx
->half_precision
)
107 return ctx
->frag_face
? 1 : 0;
108 return ctx
->frag_coord
? 6 : 2;
113 /* live means read before written */
114 static void compute_liveregs(struct ir3_ra_ctx
*ctx
,
115 struct ir3_instruction
*instr
, regmask_t
*liveregs
)
117 struct ir3_block
*block
= instr
->block
;
121 regmask_init(liveregs
);
122 regmask_init(&written
);
124 for (instr
= instr
->next
; instr
; instr
= instr
->next
) {
125 struct ir3_register
*r
;
130 /* check first src's read: */
131 for (j
= 1; j
< instr
->regs_count
; j
++) {
132 r
= reg_check(instr
, j
);
134 regmask_set_if_not(liveregs
, r
, &written
);
137 /* then dst written (if assigned already): */
138 if (instr
->flags
& IR3_INSTR_MARK
) {
139 r
= reg_check(instr
, 0);
141 regmask_set(&written
, r
);
145 /* be sure to account for output registers too: */
146 for (i
= 0; i
< block
->noutputs
; i
++) {
147 struct ir3_register reg
= REG(output_base(ctx
) + i
, X
);
148 regmask_set_if_not(liveregs
, ®
, &written
);
152 /* calculate registers that are clobbered before last use of 'assigner'.
153 * This needs to be done backwards, although it could possibly be
154 * combined into compute_liveregs(). (Ie. compute_liveregs() could
155 * reverse the list, then do this part backwards reversing the list
156 * again back to original order.) Otoh, probably I should try to
157 * construct a proper interference graph instead.
159 * XXX this need to follow the same recursion path that is used for
160 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
161 * ugly right now, maybe refactor into node iterator sort of things
162 * that iterates nodes in the correct order?
164 static bool compute_clobbers(struct ir3_ra_ctx
*ctx
,
165 struct ir3_instruction
*instr
, struct ir3_instruction
*assigner
,
169 bool live
= false, was_live
= false;
172 struct ir3_block
*block
= ctx
->block
;
174 /* if at the end, check outputs: */
175 for (i
= 0; i
< block
->noutputs
; i
++)
176 if (block
->outputs
[i
] == assigner
)
181 for (i
= 1; i
< instr
->regs_count
; i
++) {
182 struct ir3_register
*reg
= instr
->regs
[i
];
183 if ((reg
->flags
& IR3_REG_SSA
) && (reg
->instr
== assigner
)) {
184 if (is_meta(instr
)) {
185 switch (instr
->opc
) {
192 was_live
|= compute_clobbers(ctx
, instr
->next
,
204 was_live
|= compute_clobbers(ctx
, instr
->next
, assigner
, liveregs
);
206 if (was_live
&& (instr
->regs_count
> 0) &&
207 (instr
->flags
& IR3_INSTR_MARK
) &&
209 regmask_set(liveregs
, instr
->regs
[0]);
211 return live
|| was_live
;
214 static int find_available(regmask_t
*liveregs
, int size
)
217 for (i
= 0; i
< MAX_REG
- size
; i
++) {
218 if (!regmask_get(liveregs
, ®(i
, X
))) {
219 unsigned start
= i
++;
220 for (; (i
< MAX_REG
) && ((i
- start
) < size
); i
++)
221 if (regmask_get(liveregs
, ®(i
, X
)))
223 if ((i
- start
) >= size
)
231 static int alloc_block(struct ir3_ra_ctx
*ctx
,
232 struct ir3_instruction
*instr
, int size
)
235 /* special case, allocating shader outputs. At this
236 * point, nothing is allocated, just start the shader
237 * outputs at r0.x and let compute_liveregs() take
238 * care of the rest from here:
243 compute_liveregs(ctx
, instr
, &liveregs
);
245 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
246 // XXX hack.. maybe ra_calc should give us a list of
247 // instrs to compute_clobbers() on?
248 if (is_meta(instr
) && (instr
->opc
== OPC_META_INPUT
) &&
249 (instr
->regs_count
== 1)) {
250 unsigned i
, base
= instr
->regs
[0]->num
& ~0x3;
251 for (i
= 0; i
< 4; i
++) {
252 struct ir3_instruction
*in
= ctx
->block
->inputs
[base
+ i
];
254 compute_clobbers(ctx
, in
->next
, in
, &liveregs
);
257 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
258 compute_clobbers(ctx
, instr
->next
, instr
, &liveregs
);
259 return find_available(&liveregs
, size
);
264 * Constraint Calculation:
267 struct ra_calc_visitor
{
268 struct ir3_visitor base
;
269 struct ir3_ra_assignment a
;
272 static inline struct ra_calc_visitor
*ra_calc_visitor(struct ir3_visitor
*v
)
274 return (struct ra_calc_visitor
*)v
;
277 /* calculate register assignment for the instruction. If the register
278 * written by this instruction is required to be part of a range, to
279 * handle other (input/output/sam/bary.f/etc) contiguous register range
280 * constraints, that is calculated handled here.
282 static void ra_calc_dst(struct ir3_visitor
*v
,
283 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
285 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
296 ra_calc_dst_shader_input(struct ir3_visitor
*v
,
297 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
299 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
300 struct ir3_block
*block
= instr
->block
;
301 struct ir3_register
*dst
= instr
->regs
[0];
302 unsigned base
= dst
->num
& ~0x3;
305 assert(!(dst
->flags
& IR3_REG_IA
));
307 /* check what input components we need: */
308 for (i
= 0; i
< 4; i
++) {
309 unsigned idx
= base
+ i
;
310 if ((idx
< block
->ninputs
) && block
->inputs
[idx
])
314 c
->a
.off
= dst
->num
- base
;
318 static void ra_calc_src_fanin(struct ir3_visitor
*v
,
319 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
321 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
322 unsigned srcn
= ir3_instr_regno(instr
, reg
) - 1;
325 c
->a
.num
= MAX2(c
->a
.num
, instr
->regs_count
- 1);
328 static const struct ir3_visitor_funcs calc_visitor_funcs
= {
329 .instr
= ir3_visit_instr
,
330 .dst_shader_input
= ra_calc_dst_shader_input
,
331 .dst_fanout
= ra_calc_dst
,
332 .dst_fanin
= ra_calc_dst
,
334 .src_fanout
= ir3_visit_reg
,
335 .src_fanin
= ra_calc_src_fanin
,
336 .src
= ir3_visit_reg
,
339 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*assigner
)
341 struct ra_calc_visitor v
= {
342 .base
.funcs
= &calc_visitor_funcs
,
345 ir3_visit_instr(&v
.base
, assigner
);
351 * Register Assignment:
354 struct ra_assign_visitor
{
355 struct ir3_visitor base
;
356 struct ir3_ra_ctx
*ctx
;
360 static inline struct ra_assign_visitor
*ra_assign_visitor(struct ir3_visitor
*v
)
362 return (struct ra_assign_visitor
*)v
;
365 static type_t
half_type(type_t type
)
368 case TYPE_F32
: return TYPE_F16
;
369 case TYPE_U32
: return TYPE_U16
;
370 case TYPE_S32
: return TYPE_S16
;
371 /* instructions may already be fixed up: */
382 /* some instructions need fix-up if dst register is half precision: */
383 static void fixup_half_instr_dst(struct ir3_instruction
*instr
)
385 switch (instr
->category
) {
386 case 1: /* move instructions */
387 instr
->cat1
.dst_type
= half_type(instr
->cat1
.dst_type
);
390 switch (instr
->opc
) {
392 instr
->opc
= OPC_MAD_F16
;
395 instr
->opc
= OPC_SEL_B16
;
398 instr
->opc
= OPC_SEL_S16
;
401 instr
->opc
= OPC_SEL_F16
;
404 instr
->opc
= OPC_SAD_S16
;
406 /* instructions may already be fixed up: */
419 instr
->cat5
.type
= half_type(instr
->cat5
.type
);
423 /* some instructions need fix-up if src register is half precision: */
424 static void fixup_half_instr_src(struct ir3_instruction
*instr
)
426 switch (instr
->category
) {
427 case 1: /* move instructions */
428 instr
->cat1
.src_type
= half_type(instr
->cat1
.src_type
);
433 static void ra_assign_reg(struct ir3_visitor
*v
,
434 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
436 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
438 if (is_flow(instr
) && (instr
->opc
== OPC_KILL
))
441 reg
->flags
&= ~IR3_REG_SSA
;
442 reg
->num
= a
->num
& ~REG_HALF
;
444 assert(reg
->num
>= 0);
446 if (a
->num
& REG_HALF
) {
447 reg
->flags
|= IR3_REG_HALF
;
448 /* if dst reg being assigned, patch up the instr: */
449 if (reg
== instr
->regs
[0])
450 fixup_half_instr_dst(instr
);
452 fixup_half_instr_src(instr
);
456 static void ra_assign_dst_shader_input(struct ir3_visitor
*v
,
457 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
459 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
460 unsigned i
, base
= reg
->num
& ~0x3;
461 int off
= base
- reg
->num
;
463 ra_assign_reg(v
, instr
, reg
);
464 reg
->flags
|= IR3_REG_IA
;
466 /* trigger assignment of all our companion input components: */
467 for (i
= 0; i
< 4; i
++) {
468 struct ir3_instruction
*in
= instr
->block
->inputs
[i
+base
];
469 if (in
&& is_meta(in
) && (in
->opc
== OPC_META_INPUT
))
470 ra_assign(a
->ctx
, in
, a
->num
+ off
+ i
);
474 static void ra_assign_dst_fanout(struct ir3_visitor
*v
,
475 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
477 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
478 struct ir3_register
*src
= instr
->regs
[1];
479 ra_assign_reg(v
, instr
, reg
);
480 if (src
->flags
& IR3_REG_SSA
)
481 ra_assign(a
->ctx
, src
->instr
, a
->num
- instr
->fo
.off
);
484 static void ra_assign_src_fanout(struct ir3_visitor
*v
,
485 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
487 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
488 ra_assign_reg(v
, instr
, reg
);
489 ra_assign(a
->ctx
, instr
, a
->num
+ instr
->fo
.off
);
493 static void ra_assign_src_fanin(struct ir3_visitor
*v
,
494 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
496 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
497 unsigned j
, srcn
= ir3_instr_regno(instr
, reg
) - 1;
498 ra_assign_reg(v
, instr
, reg
);
499 ra_assign(a
->ctx
, instr
, a
->num
- srcn
);
500 for (j
= 1; j
< instr
->regs_count
; j
++) {
501 struct ir3_register
*reg
= instr
->regs
[j
];
502 if (reg
->flags
& IR3_REG_SSA
) /* could be renamed already */
503 ra_assign(a
->ctx
, reg
->instr
, a
->num
- srcn
+ j
- 1);
507 static const struct ir3_visitor_funcs assign_visitor_funcs
= {
508 .instr
= ir3_visit_instr
,
509 .dst_shader_input
= ra_assign_dst_shader_input
,
510 .dst_fanout
= ra_assign_dst_fanout
,
511 .dst_fanin
= ra_assign_reg
,
512 .dst
= ra_assign_reg
,
513 .src_fanout
= ra_assign_src_fanout
,
514 .src_fanin
= ra_assign_src_fanin
,
515 .src
= ra_assign_reg
,
518 static void ra_assign(struct ir3_ra_ctx
*ctx
,
519 struct ir3_instruction
*assigner
, int num
)
521 struct ra_assign_visitor v
= {
522 .base
.funcs
= &assign_visitor_funcs
,
527 /* if we've already visited this instruction, bail now: */
528 if (ir3_instr_check_mark(assigner
)) {
529 debug_assert(assigner
->regs
[0]->num
== (num
& ~REG_HALF
));
530 if (assigner
->regs
[0]->num
!= (num
& ~REG_HALF
)) {
531 /* impossible situation, should have been resolved
532 * at an earlier stage by inserting extra mov's:
539 ir3_visit_instr(&v
.base
, assigner
);
546 static void ir3_instr_ra(struct ir3_ra_ctx
*ctx
,
547 struct ir3_instruction
*instr
)
549 struct ir3_ra_assignment a
;
552 /* skip over nop's */
553 if (instr
->regs_count
== 0)
556 /* skip writes to a0, p0, etc */
557 if (!reg_gpr(instr
->regs
[0]))
560 /* if we've already visited this instruction, bail now: */
561 if (instr
->flags
& IR3_INSTR_MARK
)
564 /* allocate register(s): */
566 num
= alloc_block(ctx
, instr
, a
.num
) + a
.off
;
568 ra_assign(ctx
, instr
, num
);
571 /* flatten into shader: */
572 // XXX this should probably be somewhere else:
573 static void legalize(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
575 struct ir3_instruction
*n
;
576 struct ir3_shader
*shader
= block
->shader
;
577 struct ir3_instruction
*end
=
578 ir3_instr_create(block
, 0, OPC_END
);
579 struct ir3_instruction
*last_input
= NULL
;
580 regmask_t needs_ss_war
; /* write after read */
584 regmask_init(&needs_ss_war
);
585 regmask_init(&needs_ss
);
586 regmask_init(&needs_sy
);
588 shader
->instrs_count
= 0;
590 for (n
= block
->head
; n
; n
= n
->next
) {
591 struct ir3_register
*reg
;
597 for (i
= 1; i
< n
->regs_count
; i
++) {
602 /* TODO: we probably only need (ss) for alu
603 * instr consuming sfu result.. need to make
604 * some tests for both this and (sy)..
606 if (regmask_get(&needs_ss
, reg
)) {
607 n
->flags
|= IR3_INSTR_SS
;
608 regmask_init(&needs_ss
);
611 if (regmask_get(&needs_sy
, reg
)) {
612 n
->flags
|= IR3_INSTR_SY
;
613 regmask_init(&needs_sy
);
618 if (n
->regs_count
> 0) {
620 if (regmask_get(&needs_ss_war
, reg
)) {
621 n
->flags
|= IR3_INSTR_SS
;
622 regmask_init(&needs_ss_war
); // ??? I assume?
626 /* cat5+ does not have an (ss) bit, if needed we need to
627 * insert a nop to carry the sync flag. Would be kinda
628 * clever if we were aware of this during scheduling, but
629 * this should be a pretty rare case:
631 if ((n
->flags
& IR3_INSTR_SS
) && (n
->category
>= 5)) {
632 struct ir3_instruction
*nop
;
633 nop
= ir3_instr_create(block
, 0, OPC_NOP
);
634 nop
->flags
|= IR3_INSTR_SS
;
635 n
->flags
&= ~IR3_INSTR_SS
;
638 /* need to be able to set (ss) on first instruction: */
639 if ((shader
->instrs_count
== 0) && (n
->category
>= 5))
640 ir3_instr_create(block
, 0, OPC_NOP
);
642 if (is_nop(n
) && shader
->instrs_count
) {
643 struct ir3_instruction
*last
=
644 shader
->instrs
[shader
->instrs_count
-1];
645 if (is_nop(last
) && (last
->repeat
< 5)) {
647 last
->flags
|= n
->flags
;
652 shader
->instrs
[shader
->instrs_count
++] = n
;
655 regmask_set(&needs_ss
, n
->regs
[0]);
658 regmask_set(&needs_sy
, n
->regs
[0]);
660 /* both tex/sfu appear to not always immediately consume
661 * their src register(s):
663 if (is_tex(n
) || is_sfu(n
)) {
664 for (i
= 1; i
< n
->regs_count
; i
++) {
667 regmask_set(&needs_ss_war
, reg
);
676 last_input
->regs
[0]->flags
|= IR3_REG_EI
;
678 shader
->instrs
[shader
->instrs_count
++] = end
;
680 shader
->instrs
[0]->flags
|= IR3_INSTR_SS
| IR3_INSTR_SY
;
683 static int block_ra(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
685 struct ir3_instruction
*n
;
687 if (!block
->parent
) {
689 int base
, off
= output_base(ctx
);
691 base
= alloc_block(ctx
, NULL
, block
->noutputs
+ off
);
693 if (ctx
->half_precision
)
696 for (i
= 0; i
< block
->noutputs
; i
++)
697 if (block
->outputs
[i
] && !is_kill(block
->outputs
[i
]))
698 ra_assign(ctx
, block
->outputs
[i
], base
+ i
+ off
);
700 if (ctx
->type
== SHADER_FRAGMENT
) {
702 if (ctx
->frag_face
) {
703 /* if we have frag_face, it gets hr0.x */
704 ra_assign(ctx
, block
->inputs
[i
], REG_HALF
| 0);
707 for (j
= 0; i
< block
->ninputs
; i
++, j
++)
708 if (block
->inputs
[i
])
709 ra_assign(ctx
, block
->inputs
[i
], (base
& ~REG_HALF
) + j
);
711 for (i
= 0; i
< block
->ninputs
; i
++)
712 if (block
->inputs
[i
])
713 ir3_instr_ra(ctx
, block
->inputs
[i
]);
717 /* then loop over instruction list and assign registers:
721 ir3_instr_ra(ctx
, n
);
727 legalize(ctx
, block
);
732 int ir3_block_ra(struct ir3_block
*block
, enum shader_t type
,
733 bool half_precision
, bool frag_coord
, bool frag_face
)
735 struct ir3_ra_ctx ctx
= {
738 .half_precision
= half_precision
,
739 .frag_coord
= frag_coord
,
740 .frag_face
= frag_face
,
742 ir3_shader_clear_mark(block
->shader
);
743 return block_ra(&ctx
, block
);