1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
33 #include "ir3_visitor.h"
36 * Register Assignment:
38 * NOTE: currently only works on a single basic block.. need to think
39 * about how multiple basic blocks are going to get scheduled. But
40 * I think I want to re-arrange how blocks work, ie. get rid of the
41 * block nesting thing..
43 * NOTE: we could do register coalescing (eliminate moves) as part of
44 * the RA step.. OTOH I think we need to do scheduling before register
45 * assignment. And if we remove a mov that effects scheduling (unless
46 * we leave a placeholder nop, which seems lame), so I'm not really
47 * sure how practical this is to do both in a single stage. But OTOH
48 * I'm not really sure a sane way for the CP stage to realize when it
49 * cannot remove a mov due to multi-register constraints..
54 struct ir3_block
*block
;
61 /* sorta ugly way to retrofit half-precision support.. rather than
62 * passing extra param around, just OR in a high bit. All the low
63 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
64 * will continue to work as long as you don't underflow (and that
65 * would go badly anyways).
67 #define REG_HALF 0x8000
69 struct ir3_ra_assignment
{
70 int8_t off
; /* offset of instruction dst within range */
71 uint8_t num
; /* number of components for the range */
74 static void ra_assign(struct ir3_ra_ctx
*ctx
,
75 struct ir3_instruction
*assigner
, int num
);
76 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*instr
);
79 * Register Allocation:
82 #define REG(n, wm) (struct ir3_register){ \
83 /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \
85 .wrmask = TGSI_WRITEMASK_ ## wm, \
88 /* check that the register exists, is a GPR and is not special (a0/p0) */
89 static struct ir3_register
* reg_check(struct ir3_instruction
*instr
, unsigned n
)
91 if ((n
< instr
->regs_count
) && reg_gpr(instr
->regs
[n
]))
92 return instr
->regs
[n
];
96 static int output_base(struct ir3_ra_ctx
*ctx
)
98 /* ugg, for fragment shader we need to have input at r0.x
99 * (or at least if there is a way to configure it, I can't
100 * see how because the blob driver always uses r0.x (ie.
103 if ((ctx
->type
== SHADER_FRAGMENT
) && !ctx
->half_precision
)
108 /* live means read before written */
109 static void compute_liveregs(struct ir3_ra_ctx
*ctx
,
110 struct ir3_instruction
*instr
, regmask_t
*liveregs
)
112 struct ir3_block
*block
= instr
->block
;
116 regmask_init(liveregs
);
117 regmask_init(&written
);
119 for (instr
= instr
->next
; instr
; instr
= instr
->next
) {
120 struct ir3_register
*r
;
125 /* check first src's read: */
126 for (j
= 1; j
< instr
->regs_count
; j
++) {
127 r
= reg_check(instr
, j
);
129 regmask_set_if_not(liveregs
, r
, &written
);
132 /* then dst written (if assigned already): */
133 if (instr
->flags
& IR3_INSTR_MARK
) {
134 r
= reg_check(instr
, 0);
136 regmask_set(&written
, r
);
140 /* be sure to account for output registers too: */
141 for (i
= 0; i
< block
->noutputs
; i
++) {
142 struct ir3_register reg
= REG(output_base(ctx
) + i
, X
);
143 regmask_set_if_not(liveregs
, ®
, &written
);
147 /* calculate registers that are clobbered before last use of 'assigner'.
148 * This needs to be done backwards, although it could possibly be
149 * combined into compute_liveregs(). (Ie. compute_liveregs() could
150 * reverse the list, then do this part backwards reversing the list
151 * again back to original order.) Otoh, probably I should try to
152 * construct a proper interference graph instead.
154 * XXX this need to follow the same recursion path that is used for
155 * to rename/assign registers (ie. ra_assign_src()).. this is a bit
156 * ugly right now, maybe refactor into node iterator sort of things
157 * that iterates nodes in the correct order?
159 static bool compute_clobbers(struct ir3_ra_ctx
*ctx
,
160 struct ir3_instruction
*instr
, struct ir3_instruction
*assigner
,
164 bool live
= false, was_live
= false;
167 struct ir3_block
*block
= ctx
->block
;
169 /* if at the end, check outputs: */
170 for (i
= 0; i
< block
->noutputs
; i
++)
171 if (block
->outputs
[i
] == assigner
)
176 for (i
= 1; i
< instr
->regs_count
; i
++) {
177 struct ir3_register
*reg
= instr
->regs
[i
];
178 if ((reg
->flags
& IR3_REG_SSA
) && (reg
->instr
== assigner
)) {
179 if (is_meta(instr
)) {
180 switch (instr
->opc
) {
187 was_live
|= compute_clobbers(ctx
, instr
->next
,
199 was_live
|= compute_clobbers(ctx
, instr
->next
, assigner
, liveregs
);
201 if (was_live
&& (instr
->regs_count
> 0) &&
202 (instr
->flags
& IR3_INSTR_MARK
) &&
204 regmask_set(liveregs
, instr
->regs
[0]);
206 return live
|| was_live
;
209 static int find_available(regmask_t
*liveregs
, int size
)
212 for (i
= 0; i
< MAX_REG
- size
; i
++) {
213 if (!regmask_get(liveregs
, ®(i
, X
))) {
214 unsigned start
= i
++;
215 for (; (i
< MAX_REG
) && ((i
- start
) < size
); i
++)
216 if (regmask_get(liveregs
, ®(i
, X
)))
218 if ((i
- start
) >= size
)
226 static int alloc_block(struct ir3_ra_ctx
*ctx
,
227 struct ir3_instruction
*instr
, int size
)
230 /* special case, allocating shader outputs. At this
231 * point, nothing is allocated, just start the shader
232 * outputs at r0.x and let compute_liveregs() take
233 * care of the rest from here:
238 compute_liveregs(ctx
, instr
, &liveregs
);
240 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
241 // XXX hack.. maybe ra_calc should give us a list of
242 // instrs to compute_clobbers() on?
243 if (is_meta(instr
) && (instr
->opc
== OPC_META_INPUT
) &&
244 (instr
->regs_count
== 1)) {
245 unsigned i
, base
= instr
->regs
[0]->num
& ~0x3;
246 for (i
= 0; i
< 4; i
++) {
247 struct ir3_instruction
*in
= ctx
->block
->inputs
[base
+ i
];
249 compute_clobbers(ctx
, in
->next
, in
, &liveregs
);
252 // XXX XXX XXX XXX XXX XXX XXX XXX XXX
253 compute_clobbers(ctx
, instr
->next
, instr
, &liveregs
);
254 return find_available(&liveregs
, size
);
259 * Constraint Calculation:
262 struct ra_calc_visitor
{
263 struct ir3_visitor base
;
264 struct ir3_ra_assignment a
;
267 static inline struct ra_calc_visitor
*ra_calc_visitor(struct ir3_visitor
*v
)
269 return (struct ra_calc_visitor
*)v
;
272 /* calculate register assignment for the instruction. If the register
273 * written by this instruction is required to be part of a range, to
274 * handle other (input/output/sam/bary.f/etc) contiguous register range
275 * constraints, that is calculated handled here.
277 static void ra_calc_dst(struct ir3_visitor
*v
,
278 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
280 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
291 ra_calc_dst_shader_input(struct ir3_visitor
*v
,
292 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
294 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
295 struct ir3_block
*block
= instr
->block
;
296 struct ir3_register
*dst
= instr
->regs
[0];
297 unsigned base
= dst
->num
& ~0x3;
300 assert(!(dst
->flags
& IR3_REG_IA
));
302 /* check what input components we need: */
303 for (i
= 0; i
< 4; i
++) {
304 unsigned idx
= base
+ i
;
305 if ((idx
< block
->ninputs
) && block
->inputs
[idx
])
309 c
->a
.off
= dst
->num
- base
;
313 static void ra_calc_src_fanin(struct ir3_visitor
*v
,
314 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
316 struct ra_calc_visitor
*c
= ra_calc_visitor(v
);
317 unsigned srcn
= ir3_instr_regno(instr
, reg
) - 1;
320 c
->a
.num
= MAX2(c
->a
.num
, instr
->regs_count
- 1);
323 static const struct ir3_visitor_funcs calc_visitor_funcs
= {
324 .instr
= ir3_visit_instr
,
325 .dst_shader_input
= ra_calc_dst_shader_input
,
326 .dst_fanout
= ra_calc_dst
,
327 .dst_fanin
= ra_calc_dst
,
329 .src_fanout
= ir3_visit_reg
,
330 .src_fanin
= ra_calc_src_fanin
,
331 .src
= ir3_visit_reg
,
334 static struct ir3_ra_assignment
ra_calc(struct ir3_instruction
*assigner
)
336 struct ra_calc_visitor v
= {
337 .base
.funcs
= &calc_visitor_funcs
,
340 ir3_visit_instr(&v
.base
, assigner
);
346 * Register Assignment:
349 struct ra_assign_visitor
{
350 struct ir3_visitor base
;
351 struct ir3_ra_ctx
*ctx
;
355 static inline struct ra_assign_visitor
*ra_assign_visitor(struct ir3_visitor
*v
)
357 return (struct ra_assign_visitor
*)v
;
360 static type_t
half_type(type_t type
)
363 case TYPE_F32
: return TYPE_F16
;
364 case TYPE_U32
: return TYPE_U16
;
365 case TYPE_S32
: return TYPE_S16
;
366 /* instructions may already be fixed up: */
377 /* some instructions need fix-up if dst register is half precision: */
378 static void fixup_half_instr_dst(struct ir3_instruction
*instr
)
380 switch (instr
->category
) {
381 case 1: /* move instructions */
382 instr
->cat1
.dst_type
= half_type(instr
->cat1
.dst_type
);
385 switch (instr
->opc
) {
387 instr
->opc
= OPC_MAD_F16
;
390 instr
->opc
= OPC_SEL_B16
;
393 instr
->opc
= OPC_SEL_S16
;
396 instr
->opc
= OPC_SEL_F16
;
399 instr
->opc
= OPC_SAD_S16
;
401 /* instructions may already be fixed up: */
414 instr
->cat5
.type
= half_type(instr
->cat5
.type
);
418 /* some instructions need fix-up if src register is half precision: */
419 static void fixup_half_instr_src(struct ir3_instruction
*instr
)
421 switch (instr
->category
) {
422 case 1: /* move instructions */
423 instr
->cat1
.src_type
= half_type(instr
->cat1
.src_type
);
428 static void ra_assign_reg(struct ir3_visitor
*v
,
429 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
431 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
432 reg
->flags
&= ~IR3_REG_SSA
;
433 reg
->num
= a
->num
& ~REG_HALF
;
434 if (a
->num
& REG_HALF
) {
435 reg
->flags
|= IR3_REG_HALF
;
436 /* if dst reg being assigned, patch up the instr: */
437 if (reg
== instr
->regs
[0])
438 fixup_half_instr_dst(instr
);
440 fixup_half_instr_src(instr
);
444 static void ra_assign_dst_shader_input(struct ir3_visitor
*v
,
445 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
447 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
448 unsigned i
, base
= reg
->num
& ~0x3;
449 int off
= base
- reg
->num
;
451 ra_assign_reg(v
, instr
, reg
);
452 reg
->flags
|= IR3_REG_IA
;
454 /* trigger assignment of all our companion input components: */
455 for (i
= 0; i
< 4; i
++) {
456 struct ir3_instruction
*in
= instr
->block
->inputs
[i
+base
];
457 if (in
&& is_meta(in
) && (in
->opc
== OPC_META_INPUT
))
458 ra_assign(a
->ctx
, in
, a
->num
+ off
+ i
);
462 static void ra_assign_dst_fanout(struct ir3_visitor
*v
,
463 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
465 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
466 struct ir3_register
*src
= instr
->regs
[1];
467 ra_assign_reg(v
, instr
, reg
);
468 if (src
->flags
& IR3_REG_SSA
)
469 ra_assign(a
->ctx
, src
->instr
, a
->num
- instr
->fo
.off
);
472 static void ra_assign_src_fanout(struct ir3_visitor
*v
,
473 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
475 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
476 ra_assign_reg(v
, instr
, reg
);
477 ra_assign(a
->ctx
, instr
, a
->num
+ instr
->fo
.off
);
481 static void ra_assign_src_fanin(struct ir3_visitor
*v
,
482 struct ir3_instruction
*instr
, struct ir3_register
*reg
)
484 struct ra_assign_visitor
*a
= ra_assign_visitor(v
);
485 unsigned j
, srcn
= ir3_instr_regno(instr
, reg
) - 1;
486 ra_assign_reg(v
, instr
, reg
);
487 ra_assign(a
->ctx
, instr
, a
->num
- srcn
);
488 for (j
= 1; j
< instr
->regs_count
; j
++) {
489 struct ir3_register
*reg
= instr
->regs
[j
];
490 if (reg
->flags
& IR3_REG_SSA
) /* could be renamed already */
491 ra_assign(a
->ctx
, reg
->instr
, a
->num
- srcn
+ j
- 1);
495 static const struct ir3_visitor_funcs assign_visitor_funcs
= {
496 .instr
= ir3_visit_instr
,
497 .dst_shader_input
= ra_assign_dst_shader_input
,
498 .dst_fanout
= ra_assign_dst_fanout
,
499 .dst_fanin
= ra_assign_reg
,
500 .dst
= ra_assign_reg
,
501 .src_fanout
= ra_assign_src_fanout
,
502 .src_fanin
= ra_assign_src_fanin
,
503 .src
= ra_assign_reg
,
506 static void ra_assign(struct ir3_ra_ctx
*ctx
,
507 struct ir3_instruction
*assigner
, int num
)
509 struct ra_assign_visitor v
= {
510 .base
.funcs
= &assign_visitor_funcs
,
515 /* if we've already visited this instruction, bail now: */
516 if (ir3_instr_check_mark(assigner
)) {
517 debug_assert(assigner
->regs
[0]->num
== (num
& ~REG_HALF
));
518 if (assigner
->regs
[0]->num
!= (num
& ~REG_HALF
)) {
519 /* impossible situation, should have been resolved
520 * at an earlier stage by inserting extra mov's:
527 ir3_visit_instr(&v
.base
, assigner
);
534 static void ir3_instr_ra(struct ir3_ra_ctx
*ctx
,
535 struct ir3_instruction
*instr
)
537 struct ir3_ra_assignment a
;
540 /* skip over nop's */
541 if (instr
->regs_count
== 0)
544 /* skip writes to a0, p0, etc */
545 if (!reg_gpr(instr
->regs
[0]))
548 /* if we've already visited this instruction, bail now: */
549 if (instr
->flags
& IR3_INSTR_MARK
)
552 /* allocate register(s): */
554 num
= alloc_block(ctx
, instr
, a
.num
) + a
.off
;
556 ra_assign(ctx
, instr
, num
);
559 /* flatten into shader: */
560 // XXX this should probably be somewhere else:
561 static void legalize(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
563 struct ir3_instruction
*n
;
564 struct ir3_shader
*shader
= block
->shader
;
565 struct ir3_instruction
*end
=
566 ir3_instr_create(block
, 0, OPC_END
);
567 struct ir3_instruction
*last_input
= NULL
;
568 regmask_t needs_ss_war
;
572 regmask_init(&needs_ss_war
);
573 regmask_init(&needs_ss
);
574 regmask_init(&needs_sy
);
576 shader
->instrs_count
= 0;
578 for (n
= block
->head
; n
; n
= n
->next
) {
579 struct ir3_register
*reg
;
585 for (i
= 1; i
< n
->regs_count
; i
++) {
590 /* TODO: we probably only need (ss) for alu
591 * instr consuming sfu result.. need to make
592 * some tests for both this and (sy)..
594 if (regmask_get(&needs_ss
, reg
)) {
595 n
->flags
|= IR3_INSTR_SS
;
596 regmask_init(&needs_ss
);
599 if (regmask_get(&needs_sy
, reg
)) {
600 n
->flags
|= IR3_INSTR_SY
;
601 regmask_init(&needs_sy
);
606 if (n
->regs_count
> 0) {
608 if (regmask_get(&needs_ss_war
, reg
)) {
609 n
->flags
|= IR3_INSTR_SS
;
610 regmask_init(&needs_ss_war
); // ??? I assume?
614 /* cat5+ does not have an (ss) bit, if needed we need to
615 * insert a nop to carry the sync flag. Would be kinda
616 * clever if we were aware of this during scheduling, but
617 * this should be a pretty rare case:
619 if ((n
->flags
& IR3_INSTR_SS
) && (n
->category
>= 5)) {
620 struct ir3_instruction
*nop
;
621 nop
= ir3_instr_create(block
, 0, OPC_NOP
);
622 nop
->flags
|= IR3_INSTR_SS
;
623 n
->flags
&= ~IR3_INSTR_SS
;
626 /* need to be able to set (ss) on first instruction: */
627 if ((shader
->instrs_count
== 0) && (n
->category
>= 5))
628 ir3_instr_create(block
, 0, OPC_NOP
);
630 if (is_nop(n
) && shader
->instrs_count
) {
631 struct ir3_instruction
*last
=
632 shader
->instrs
[shader
->instrs_count
-1];
633 if (is_nop(last
) && (last
->repeat
< 5)) {
635 last
->flags
|= n
->flags
;
640 shader
->instrs
[shader
->instrs_count
++] = n
;
643 regmask_set(&needs_ss
, n
->regs
[0]);
646 regmask_set(&needs_sy
, n
->regs
[0]);
648 /* both tex/sfu appear to not always immediately consume
649 * their src register(s):
651 if (is_tex(n
) || is_sfu(n
)) {
652 for (i
= 1; i
< n
->regs_count
; i
++) {
655 regmask_set(&needs_ss_war
, reg
);
664 last_input
->regs
[0]->flags
|= IR3_REG_EI
;
666 shader
->instrs
[shader
->instrs_count
++] = end
;
668 shader
->instrs
[0]->flags
|= IR3_INSTR_SS
| IR3_INSTR_SY
;
671 static int block_ra(struct ir3_ra_ctx
*ctx
, struct ir3_block
*block
)
673 struct ir3_instruction
*n
;
675 if (!block
->parent
) {
677 int base
, off
= output_base(ctx
);
679 base
= alloc_block(ctx
, NULL
, block
->noutputs
+ off
);
681 if (ctx
->half_precision
)
684 for (i
= 0; i
< block
->noutputs
; i
++)
685 if (block
->outputs
[i
])
686 ra_assign(ctx
, block
->outputs
[i
], base
+ i
+ off
);
688 if (ctx
->type
== SHADER_FRAGMENT
) {
689 for (i
= 0; i
< block
->ninputs
; i
++)
690 if (block
->inputs
[i
])
691 ra_assign(ctx
, block
->inputs
[i
], (base
& ~REG_HALF
) + i
);
693 for (i
= 0; i
< block
->ninputs
; i
++)
694 if (block
->inputs
[i
])
695 ir3_instr_ra(ctx
, block
->inputs
[i
]);
699 /* then loop over instruction list and assign registers:
703 ir3_instr_ra(ctx
, n
);
709 legalize(ctx
, block
);
714 int ir3_block_ra(struct ir3_block
*block
, enum shader_t type
,
717 struct ir3_ra_ctx ctx
= {
720 .half_precision
= half_precision
,
722 ir3_shader_clear_mark(block
->shader
);
723 return block_ra(&ctx
, block
);