2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Rob Clark <robclark@freedesktop.org>
27 #include "util/u_math.h"
28 #include "util/register_allocate.h"
29 #include "util/ralloc.h"
30 #include "util/bitset.h"
33 #include "ir3_compiler.h"
37 build_q_values(unsigned int **q_values
, unsigned off
,
38 const unsigned *sizes
, unsigned count
)
40 for (unsigned i
= 0; i
< count
; i
++) {
41 q_values
[i
+ off
] = rzalloc_array(q_values
, unsigned, total_class_count
);
43 /* From register_allocate.c:
45 * q(B,C) (indexed by C, B is this register class) in
46 * Runeson/Nyström paper. This is "how many registers of B could
47 * the worst choice register from C conflict with".
49 * If we just let the register allocation algorithm compute these
50 * values, is extremely expensive. However, since all of our
51 * registers are laid out, we can very easily compute them
52 * ourselves. View the register from C as fixed starting at GRF n
53 * somewhere in the middle, and the register from B as sliding back
54 * and forth. Then the first register to conflict from B is the
55 * one starting at n - class_size[B] + 1 and the last register to
56 * conflict will start at n + class_size[B] - 1. Therefore, the
57 * number of conflicts from B is class_size[B] + class_size[C] - 1.
59 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
60 * B | | | | | |n| --> | | | | | | |
61 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
66 * (Idea copied from brw_fs_reg_allocate.cpp)
68 for (unsigned j
= 0; j
< count
; j
++)
69 q_values
[i
+ off
][j
+ off
] = sizes
[i
] + sizes
[j
] - 1;
73 /* One-time setup of RA register-set, which describes all the possible
74 * "virtual" registers and their interferences. Ie. double register
75 * occupies (and conflicts with) two single registers, and so forth.
76 * Since registers do not need to be aligned to their class size, they
77 * can conflict with other registers in the same class too. Ie:
79 * Single (base) | Double
80 * --------------+---------------
87 * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
88 * really just four scalar registers. Don't let that confuse you.)
90 struct ir3_ra_reg_set
*
91 ir3_ra_alloc_reg_set(struct ir3_compiler
*compiler
)
93 struct ir3_ra_reg_set
*set
= rzalloc(compiler
, struct ir3_ra_reg_set
);
94 unsigned ra_reg_count
, reg
, first_half_reg
, first_high_reg
, base
;
95 unsigned int **q_values
;
97 /* calculate # of regs across all classes: */
99 for (unsigned i
= 0; i
< class_count
; i
++)
100 ra_reg_count
+= CLASS_REGS(i
);
101 for (unsigned i
= 0; i
< half_class_count
; i
++)
102 ra_reg_count
+= HALF_CLASS_REGS(i
);
103 for (unsigned i
= 0; i
< high_class_count
; i
++)
104 ra_reg_count
+= HIGH_CLASS_REGS(i
);
106 /* allocate and populate q_values: */
107 q_values
= ralloc_array(set
, unsigned *, total_class_count
);
109 build_q_values(q_values
, 0, class_sizes
, class_count
);
110 build_q_values(q_values
, HALF_OFFSET
, half_class_sizes
, half_class_count
);
111 build_q_values(q_values
, HIGH_OFFSET
, high_class_sizes
, high_class_count
);
113 /* allocate the reg-set.. */
114 set
->regs
= ra_alloc_reg_set(set
, ra_reg_count
, true);
115 set
->ra_reg_to_gpr
= ralloc_array(set
, uint16_t, ra_reg_count
);
116 set
->gpr_to_ra_reg
= ralloc_array(set
, uint16_t *, total_class_count
);
120 for (unsigned i
= 0; i
< class_count
; i
++) {
121 set
->classes
[i
] = ra_alloc_reg_class(set
->regs
);
123 set
->gpr_to_ra_reg
[i
] = ralloc_array(set
, uint16_t, CLASS_REGS(i
));
125 for (unsigned j
= 0; j
< CLASS_REGS(i
); j
++) {
126 ra_class_add_reg(set
->regs
, set
->classes
[i
], reg
);
128 set
->ra_reg_to_gpr
[reg
] = j
;
129 set
->gpr_to_ra_reg
[i
][j
] = reg
;
131 for (unsigned br
= j
; br
< j
+ class_sizes
[i
]; br
++)
132 ra_add_transitive_reg_conflict(set
->regs
, br
, reg
);
138 first_half_reg
= reg
;
141 for (unsigned i
= 0; i
< half_class_count
; i
++) {
142 set
->half_classes
[i
] = ra_alloc_reg_class(set
->regs
);
144 set
->gpr_to_ra_reg
[base
+ i
] =
145 ralloc_array(set
, uint16_t, HALF_CLASS_REGS(i
));
147 for (unsigned j
= 0; j
< HALF_CLASS_REGS(i
); j
++) {
148 ra_class_add_reg(set
->regs
, set
->half_classes
[i
], reg
);
150 set
->ra_reg_to_gpr
[reg
] = j
;
151 set
->gpr_to_ra_reg
[base
+ i
][j
] = reg
;
153 for (unsigned br
= j
; br
< j
+ half_class_sizes
[i
]; br
++)
154 ra_add_transitive_reg_conflict(set
->regs
, br
+ first_half_reg
, reg
);
160 first_high_reg
= reg
;
163 for (unsigned i
= 0; i
< high_class_count
; i
++) {
164 set
->high_classes
[i
] = ra_alloc_reg_class(set
->regs
);
166 set
->gpr_to_ra_reg
[base
+ i
] =
167 ralloc_array(set
, uint16_t, HIGH_CLASS_REGS(i
));
169 for (unsigned j
= 0; j
< HIGH_CLASS_REGS(i
); j
++) {
170 ra_class_add_reg(set
->regs
, set
->high_classes
[i
], reg
);
172 set
->ra_reg_to_gpr
[reg
] = j
;
173 set
->gpr_to_ra_reg
[base
+ i
][j
] = reg
;
175 for (unsigned br
= j
; br
< j
+ high_class_sizes
[i
]; br
++)
176 ra_add_transitive_reg_conflict(set
->regs
, br
+ first_high_reg
, reg
);
182 /* starting a6xx, half precision regs conflict w/ full precision regs: */
183 if (compiler
->gpu_id
>= 600) {
184 /* because of transitivity, we can get away with just setting up
185 * conflicts between the first class of full and half regs:
187 for (unsigned i
= 0; i
< half_class_count
; i
++) {
188 /* NOTE there are fewer half class sizes, but they match the
189 * first N full class sizes.. but assert in case that ever
190 * accidentally changes:
192 debug_assert(class_sizes
[i
] == half_class_sizes
[i
]);
193 for (unsigned j
= 0; j
< CLASS_REGS(i
) / 2; j
++) {
194 unsigned freg
= set
->gpr_to_ra_reg
[i
][j
];
195 unsigned hreg0
= set
->gpr_to_ra_reg
[i
+ HALF_OFFSET
][(j
* 2) + 0];
196 unsigned hreg1
= set
->gpr_to_ra_reg
[i
+ HALF_OFFSET
][(j
* 2) + 1];
198 ra_add_transitive_reg_pair_conflict(set
->regs
, freg
, hreg0
, hreg1
);
202 // TODO also need to update q_values, but for now:
203 ra_set_finalize(set
->regs
, NULL
);
205 ra_set_finalize(set
->regs
, q_values
);
208 ralloc_free(q_values
);
214 ra_size_to_class(unsigned sz
, bool half
, bool high
)
217 for (unsigned i
= 0; i
< high_class_count
; i
++)
218 if (high_class_sizes
[i
] >= sz
)
219 return i
+ HIGH_OFFSET
;
221 for (unsigned i
= 0; i
< half_class_count
; i
++)
222 if (half_class_sizes
[i
] >= sz
)
223 return i
+ HALF_OFFSET
;
225 for (unsigned i
= 0; i
< class_count
; i
++)
226 if (class_sizes
[i
] >= sz
)