lima/gpir: Use registers for values live in multiple blocks
[mesa.git] / src / gallium / drivers / lima / ir / gp / regalloc.c
1 /*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include "gpir.h"
26 #include "util/u_dynarray.h"
27
28 /* Per-register information */
29
30 struct reg_info {
31 BITSET_WORD *conflicts;
32 struct util_dynarray conflict_list;
33
34 /* Number of conflicts that must be allocated to physical registers.
35 */
36 unsigned phys_conflicts;
37
38 unsigned node_conflicts;
39
40 /* Number of conflicts that can be allocated to either. */
41 unsigned total_conflicts;
42
43 int assigned_color;
44
45 bool visited;
46 };
47
48 struct regalloc_ctx {
49 unsigned bitset_words, num_nodes_and_regs;
50 struct reg_info *registers;
51
52 /* Reusable scratch liveness array */
53 BITSET_WORD *live;
54
55 unsigned *worklist;
56 unsigned worklist_start, worklist_end;
57
58 unsigned *stack;
59 unsigned stack_size;
60
61 gpir_compiler *comp;
62 void *mem_ctx;
63 };
64
65 /* Liveness analysis */
66
67 static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live,
68 gpir_compiler *comp)
69 {
70 /* KILL */
71 if (node->type == gpir_node_type_store) {
72 if (node->op == gpir_op_store_reg) {
73 gpir_store_node *store = gpir_node_to_store(node);
74 BITSET_CLEAR(live, store->reg->index);
75 }
76 }
77
78 /* GEN */
79 if (node->type == gpir_node_type_load) {
80 if (node->op == gpir_op_load_reg) {
81 gpir_load_node *load = gpir_node_to_load(node);
82 BITSET_SET(live, load->reg->index);
83 }
84 }
85 }
86
87 static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx)
88 {
89 for (unsigned i = 0; i < 2; i++) {
90 if (block->successors[i]) {
91 for (unsigned j = 0; j < ctx->bitset_words; j++)
92 block->live_out[j] |= block->successors[i]->live_in[j];
93 }
94 }
95
96 memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
97
98 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
99 propagate_liveness_instr(node, ctx->live, block->comp);
100 }
101
102 bool changed = false;
103 for (unsigned i = 0; i < ctx->bitset_words; i++) {
104 changed |= (block->live_in[i] != ctx->live[i]);
105 block->live_in[i] = ctx->live[i];
106 }
107 return changed;
108 }
109
110 static void calc_def_block(gpir_block *block)
111 {
112 list_for_each_entry(gpir_node, node, &block->node_list, list) {
113 if (node->op == gpir_op_store_reg) {
114 gpir_store_node *store = gpir_node_to_store(node);
115 BITSET_SET(block->def_out, store->reg->index);
116 }
117 }
118 }
119
120 static void calc_liveness(struct regalloc_ctx *ctx)
121 {
122 bool changed = true;
123 while (changed) {
124 changed = false;
125 list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) {
126 changed |= propagate_liveness_block(block, ctx);
127 }
128 }
129
130 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
131 calc_def_block(block);
132 }
133
134 changed = true;
135 while (changed) {
136 changed = false;
137 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
138 for (unsigned i = 0; i < 2; i++) {
139 gpir_block *succ = block->successors[i];
140 if (!succ)
141 continue;
142
143 for (unsigned j = 0; j < ctx->bitset_words; j++) {
144 BITSET_WORD new = block->def_out[j] & ~succ->def_out[j];
145 changed |= (new != 0);
146 succ->def_out[j] |= block->def_out[j];
147 }
148 }
149 }
150 }
151 }
152
153 /* Interference calculation */
154
155 static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
156 {
157 if (i == j)
158 return;
159
160 struct reg_info *a = &ctx->registers[i];
161 struct reg_info *b = &ctx->registers[j];
162
163 if (BITSET_TEST(a->conflicts, j))
164 return;
165
166 BITSET_SET(a->conflicts, j);
167 BITSET_SET(b->conflicts, i);
168
169 a->total_conflicts++;
170 b->total_conflicts++;
171 if (j < ctx->comp->cur_reg)
172 a->phys_conflicts++;
173 else
174 a->node_conflicts++;
175
176 if (i < ctx->comp->cur_reg)
177 b->phys_conflicts++;
178 else
179 b->node_conflicts++;
180
181 util_dynarray_append(&a->conflict_list, unsigned, j);
182 util_dynarray_append(&b->conflict_list, unsigned, i);
183 }
184
185 /* Make the register or node "i" intefere with all the other live registers
186 * and nodes.
187 */
188 static void add_all_interferences(struct regalloc_ctx *ctx,
189 unsigned i,
190 BITSET_WORD *live_nodes,
191 BITSET_WORD *live_regs)
192 {
193 int live_node;
194 BITSET_WORD tmp;
195 BITSET_FOREACH_SET(live_node, tmp, live_nodes, ctx->comp->cur_index) {
196 add_interference(ctx, i,
197 live_node + ctx->comp->cur_reg);
198 }
199
200 int live_reg;
201 BITSET_FOREACH_SET(live_reg, tmp, ctx->live, ctx->comp->cur_index) {
202 add_interference(ctx, i, live_reg);
203 }
204
205 }
206
207 static void print_liveness(struct regalloc_ctx *ctx,
208 BITSET_WORD *live_reg, BITSET_WORD *live_val)
209 {
210 if (!(lima_debug & LIMA_DEBUG_GP))
211 return;
212
213 int live_idx;
214 BITSET_WORD tmp;
215 BITSET_FOREACH_SET(live_idx, tmp, live_reg, ctx->comp->cur_reg) {
216 printf("reg%d ", live_idx);
217 }
218 BITSET_FOREACH_SET(live_idx, tmp, live_val, ctx->comp->cur_index) {
219 printf("%d ", live_idx);
220 }
221 printf("\n");
222 }
223
224 static void calc_interference(struct regalloc_ctx *ctx)
225 {
226 BITSET_WORD *live_nodes =
227 rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index);
228
229 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
230 /* Initialize liveness at the end of the block, but exclude values that
231 * definitely aren't defined by the end. This helps out with
232 * partially-defined registers, like:
233 *
234 * if (condition) {
235 * foo = ...;
236 * }
237 * if (condition) {
238 * ... = foo;
239 * }
240 *
241 * If we naively propagated liveness backwards, foo would be live from
242 * the beginning of the program, but if we're not inside a loop then
243 * its value is undefined before the first if and we don't have to
244 * consider it live. Mask out registers like foo here.
245 */
246 for (unsigned i = 0; i < ctx->bitset_words; i++) {
247 ctx->live[i] = block->live_out[i] & block->def_out[i];
248 }
249
250 list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
251 gpir_debug("processing node %d\n", node->index);
252 print_liveness(ctx, ctx->live, live_nodes);
253 if (node->type != gpir_node_type_store &&
254 node->type != gpir_node_type_branch) {
255 add_all_interferences(ctx, node->index + ctx->comp->cur_reg,
256 live_nodes, ctx->live);
257
258 /* KILL */
259 BITSET_CLEAR(live_nodes, node->index);
260 } else if (node->op == gpir_op_store_reg) {
261 gpir_store_node *store = gpir_node_to_store(node);
262 add_all_interferences(ctx, store->reg->index,
263 live_nodes, ctx->live);
264
265 /* KILL */
266 BITSET_CLEAR(ctx->live, store->reg->index);
267 }
268
269 /* GEN */
270 if (node->type == gpir_node_type_store) {
271 gpir_store_node *store = gpir_node_to_store(node);
272 BITSET_SET(live_nodes, store->child->index);
273 } else if (node->type == gpir_node_type_alu) {
274 gpir_alu_node *alu = gpir_node_to_alu(node);
275 for (int i = 0; i < alu->num_child; i++)
276 BITSET_SET(live_nodes, alu->children[i]->index);
277 } else if (node->type == gpir_node_type_branch) {
278 gpir_branch_node *branch = gpir_node_to_branch(node);
279 BITSET_SET(live_nodes, branch->cond->index);
280 } else if (node->op == gpir_op_load_reg) {
281 gpir_load_node *load = gpir_node_to_load(node);
282 BITSET_SET(ctx->live, load->reg->index);
283 }
284 }
285 }
286 }
287
288 /* Register allocation */
289
290 static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
291 {
292 struct reg_info *info = &ctx->registers[i];
293 if (i < ctx->comp->cur_reg) {
294 /* Physical regs. */
295 return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM;
296 } else {
297 /* Nodes: if we manage to allocate all of its conflicting physical
298 * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so
299 * we can ignore any more than that.
300 */
301 return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) +
302 info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
303 }
304 }
305
306 static void push_stack(struct regalloc_ctx *ctx, unsigned i)
307 {
308 ctx->stack[ctx->stack_size++] = i;
309 if (i < ctx->comp->cur_reg)
310 gpir_debug("pushing reg%u\n", i);
311 else
312 gpir_debug("pushing %d\n", i - ctx->comp->cur_reg);
313
314 struct reg_info *info = &ctx->registers[i];
315 assert(info->visited);
316
317 util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
318 struct reg_info *conflict_info = &ctx->registers[*conflict];
319 if (i < ctx->comp->cur_reg) {
320 assert(conflict_info->phys_conflicts > 0);
321 conflict_info->phys_conflicts--;
322 } else {
323 assert(conflict_info->node_conflicts > 0);
324 conflict_info->node_conflicts--;
325 }
326 if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
327 ctx->worklist[ctx->worklist_end++] = *conflict;
328 ctx->registers[*conflict].visited = true;
329 }
330 }
331 }
332
333 static bool do_regalloc(struct regalloc_ctx *ctx)
334 {
335 ctx->worklist_start = 0;
336 ctx->worklist_end = 0;
337 ctx->stack_size = 0;
338
339 /* Step 1: find the initially simplifiable registers */
340 for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) {
341 if (can_simplify(ctx, i)) {
342 ctx->worklist[ctx->worklist_end++] = i;
343 ctx->registers[i].visited = true;
344 }
345 }
346
347 while (true) {
348 /* Step 2: push onto the stack whatever we can */
349 while (ctx->worklist_start != ctx->worklist_end) {
350 push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
351 }
352
353 if (ctx->stack_size < ctx->num_nodes_and_regs) {
354 /* If there are still unsimplifiable nodes left, we need to
355 * optimistically push a node onto the stack. Choose the one with
356 * the smallest number of current neighbors, since that's the most
357 * likely to succeed.
358 */
359 unsigned min_conflicts = UINT_MAX;
360 unsigned best_reg = 0;
361 for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) {
362 struct reg_info *info = &ctx->registers[reg];
363 if (info->visited)
364 continue;
365 if (info->phys_conflicts + info->node_conflicts < min_conflicts) {
366 best_reg = reg;
367 min_conflicts = info->phys_conflicts + info->node_conflicts;
368 }
369 }
370 gpir_debug("optimistic triggered\n");
371 ctx->registers[best_reg].visited = true;
372 push_stack(ctx, best_reg);
373 } else {
374 break;
375 }
376 }
377
378 /* Step 4: pop off the stack and assign colors */
379 for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) {
380 unsigned idx = ctx->stack[i];
381 struct reg_info *reg = &ctx->registers[idx];
382
383 unsigned num_available_regs;
384 if (idx < ctx->comp->cur_reg) {
385 num_available_regs = GPIR_PHYSICAL_REG_NUM;
386 } else {
387 num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM;
388 }
389
390 bool found = false;
391 unsigned start = i % num_available_regs;
392 for (unsigned j = 0; j < num_available_regs; j++) {
393 unsigned candidate = (j + start) % num_available_regs;
394 bool available = true;
395 util_dynarray_foreach(&reg->conflict_list, unsigned, conflict_idx) {
396 struct reg_info *conflict = &ctx->registers[*conflict_idx];
397 if (conflict->assigned_color >= 0 &&
398 conflict->assigned_color == (int) candidate) {
399 available = false;
400 break;
401 }
402 }
403
404 if (available) {
405 reg->assigned_color = candidate;
406 found = true;
407 break;
408 }
409 }
410
411 /* TODO: spilling */
412 if (!found) {
413 gpir_error("Failed to allocate registers\n");
414 return false;
415 }
416 }
417
418 return true;
419 }
420
421 static void assign_regs(struct regalloc_ctx *ctx)
422 {
423 list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
424 list_for_each_entry(gpir_node, node, &block->node_list, list) {
425 if (node->index >= 0) {
426 node->value_reg =
427 ctx->registers[ctx->comp->cur_reg + node->index].assigned_color;
428 }
429
430 if (node->op == gpir_op_load_reg) {
431 gpir_load_node *load = gpir_node_to_load(node);
432 unsigned color = ctx->registers[load->reg->index].assigned_color;
433 load->index = color / 4;
434 load->component = color % 4;
435 }
436
437 if (node->op == gpir_op_store_reg) {
438 gpir_store_node *store = gpir_node_to_store(node);
439 unsigned color = ctx->registers[store->reg->index].assigned_color;
440 store->index = color / 4;
441 store->component = color % 4;
442 node->value_reg = color;
443 }
444 }
445
446 block->live_out_phys = 0;
447
448 int reg_idx;
449 BITSET_WORD tmp;
450 BITSET_FOREACH_SET(reg_idx, tmp, block->live_out, ctx->comp->cur_reg) {
451 if (BITSET_TEST(block->def_out, reg_idx)) {
452 block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color);
453 }
454 }
455 }
456 }
457
458 static void regalloc_print_result(gpir_compiler *comp)
459 {
460 if (!(lima_debug & LIMA_DEBUG_GP))
461 return;
462
463 int index = 0;
464 printf("======== regalloc ========\n");
465 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
466 list_for_each_entry(gpir_node, node, &block->node_list, list) {
467 printf("%03d: %d/%d %s ", index++, node->index, node->value_reg,
468 gpir_op_infos[node->op].name);
469 gpir_node_foreach_pred(node, dep) {
470 gpir_node *pred = dep->pred;
471 printf(" %d/%d", pred->index, pred->value_reg);
472 }
473 if (node->op == gpir_op_load_reg) {
474 gpir_load_node *load = gpir_node_to_load(node);
475 printf(" -/%d", 4 * load->index + load->component);
476 printf(" (%d)", load->reg->index);
477 } else if (node->op == gpir_op_store_reg) {
478 gpir_store_node *store = gpir_node_to_store(node);
479 printf(" (%d)", store->reg->index);
480 }
481 printf("\n");
482 }
483 printf("----------------------------\n");
484 }
485 }
486
487 bool gpir_regalloc_prog(gpir_compiler *comp)
488 {
489 struct regalloc_ctx ctx;
490
491 ctx.mem_ctx = ralloc_context(NULL);
492 ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index;
493 ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs);
494 ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
495 ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
496 ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
497 ctx.comp = comp;
498
499 ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs);
500 for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) {
501 ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
502 ctx.bitset_words);
503 util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
504 }
505
506 list_for_each_entry(gpir_block, block, &comp->block_list, list) {
507 block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
508 block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
509 block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
510 }
511
512 calc_liveness(&ctx);
513 calc_interference(&ctx);
514 if (!do_regalloc(&ctx)) {
515 ralloc_free(ctx.mem_ctx);
516 return false;
517 }
518 assign_regs(&ctx);
519
520 regalloc_print_result(comp);
521 ralloc_free(ctx.mem_ctx);
522 return true;
523 }