nir: Add a global code motion (GCM) pass
[mesa.git] / src / glsl / nir / nir_opt_gcm.c
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jason Ekstrand (jason@jlekstrand.net)
25 *
26 */
27
28 #include "nir.h"
29
30 /*
31 * Implements Global Code Motion. A description of GCM can be found in
32 * "Global Code Motion; Global Value Numbering" by Cliff Click.
33 * Unfortunately, the algorithm presented in the paper is broken in a
34 * number of ways. The algorithm used here differs substantially from the
35 * one in the paper but it is, in my opinion, much easier to read and
36 * verify correcness.
37 */
38
39 struct gcm_block_info {
40 /* Number of loops this block is inside */
41 unsigned loop_depth;
42
43 /* The last instruction inserted into this block. This is used as we
44 * traverse the instructions and insert them back into the program to
45 * put them in the right order.
46 */
47 nir_instr *last_instr;
48 };
49
50 struct gcm_state {
51 nir_function_impl *impl;
52 nir_instr *instr;
53
54 /* Marks all instructions that have been visited by the curren pass */
55 BITSET_WORD *visited;
56
57 /* Marks instructions that are "pinned", i.e. cannot be moved from their
58 * basic block by code motion */
59 BITSET_WORD *pinned;
60
61 /* The list of non-pinned instructions. As we do the late scheduling,
62 * we pull non-pinned instructions out of their blocks and place them in
63 * this list. This saves us from having linked-list problems when we go
64 * to put instructions back in their blocks.
65 */
66 struct exec_list instrs;
67
68 struct gcm_block_info *blocks;
69 };
70
71 /* Recursively walks the CFG and builds the block_info structure */
72 static void
73 gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state,
74 unsigned loop_depth)
75 {
76 foreach_list_typed(nir_cf_node, node, node, cf_list) {
77 switch (node->type) {
78 case nir_cf_node_block: {
79 nir_block *block = nir_cf_node_as_block(node);
80 state->blocks[block->index].loop_depth = loop_depth;
81 break;
82 }
83 case nir_cf_node_if: {
84 nir_if *if_stmt = nir_cf_node_as_if(node);
85 gcm_build_block_info(&if_stmt->then_list, state, loop_depth);
86 gcm_build_block_info(&if_stmt->else_list, state, loop_depth);
87 break;
88 }
89 case nir_cf_node_loop: {
90 nir_loop *loop = nir_cf_node_as_loop(node);
91 gcm_build_block_info(&loop->body, state, loop_depth + 1);
92 break;
93 }
94 default:
95 unreachable("Invalid CF node type");
96 }
97 }
98 }
99
100 /* Walks the instruction list and marks immovable instructions as pinned */
101 static bool
102 gcm_pin_instructions_block(nir_block *block, void *void_state)
103 {
104 struct gcm_state *state = void_state;
105
106 nir_foreach_instr_safe(block, instr) {
107 bool pinned;
108 switch (instr->type) {
109 case nir_instr_type_alu:
110 switch (nir_instr_as_alu(instr)->op) {
111 case nir_op_fddx:
112 case nir_op_fddy:
113 case nir_op_fddx_fine:
114 case nir_op_fddy_fine:
115 case nir_op_fddx_coarse:
116 case nir_op_fddy_coarse:
117 /* These can only go in uniform control flow; pin them for now */
118 pinned = true;
119
120 default:
121 pinned = false;
122 }
123 break;
124
125 case nir_instr_type_tex:
126 /* We need to pin texture ops that do partial derivatives */
127 pinned = nir_instr_as_tex(instr)->op == nir_texop_txd;
128 break;
129
130 case nir_instr_type_load_const:
131 pinned = false;
132 break;
133
134 case nir_instr_type_intrinsic: {
135 const nir_intrinsic_info *info =
136 &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
137 pinned = !(info->flags & NIR_INTRINSIC_CAN_ELIMINATE) ||
138 !(info->flags & NIR_INTRINSIC_CAN_REORDER);
139 break;
140 }
141
142 case nir_instr_type_jump:
143 case nir_instr_type_ssa_undef:
144 case nir_instr_type_phi:
145 pinned = true;
146 break;
147
148 default:
149 unreachable("Invalid instruction type in GCM");
150 }
151
152 if (pinned)
153 BITSET_SET(state->pinned, instr->index);
154 }
155
156 return true;
157 }
158
159 static void
160 gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state);
161
162 /** Update an instructions schedule for the given source
163 *
164 * This function is called iteratively as we walk the sources of an
165 * instruction. It ensures that the given source instruction has been
166 * scheduled and then update this instruction's block if the source
167 * instruction is lower down the tree.
168 */
169 static bool
170 gcm_schedule_early_src(nir_src *src, void *void_state)
171 {
172 struct gcm_state *state = void_state;
173 nir_instr *instr = state->instr;
174
175 assert(src->is_ssa);
176
177 gcm_schedule_early_instr(src->ssa->parent_instr, void_state);
178
179 /* While the index isn't a proper dominance depth, it does have the
180 * property that if A dominates B then A->index <= B->index. Since we
181 * know that this instruction must have been dominated by all of its
182 * sources at some point (even if it's gone through value-numbering),
183 * all of the sources must lie on the same branch of the dominance tree.
184 * Therefore, we can just go ahead and just compare indices.
185 */
186 if (instr->block->index < src->ssa->parent_instr->block->index)
187 instr->block = src->ssa->parent_instr->block;
188
189 /* We need to restore the state instruction because it may have been
190 * changed through the gcm_schedule_early_instr call above. Since we
191 * may still be iterating through sources and future calls to
192 * gcm_schedule_early_src for the same instruction will still need it.
193 */
194 state->instr = instr;
195
196 return true;
197 }
198
199 /** Schedules an instruction early
200 *
201 * This function performs a recursive depth-first search starting at the
202 * given instruction and proceeding through the sources to schedule
203 * instructions as early as they can possibly go in the dominance tree.
204 * The instructions are "scheduled" by updating their instr->block field.
205 */
206 static void
207 gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state)
208 {
209 if (BITSET_TEST(state->visited, instr->index))
210 return;
211
212 BITSET_SET(state->visited, instr->index);
213
214 /* Pinned instructions are already scheduled so we don't need to do
215 * anything. Also, bailing here keeps us from ever following the
216 * sources of phi nodes which can be back-edges.
217 */
218 if (BITSET_TEST(state->pinned, instr->index))
219 return;
220
221 /* Start with the instruction at the top. As we iterate over the
222 * sources, it will get moved down as needed.
223 */
224 instr->block = state->impl->start_block;
225 state->instr = instr;
226
227 nir_foreach_src(instr, gcm_schedule_early_src, state);
228 }
229
230 static bool
231 gcm_schedule_early_block(nir_block *block, void *state)
232 {
233 nir_foreach_instr(block, instr)
234 gcm_schedule_early_instr(instr, state);
235
236 return true;
237 }
238
239 static void
240 gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state);
241
242 /** Schedules the instruction associated with the given SSA def late
243 *
244 * This function works by first walking all of the uses of the given SSA
245 * definition, ensuring that they are scheduled, and then computing the LCA
246 * (least common ancestor) of its uses. It then schedules this instruction
247 * as close to the LCA as possible while trying to stay out of loops.
248 */
249 static bool
250 gcm_schedule_late_def(nir_ssa_def *def, void *void_state)
251 {
252 struct gcm_state *state = void_state;
253
254 nir_block *lca = NULL;
255
256 struct set_entry *entry;
257 set_foreach(def->uses, entry) {
258 nir_instr *use_instr = (nir_instr *)entry->key;
259
260 gcm_schedule_late_instr(use_instr, state);
261
262 /* Phi instructions are a bit special. SSA definitions don't have to
263 * dominate the sources of the phi nodes that use them; instead, they
264 * have to dominate the predecessor block corresponding to the phi
265 * source. We handle this by looking through the sources, finding
266 * any that are usingg this SSA def, and using those blocks instead
267 * of the one the phi lives in.
268 */
269 if (use_instr->type == nir_instr_type_phi) {
270 nir_phi_instr *phi = nir_instr_as_phi(use_instr);
271
272 nir_foreach_phi_src(phi, phi_src) {
273 if (phi_src->src.ssa == def)
274 lca = nir_dominance_lca(lca, phi_src->pred);
275 }
276 } else {
277 lca = nir_dominance_lca(lca, use_instr->block);
278 }
279 }
280
281 set_foreach(def->if_uses, entry) {
282 nir_if *if_stmt = (nir_if *)entry->key;
283
284 /* For if statements, we consider the block to be the one immediately
285 * preceding the if CF node.
286 */
287 nir_block *pred_block =
288 nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node));
289
290 lca = nir_dominance_lca(lca, pred_block);
291 }
292
293 /* Some instructions may never be used. We'll just leave them scheduled
294 * early and let dead code clean them up.
295 */
296 if (lca == NULL)
297 return true;
298
299 /* We know have the LCA of all of the uses. If our invariants hold,
300 * this is dominated by the block that we chose when scheduling early.
301 * We now walk up the dominance tree and pick the lowest block that is
302 * as far outside loops as we can get.
303 */
304 nir_block *best = lca;
305 while (lca != def->parent_instr->block) {
306 assert(lca);
307 if (state->blocks[lca->index].loop_depth <
308 state->blocks[best->index].loop_depth)
309 best = lca;
310 lca = lca->imm_dom;
311 }
312 def->parent_instr->block = best;
313
314 return true;
315 }
316
317 /** Schedules an instruction late
318 *
319 * This function performs a depth-first search starting at the given
320 * instruction and proceeding through its uses to schedule instructions as
321 * late as they can reasonably go in the dominance tree. The instructions
322 * are "scheduled" by updating their instr->block field.
323 *
324 * The name of this function is actually a bit of a misnomer as it doesn't
325 * schedule them "as late as possible" as the paper implies. Instead, it
326 * first finds the lates possible place it can schedule the instruction and
327 * then possibly schedules it earlier than that. The actual location is as
328 * far down the tree as we can go while trying to stay out of loops.
329 */
330 static void
331 gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state)
332 {
333 if (BITSET_TEST(state->visited, instr->index))
334 return;
335
336 BITSET_SET(state->visited, instr->index);
337
338 /* Pinned instructions are already scheduled so we don't need to do
339 * anything. Also, bailing here keeps us from ever following phi nodes
340 * which can be back-edges.
341 */
342 if (BITSET_TEST(state->pinned, instr->index))
343 return;
344
345 nir_foreach_ssa_def(instr, gcm_schedule_late_def, state);
346 }
347
348 static bool
349 gcm_schedule_late_block(nir_block *block, void *void_state)
350 {
351 struct gcm_state *state = void_state;
352
353 nir_foreach_instr_safe(block, instr) {
354 gcm_schedule_late_instr(instr, state);
355
356 if (!BITSET_TEST(state->pinned, instr->index)) {
357 /* If this is an instruction we can move, go ahead and pull it out
358 * of the program and put it on the instrs list. This keeps us
359 * from causing linked list confusion when we're trying to put
360 * everything in its proper place.
361 *
362 * Note that we don't use nir_instr_remove here because that also
363 * cleans up uses and defs and we want to keep that information.
364 */
365 exec_node_remove(&instr->node);
366 exec_list_push_tail(&state->instrs, &instr->node);
367 }
368 }
369
370 return true;
371 }
372
373 static void
374 gcm_place_instr(nir_instr *instr, struct gcm_state *state);
375
376 static bool
377 gcm_place_instr_def(nir_ssa_def *def, void *state)
378 {
379 struct set_entry *entry;
380 set_foreach(def->uses, entry)
381 gcm_place_instr((nir_instr *)entry->key, state);
382
383 return false;
384 }
385
386 /** Places an instrution back into the program
387 *
388 * The earlier passes of GCM simply choose blocks for each instruction and
389 * otherwise leave them alone. This pass actually places the instructions
390 * into their chosen blocks.
391 *
392 * To do so, we use a standard post-order depth-first search linearization
393 * algorithm. We walk over the uses of the given instruction and ensure
394 * that they are placed and then place this instruction. Because we are
395 * working on multiple blocks at a time, we keep track of the last inserted
396 * instruction per-block in the state structure's block_info array. When
397 * we insert an instruction in a block we insert it before the last
398 * instruction inserted in that block rather than the last instruction
399 * inserted globally.
400 */
401 static void
402 gcm_place_instr(nir_instr *instr, struct gcm_state *state)
403 {
404 if (BITSET_TEST(state->visited, instr->index))
405 return;
406
407 BITSET_SET(state->visited, instr->index);
408
409 /* Phi nodes are our once source of back-edges. Since right now we are
410 * only doing scheduling within blocks, we don't need to worry about
411 * them since they are always at the top. Just skip them completely.
412 */
413 if (instr->type == nir_instr_type_phi) {
414 assert(BITSET_TEST(state->pinned, instr->index));
415 return;
416 }
417
418 nir_foreach_ssa_def(instr, gcm_place_instr_def, state);
419
420 if (BITSET_TEST(state->pinned, instr->index)) {
421 /* Pinned instructions have an implicit dependence on the pinned
422 * instructions that come after them in the block. Since the pinned
423 * instructions will naturally "chain" together, we only need to
424 * explicitly visit one of them.
425 */
426 for (nir_instr *after = nir_instr_next(instr);
427 after;
428 after = nir_instr_next(after)) {
429 if (BITSET_TEST(state->pinned, after->index)) {
430 gcm_place_instr(after, state);
431 break;
432 }
433 }
434 }
435
436 struct gcm_block_info *block_info = &state->blocks[instr->block->index];
437 if (!BITSET_TEST(state->pinned, instr->index)) {
438 exec_node_remove(&instr->node);
439
440 if (block_info->last_instr) {
441 exec_node_insert_node_before(&block_info->last_instr->node,
442 &instr->node);
443 } else {
444 /* Schedule it at the end of the block */
445 nir_instr *jump_instr = nir_block_last_instr(instr->block);
446 if (jump_instr && jump_instr->type == nir_instr_type_jump) {
447 exec_node_insert_node_before(&jump_instr->node, &instr->node);
448 } else {
449 exec_list_push_tail(&instr->block->instr_list, &instr->node);
450 }
451 }
452 }
453
454 block_info->last_instr = instr;
455 }
456
457 static void
458 opt_gcm_impl(nir_function_impl *impl)
459 {
460 struct gcm_state state;
461
462 unsigned num_instrs = nir_index_instrs(impl);
463 unsigned instr_words = BITSET_WORDS(num_instrs);
464
465 state.impl = impl;
466 state.instr = NULL;
467 state.visited = rzalloc_array(NULL, BITSET_WORD, instr_words);
468 state.pinned = rzalloc_array(NULL, BITSET_WORD, instr_words);
469 exec_list_make_empty(&state.instrs);
470 state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks);
471
472 nir_metadata_require(impl, nir_metadata_block_index |
473 nir_metadata_dominance);
474
475 gcm_build_block_info(&impl->body, &state, 0);
476 nir_foreach_block(impl, gcm_pin_instructions_block, &state);
477
478 nir_foreach_block(impl, gcm_schedule_early_block, &state);
479
480 memset(state.visited, 0, instr_words * sizeof(*state.visited));
481 nir_foreach_block(impl, gcm_schedule_late_block, &state);
482
483 memset(state.visited, 0, instr_words * sizeof(*state.visited));
484 while (!exec_list_is_empty(&state.instrs)) {
485 nir_instr *instr = exec_node_data(nir_instr,
486 state.instrs.tail_pred, node);
487 gcm_place_instr(instr, &state);
488 }
489
490 ralloc_free(state.visited);
491 ralloc_free(state.blocks);
492 }
493
494 void
495 nir_opt_gcm(nir_shader *shader)
496 {
497 nir_foreach_overload(shader, overload) {
498 if (overload->impl)
499 opt_gcm_impl(overload->impl);
500 }
501 }