e39bdbcbffb8fc694dfc28f6309df86f1a33cade
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
28 #include "cfgloop.h"
29 #include "cfglayout.h"
30 #include "params.h"
31 #include "output.h"
32 #include "expr.h"
33
34 /* This pass performs loop unrolling and peeling. We only perform these
35 optimalizations on innermost loops (with single exception) because
36 the impact on performance is greatest here, and we want to avoid
37 unnecessary code size growth. The gain is caused by greater sequentiality
38 of code, better code to optimize for futher passes and in some cases
39 by fewer testings of exit conditions. The main problem is code growth,
40 that impacts performance negatively due to effect of caches.
41
42 What we do:
43
44 -- complete peeling of once-rolling loops; this is the above mentioned
45 exception, as this causes loop to be cancelled completely and
46 does not cause code growth
47 -- complete peeling of loops that roll (small) constant times.
48 -- simple peeling of first iterations of loops that do not roll much
49 (according to profile feedback)
50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
60
61 There is a lot of parameters (defined and described in params.def) that
62 control how much we unroll/peel.
63
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
67 */
68
69 static void decide_unrolling_and_peeling PARAMS ((struct loops *, int));
70 static void peel_loops_completely PARAMS ((struct loops *, int));
71 static void decide_peel_simple PARAMS ((struct loops *, struct loop *, int));
72 static void decide_peel_once_rolling PARAMS ((struct loops *, struct loop *, int));
73 static void decide_peel_completely PARAMS ((struct loops *, struct loop *, int));
74 static void decide_unroll_stupid PARAMS ((struct loops *, struct loop *, int));
75 static void decide_unroll_constant_iterations PARAMS ((struct loops *, struct loop *, int));
76 static void decide_unroll_runtime_iterations PARAMS ((struct loops *, struct loop *, int));
77 static void peel_loop_simple PARAMS ((struct loops *, struct loop *));
78 static void peel_loop_completely PARAMS ((struct loops *, struct loop *));
79 static void unroll_loop_stupid PARAMS ((struct loops *, struct loop *));
80 static void unroll_loop_constant_iterations PARAMS ((struct loops *,
81 struct loop *));
82 static void unroll_loop_runtime_iterations PARAMS ((struct loops *,
83 struct loop *));
84
85 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
86 void
87 unroll_and_peel_loops (loops, flags)
88 struct loops *loops;
89 int flags;
90 {
91 struct loop *loop, *next;
92 int check;
93
94 /* First perform complete loop peeling (it is almost surely a win,
95 and affects parameters for further decision a lot). */
96 peel_loops_completely (loops, flags);
97
98 /* Now decide rest of unrolling and peeling. */
99 decide_unrolling_and_peeling (loops, flags);
100
101 loop = loops->tree_root;
102 while (loop->inner)
103 loop = loop->inner;
104
105 /* Scan the loops, inner ones first. */
106 while (loop != loops->tree_root)
107 {
108 if (loop->next)
109 {
110 next = loop->next;
111 while (next->inner)
112 next = next->inner;
113 }
114 else
115 next = loop->outer;
116
117 check = 1;
118 /* And perform the appropriate transformations. */
119 switch (loop->lpt_decision.decision)
120 {
121 case LPT_PEEL_COMPLETELY:
122 /* Already done. */
123 abort ();
124 case LPT_PEEL_SIMPLE:
125 peel_loop_simple (loops, loop);
126 break;
127 case LPT_UNROLL_CONSTANT:
128 unroll_loop_constant_iterations (loops, loop);
129 break;
130 case LPT_UNROLL_RUNTIME:
131 unroll_loop_runtime_iterations (loops, loop);
132 break;
133 case LPT_UNROLL_STUPID:
134 unroll_loop_stupid (loops, loop);
135 break;
136 case LPT_NONE:
137 check = 0;
138 break;
139 default:
140 abort ();
141 }
142 if (check)
143 {
144 #ifdef ENABLE_CHECKING
145 verify_dominators (loops->cfg.dom);
146 verify_loop_structure (loops);
147 #endif
148 }
149 loop = next;
150 }
151 }
152
153 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
154 static void
155 peel_loops_completely (loops, flags)
156 struct loops *loops;
157 int flags;
158 {
159 struct loop *loop, *next;
160
161 loop = loops->tree_root;
162 while (loop->inner)
163 loop = loop->inner;
164
165 while (loop != loops->tree_root)
166 {
167 if (loop->next)
168 {
169 next = loop->next;
170 while (next->inner)
171 next = next->inner;
172 }
173 else
174 next = loop->outer;
175
176 loop->lpt_decision.decision = LPT_NONE;
177 loop->has_desc = 0;
178
179 if (rtl_dump_file)
180 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
181 loop->num);
182
183 loop->ninsns = num_loop_insns (loop);
184
185 decide_peel_once_rolling (loops, loop, flags);
186 if (loop->lpt_decision.decision == LPT_NONE)
187 decide_peel_completely (loops, loop, flags);
188
189 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
190 {
191 peel_loop_completely (loops, loop);
192 #ifdef ENABLE_CHECKING
193 verify_dominators (loops->cfg.dom);
194 verify_loop_structure (loops);
195 #endif
196 }
197 loop = next;
198 }
199 }
200
201 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
202 static void
203 decide_unrolling_and_peeling (loops, flags)
204 struct loops *loops;
205 int flags;
206 {
207 struct loop *loop = loops->tree_root, *next;
208
209 while (loop->inner)
210 loop = loop->inner;
211
212 /* Scan the loops, inner ones first. */
213 while (loop != loops->tree_root)
214 {
215 if (loop->next)
216 {
217 next = loop->next;
218 while (next->inner)
219 next = next->inner;
220 }
221 else
222 next = loop->outer;
223
224 loop->lpt_decision.decision = LPT_NONE;
225
226 if (rtl_dump_file)
227 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
228
229 /* Do not peel cold areas. */
230 if (!maybe_hot_bb_p (loop->header))
231 {
232 if (rtl_dump_file)
233 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
234 loop = next;
235 continue;
236 }
237
238 /* Can the loop be manipulated? */
239 if (!can_duplicate_loop_p (loop))
240 {
241 if (rtl_dump_file)
242 fprintf (rtl_dump_file,
243 ";; Not considering loop, cannot duplicate\n");
244 loop = next;
245 continue;
246 }
247
248 /* Skip non-innermost loops. */
249 if (loop->inner)
250 {
251 if (rtl_dump_file)
252 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
253 loop = next;
254 continue;
255 }
256
257 loop->ninsns = num_loop_insns (loop);
258 loop->av_ninsns = average_num_loop_insns (loop);
259
260 /* Try transformations one by one in decreasing order of
261 priority. */
262
263 decide_unroll_constant_iterations (loops, loop, flags);
264 if (loop->lpt_decision.decision == LPT_NONE)
265 decide_unroll_runtime_iterations (loops, loop, flags);
266 if (loop->lpt_decision.decision == LPT_NONE)
267 decide_unroll_stupid (loops, loop, flags);
268 if (loop->lpt_decision.decision == LPT_NONE)
269 decide_peel_simple (loops, loop, flags);
270
271 loop = next;
272 }
273 }
274
275 /* Decide whether the LOOP is once rolling and suitable for complete
276 peeling. */
277 static void
278 decide_peel_once_rolling (loops, loop, flags)
279 struct loops *loops;
280 struct loop *loop;
281 int flags ATTRIBUTE_UNUSED;
282 {
283 if (rtl_dump_file)
284 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
285
286 /* Is the loop small enough? */
287 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
288 {
289 if (rtl_dump_file)
290 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
291 return;
292 }
293
294 /* Check for simple loops. */
295 loop->simple = simple_loop_p (loops, loop, &loop->desc);
296 loop->has_desc = 1;
297
298 /* Check number of iterations. */
299 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter !=0)
300 {
301 if (rtl_dump_file)
302 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
303 return;
304 }
305
306 /* Success. */
307 if (rtl_dump_file)
308 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
309 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
310 }
311
312 /* Decide whether the LOOP is suitable for complete peeling. */
313 static void
314 decide_peel_completely (loops, loop, flags)
315 struct loops *loops;
316 struct loop *loop;
317 int flags ATTRIBUTE_UNUSED;
318 {
319 unsigned npeel;
320
321 if (rtl_dump_file)
322 fprintf (rtl_dump_file, ";; Considering peeling completely\n");
323
324 /* Skip non-innermost loops. */
325 if (loop->inner)
326 {
327 if (rtl_dump_file)
328 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
329 return;
330 }
331
332 /* Do not peel cold areas. */
333 if (!maybe_hot_bb_p (loop->header))
334 {
335 if (rtl_dump_file)
336 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
337 return;
338 }
339
340 /* Can the loop be manipulated? */
341 if (!can_duplicate_loop_p (loop))
342 {
343 if (rtl_dump_file)
344 fprintf (rtl_dump_file,
345 ";; Not considering loop, cannot duplicate\n");
346 return;
347 }
348
349 /* npeel = number of iterations to peel. */
350 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
351 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
352 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
353
354 /* Is the loop small enough? */
355 if (!npeel)
356 {
357 if (rtl_dump_file)
358 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
359 return;
360 }
361
362 /* Check for simple loops. */
363 if (!loop->has_desc)
364 loop->simple = simple_loop_p (loops, loop, &loop->desc);
365
366 /* Check number of iterations. */
367 if (!loop->simple || !loop->desc.const_iter)
368 {
369 if (rtl_dump_file)
370 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
371 return;
372 }
373
374 if (loop->desc.niter > npeel - 1)
375 {
376 if (rtl_dump_file)
377 {
378 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
379 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
380 fprintf (rtl_dump_file, "iterations > %d [maximum peelings])\n", npeel);
381 }
382 return;
383 }
384
385 /* Success. */
386 if (rtl_dump_file)
387 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
388 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
389 }
390
391 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
392 completely. The transformation done:
393
394 for (i = 0; i < 4; i++)
395 body;
396
397 ==>
398
399 i = 0;
400 body; i++;
401 body; i++;
402 body; i++;
403 body; i++;
404 */
405 static void
406 peel_loop_completely (loops, loop)
407 struct loops *loops;
408 struct loop *loop;
409 {
410 sbitmap wont_exit;
411 unsigned HOST_WIDE_INT npeel;
412 unsigned n_remove_edges, i;
413 edge *remove_edges;
414 struct loop_desc *desc = &loop->desc;
415
416 npeel = desc->niter;
417
418 if (npeel)
419 {
420 wont_exit = sbitmap_alloc (npeel + 1);
421 sbitmap_ones (wont_exit);
422 RESET_BIT (wont_exit, 0);
423 if (desc->may_be_zero)
424 RESET_BIT (wont_exit, 1);
425
426 remove_edges = xcalloc (npeel, sizeof (edge));
427 n_remove_edges = 0;
428
429 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
430 loops, npeel,
431 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
432 DLTHE_FLAG_UPDATE_FREQ))
433 abort ();
434
435 free (wont_exit);
436
437 /* Remove the exit edges. */
438 for (i = 0; i < n_remove_edges; i++)
439 remove_path (loops, remove_edges[i]);
440 free (remove_edges);
441 }
442
443 /* Now remove the unreachable part of the last iteration and cancel
444 the loop. */
445 remove_path (loops, desc->in_edge);
446
447 if (rtl_dump_file)
448 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
449 }
450
451 /* Decide whether to unroll LOOP iterating constant number of times and how much. */
452 static void
453 decide_unroll_constant_iterations (loops, loop, flags)
454 struct loops *loops;
455 struct loop *loop;
456 int flags;
457 {
458 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
459
460 if (!(flags & UAP_UNROLL))
461 {
462 /* We were not asked to, just return back silently. */
463 return;
464 }
465
466 if (rtl_dump_file)
467 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
468
469 /* nunroll = total number of copies of the original loop body in
470 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
471 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
472 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
473 if (nunroll > nunroll_by_av)
474 nunroll = nunroll_by_av;
475 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
476 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
477
478 /* Skip big loops. */
479 if (nunroll <= 1)
480 {
481 if (rtl_dump_file)
482 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
483 return;
484 }
485
486 /* Check for simple loops. */
487 if (!loop->has_desc)
488 loop->simple = simple_loop_p (loops, loop, &loop->desc);
489
490 /* Check number of iterations. */
491 if (!loop->simple || !loop->desc.const_iter)
492 {
493 if (rtl_dump_file)
494 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
495 return;
496 }
497
498 /* Check whether the loop rolls enough to consider. */
499 if (loop->desc.niter < 2 * nunroll)
500 {
501 if (rtl_dump_file)
502 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
503 return;
504 }
505
506 /* Success; now compute number of iterations to unroll. We alter
507 nunroll so that as few as possible copies of loop body are
508 neccesary, while still not decreasing the number of unrollings
509 too much (at most by 1). */
510 best_copies = 2 * nunroll + 10;
511
512 i = 2 * nunroll + 2;
513 if ((unsigned) i - 1 >= loop->desc.niter)
514 i = loop->desc.niter - 2;
515
516 for (; i >= nunroll - 1; i--)
517 {
518 unsigned exit_mod = loop->desc.niter % (i + 1);
519
520 if (loop->desc.postincr)
521 n_copies = exit_mod + i + 1;
522 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
523 n_copies = exit_mod + i + 2;
524 else
525 n_copies = i + 1;
526
527 if (n_copies < best_copies)
528 {
529 best_copies = n_copies;
530 best_unroll = i;
531 }
532 }
533
534 if (rtl_dump_file)
535 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
536 best_unroll + 1, best_copies, nunroll);
537
538 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
539 loop->lpt_decision.times = best_unroll;
540 }
541
542 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
543 times. The transformation does this:
544
545 for (i = 0; i < 102; i++)
546 body;
547
548 ==>
549
550 i = 0;
551 body; i++;
552 body; i++;
553 while (i < 102)
554 {
555 body; i++;
556 body; i++;
557 body; i++;
558 body; i++;
559 }
560 */
561 static void
562 unroll_loop_constant_iterations (loops, loop)
563 struct loops *loops;
564 struct loop *loop;
565 {
566 unsigned HOST_WIDE_INT niter;
567 unsigned exit_mod;
568 sbitmap wont_exit;
569 unsigned n_remove_edges, i;
570 edge *remove_edges;
571 unsigned max_unroll = loop->lpt_decision.times;
572 struct loop_desc *desc = &loop->desc;
573
574 niter = desc->niter;
575
576 if (niter <= (unsigned) max_unroll + 1)
577 abort (); /* Should not get here (such loop should be peeled instead). */
578
579 exit_mod = niter % (max_unroll + 1);
580
581 wont_exit = sbitmap_alloc (max_unroll + 1);
582 sbitmap_ones (wont_exit);
583
584 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
585 n_remove_edges = 0;
586
587 if (desc->postincr)
588 {
589 /* Counter is incremented after the exit test; leave exit test
590 in the first copy, so that the loops that start with test
591 of exit condition have continuous body after unrolling. */
592
593 if (rtl_dump_file)
594 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
595
596 /* Peel exit_mod iterations. */
597 RESET_BIT (wont_exit, 0);
598 if (desc->may_be_zero)
599 RESET_BIT (wont_exit, 1);
600
601 if (exit_mod
602 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
603 loops, exit_mod,
604 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
605 DLTHE_FLAG_UPDATE_FREQ))
606 abort ();
607
608 SET_BIT (wont_exit, 1);
609 }
610 else
611 {
612 /* Leave exit test in last copy, for the same reason as above if
613 the loop tests the condition at the end of loop body. */
614
615 if (rtl_dump_file)
616 fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
617
618 /* We know that niter >= max_unroll + 2; so we do not need to care of
619 case when we would exit before reaching the loop. So just peel
620 exit_mod + 1 iterations.
621 */
622 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
623 {
624 RESET_BIT (wont_exit, 0);
625 if (desc->may_be_zero)
626 RESET_BIT (wont_exit, 1);
627
628 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
629 loops, exit_mod + 1,
630 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
631 DLTHE_FLAG_UPDATE_FREQ))
632 abort ();
633
634 SET_BIT (wont_exit, 0);
635 SET_BIT (wont_exit, 1);
636 }
637
638 RESET_BIT (wont_exit, max_unroll);
639 }
640
641 /* Now unroll the loop. */
642 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
643 loops, max_unroll,
644 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
645 DLTHE_FLAG_UPDATE_FREQ))
646 abort ();
647
648 free (wont_exit);
649
650 /* Remove the edges. */
651 for (i = 0; i < n_remove_edges; i++)
652 remove_path (loops, remove_edges[i]);
653 free (remove_edges);
654
655 if (rtl_dump_file)
656 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
657 }
658
659 /* Decide whether to unroll LOOP iterating runtime computable number of times
660 and how much. */
661 static void
662 decide_unroll_runtime_iterations (loops, loop, flags)
663 struct loops *loops;
664 struct loop *loop;
665 int flags;
666 {
667 unsigned nunroll, nunroll_by_av, i;
668
669 if (!(flags & UAP_UNROLL))
670 {
671 /* We were not asked to, just return back silently. */
672 return;
673 }
674
675 if (rtl_dump_file)
676 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
677
678 /* nunroll = total number of copies of the original loop body in
679 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
680 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
681 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
682 if (nunroll > nunroll_by_av)
683 nunroll = nunroll_by_av;
684 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
685 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
686
687 /* Skip big loops. */
688 if (nunroll <= 1)
689 {
690 if (rtl_dump_file)
691 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
692 return;
693 }
694
695 /* Check for simple loops. */
696 if (!loop->has_desc)
697 loop->simple = simple_loop_p (loops, loop, &loop->desc);
698
699 /* Check simpleness. */
700 if (!loop->simple)
701 {
702 if (rtl_dump_file)
703 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
704 return;
705 }
706
707 if (loop->desc.const_iter)
708 {
709 if (rtl_dump_file)
710 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
711 return;
712 }
713
714 /* If we have profile feedback, check whether the loop rolls. */
715 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
716 {
717 if (rtl_dump_file)
718 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
719 return;
720 }
721
722 /* Success; now force nunroll to be power of 2, as we are unable to
723 cope with overflows in computation of number of iterations. */
724 for (i = 1; 2 * i <= nunroll; i *= 2);
725
726 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
727 loop->lpt_decision.times = i - 1;
728 }
729
730 /* Unroll LOOP for that we are able to count number of iterations in runtime
731 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
732 extra care for case n < 0):
733
734 for (i = 0; i < n; i++)
735 body;
736
737 ==>
738
739 i = 0;
740 mod = n % 4;
741
742 switch (mod)
743 {
744 case 3:
745 body; i++;
746 case 2:
747 body; i++;
748 case 1:
749 body; i++;
750 case 0: ;
751 }
752
753 while (i < n)
754 {
755 body; i++;
756 body; i++;
757 body; i++;
758 body; i++;
759 }
760 */
761 static void
762 unroll_loop_runtime_iterations (loops, loop)
763 struct loops *loops;
764 struct loop *loop;
765 {
766 rtx niter, init_code, branch_code, jump, label;
767 unsigned i, j, p;
768 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
769 unsigned n_dom_bbs;
770 sbitmap wont_exit;
771 int may_exit_copy;
772 unsigned n_peel, n_remove_edges;
773 edge *remove_edges, e;
774 bool extra_zero_check, last_may_exit;
775 unsigned max_unroll = loop->lpt_decision.times;
776 struct loop_desc *desc = &loop->desc;
777
778 /* Remember blocks whose dominators will have to be updated. */
779 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
780 n_dom_bbs = 0;
781
782 body = get_loop_body (loop);
783 for (i = 0; i < loop->num_nodes; i++)
784 {
785 unsigned nldom;
786 basic_block *ldom;
787
788 nldom = get_dominated_by (loops->cfg.dom, body[i], &ldom);
789 for (j = 0; j < nldom; j++)
790 if (!flow_bb_inside_loop_p (loop, ldom[j]))
791 dom_bbs[n_dom_bbs++] = ldom[j];
792
793 free (ldom);
794 }
795 free (body);
796
797 if (desc->postincr)
798 {
799 /* Leave exit in first copy (for explanation why see comment in
800 unroll_loop_constant_iterations). */
801 may_exit_copy = 0;
802 n_peel = max_unroll - 1;
803 extra_zero_check = true;
804 last_may_exit = false;
805 }
806 else
807 {
808 /* Leave exit in last copy (for explanation why see comment in
809 unroll_loop_constant_iterations). */
810 may_exit_copy = max_unroll;
811 n_peel = max_unroll;
812 extra_zero_check = false;
813 last_may_exit = true;
814 }
815
816 /* Get expression for number of iterations. */
817 start_sequence ();
818 niter = count_loop_iterations (desc, NULL, NULL);
819 if (!niter)
820 abort ();
821 niter = force_operand (niter, NULL);
822
823 /* Count modulo by ANDing it with max_unroll; we use the fact that
824 the number of unrollings is a power of two, and thus this is correct
825 even if there is overflow in the computation. */
826 niter = expand_simple_binop (GET_MODE (desc->var), AND,
827 niter,
828 GEN_INT (max_unroll),
829 NULL_RTX, 0, OPTAB_LIB_WIDEN);
830
831 init_code = get_insns ();
832 end_sequence ();
833
834 /* Precondition the loop. */
835 loop_split_edge_with (loop_preheader_edge (loop), init_code, loops);
836
837 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
838 n_remove_edges = 0;
839
840 wont_exit = sbitmap_alloc (max_unroll + 2);
841
842 /* Peel the first copy of loop body (almost always we must leave exit test
843 here; the only exception is when we have extra zero check and the number
844 of iterations is reliable (i.e. comes out of NE condition). Also record
845 the place of (possible) extra zero check. */
846 sbitmap_zero (wont_exit);
847 if (extra_zero_check && desc->cond == NE)
848 SET_BIT (wont_exit, 1);
849 ezc_swtch = loop_preheader_edge (loop)->src;
850 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
851 loops, 1,
852 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
853 DLTHE_FLAG_UPDATE_FREQ))
854 abort ();
855
856 /* Record the place where switch will be built for preconditioning. */
857 swtch = loop_split_edge_with (loop_preheader_edge (loop),
858 NULL_RTX, loops);
859
860 for (i = 0; i < n_peel; i++)
861 {
862 /* Peel the copy. */
863 sbitmap_zero (wont_exit);
864 if (i != n_peel - 1 || !last_may_exit)
865 SET_BIT (wont_exit, 1);
866 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
867 loops, 1,
868 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
869 DLTHE_FLAG_UPDATE_FREQ))
870 abort ();
871
872 if (i != n_peel)
873 {
874 /* Create item for switch. */
875 j = n_peel - i - (extra_zero_check ? 0 : 1);
876 p = REG_BR_PROB_BASE / (i + 2);
877
878 preheader = loop_split_edge_with (loop_preheader_edge (loop),
879 NULL_RTX, loops);
880 label = block_label (preheader);
881 start_sequence ();
882 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
883 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
884 label);
885 jump = get_last_insn ();
886 JUMP_LABEL (jump) = label;
887 REG_NOTES (jump)
888 = gen_rtx_EXPR_LIST (REG_BR_PROB,
889 GEN_INT (p), REG_NOTES (jump));
890
891 LABEL_NUSES (label)++;
892 branch_code = get_insns ();
893 end_sequence ();
894
895 swtch = loop_split_edge_with (swtch->pred, branch_code, loops);
896 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
897 swtch->succ->probability = REG_BR_PROB_BASE - p;
898 e = make_edge (swtch, preheader,
899 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
900 e->probability = p;
901 }
902 }
903
904 if (extra_zero_check)
905 {
906 /* Add branch for zero iterations. */
907 p = REG_BR_PROB_BASE / (max_unroll + 1);
908 swtch = ezc_swtch;
909 preheader = loop_split_edge_with (loop_preheader_edge (loop),
910 NULL_RTX, loops);
911 label = block_label (preheader);
912 start_sequence ();
913 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
914 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
915 label);
916 jump = get_last_insn ();
917 JUMP_LABEL (jump) = label;
918 REG_NOTES (jump)
919 = gen_rtx_EXPR_LIST (REG_BR_PROB,
920 GEN_INT (p), REG_NOTES (jump));
921
922 LABEL_NUSES (label)++;
923 branch_code = get_insns ();
924 end_sequence ();
925
926 swtch = loop_split_edge_with (swtch->succ, branch_code, loops);
927 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
928 swtch->succ->probability = REG_BR_PROB_BASE - p;
929 e = make_edge (swtch, preheader,
930 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
931 e->probability = p;
932 }
933
934 /* Recount dominators for outer blocks. */
935 iterate_fix_dominators (loops->cfg.dom, dom_bbs, n_dom_bbs);
936
937 /* And unroll loop. */
938
939 sbitmap_ones (wont_exit);
940 RESET_BIT (wont_exit, may_exit_copy);
941
942 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
943 loops, max_unroll,
944 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
945 DLTHE_FLAG_UPDATE_FREQ))
946 abort ();
947
948 free (wont_exit);
949
950 /* Remove the edges. */
951 for (i = 0; i < n_remove_edges; i++)
952 remove_path (loops, remove_edges[i]);
953 free (remove_edges);
954
955 if (rtl_dump_file)
956 fprintf (rtl_dump_file,
957 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
958 max_unroll, num_loop_insns (loop));
959 }
960
961 /* Decide whether to simply peel LOOP and how much. */
962 static void
963 decide_peel_simple (loops, loop, flags)
964 struct loops *loops;
965 struct loop *loop;
966 int flags;
967 {
968 unsigned npeel;
969
970 if (!(flags & UAP_PEEL))
971 {
972 /* We were not asked to, just return back silently. */
973 return;
974 }
975
976 if (rtl_dump_file)
977 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
978
979 /* npeel = number of iterations to peel. */
980 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
981 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
982 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
983
984 /* Skip big loops. */
985 if (!npeel)
986 {
987 if (rtl_dump_file)
988 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
989 return;
990 }
991
992 /* Check for simple loops. */
993 if (!loop->has_desc)
994 loop->simple = simple_loop_p (loops, loop, &loop->desc);
995
996 /* Check number of iterations. */
997 if (loop->simple && loop->desc.const_iter)
998 {
999 if (rtl_dump_file)
1000 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
1001 return;
1002 }
1003
1004 /* Do not simply peel loops with branches inside -- it increases number
1005 of mispredicts. */
1006 if (loop->desc.n_branches > 1)
1007 {
1008 if (rtl_dump_file)
1009 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
1010 return;
1011 }
1012
1013 if (loop->header->count)
1014 {
1015 unsigned niter = expected_loop_iterations (loop);
1016 if (niter + 1 > npeel)
1017 {
1018 if (rtl_dump_file)
1019 {
1020 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1021 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1022 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1023 }
1024 return;
1025 }
1026 npeel = niter + 1;
1027 }
1028 else
1029 {
1030 /* For now we have no good heuristics to decide whether loop peeling
1031 will be effective, so disable it. */
1032 if (rtl_dump_file)
1033 fprintf (rtl_dump_file,
1034 ";; Not peeling loop, no evidence it will be profitable\n");
1035 return;
1036 }
1037
1038 /* Success. */
1039 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1040 loop->lpt_decision.times = npeel;
1041 }
1042
1043 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1044 while (cond)
1045 body;
1046
1047 ==>
1048
1049 if (!cond) goto end;
1050 body;
1051 if (!cond) goto end;
1052 body;
1053 while (cond)
1054 body;
1055 end: ;
1056 */
1057 static void
1058 peel_loop_simple (loops, loop)
1059 struct loops *loops;
1060 struct loop *loop;
1061 {
1062 sbitmap wont_exit;
1063 unsigned npeel = loop->lpt_decision.times;
1064
1065 wont_exit = sbitmap_alloc (npeel + 1);
1066 sbitmap_zero (wont_exit);
1067
1068 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1069 loops, npeel, wont_exit, NULL, NULL, NULL,
1070 DLTHE_FLAG_UPDATE_FREQ))
1071 abort ();
1072
1073 free (wont_exit);
1074
1075 if (rtl_dump_file)
1076 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1077 }
1078
1079 /* Decide whether to unroll LOOP stupidly and how much. */
1080 static void
1081 decide_unroll_stupid (loops, loop, flags)
1082 struct loops *loops;
1083 struct loop *loop;
1084 int flags;
1085 {
1086 unsigned nunroll, nunroll_by_av, i;
1087
1088 if (!(flags & UAP_UNROLL_ALL))
1089 {
1090 /* We were not asked to, just return back silently. */
1091 return;
1092 }
1093
1094 if (rtl_dump_file)
1095 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1096
1097 /* nunroll = total number of copies of the original loop body in
1098 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1099 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1100 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1101 if (nunroll > nunroll_by_av)
1102 nunroll = nunroll_by_av;
1103 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1104 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1105
1106 /* Skip big loops. */
1107 if (nunroll <= 1)
1108 {
1109 if (rtl_dump_file)
1110 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1111 return;
1112 }
1113
1114 /* Check for simple loops. */
1115 if (!loop->has_desc)
1116 loop->simple = simple_loop_p (loops, loop, &loop->desc);
1117
1118 /* Check simpleness. */
1119 if (loop->simple)
1120 {
1121 if (rtl_dump_file)
1122 fprintf (rtl_dump_file, ";; The loop is simple\n");
1123 return;
1124 }
1125
1126 /* Do not unroll loops with branches inside -- it increases number
1127 of mispredicts. */
1128 if (loop->desc.n_branches > 1)
1129 {
1130 if (rtl_dump_file)
1131 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1132 return;
1133 }
1134
1135 /* If we have profile feedback, check whether the loop rolls. */
1136 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1137 {
1138 if (rtl_dump_file)
1139 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1140 return;
1141 }
1142
1143 /* Success. Now force nunroll to be power of 2, as it seems that this
1144 improves results (partially because of better aligments, partially
1145 because of some dark magic). */
1146 for (i = 1; 2 * i <= nunroll; i *= 2);
1147
1148 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1149 loop->lpt_decision.times = i - 1;
1150 }
1151
1152 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1153 while (cond)
1154 body;
1155
1156 ==>
1157
1158 while (cond)
1159 {
1160 body;
1161 if (!cond) break;
1162 body;
1163 if (!cond) break;
1164 body;
1165 if (!cond) break;
1166 body;
1167 }
1168 */
1169 static void
1170 unroll_loop_stupid (loops, loop)
1171 struct loops *loops;
1172 struct loop *loop;
1173 {
1174 sbitmap wont_exit;
1175 unsigned nunroll = loop->lpt_decision.times;
1176
1177 wont_exit = sbitmap_alloc (nunroll + 1);
1178 sbitmap_zero (wont_exit);
1179
1180 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1181 loops, nunroll, wont_exit, NULL, NULL, NULL,
1182 DLTHE_FLAG_UPDATE_FREQ))
1183 abort ();
1184
1185 free (wont_exit);
1186
1187 if (rtl_dump_file)
1188 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1189 nunroll, num_loop_insns (loop));
1190 }