b8c035fbc81c548b79eb92db43c1f62f4a586048
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "hard-reg-set.h"
27 #include "basic-block.h"
28 #include "cfgloop.h"
29 #include "cfglayout.h"
30 #include "params.h"
31 #include "output.h"
32 #include "expr.h"
33
34 /* This pass performs loop unrolling and peeling. We only perform these
35 optimizations on innermost loops (with single exception) because
36 the impact on performance is greatest here, and we want to avoid
37 unnecessary code size growth. The gain is caused by greater sequentiality
38 of code, better code to optimize for futher passes and in some cases
39 by fewer testings of exit conditions. The main problem is code growth,
40 that impacts performance negatively due to effect of caches.
41
42 What we do:
43
44 -- complete peeling of once-rolling loops; this is the above mentioned
45 exception, as this causes loop to be cancelled completely and
46 does not cause code growth
47 -- complete peeling of loops that roll (small) constant times.
48 -- simple peeling of first iterations of loops that do not roll much
49 (according to profile feedback)
50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
60
61 There is a lot of parameters (defined and described in params.def) that
62 control how much we unroll/peel.
63
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
67 */
68
69 static void decide_unrolling_and_peeling PARAMS ((struct loops *, int));
70 static void peel_loops_completely PARAMS ((struct loops *, int));
71 static void decide_peel_simple PARAMS ((struct loops *, struct loop *, int));
72 static void decide_peel_once_rolling PARAMS ((struct loops *, struct loop *, int));
73 static void decide_peel_completely PARAMS ((struct loops *, struct loop *, int));
74 static void decide_unroll_stupid PARAMS ((struct loops *, struct loop *, int));
75 static void decide_unroll_constant_iterations PARAMS ((struct loops *, struct loop *, int));
76 static void decide_unroll_runtime_iterations PARAMS ((struct loops *, struct loop *, int));
77 static void peel_loop_simple PARAMS ((struct loops *, struct loop *));
78 static void peel_loop_completely PARAMS ((struct loops *, struct loop *));
79 static void unroll_loop_stupid PARAMS ((struct loops *, struct loop *));
80 static void unroll_loop_constant_iterations PARAMS ((struct loops *,
81 struct loop *));
82 static void unroll_loop_runtime_iterations PARAMS ((struct loops *,
83 struct loop *));
84
85 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
86 void
87 unroll_and_peel_loops (loops, flags)
88 struct loops *loops;
89 int flags;
90 {
91 struct loop *loop, *next;
92 int check;
93
94 /* First perform complete loop peeling (it is almost surely a win,
95 and affects parameters for further decision a lot). */
96 peel_loops_completely (loops, flags);
97
98 /* Now decide rest of unrolling and peeling. */
99 decide_unrolling_and_peeling (loops, flags);
100
101 loop = loops->tree_root;
102 while (loop->inner)
103 loop = loop->inner;
104
105 /* Scan the loops, inner ones first. */
106 while (loop != loops->tree_root)
107 {
108 if (loop->next)
109 {
110 next = loop->next;
111 while (next->inner)
112 next = next->inner;
113 }
114 else
115 next = loop->outer;
116
117 check = 1;
118 /* And perform the appropriate transformations. */
119 switch (loop->lpt_decision.decision)
120 {
121 case LPT_PEEL_COMPLETELY:
122 /* Already done. */
123 abort ();
124 case LPT_PEEL_SIMPLE:
125 peel_loop_simple (loops, loop);
126 break;
127 case LPT_UNROLL_CONSTANT:
128 unroll_loop_constant_iterations (loops, loop);
129 break;
130 case LPT_UNROLL_RUNTIME:
131 unroll_loop_runtime_iterations (loops, loop);
132 break;
133 case LPT_UNROLL_STUPID:
134 unroll_loop_stupid (loops, loop);
135 break;
136 case LPT_NONE:
137 check = 0;
138 break;
139 default:
140 abort ();
141 }
142 if (check)
143 {
144 #ifdef ENABLE_CHECKING
145 verify_dominators (loops->cfg.dom);
146 verify_loop_structure (loops);
147 #endif
148 }
149 loop = next;
150 }
151 }
152
153 /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */
154 static void
155 peel_loops_completely (loops, flags)
156 struct loops *loops;
157 int flags;
158 {
159 struct loop *loop, *next;
160
161 loop = loops->tree_root;
162 while (loop->inner)
163 loop = loop->inner;
164
165 while (loop != loops->tree_root)
166 {
167 if (loop->next)
168 {
169 next = loop->next;
170 while (next->inner)
171 next = next->inner;
172 }
173 else
174 next = loop->outer;
175
176 loop->lpt_decision.decision = LPT_NONE;
177 loop->has_desc = 0;
178
179 if (rtl_dump_file)
180 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
181 loop->num);
182
183 loop->ninsns = num_loop_insns (loop);
184
185 decide_peel_once_rolling (loops, loop, flags);
186 if (loop->lpt_decision.decision == LPT_NONE)
187 decide_peel_completely (loops, loop, flags);
188
189 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
190 {
191 peel_loop_completely (loops, loop);
192 #ifdef ENABLE_CHECKING
193 verify_dominators (loops->cfg.dom);
194 verify_loop_structure (loops);
195 #endif
196 }
197 loop = next;
198 }
199 }
200
201 /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */
202 static void
203 decide_unrolling_and_peeling (loops, flags)
204 struct loops *loops;
205 int flags;
206 {
207 struct loop *loop = loops->tree_root, *next;
208
209 while (loop->inner)
210 loop = loop->inner;
211
212 /* Scan the loops, inner ones first. */
213 while (loop != loops->tree_root)
214 {
215 if (loop->next)
216 {
217 next = loop->next;
218 while (next->inner)
219 next = next->inner;
220 }
221 else
222 next = loop->outer;
223
224 loop->lpt_decision.decision = LPT_NONE;
225
226 if (rtl_dump_file)
227 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
228
229 /* Do not peel cold areas. */
230 if (!maybe_hot_bb_p (loop->header))
231 {
232 if (rtl_dump_file)
233 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
234 loop = next;
235 continue;
236 }
237
238 /* Can the loop be manipulated? */
239 if (!can_duplicate_loop_p (loop))
240 {
241 if (rtl_dump_file)
242 fprintf (rtl_dump_file,
243 ";; Not considering loop, cannot duplicate\n");
244 loop = next;
245 continue;
246 }
247
248 /* Skip non-innermost loops. */
249 if (loop->inner)
250 {
251 if (rtl_dump_file)
252 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
253 loop = next;
254 continue;
255 }
256
257 loop->ninsns = num_loop_insns (loop);
258 loop->av_ninsns = average_num_loop_insns (loop);
259
260 /* Try transformations one by one in decreasing order of
261 priority. */
262
263 decide_unroll_constant_iterations (loops, loop, flags);
264 if (loop->lpt_decision.decision == LPT_NONE)
265 decide_unroll_runtime_iterations (loops, loop, flags);
266 if (loop->lpt_decision.decision == LPT_NONE)
267 decide_unroll_stupid (loops, loop, flags);
268 if (loop->lpt_decision.decision == LPT_NONE)
269 decide_peel_simple (loops, loop, flags);
270
271 loop = next;
272 }
273 }
274
275 /* Decide whether the LOOP is once rolling and suitable for complete
276 peeling. */
277 static void
278 decide_peel_once_rolling (loops, loop, flags)
279 struct loops *loops;
280 struct loop *loop;
281 int flags ATTRIBUTE_UNUSED;
282 {
283 if (rtl_dump_file)
284 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
285
286 /* Is the loop small enough? */
287 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
288 {
289 if (rtl_dump_file)
290 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
291 return;
292 }
293
294 /* Check for simple loops. */
295 loop->simple = simple_loop_p (loops, loop, &loop->desc);
296 loop->has_desc = 1;
297
298 /* Check number of iterations. */
299 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0)
300 {
301 if (rtl_dump_file)
302 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
303 return;
304 }
305
306 /* Success. */
307 if (rtl_dump_file)
308 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
309 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
310 }
311
312 /* Decide whether the LOOP is suitable for complete peeling. */
313 static void
314 decide_peel_completely (loops, loop, flags)
315 struct loops *loops;
316 struct loop *loop;
317 int flags ATTRIBUTE_UNUSED;
318 {
319 unsigned npeel;
320
321 if (rtl_dump_file)
322 fprintf (rtl_dump_file, ";; Considering peeling completely\n");
323
324 /* Skip non-innermost loops. */
325 if (loop->inner)
326 {
327 if (rtl_dump_file)
328 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
329 return;
330 }
331
332 /* Do not peel cold areas. */
333 if (!maybe_hot_bb_p (loop->header))
334 {
335 if (rtl_dump_file)
336 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
337 return;
338 }
339
340 /* Can the loop be manipulated? */
341 if (!can_duplicate_loop_p (loop))
342 {
343 if (rtl_dump_file)
344 fprintf (rtl_dump_file,
345 ";; Not considering loop, cannot duplicate\n");
346 return;
347 }
348
349 /* npeel = number of iterations to peel. */
350 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
351 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
352 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
353
354 /* Is the loop small enough? */
355 if (!npeel)
356 {
357 if (rtl_dump_file)
358 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
359 return;
360 }
361
362 /* Check for simple loops. */
363 if (!loop->has_desc)
364 {
365 loop->simple = simple_loop_p (loops, loop, &loop->desc);
366 loop->has_desc = 1;
367 }
368
369 /* Check number of iterations. */
370 if (!loop->simple || !loop->desc.const_iter)
371 {
372 if (rtl_dump_file)
373 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
374 return;
375 }
376
377 if (loop->desc.niter > npeel - 1)
378 {
379 if (rtl_dump_file)
380 {
381 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
382 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
383 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
384 }
385 return;
386 }
387
388 /* Success. */
389 if (rtl_dump_file)
390 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
391 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
392 }
393
394 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
395 completely. The transformation done:
396
397 for (i = 0; i < 4; i++)
398 body;
399
400 ==>
401
402 i = 0;
403 body; i++;
404 body; i++;
405 body; i++;
406 body; i++;
407 */
408 static void
409 peel_loop_completely (loops, loop)
410 struct loops *loops;
411 struct loop *loop;
412 {
413 sbitmap wont_exit;
414 unsigned HOST_WIDE_INT npeel;
415 unsigned n_remove_edges, i;
416 edge *remove_edges;
417 struct loop_desc *desc = &loop->desc;
418
419 npeel = desc->niter;
420
421 if (npeel)
422 {
423 wont_exit = sbitmap_alloc (npeel + 1);
424 sbitmap_ones (wont_exit);
425 RESET_BIT (wont_exit, 0);
426 if (desc->may_be_zero)
427 RESET_BIT (wont_exit, 1);
428
429 remove_edges = xcalloc (npeel, sizeof (edge));
430 n_remove_edges = 0;
431
432 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
433 loops, npeel,
434 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
435 DLTHE_FLAG_UPDATE_FREQ))
436 abort ();
437
438 free (wont_exit);
439
440 /* Remove the exit edges. */
441 for (i = 0; i < n_remove_edges; i++)
442 remove_path (loops, remove_edges[i]);
443 free (remove_edges);
444 }
445
446 /* Now remove the unreachable part of the last iteration and cancel
447 the loop. */
448 remove_path (loops, desc->in_edge);
449
450 if (rtl_dump_file)
451 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
452 }
453
454 /* Decide whether to unroll LOOP iterating constant number of times and how much. */
455 static void
456 decide_unroll_constant_iterations (loops, loop, flags)
457 struct loops *loops;
458 struct loop *loop;
459 int flags;
460 {
461 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
462
463 if (!(flags & UAP_UNROLL))
464 {
465 /* We were not asked to, just return back silently. */
466 return;
467 }
468
469 if (rtl_dump_file)
470 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
471
472 /* nunroll = total number of copies of the original loop body in
473 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
474 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
475 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
476 if (nunroll > nunroll_by_av)
477 nunroll = nunroll_by_av;
478 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
479 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
480
481 /* Skip big loops. */
482 if (nunroll <= 1)
483 {
484 if (rtl_dump_file)
485 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
486 return;
487 }
488
489 /* Check for simple loops. */
490 if (!loop->has_desc)
491 {
492 loop->simple = simple_loop_p (loops, loop, &loop->desc);
493 loop->has_desc = 1;
494 }
495
496 /* Check number of iterations. */
497 if (!loop->simple || !loop->desc.const_iter)
498 {
499 if (rtl_dump_file)
500 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
501 return;
502 }
503
504 /* Check whether the loop rolls enough to consider. */
505 if (loop->desc.niter < 2 * nunroll)
506 {
507 if (rtl_dump_file)
508 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
509 return;
510 }
511
512 /* Success; now compute number of iterations to unroll. We alter
513 nunroll so that as few as possible copies of loop body are
514 necessary, while still not decreasing the number of unrollings
515 too much (at most by 1). */
516 best_copies = 2 * nunroll + 10;
517
518 i = 2 * nunroll + 2;
519 if ((unsigned) i - 1 >= loop->desc.niter)
520 i = loop->desc.niter - 2;
521
522 for (; i >= nunroll - 1; i--)
523 {
524 unsigned exit_mod = loop->desc.niter % (i + 1);
525
526 if (loop->desc.postincr)
527 n_copies = exit_mod + i + 1;
528 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
529 n_copies = exit_mod + i + 2;
530 else
531 n_copies = i + 1;
532
533 if (n_copies < best_copies)
534 {
535 best_copies = n_copies;
536 best_unroll = i;
537 }
538 }
539
540 if (rtl_dump_file)
541 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
542 best_unroll + 1, best_copies, nunroll);
543
544 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
545 loop->lpt_decision.times = best_unroll;
546 }
547
548 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
549 times. The transformation does this:
550
551 for (i = 0; i < 102; i++)
552 body;
553
554 ==>
555
556 i = 0;
557 body; i++;
558 body; i++;
559 while (i < 102)
560 {
561 body; i++;
562 body; i++;
563 body; i++;
564 body; i++;
565 }
566 */
567 static void
568 unroll_loop_constant_iterations (loops, loop)
569 struct loops *loops;
570 struct loop *loop;
571 {
572 unsigned HOST_WIDE_INT niter;
573 unsigned exit_mod;
574 sbitmap wont_exit;
575 unsigned n_remove_edges, i;
576 edge *remove_edges;
577 unsigned max_unroll = loop->lpt_decision.times;
578 struct loop_desc *desc = &loop->desc;
579
580 niter = desc->niter;
581
582 if (niter <= (unsigned) max_unroll + 1)
583 abort (); /* Should not get here (such loop should be peeled instead). */
584
585 exit_mod = niter % (max_unroll + 1);
586
587 wont_exit = sbitmap_alloc (max_unroll + 1);
588 sbitmap_ones (wont_exit);
589
590 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
591 n_remove_edges = 0;
592
593 if (desc->postincr)
594 {
595 /* Counter is incremented after the exit test; leave exit test
596 in the first copy, so that the loops that start with test
597 of exit condition have continuous body after unrolling. */
598
599 if (rtl_dump_file)
600 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
601
602 /* Peel exit_mod iterations. */
603 RESET_BIT (wont_exit, 0);
604 if (desc->may_be_zero)
605 RESET_BIT (wont_exit, 1);
606
607 if (exit_mod
608 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
609 loops, exit_mod,
610 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
611 DLTHE_FLAG_UPDATE_FREQ))
612 abort ();
613
614 SET_BIT (wont_exit, 1);
615 }
616 else
617 {
618 /* Leave exit test in last copy, for the same reason as above if
619 the loop tests the condition at the end of loop body. */
620
621 if (rtl_dump_file)
622 fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
623
624 /* We know that niter >= max_unroll + 2; so we do not need to care of
625 case when we would exit before reaching the loop. So just peel
626 exit_mod + 1 iterations.
627 */
628 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
629 {
630 RESET_BIT (wont_exit, 0);
631 if (desc->may_be_zero)
632 RESET_BIT (wont_exit, 1);
633
634 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
635 loops, exit_mod + 1,
636 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
637 DLTHE_FLAG_UPDATE_FREQ))
638 abort ();
639
640 SET_BIT (wont_exit, 0);
641 SET_BIT (wont_exit, 1);
642 }
643
644 RESET_BIT (wont_exit, max_unroll);
645 }
646
647 /* Now unroll the loop. */
648 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
649 loops, max_unroll,
650 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
651 DLTHE_FLAG_UPDATE_FREQ))
652 abort ();
653
654 free (wont_exit);
655
656 /* Remove the edges. */
657 for (i = 0; i < n_remove_edges; i++)
658 remove_path (loops, remove_edges[i]);
659 free (remove_edges);
660
661 if (rtl_dump_file)
662 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
663 }
664
665 /* Decide whether to unroll LOOP iterating runtime computable number of times
666 and how much. */
667 static void
668 decide_unroll_runtime_iterations (loops, loop, flags)
669 struct loops *loops;
670 struct loop *loop;
671 int flags;
672 {
673 unsigned nunroll, nunroll_by_av, i;
674
675 if (!(flags & UAP_UNROLL))
676 {
677 /* We were not asked to, just return back silently. */
678 return;
679 }
680
681 if (rtl_dump_file)
682 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
683
684 /* nunroll = total number of copies of the original loop body in
685 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
686 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
687 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
688 if (nunroll > nunroll_by_av)
689 nunroll = nunroll_by_av;
690 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
691 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
692
693 /* Skip big loops. */
694 if (nunroll <= 1)
695 {
696 if (rtl_dump_file)
697 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
698 return;
699 }
700
701 /* Check for simple loops. */
702 if (!loop->has_desc)
703 {
704 loop->simple = simple_loop_p (loops, loop, &loop->desc);
705 loop->has_desc = 1;
706 }
707
708 /* Check simpleness. */
709 if (!loop->simple)
710 {
711 if (rtl_dump_file)
712 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
713 return;
714 }
715
716 if (loop->desc.const_iter)
717 {
718 if (rtl_dump_file)
719 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
720 return;
721 }
722
723 /* If we have profile feedback, check whether the loop rolls. */
724 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
725 {
726 if (rtl_dump_file)
727 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
728 return;
729 }
730
731 /* Success; now force nunroll to be power of 2, as we are unable to
732 cope with overflows in computation of number of iterations. */
733 for (i = 1; 2 * i <= nunroll; i *= 2);
734
735 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
736 loop->lpt_decision.times = i - 1;
737 }
738
739 /* Unroll LOOP for that we are able to count number of iterations in runtime
740 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some
741 extra care for case n < 0):
742
743 for (i = 0; i < n; i++)
744 body;
745
746 ==>
747
748 i = 0;
749 mod = n % 4;
750
751 switch (mod)
752 {
753 case 3:
754 body; i++;
755 case 2:
756 body; i++;
757 case 1:
758 body; i++;
759 case 0: ;
760 }
761
762 while (i < n)
763 {
764 body; i++;
765 body; i++;
766 body; i++;
767 body; i++;
768 }
769 */
770 static void
771 unroll_loop_runtime_iterations (loops, loop)
772 struct loops *loops;
773 struct loop *loop;
774 {
775 rtx niter, init_code, branch_code, jump, label;
776 unsigned i, j, p;
777 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
778 unsigned n_dom_bbs;
779 sbitmap wont_exit;
780 int may_exit_copy;
781 unsigned n_peel, n_remove_edges;
782 edge *remove_edges, e;
783 bool extra_zero_check, last_may_exit;
784 unsigned max_unroll = loop->lpt_decision.times;
785 struct loop_desc *desc = &loop->desc;
786
787 /* Remember blocks whose dominators will have to be updated. */
788 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
789 n_dom_bbs = 0;
790
791 body = get_loop_body (loop);
792 for (i = 0; i < loop->num_nodes; i++)
793 {
794 unsigned nldom;
795 basic_block *ldom;
796
797 nldom = get_dominated_by (loops->cfg.dom, body[i], &ldom);
798 for (j = 0; j < nldom; j++)
799 if (!flow_bb_inside_loop_p (loop, ldom[j]))
800 dom_bbs[n_dom_bbs++] = ldom[j];
801
802 free (ldom);
803 }
804 free (body);
805
806 if (desc->postincr)
807 {
808 /* Leave exit in first copy (for explanation why see comment in
809 unroll_loop_constant_iterations). */
810 may_exit_copy = 0;
811 n_peel = max_unroll - 1;
812 extra_zero_check = true;
813 last_may_exit = false;
814 }
815 else
816 {
817 /* Leave exit in last copy (for explanation why see comment in
818 unroll_loop_constant_iterations). */
819 may_exit_copy = max_unroll;
820 n_peel = max_unroll;
821 extra_zero_check = false;
822 last_may_exit = true;
823 }
824
825 /* Get expression for number of iterations. */
826 start_sequence ();
827 niter = count_loop_iterations (desc, NULL, NULL);
828 if (!niter)
829 abort ();
830 niter = force_operand (niter, NULL);
831
832 /* Count modulo by ANDing it with max_unroll; we use the fact that
833 the number of unrollings is a power of two, and thus this is correct
834 even if there is overflow in the computation. */
835 niter = expand_simple_binop (GET_MODE (desc->var), AND,
836 niter,
837 GEN_INT (max_unroll),
838 NULL_RTX, 0, OPTAB_LIB_WIDEN);
839
840 init_code = get_insns ();
841 end_sequence ();
842
843 /* Precondition the loop. */
844 loop_split_edge_with (loop_preheader_edge (loop), init_code, loops);
845
846 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
847 n_remove_edges = 0;
848
849 wont_exit = sbitmap_alloc (max_unroll + 2);
850
851 /* Peel the first copy of loop body (almost always we must leave exit test
852 here; the only exception is when we have extra zero check and the number
853 of iterations is reliable (i.e. comes out of NE condition). Also record
854 the place of (possible) extra zero check. */
855 sbitmap_zero (wont_exit);
856 if (extra_zero_check && desc->cond == NE)
857 SET_BIT (wont_exit, 1);
858 ezc_swtch = loop_preheader_edge (loop)->src;
859 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
860 loops, 1,
861 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
862 DLTHE_FLAG_UPDATE_FREQ))
863 abort ();
864
865 /* Record the place where switch will be built for preconditioning. */
866 swtch = loop_split_edge_with (loop_preheader_edge (loop),
867 NULL_RTX, loops);
868
869 for (i = 0; i < n_peel; i++)
870 {
871 /* Peel the copy. */
872 sbitmap_zero (wont_exit);
873 if (i != n_peel - 1 || !last_may_exit)
874 SET_BIT (wont_exit, 1);
875 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
876 loops, 1,
877 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
878 DLTHE_FLAG_UPDATE_FREQ))
879 abort ();
880
881 /* Create item for switch. */
882 j = n_peel - i - (extra_zero_check ? 0 : 1);
883 p = REG_BR_PROB_BASE / (i + 2);
884
885 preheader = loop_split_edge_with (loop_preheader_edge (loop),
886 NULL_RTX, loops);
887 label = block_label (preheader);
888 start_sequence ();
889 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
890 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
891 label);
892 jump = get_last_insn ();
893 JUMP_LABEL (jump) = label;
894 REG_NOTES (jump)
895 = gen_rtx_EXPR_LIST (REG_BR_PROB,
896 GEN_INT (p), REG_NOTES (jump));
897
898 LABEL_NUSES (label)++;
899 branch_code = get_insns ();
900 end_sequence ();
901
902 swtch = loop_split_edge_with (swtch->pred, branch_code, loops);
903 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
904 swtch->succ->probability = REG_BR_PROB_BASE - p;
905 e = make_edge (swtch, preheader,
906 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
907 e->probability = p;
908 }
909
910 if (extra_zero_check)
911 {
912 /* Add branch for zero iterations. */
913 p = REG_BR_PROB_BASE / (max_unroll + 1);
914 swtch = ezc_swtch;
915 preheader = loop_split_edge_with (loop_preheader_edge (loop),
916 NULL_RTX, loops);
917 label = block_label (preheader);
918 start_sequence ();
919 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
920 GET_MODE (desc->var), NULL_RTX, NULL_RTX,
921 label);
922 jump = get_last_insn ();
923 JUMP_LABEL (jump) = label;
924 REG_NOTES (jump)
925 = gen_rtx_EXPR_LIST (REG_BR_PROB,
926 GEN_INT (p), REG_NOTES (jump));
927
928 LABEL_NUSES (label)++;
929 branch_code = get_insns ();
930 end_sequence ();
931
932 swtch = loop_split_edge_with (swtch->succ, branch_code, loops);
933 set_immediate_dominator (loops->cfg.dom, preheader, swtch);
934 swtch->succ->probability = REG_BR_PROB_BASE - p;
935 e = make_edge (swtch, preheader,
936 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
937 e->probability = p;
938 }
939
940 /* Recount dominators for outer blocks. */
941 iterate_fix_dominators (loops->cfg.dom, dom_bbs, n_dom_bbs);
942
943 /* And unroll loop. */
944
945 sbitmap_ones (wont_exit);
946 RESET_BIT (wont_exit, may_exit_copy);
947
948 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
949 loops, max_unroll,
950 wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
951 DLTHE_FLAG_UPDATE_FREQ))
952 abort ();
953
954 free (wont_exit);
955
956 /* Remove the edges. */
957 for (i = 0; i < n_remove_edges; i++)
958 remove_path (loops, remove_edges[i]);
959 free (remove_edges);
960
961 if (rtl_dump_file)
962 fprintf (rtl_dump_file,
963 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
964 max_unroll, num_loop_insns (loop));
965 }
966
967 /* Decide whether to simply peel LOOP and how much. */
968 static void
969 decide_peel_simple (loops, loop, flags)
970 struct loops *loops;
971 struct loop *loop;
972 int flags;
973 {
974 unsigned npeel;
975
976 if (!(flags & UAP_PEEL))
977 {
978 /* We were not asked to, just return back silently. */
979 return;
980 }
981
982 if (rtl_dump_file)
983 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
984
985 /* npeel = number of iterations to peel. */
986 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
987 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
988 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
989
990 /* Skip big loops. */
991 if (!npeel)
992 {
993 if (rtl_dump_file)
994 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
995 return;
996 }
997
998 /* Check for simple loops. */
999 if (!loop->has_desc)
1000 {
1001 loop->simple = simple_loop_p (loops, loop, &loop->desc);
1002 loop->has_desc = 1;
1003 }
1004
1005 /* Check number of iterations. */
1006 if (loop->simple && loop->desc.const_iter)
1007 {
1008 if (rtl_dump_file)
1009 fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
1010 return;
1011 }
1012
1013 /* Do not simply peel loops with branches inside -- it increases number
1014 of mispredicts. */
1015 if (loop->desc.n_branches > 1)
1016 {
1017 if (rtl_dump_file)
1018 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
1019 return;
1020 }
1021
1022 if (loop->header->count)
1023 {
1024 unsigned niter = expected_loop_iterations (loop);
1025 if (niter + 1 > npeel)
1026 {
1027 if (rtl_dump_file)
1028 {
1029 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1030 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1031 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1032 }
1033 return;
1034 }
1035 npeel = niter + 1;
1036 }
1037 else
1038 {
1039 /* For now we have no good heuristics to decide whether loop peeling
1040 will be effective, so disable it. */
1041 if (rtl_dump_file)
1042 fprintf (rtl_dump_file,
1043 ";; Not peeling loop, no evidence it will be profitable\n");
1044 return;
1045 }
1046
1047 /* Success. */
1048 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1049 loop->lpt_decision.times = npeel;
1050 }
1051
1052 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1053 while (cond)
1054 body;
1055
1056 ==>
1057
1058 if (!cond) goto end;
1059 body;
1060 if (!cond) goto end;
1061 body;
1062 while (cond)
1063 body;
1064 end: ;
1065 */
1066 static void
1067 peel_loop_simple (loops, loop)
1068 struct loops *loops;
1069 struct loop *loop;
1070 {
1071 sbitmap wont_exit;
1072 unsigned npeel = loop->lpt_decision.times;
1073
1074 wont_exit = sbitmap_alloc (npeel + 1);
1075 sbitmap_zero (wont_exit);
1076
1077 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1078 loops, npeel, wont_exit, NULL, NULL, NULL,
1079 DLTHE_FLAG_UPDATE_FREQ))
1080 abort ();
1081
1082 free (wont_exit);
1083
1084 if (rtl_dump_file)
1085 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1086 }
1087
1088 /* Decide whether to unroll LOOP stupidly and how much. */
1089 static void
1090 decide_unroll_stupid (loops, loop, flags)
1091 struct loops *loops;
1092 struct loop *loop;
1093 int flags;
1094 {
1095 unsigned nunroll, nunroll_by_av, i;
1096
1097 if (!(flags & UAP_UNROLL_ALL))
1098 {
1099 /* We were not asked to, just return back silently. */
1100 return;
1101 }
1102
1103 if (rtl_dump_file)
1104 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1105
1106 /* nunroll = total number of copies of the original loop body in
1107 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1108 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1109 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1110 if (nunroll > nunroll_by_av)
1111 nunroll = nunroll_by_av;
1112 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1113 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1114
1115 /* Skip big loops. */
1116 if (nunroll <= 1)
1117 {
1118 if (rtl_dump_file)
1119 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1120 return;
1121 }
1122
1123 /* Check for simple loops. */
1124 if (!loop->has_desc)
1125 {
1126 loop->simple = simple_loop_p (loops, loop, &loop->desc);
1127 loop->has_desc = 1;
1128 }
1129
1130 /* Check simpleness. */
1131 if (loop->simple)
1132 {
1133 if (rtl_dump_file)
1134 fprintf (rtl_dump_file, ";; The loop is simple\n");
1135 return;
1136 }
1137
1138 /* Do not unroll loops with branches inside -- it increases number
1139 of mispredicts. */
1140 if (loop->desc.n_branches > 1)
1141 {
1142 if (rtl_dump_file)
1143 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1144 return;
1145 }
1146
1147 /* If we have profile feedback, check whether the loop rolls. */
1148 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1149 {
1150 if (rtl_dump_file)
1151 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1152 return;
1153 }
1154
1155 /* Success. Now force nunroll to be power of 2, as it seems that this
1156 improves results (partially because of better alignments, partially
1157 because of some dark magic). */
1158 for (i = 1; 2 * i <= nunroll; i *= 2);
1159
1160 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1161 loop->lpt_decision.times = i - 1;
1162 }
1163
1164 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation:
1165 while (cond)
1166 body;
1167
1168 ==>
1169
1170 while (cond)
1171 {
1172 body;
1173 if (!cond) break;
1174 body;
1175 if (!cond) break;
1176 body;
1177 if (!cond) break;
1178 body;
1179 }
1180 */
1181 static void
1182 unroll_loop_stupid (loops, loop)
1183 struct loops *loops;
1184 struct loop *loop;
1185 {
1186 sbitmap wont_exit;
1187 unsigned nunroll = loop->lpt_decision.times;
1188
1189 wont_exit = sbitmap_alloc (nunroll + 1);
1190 sbitmap_zero (wont_exit);
1191
1192 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1193 loops, nunroll, wont_exit, NULL, NULL, NULL,
1194 DLTHE_FLAG_UPDATE_FREQ))
1195 abort ();
1196
1197 free (wont_exit);
1198
1199 if (rtl_dump_file)
1200 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1201 nunroll, num_loop_insns (loop));
1202 }