0dd8b0a9d5c8beb67ecd3e098ee698a2c79eca16
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
3 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "basic-block.h"
29 #include "cfgloop.h"
30 #include "params.h"
31 #include "expr.h"
32 #include "hashtab.h"
33 #include "recog.h"
34 #include "target.h"
35 #include "dumpfile.h"
36
37 /* This pass performs loop unrolling and peeling. We only perform these
38 optimizations on innermost loops (with single exception) because
39 the impact on performance is greatest here, and we want to avoid
40 unnecessary code size growth. The gain is caused by greater sequentiality
41 of code, better code to optimize for further passes and in some cases
42 by fewer testings of exit conditions. The main problem is code growth,
43 that impacts performance negatively due to effect of caches.
44
45 What we do:
46
47 -- complete peeling of once-rolling loops; this is the above mentioned
48 exception, as this causes loop to be cancelled completely and
49 does not cause code growth
50 -- complete peeling of loops that roll (small) constant times.
51 -- simple peeling of first iterations of loops that do not roll much
52 (according to profile feedback)
53 -- unrolling of loops that roll constant times; this is almost always
54 win, as we get rid of exit condition tests.
55 -- unrolling of loops that roll number of times that we can compute
56 in runtime; we also get rid of exit condition tests here, but there
57 is the extra expense for calculating the number of iterations
58 -- simple unrolling of remaining loops; this is performed only if we
59 are asked to, as the gain is questionable in this case and often
60 it may even slow down the code
61 For more detailed descriptions of each of those, see comments at
62 appropriate function below.
63
64 There is a lot of parameters (defined and described in params.def) that
65 control how much we unroll/peel.
66
67 ??? A great problem is that we don't have a good way how to determine
68 how many times we should unroll the loop; the experiments I have made
69 showed that this choice may affect performance in order of several %.
70 */
71
72 /* Information about induction variables to split. */
73
74 struct iv_to_split
75 {
76 rtx insn; /* The insn in that the induction variable occurs. */
77 rtx base_var; /* The variable on that the values in the further
78 iterations are based. */
79 rtx step; /* Step of the induction variable. */
80 struct iv_to_split *next; /* Next entry in walking order. */
81 unsigned n_loc;
82 unsigned loc[3]; /* Location where the definition of the induction
83 variable occurs in the insn. For example if
84 N_LOC is 2, the expression is located at
85 XEXP (XEXP (single_set, loc[0]), loc[1]). */
86 };
87
88 /* Information about accumulators to expand. */
89
90 struct var_to_expand
91 {
92 rtx insn; /* The insn in that the variable expansion occurs. */
93 rtx reg; /* The accumulator which is expanded. */
94 VEC(rtx,heap) *var_expansions; /* The copies of the accumulator which is expanded. */
95 struct var_to_expand *next; /* Next entry in walking order. */
96 enum rtx_code op; /* The type of the accumulation - addition, subtraction
97 or multiplication. */
98 int expansion_count; /* Count the number of expansions generated so far. */
99 int reuse_expansion; /* The expansion we intend to reuse to expand
100 the accumulator. If REUSE_EXPANSION is 0 reuse
101 the original accumulator. Else use
102 var_expansions[REUSE_EXPANSION - 1]. */
103 };
104
105 /* Information about optimization applied in
106 the unrolled loop. */
107
108 struct opt_info
109 {
110 htab_t insns_to_split; /* A hashtable of insns to split. */
111 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
112 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
113 htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
114 to expand. */
115 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
116 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
117 unsigned first_new_block; /* The first basic block that was
118 duplicated. */
119 basic_block loop_exit; /* The loop exit basic block. */
120 basic_block loop_preheader; /* The loop preheader basic block. */
121 };
122
123 static void decide_unrolling_and_peeling (int);
124 static void peel_loops_completely (int);
125 static void decide_peel_simple (struct loop *, int);
126 static void decide_peel_once_rolling (struct loop *, int);
127 static void decide_peel_completely (struct loop *, int);
128 static void decide_unroll_stupid (struct loop *, int);
129 static void decide_unroll_constant_iterations (struct loop *, int);
130 static void decide_unroll_runtime_iterations (struct loop *, int);
131 static void peel_loop_simple (struct loop *);
132 static void peel_loop_completely (struct loop *);
133 static void unroll_loop_stupid (struct loop *);
134 static void unroll_loop_constant_iterations (struct loop *);
135 static void unroll_loop_runtime_iterations (struct loop *);
136 static struct opt_info *analyze_insns_in_loop (struct loop *);
137 static void opt_info_start_duplication (struct opt_info *);
138 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
139 static void free_opt_info (struct opt_info *);
140 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
141 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
142 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
143 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
144 static void insert_var_expansion_initialization (struct var_to_expand *,
145 basic_block);
146 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
147 basic_block);
148 static rtx get_expansion (struct var_to_expand *);
149
150 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
151 void
152 unroll_and_peel_loops (int flags)
153 {
154 struct loop *loop;
155 bool check;
156 loop_iterator li;
157
158 /* First perform complete loop peeling (it is almost surely a win,
159 and affects parameters for further decision a lot). */
160 peel_loops_completely (flags);
161
162 /* Now decide rest of unrolling and peeling. */
163 decide_unrolling_and_peeling (flags);
164
165 /* Scan the loops, inner ones first. */
166 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
167 {
168 check = true;
169 /* And perform the appropriate transformations. */
170 switch (loop->lpt_decision.decision)
171 {
172 case LPT_PEEL_COMPLETELY:
173 /* Already done. */
174 gcc_unreachable ();
175 case LPT_PEEL_SIMPLE:
176 peel_loop_simple (loop);
177 break;
178 case LPT_UNROLL_CONSTANT:
179 unroll_loop_constant_iterations (loop);
180 break;
181 case LPT_UNROLL_RUNTIME:
182 unroll_loop_runtime_iterations (loop);
183 break;
184 case LPT_UNROLL_STUPID:
185 unroll_loop_stupid (loop);
186 break;
187 case LPT_NONE:
188 check = false;
189 break;
190 default:
191 gcc_unreachable ();
192 }
193 if (check)
194 {
195 #ifdef ENABLE_CHECKING
196 verify_loop_structure ();
197 #endif
198 }
199 }
200
201 iv_analysis_done ();
202 }
203
204 /* Check whether exit of the LOOP is at the end of loop body. */
205
206 static bool
207 loop_exit_at_end_p (struct loop *loop)
208 {
209 struct niter_desc *desc = get_simple_loop_desc (loop);
210 rtx insn;
211
212 if (desc->in_edge->dest != loop->latch)
213 return false;
214
215 /* Check that the latch is empty. */
216 FOR_BB_INSNS (loop->latch, insn)
217 {
218 if (INSN_P (insn))
219 return false;
220 }
221
222 return true;
223 }
224
225 /* Depending on FLAGS, check whether to peel loops completely and do so. */
226 static void
227 peel_loops_completely (int flags)
228 {
229 struct loop *loop;
230 loop_iterator li;
231
232 /* Scan the loops, the inner ones first. */
233 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
234 {
235 loop->lpt_decision.decision = LPT_NONE;
236
237 if (dump_file)
238 fprintf (dump_file,
239 "\n;; *** Considering loop %d for complete peeling ***\n",
240 loop->num);
241
242 loop->ninsns = num_loop_insns (loop);
243
244 decide_peel_once_rolling (loop, flags);
245 if (loop->lpt_decision.decision == LPT_NONE)
246 decide_peel_completely (loop, flags);
247
248 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
249 {
250 peel_loop_completely (loop);
251 #ifdef ENABLE_CHECKING
252 verify_loop_structure ();
253 #endif
254 }
255 }
256 }
257
258 /* Decide whether unroll or peel loops (depending on FLAGS) and how much. */
259 static void
260 decide_unrolling_and_peeling (int flags)
261 {
262 struct loop *loop;
263 loop_iterator li;
264
265 /* Scan the loops, inner ones first. */
266 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
267 {
268 loop->lpt_decision.decision = LPT_NONE;
269
270 if (dump_file)
271 fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
272
273 /* Do not peel cold areas. */
274 if (optimize_loop_for_size_p (loop))
275 {
276 if (dump_file)
277 fprintf (dump_file, ";; Not considering loop, cold area\n");
278 continue;
279 }
280
281 /* Can the loop be manipulated? */
282 if (!can_duplicate_loop_p (loop))
283 {
284 if (dump_file)
285 fprintf (dump_file,
286 ";; Not considering loop, cannot duplicate\n");
287 continue;
288 }
289
290 /* Skip non-innermost loops. */
291 if (loop->inner)
292 {
293 if (dump_file)
294 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
295 continue;
296 }
297
298 loop->ninsns = num_loop_insns (loop);
299 loop->av_ninsns = average_num_loop_insns (loop);
300
301 /* Try transformations one by one in decreasing order of
302 priority. */
303
304 decide_unroll_constant_iterations (loop, flags);
305 if (loop->lpt_decision.decision == LPT_NONE)
306 decide_unroll_runtime_iterations (loop, flags);
307 if (loop->lpt_decision.decision == LPT_NONE)
308 decide_unroll_stupid (loop, flags);
309 if (loop->lpt_decision.decision == LPT_NONE)
310 decide_peel_simple (loop, flags);
311 }
312 }
313
314 /* Decide whether the LOOP is once rolling and suitable for complete
315 peeling. */
316 static void
317 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
318 {
319 struct niter_desc *desc;
320
321 if (dump_file)
322 fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
323
324 /* Is the loop small enough? */
325 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
326 {
327 if (dump_file)
328 fprintf (dump_file, ";; Not considering loop, is too big\n");
329 return;
330 }
331
332 /* Check for simple loops. */
333 desc = get_simple_loop_desc (loop);
334
335 /* Check number of iterations. */
336 if (!desc->simple_p
337 || desc->assumptions
338 || desc->infinite
339 || !desc->const_iter
340 || (desc->niter != 0
341 && max_loop_iterations_int (loop) != 0))
342 {
343 if (dump_file)
344 fprintf (dump_file,
345 ";; Unable to prove that the loop rolls exactly once\n");
346 return;
347 }
348
349 /* Success. */
350 if (dump_file)
351 fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
352 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
353 }
354
355 /* Decide whether the LOOP is suitable for complete peeling. */
356 static void
357 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
358 {
359 unsigned npeel;
360 struct niter_desc *desc;
361
362 if (dump_file)
363 fprintf (dump_file, "\n;; Considering peeling completely\n");
364
365 /* Skip non-innermost loops. */
366 if (loop->inner)
367 {
368 if (dump_file)
369 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
370 return;
371 }
372
373 /* Do not peel cold areas. */
374 if (optimize_loop_for_size_p (loop))
375 {
376 if (dump_file)
377 fprintf (dump_file, ";; Not considering loop, cold area\n");
378 return;
379 }
380
381 /* Can the loop be manipulated? */
382 if (!can_duplicate_loop_p (loop))
383 {
384 if (dump_file)
385 fprintf (dump_file,
386 ";; Not considering loop, cannot duplicate\n");
387 return;
388 }
389
390 /* npeel = number of iterations to peel. */
391 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
392 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
393 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
394
395 /* Is the loop small enough? */
396 if (!npeel)
397 {
398 if (dump_file)
399 fprintf (dump_file, ";; Not considering loop, is too big\n");
400 return;
401 }
402
403 /* Check for simple loops. */
404 desc = get_simple_loop_desc (loop);
405
406 /* Check number of iterations. */
407 if (!desc->simple_p
408 || desc->assumptions
409 || !desc->const_iter
410 || desc->infinite)
411 {
412 if (dump_file)
413 fprintf (dump_file,
414 ";; Unable to prove that the loop iterates constant times\n");
415 return;
416 }
417
418 if (desc->niter > npeel - 1)
419 {
420 if (dump_file)
421 {
422 fprintf (dump_file,
423 ";; Not peeling loop completely, rolls too much (");
424 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
425 fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
426 }
427 return;
428 }
429
430 /* Success. */
431 if (dump_file)
432 fprintf (dump_file, ";; Decided to peel loop completely\n");
433 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
434 }
435
436 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
437 completely. The transformation done:
438
439 for (i = 0; i < 4; i++)
440 body;
441
442 ==>
443
444 i = 0;
445 body; i++;
446 body; i++;
447 body; i++;
448 body; i++;
449 */
450 static void
451 peel_loop_completely (struct loop *loop)
452 {
453 sbitmap wont_exit;
454 unsigned HOST_WIDE_INT npeel;
455 unsigned i;
456 VEC (edge, heap) *remove_edges;
457 edge ein;
458 struct niter_desc *desc = get_simple_loop_desc (loop);
459 struct opt_info *opt_info = NULL;
460
461 npeel = desc->niter;
462
463 if (npeel)
464 {
465 bool ok;
466
467 wont_exit = sbitmap_alloc (npeel + 1);
468 sbitmap_ones (wont_exit);
469 RESET_BIT (wont_exit, 0);
470 if (desc->noloop_assumptions)
471 RESET_BIT (wont_exit, 1);
472
473 remove_edges = NULL;
474
475 if (flag_split_ivs_in_unroller)
476 opt_info = analyze_insns_in_loop (loop);
477
478 opt_info_start_duplication (opt_info);
479 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
480 npeel,
481 wont_exit, desc->out_edge,
482 &remove_edges,
483 DLTHE_FLAG_UPDATE_FREQ
484 | DLTHE_FLAG_COMPLETTE_PEEL
485 | (opt_info
486 ? DLTHE_RECORD_COPY_NUMBER : 0));
487 gcc_assert (ok);
488
489 free (wont_exit);
490
491 if (opt_info)
492 {
493 apply_opt_in_copies (opt_info, npeel, false, true);
494 free_opt_info (opt_info);
495 }
496
497 /* Remove the exit edges. */
498 FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
499 remove_path (ein);
500 VEC_free (edge, heap, remove_edges);
501 }
502
503 ein = desc->in_edge;
504 free_simple_loop_desc (loop);
505
506 /* Now remove the unreachable part of the last iteration and cancel
507 the loop. */
508 remove_path (ein);
509
510 if (dump_file)
511 fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
512 }
513
514 /* Decide whether to unroll LOOP iterating constant number of times
515 and how much. */
516
517 static void
518 decide_unroll_constant_iterations (struct loop *loop, int flags)
519 {
520 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
521 struct niter_desc *desc;
522
523 if (!(flags & UAP_UNROLL))
524 {
525 /* We were not asked to, just return back silently. */
526 return;
527 }
528
529 if (dump_file)
530 fprintf (dump_file,
531 "\n;; Considering unrolling loop with constant "
532 "number of iterations\n");
533
534 /* nunroll = total number of copies of the original loop body in
535 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
536 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
537 nunroll_by_av
538 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
539 if (nunroll > nunroll_by_av)
540 nunroll = nunroll_by_av;
541 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
542 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
543
544 /* Skip big loops. */
545 if (nunroll <= 1)
546 {
547 if (dump_file)
548 fprintf (dump_file, ";; Not considering loop, is too big\n");
549 return;
550 }
551
552 /* Check for simple loops. */
553 desc = get_simple_loop_desc (loop);
554
555 /* Check number of iterations. */
556 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
557 {
558 if (dump_file)
559 fprintf (dump_file,
560 ";; Unable to prove that the loop iterates constant times\n");
561 return;
562 }
563
564 /* Check whether the loop rolls enough to consider. */
565 if (desc->niter < 2 * nunroll)
566 {
567 if (dump_file)
568 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
569 return;
570 }
571
572 /* Success; now compute number of iterations to unroll. We alter
573 nunroll so that as few as possible copies of loop body are
574 necessary, while still not decreasing the number of unrollings
575 too much (at most by 1). */
576 best_copies = 2 * nunroll + 10;
577
578 i = 2 * nunroll + 2;
579 if (i - 1 >= desc->niter)
580 i = desc->niter - 2;
581
582 for (; i >= nunroll - 1; i--)
583 {
584 unsigned exit_mod = desc->niter % (i + 1);
585
586 if (!loop_exit_at_end_p (loop))
587 n_copies = exit_mod + i + 1;
588 else if (exit_mod != (unsigned) i
589 || desc->noloop_assumptions != NULL_RTX)
590 n_copies = exit_mod + i + 2;
591 else
592 n_copies = i + 1;
593
594 if (n_copies < best_copies)
595 {
596 best_copies = n_copies;
597 best_unroll = i;
598 }
599 }
600
601 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
602 loop->lpt_decision.times = best_unroll;
603
604 if (dump_file)
605 fprintf (dump_file, ";; Decided to unroll the loop %d times (%d copies).\n",
606 loop->lpt_decision.times, best_copies);
607 }
608
609 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
610 The transformation does this:
611
612 for (i = 0; i < 102; i++)
613 body;
614
615 ==> (LOOP->LPT_DECISION.TIMES == 3)
616
617 i = 0;
618 body; i++;
619 body; i++;
620 while (i < 102)
621 {
622 body; i++;
623 body; i++;
624 body; i++;
625 body; i++;
626 }
627 */
628 static void
629 unroll_loop_constant_iterations (struct loop *loop)
630 {
631 unsigned HOST_WIDE_INT niter;
632 unsigned exit_mod;
633 sbitmap wont_exit;
634 unsigned i;
635 VEC (edge, heap) *remove_edges;
636 edge e;
637 unsigned max_unroll = loop->lpt_decision.times;
638 struct niter_desc *desc = get_simple_loop_desc (loop);
639 bool exit_at_end = loop_exit_at_end_p (loop);
640 struct opt_info *opt_info = NULL;
641 bool ok;
642
643 niter = desc->niter;
644
645 /* Should not get here (such loop should be peeled instead). */
646 gcc_assert (niter > max_unroll + 1);
647
648 exit_mod = niter % (max_unroll + 1);
649
650 wont_exit = sbitmap_alloc (max_unroll + 1);
651 sbitmap_ones (wont_exit);
652
653 remove_edges = NULL;
654 if (flag_split_ivs_in_unroller
655 || flag_variable_expansion_in_unroller)
656 opt_info = analyze_insns_in_loop (loop);
657
658 if (!exit_at_end)
659 {
660 /* The exit is not at the end of the loop; leave exit test
661 in the first copy, so that the loops that start with test
662 of exit condition have continuous body after unrolling. */
663
664 if (dump_file)
665 fprintf (dump_file, ";; Condition at beginning of loop.\n");
666
667 /* Peel exit_mod iterations. */
668 RESET_BIT (wont_exit, 0);
669 if (desc->noloop_assumptions)
670 RESET_BIT (wont_exit, 1);
671
672 if (exit_mod)
673 {
674 opt_info_start_duplication (opt_info);
675 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
676 exit_mod,
677 wont_exit, desc->out_edge,
678 &remove_edges,
679 DLTHE_FLAG_UPDATE_FREQ
680 | (opt_info && exit_mod > 1
681 ? DLTHE_RECORD_COPY_NUMBER
682 : 0));
683 gcc_assert (ok);
684
685 if (opt_info && exit_mod > 1)
686 apply_opt_in_copies (opt_info, exit_mod, false, false);
687
688 desc->noloop_assumptions = NULL_RTX;
689 desc->niter -= exit_mod;
690 loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
691 if (loop->any_estimate
692 && double_int::from_uhwi (exit_mod).ule
693 (loop->nb_iterations_estimate))
694 loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
695 else
696 loop->any_estimate = false;
697 }
698
699 SET_BIT (wont_exit, 1);
700 }
701 else
702 {
703 /* Leave exit test in last copy, for the same reason as above if
704 the loop tests the condition at the end of loop body. */
705
706 if (dump_file)
707 fprintf (dump_file, ";; Condition at end of loop.\n");
708
709 /* We know that niter >= max_unroll + 2; so we do not need to care of
710 case when we would exit before reaching the loop. So just peel
711 exit_mod + 1 iterations. */
712 if (exit_mod != max_unroll
713 || desc->noloop_assumptions)
714 {
715 RESET_BIT (wont_exit, 0);
716 if (desc->noloop_assumptions)
717 RESET_BIT (wont_exit, 1);
718
719 opt_info_start_duplication (opt_info);
720 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
721 exit_mod + 1,
722 wont_exit, desc->out_edge,
723 &remove_edges,
724 DLTHE_FLAG_UPDATE_FREQ
725 | (opt_info && exit_mod > 0
726 ? DLTHE_RECORD_COPY_NUMBER
727 : 0));
728 gcc_assert (ok);
729
730 if (opt_info && exit_mod > 0)
731 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
732
733 desc->niter -= exit_mod + 1;
734 loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
735 if (loop->any_estimate
736 && double_int::from_uhwi (exit_mod + 1).ule
737 (loop->nb_iterations_estimate))
738 loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
739 else
740 loop->any_estimate = false;
741 desc->noloop_assumptions = NULL_RTX;
742
743 SET_BIT (wont_exit, 0);
744 SET_BIT (wont_exit, 1);
745 }
746
747 RESET_BIT (wont_exit, max_unroll);
748 }
749
750 /* Now unroll the loop. */
751
752 opt_info_start_duplication (opt_info);
753 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
754 max_unroll,
755 wont_exit, desc->out_edge,
756 &remove_edges,
757 DLTHE_FLAG_UPDATE_FREQ
758 | (opt_info
759 ? DLTHE_RECORD_COPY_NUMBER
760 : 0));
761 gcc_assert (ok);
762
763 if (opt_info)
764 {
765 apply_opt_in_copies (opt_info, max_unroll, true, true);
766 free_opt_info (opt_info);
767 }
768
769 free (wont_exit);
770
771 if (exit_at_end)
772 {
773 basic_block exit_block = get_bb_copy (desc->in_edge->src);
774 /* Find a new in and out edge; they are in the last copy we have made. */
775
776 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
777 {
778 desc->out_edge = EDGE_SUCC (exit_block, 0);
779 desc->in_edge = EDGE_SUCC (exit_block, 1);
780 }
781 else
782 {
783 desc->out_edge = EDGE_SUCC (exit_block, 1);
784 desc->in_edge = EDGE_SUCC (exit_block, 0);
785 }
786 }
787
788 desc->niter /= max_unroll + 1;
789 loop->nb_iterations_upper_bound
790 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
791 + 1),
792 TRUNC_DIV_EXPR);
793 if (loop->any_estimate)
794 loop->nb_iterations_estimate
795 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
796 + 1),
797 TRUNC_DIV_EXPR);
798 desc->niter_expr = GEN_INT (desc->niter);
799
800 /* Remove the edges. */
801 FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
802 remove_path (e);
803 VEC_free (edge, heap, remove_edges);
804
805 if (dump_file)
806 fprintf (dump_file,
807 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
808 max_unroll, num_loop_insns (loop));
809 }
810
811 /* Decide whether to unroll LOOP iterating runtime computable number of times
812 and how much. */
813 static void
814 decide_unroll_runtime_iterations (struct loop *loop, int flags)
815 {
816 unsigned nunroll, nunroll_by_av, i;
817 struct niter_desc *desc;
818 double_int iterations;
819
820 if (!(flags & UAP_UNROLL))
821 {
822 /* We were not asked to, just return back silently. */
823 return;
824 }
825
826 if (dump_file)
827 fprintf (dump_file,
828 "\n;; Considering unrolling loop with runtime "
829 "computable number of iterations\n");
830
831 /* nunroll = total number of copies of the original loop body in
832 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
833 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
834 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
835 if (nunroll > nunroll_by_av)
836 nunroll = nunroll_by_av;
837 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
838 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
839
840 if (targetm.loop_unroll_adjust)
841 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
842
843 /* Skip big loops. */
844 if (nunroll <= 1)
845 {
846 if (dump_file)
847 fprintf (dump_file, ";; Not considering loop, is too big\n");
848 return;
849 }
850
851 /* Check for simple loops. */
852 desc = get_simple_loop_desc (loop);
853
854 /* Check simpleness. */
855 if (!desc->simple_p || desc->assumptions)
856 {
857 if (dump_file)
858 fprintf (dump_file,
859 ";; Unable to prove that the number of iterations "
860 "can be counted in runtime\n");
861 return;
862 }
863
864 if (desc->const_iter)
865 {
866 if (dump_file)
867 fprintf (dump_file, ";; Loop iterates constant times\n");
868 return;
869 }
870
871 /* Check whether the loop rolls. */
872 if ((estimated_loop_iterations (loop, &iterations)
873 || max_loop_iterations (loop, &iterations))
874 && iterations.ult (double_int::from_shwi (2 * nunroll)))
875 {
876 if (dump_file)
877 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
878 return;
879 }
880
881 /* Success; now force nunroll to be power of 2, as we are unable to
882 cope with overflows in computation of number of iterations. */
883 for (i = 1; 2 * i <= nunroll; i *= 2)
884 continue;
885
886 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
887 loop->lpt_decision.times = i - 1;
888
889 if (dump_file)
890 fprintf (dump_file, ";; Decided to unroll the loop %d times.\n",
891 loop->lpt_decision.times);
892 }
893
894 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
895 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
896 and NULL is returned instead. */
897
898 basic_block
899 split_edge_and_insert (edge e, rtx insns)
900 {
901 basic_block bb;
902
903 if (!insns)
904 return NULL;
905 bb = split_edge (e);
906 emit_insn_after (insns, BB_END (bb));
907
908 /* ??? We used to assume that INSNS can contain control flow insns, and
909 that we had to try to find sub basic blocks in BB to maintain a valid
910 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
911 and call break_superblocks when going out of cfglayout mode. But it
912 turns out that this never happens; and that if it does ever happen,
913 the TODO_verify_flow at the end of the RTL loop passes would fail.
914
915 There are two reasons why we expected we could have control flow insns
916 in INSNS. The first is when a comparison has to be done in parts, and
917 the second is when the number of iterations is computed for loops with
918 the number of iterations known at runtime. In both cases, test cases
919 to get control flow in INSNS appear to be impossible to construct:
920
921 * If do_compare_rtx_and_jump needs several branches to do comparison
922 in a mode that needs comparison by parts, we cannot analyze the
923 number of iterations of the loop, and we never get to unrolling it.
924
925 * The code in expand_divmod that was suspected to cause creation of
926 branching code seems to be only accessed for signed division. The
927 divisions used by # of iterations analysis are always unsigned.
928 Problems might arise on architectures that emits branching code
929 for some operations that may appear in the unroller (especially
930 for division), but we have no such architectures.
931
932 Considering all this, it was decided that we should for now assume
933 that INSNS can in theory contain control flow insns, but in practice
934 it never does. So we don't handle the theoretical case, and should
935 a real failure ever show up, we have a pretty good clue for how to
936 fix it. */
937
938 return bb;
939 }
940
941 /* Unroll LOOP for which we are able to count number of iterations in runtime
942 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
943 extra care for case n < 0):
944
945 for (i = 0; i < n; i++)
946 body;
947
948 ==> (LOOP->LPT_DECISION.TIMES == 3)
949
950 i = 0;
951 mod = n % 4;
952
953 switch (mod)
954 {
955 case 3:
956 body; i++;
957 case 2:
958 body; i++;
959 case 1:
960 body; i++;
961 case 0: ;
962 }
963
964 while (i < n)
965 {
966 body; i++;
967 body; i++;
968 body; i++;
969 body; i++;
970 }
971 */
972 static void
973 unroll_loop_runtime_iterations (struct loop *loop)
974 {
975 rtx old_niter, niter, init_code, branch_code, tmp;
976 unsigned i, j, p;
977 basic_block preheader, *body, swtch, ezc_swtch;
978 VEC (basic_block, heap) *dom_bbs;
979 sbitmap wont_exit;
980 int may_exit_copy;
981 unsigned n_peel;
982 VEC (edge, heap) *remove_edges;
983 edge e;
984 bool extra_zero_check, last_may_exit;
985 unsigned max_unroll = loop->lpt_decision.times;
986 struct niter_desc *desc = get_simple_loop_desc (loop);
987 bool exit_at_end = loop_exit_at_end_p (loop);
988 struct opt_info *opt_info = NULL;
989 bool ok;
990
991 if (flag_split_ivs_in_unroller
992 || flag_variable_expansion_in_unroller)
993 opt_info = analyze_insns_in_loop (loop);
994
995 /* Remember blocks whose dominators will have to be updated. */
996 dom_bbs = NULL;
997
998 body = get_loop_body (loop);
999 for (i = 0; i < loop->num_nodes; i++)
1000 {
1001 VEC (basic_block, heap) *ldom;
1002 basic_block bb;
1003
1004 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1005 FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
1006 if (!flow_bb_inside_loop_p (loop, bb))
1007 VEC_safe_push (basic_block, heap, dom_bbs, bb);
1008
1009 VEC_free (basic_block, heap, ldom);
1010 }
1011 free (body);
1012
1013 if (!exit_at_end)
1014 {
1015 /* Leave exit in first copy (for explanation why see comment in
1016 unroll_loop_constant_iterations). */
1017 may_exit_copy = 0;
1018 n_peel = max_unroll - 1;
1019 extra_zero_check = true;
1020 last_may_exit = false;
1021 }
1022 else
1023 {
1024 /* Leave exit in last copy (for explanation why see comment in
1025 unroll_loop_constant_iterations). */
1026 may_exit_copy = max_unroll;
1027 n_peel = max_unroll;
1028 extra_zero_check = false;
1029 last_may_exit = true;
1030 }
1031
1032 /* Get expression for number of iterations. */
1033 start_sequence ();
1034 old_niter = niter = gen_reg_rtx (desc->mode);
1035 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1036 if (tmp != niter)
1037 emit_move_insn (niter, tmp);
1038
1039 /* Count modulo by ANDing it with max_unroll; we use the fact that
1040 the number of unrollings is a power of two, and thus this is correct
1041 even if there is overflow in the computation. */
1042 niter = expand_simple_binop (desc->mode, AND,
1043 niter,
1044 GEN_INT (max_unroll),
1045 NULL_RTX, 0, OPTAB_LIB_WIDEN);
1046
1047 init_code = get_insns ();
1048 end_sequence ();
1049 unshare_all_rtl_in_chain (init_code);
1050
1051 /* Precondition the loop. */
1052 split_edge_and_insert (loop_preheader_edge (loop), init_code);
1053
1054 remove_edges = NULL;
1055
1056 wont_exit = sbitmap_alloc (max_unroll + 2);
1057
1058 /* Peel the first copy of loop body (almost always we must leave exit test
1059 here; the only exception is when we have extra zero check and the number
1060 of iterations is reliable. Also record the place of (possible) extra
1061 zero check. */
1062 sbitmap_zero (wont_exit);
1063 if (extra_zero_check
1064 && !desc->noloop_assumptions)
1065 SET_BIT (wont_exit, 1);
1066 ezc_swtch = loop_preheader_edge (loop)->src;
1067 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1068 1, wont_exit, desc->out_edge,
1069 &remove_edges,
1070 DLTHE_FLAG_UPDATE_FREQ);
1071 gcc_assert (ok);
1072
1073 /* Record the place where switch will be built for preconditioning. */
1074 swtch = split_edge (loop_preheader_edge (loop));
1075
1076 for (i = 0; i < n_peel; i++)
1077 {
1078 /* Peel the copy. */
1079 sbitmap_zero (wont_exit);
1080 if (i != n_peel - 1 || !last_may_exit)
1081 SET_BIT (wont_exit, 1);
1082 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1083 1, wont_exit, desc->out_edge,
1084 &remove_edges,
1085 DLTHE_FLAG_UPDATE_FREQ);
1086 gcc_assert (ok);
1087
1088 /* Create item for switch. */
1089 j = n_peel - i - (extra_zero_check ? 0 : 1);
1090 p = REG_BR_PROB_BASE / (i + 2);
1091
1092 preheader = split_edge (loop_preheader_edge (loop));
1093 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1094 block_label (preheader), p,
1095 NULL_RTX);
1096
1097 /* We rely on the fact that the compare and jump cannot be optimized out,
1098 and hence the cfg we create is correct. */
1099 gcc_assert (branch_code != NULL_RTX);
1100
1101 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1102 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1103 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1104 e = make_edge (swtch, preheader,
1105 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1106 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1107 e->probability = p;
1108 }
1109
1110 if (extra_zero_check)
1111 {
1112 /* Add branch for zero iterations. */
1113 p = REG_BR_PROB_BASE / (max_unroll + 1);
1114 swtch = ezc_swtch;
1115 preheader = split_edge (loop_preheader_edge (loop));
1116 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1117 block_label (preheader), p,
1118 NULL_RTX);
1119 gcc_assert (branch_code != NULL_RTX);
1120
1121 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1122 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1123 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1124 e = make_edge (swtch, preheader,
1125 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1126 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1127 e->probability = p;
1128 }
1129
1130 /* Recount dominators for outer blocks. */
1131 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1132
1133 /* And unroll loop. */
1134
1135 sbitmap_ones (wont_exit);
1136 RESET_BIT (wont_exit, may_exit_copy);
1137 opt_info_start_duplication (opt_info);
1138
1139 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1140 max_unroll,
1141 wont_exit, desc->out_edge,
1142 &remove_edges,
1143 DLTHE_FLAG_UPDATE_FREQ
1144 | (opt_info
1145 ? DLTHE_RECORD_COPY_NUMBER
1146 : 0));
1147 gcc_assert (ok);
1148
1149 if (opt_info)
1150 {
1151 apply_opt_in_copies (opt_info, max_unroll, true, true);
1152 free_opt_info (opt_info);
1153 }
1154
1155 free (wont_exit);
1156
1157 if (exit_at_end)
1158 {
1159 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1160 /* Find a new in and out edge; they are in the last copy we have
1161 made. */
1162
1163 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1164 {
1165 desc->out_edge = EDGE_SUCC (exit_block, 0);
1166 desc->in_edge = EDGE_SUCC (exit_block, 1);
1167 }
1168 else
1169 {
1170 desc->out_edge = EDGE_SUCC (exit_block, 1);
1171 desc->in_edge = EDGE_SUCC (exit_block, 0);
1172 }
1173 }
1174
1175 /* Remove the edges. */
1176 FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1177 remove_path (e);
1178 VEC_free (edge, heap, remove_edges);
1179
1180 /* We must be careful when updating the number of iterations due to
1181 preconditioning and the fact that the value must be valid at entry
1182 of the loop. After passing through the above code, we see that
1183 the correct new number of iterations is this: */
1184 gcc_assert (!desc->const_iter);
1185 desc->niter_expr =
1186 simplify_gen_binary (UDIV, desc->mode, old_niter,
1187 GEN_INT (max_unroll + 1));
1188 loop->nb_iterations_upper_bound
1189 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1190 + 1),
1191 TRUNC_DIV_EXPR);
1192 if (loop->any_estimate)
1193 loop->nb_iterations_estimate
1194 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1195 + 1),
1196 TRUNC_DIV_EXPR);
1197 if (exit_at_end)
1198 {
1199 desc->niter_expr =
1200 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1201 desc->noloop_assumptions = NULL_RTX;
1202 --loop->nb_iterations_upper_bound;
1203 if (loop->any_estimate
1204 && loop->nb_iterations_estimate != double_int_zero)
1205 --loop->nb_iterations_estimate;
1206 else
1207 loop->any_estimate = false;
1208 }
1209
1210 if (dump_file)
1211 fprintf (dump_file,
1212 ";; Unrolled loop %d times, counting # of iterations "
1213 "in runtime, %i insns\n",
1214 max_unroll, num_loop_insns (loop));
1215
1216 VEC_free (basic_block, heap, dom_bbs);
1217 }
1218
1219 /* Decide whether to simply peel LOOP and how much. */
1220 static void
1221 decide_peel_simple (struct loop *loop, int flags)
1222 {
1223 unsigned npeel;
1224 struct niter_desc *desc;
1225 double_int iterations;
1226
1227 if (!(flags & UAP_PEEL))
1228 {
1229 /* We were not asked to, just return back silently. */
1230 return;
1231 }
1232
1233 if (dump_file)
1234 fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1235
1236 /* npeel = number of iterations to peel. */
1237 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1238 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1239 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1240
1241 /* Skip big loops. */
1242 if (!npeel)
1243 {
1244 if (dump_file)
1245 fprintf (dump_file, ";; Not considering loop, is too big\n");
1246 return;
1247 }
1248
1249 /* Check for simple loops. */
1250 desc = get_simple_loop_desc (loop);
1251
1252 /* Check number of iterations. */
1253 if (desc->simple_p && !desc->assumptions && desc->const_iter)
1254 {
1255 if (dump_file)
1256 fprintf (dump_file, ";; Loop iterates constant times\n");
1257 return;
1258 }
1259
1260 /* Do not simply peel loops with branches inside -- it increases number
1261 of mispredicts. */
1262 if (num_loop_branches (loop) > 1)
1263 {
1264 if (dump_file)
1265 fprintf (dump_file, ";; Not peeling, contains branches\n");
1266 return;
1267 }
1268
1269 /* If we have realistic estimate on number of iterations, use it. */
1270 if (estimated_loop_iterations (loop, &iterations))
1271 {
1272 if (double_int::from_shwi (npeel).ule (iterations))
1273 {
1274 if (dump_file)
1275 {
1276 fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1277 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1278 (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1279 fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1280 npeel);
1281 }
1282 return;
1283 }
1284 npeel = iterations.to_shwi () + 1;
1285 }
1286 /* If we have small enough bound on iterations, we can still peel (completely
1287 unroll). */
1288 else if (max_loop_iterations (loop, &iterations)
1289 && iterations.ult (double_int::from_shwi (npeel)))
1290 npeel = iterations.to_shwi () + 1;
1291 else
1292 {
1293 /* For now we have no good heuristics to decide whether loop peeling
1294 will be effective, so disable it. */
1295 if (dump_file)
1296 fprintf (dump_file,
1297 ";; Not peeling loop, no evidence it will be profitable\n");
1298 return;
1299 }
1300
1301 /* Success. */
1302 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1303 loop->lpt_decision.times = npeel;
1304
1305 if (dump_file)
1306 fprintf (dump_file, ";; Decided to simply peel the loop %d times.\n",
1307 loop->lpt_decision.times);
1308 }
1309
1310 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1311
1312 while (cond)
1313 body;
1314
1315 ==> (LOOP->LPT_DECISION.TIMES == 3)
1316
1317 if (!cond) goto end;
1318 body;
1319 if (!cond) goto end;
1320 body;
1321 if (!cond) goto end;
1322 body;
1323 while (cond)
1324 body;
1325 end: ;
1326 */
1327 static void
1328 peel_loop_simple (struct loop *loop)
1329 {
1330 sbitmap wont_exit;
1331 unsigned npeel = loop->lpt_decision.times;
1332 struct niter_desc *desc = get_simple_loop_desc (loop);
1333 struct opt_info *opt_info = NULL;
1334 bool ok;
1335
1336 if (flag_split_ivs_in_unroller && npeel > 1)
1337 opt_info = analyze_insns_in_loop (loop);
1338
1339 wont_exit = sbitmap_alloc (npeel + 1);
1340 sbitmap_zero (wont_exit);
1341
1342 opt_info_start_duplication (opt_info);
1343
1344 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1345 npeel, wont_exit, NULL,
1346 NULL, DLTHE_FLAG_UPDATE_FREQ
1347 | (opt_info
1348 ? DLTHE_RECORD_COPY_NUMBER
1349 : 0));
1350 gcc_assert (ok);
1351
1352 free (wont_exit);
1353
1354 if (opt_info)
1355 {
1356 apply_opt_in_copies (opt_info, npeel, false, false);
1357 free_opt_info (opt_info);
1358 }
1359
1360 if (desc->simple_p)
1361 {
1362 if (desc->const_iter)
1363 {
1364 desc->niter -= npeel;
1365 desc->niter_expr = GEN_INT (desc->niter);
1366 desc->noloop_assumptions = NULL_RTX;
1367 }
1368 else
1369 {
1370 /* We cannot just update niter_expr, as its value might be clobbered
1371 inside loop. We could handle this by counting the number into
1372 temporary just like we do in runtime unrolling, but it does not
1373 seem worthwhile. */
1374 free_simple_loop_desc (loop);
1375 }
1376 }
1377 if (dump_file)
1378 fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1379 }
1380
1381 /* Decide whether to unroll LOOP stupidly and how much. */
1382 static void
1383 decide_unroll_stupid (struct loop *loop, int flags)
1384 {
1385 unsigned nunroll, nunroll_by_av, i;
1386 struct niter_desc *desc;
1387 double_int iterations;
1388
1389 if (!(flags & UAP_UNROLL_ALL))
1390 {
1391 /* We were not asked to, just return back silently. */
1392 return;
1393 }
1394
1395 if (dump_file)
1396 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1397
1398 /* nunroll = total number of copies of the original loop body in
1399 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1400 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1401 nunroll_by_av
1402 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1403 if (nunroll > nunroll_by_av)
1404 nunroll = nunroll_by_av;
1405 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1406 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1407
1408 if (targetm.loop_unroll_adjust)
1409 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1410
1411 /* Skip big loops. */
1412 if (nunroll <= 1)
1413 {
1414 if (dump_file)
1415 fprintf (dump_file, ";; Not considering loop, is too big\n");
1416 return;
1417 }
1418
1419 /* Check for simple loops. */
1420 desc = get_simple_loop_desc (loop);
1421
1422 /* Check simpleness. */
1423 if (desc->simple_p && !desc->assumptions)
1424 {
1425 if (dump_file)
1426 fprintf (dump_file, ";; The loop is simple\n");
1427 return;
1428 }
1429
1430 /* Do not unroll loops with branches inside -- it increases number
1431 of mispredicts. */
1432 if (num_loop_branches (loop) > 1)
1433 {
1434 if (dump_file)
1435 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1436 return;
1437 }
1438
1439 /* Check whether the loop rolls. */
1440 if ((estimated_loop_iterations (loop, &iterations)
1441 || max_loop_iterations (loop, &iterations))
1442 && iterations.ult (double_int::from_shwi (2 * nunroll)))
1443 {
1444 if (dump_file)
1445 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1446 return;
1447 }
1448
1449 /* Success. Now force nunroll to be power of 2, as it seems that this
1450 improves results (partially because of better alignments, partially
1451 because of some dark magic). */
1452 for (i = 1; 2 * i <= nunroll; i *= 2)
1453 continue;
1454
1455 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1456 loop->lpt_decision.times = i - 1;
1457
1458 if (dump_file)
1459 fprintf (dump_file, ";; Decided to unroll the loop stupidly %d times.\n",
1460 loop->lpt_decision.times);
1461 }
1462
1463 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1464
1465 while (cond)
1466 body;
1467
1468 ==> (LOOP->LPT_DECISION.TIMES == 3)
1469
1470 while (cond)
1471 {
1472 body;
1473 if (!cond) break;
1474 body;
1475 if (!cond) break;
1476 body;
1477 if (!cond) break;
1478 body;
1479 }
1480 */
1481 static void
1482 unroll_loop_stupid (struct loop *loop)
1483 {
1484 sbitmap wont_exit;
1485 unsigned nunroll = loop->lpt_decision.times;
1486 struct niter_desc *desc = get_simple_loop_desc (loop);
1487 struct opt_info *opt_info = NULL;
1488 bool ok;
1489
1490 if (flag_split_ivs_in_unroller
1491 || flag_variable_expansion_in_unroller)
1492 opt_info = analyze_insns_in_loop (loop);
1493
1494
1495 wont_exit = sbitmap_alloc (nunroll + 1);
1496 sbitmap_zero (wont_exit);
1497 opt_info_start_duplication (opt_info);
1498
1499 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1500 nunroll, wont_exit,
1501 NULL, NULL,
1502 DLTHE_FLAG_UPDATE_FREQ
1503 | (opt_info
1504 ? DLTHE_RECORD_COPY_NUMBER
1505 : 0));
1506 gcc_assert (ok);
1507
1508 if (opt_info)
1509 {
1510 apply_opt_in_copies (opt_info, nunroll, true, true);
1511 free_opt_info (opt_info);
1512 }
1513
1514 free (wont_exit);
1515
1516 if (desc->simple_p)
1517 {
1518 /* We indeed may get here provided that there are nontrivial assumptions
1519 for a loop to be really simple. We could update the counts, but the
1520 problem is that we are unable to decide which exit will be taken
1521 (not really true in case the number of iterations is constant,
1522 but noone will do anything with this information, so we do not
1523 worry about it). */
1524 desc->simple_p = false;
1525 }
1526
1527 if (dump_file)
1528 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1529 nunroll, num_loop_insns (loop));
1530 }
1531
1532 /* A hash function for information about insns to split. */
1533
1534 static hashval_t
1535 si_info_hash (const void *ivts)
1536 {
1537 return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1538 }
1539
1540 /* An equality functions for information about insns to split. */
1541
1542 static int
1543 si_info_eq (const void *ivts1, const void *ivts2)
1544 {
1545 const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1546 const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1547
1548 return i1->insn == i2->insn;
1549 }
1550
1551 /* Return a hash for VES, which is really a "var_to_expand *". */
1552
1553 static hashval_t
1554 ve_info_hash (const void *ves)
1555 {
1556 return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1557 }
1558
1559 /* Return true if IVTS1 and IVTS2 (which are really both of type
1560 "var_to_expand *") refer to the same instruction. */
1561
1562 static int
1563 ve_info_eq (const void *ivts1, const void *ivts2)
1564 {
1565 const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1566 const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1567
1568 return i1->insn == i2->insn;
1569 }
1570
1571 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1572 Set *DEBUG_USES to the number of debug insns that reference the
1573 variable. */
1574
1575 bool
1576 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1577 int *debug_uses)
1578 {
1579 basic_block *body, bb;
1580 unsigned i;
1581 int count_ref = 0;
1582 rtx insn;
1583
1584 body = get_loop_body (loop);
1585 for (i = 0; i < loop->num_nodes; i++)
1586 {
1587 bb = body[i];
1588
1589 FOR_BB_INSNS (bb, insn)
1590 if (!rtx_referenced_p (reg, insn))
1591 continue;
1592 else if (DEBUG_INSN_P (insn))
1593 ++*debug_uses;
1594 else if (++count_ref > 1)
1595 break;
1596 }
1597 free (body);
1598 return (count_ref == 1);
1599 }
1600
1601 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1602
1603 static void
1604 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1605 {
1606 basic_block *body, bb;
1607 unsigned i;
1608 rtx insn;
1609
1610 body = get_loop_body (loop);
1611 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1612 {
1613 bb = body[i];
1614
1615 FOR_BB_INSNS (bb, insn)
1616 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1617 continue;
1618 else
1619 {
1620 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1621 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1622 if (!--debug_uses)
1623 break;
1624 }
1625 }
1626 free (body);
1627 }
1628
1629 /* Determine whether INSN contains an accumulator
1630 which can be expanded into separate copies,
1631 one for each copy of the LOOP body.
1632
1633 for (i = 0 ; i < n; i++)
1634 sum += a[i];
1635
1636 ==>
1637
1638 sum += a[i]
1639 ....
1640 i = i+1;
1641 sum1 += a[i]
1642 ....
1643 i = i+1
1644 sum2 += a[i];
1645 ....
1646
1647 Return NULL if INSN contains no opportunity for expansion of accumulator.
1648 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1649 information and return a pointer to it.
1650 */
1651
1652 static struct var_to_expand *
1653 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1654 {
1655 rtx set, dest, src;
1656 struct var_to_expand *ves;
1657 unsigned accum_pos;
1658 enum rtx_code code;
1659 int debug_uses = 0;
1660
1661 set = single_set (insn);
1662 if (!set)
1663 return NULL;
1664
1665 dest = SET_DEST (set);
1666 src = SET_SRC (set);
1667 code = GET_CODE (src);
1668
1669 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1670 return NULL;
1671
1672 if (FLOAT_MODE_P (GET_MODE (dest)))
1673 {
1674 if (!flag_associative_math)
1675 return NULL;
1676 /* In the case of FMA, we're also changing the rounding. */
1677 if (code == FMA && !flag_unsafe_math_optimizations)
1678 return NULL;
1679 }
1680
1681 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1682 in MD. But if there is no optab to generate the insn, we can not
1683 perform the variable expansion. This can happen if an MD provides
1684 an insn but not a named pattern to generate it, for example to avoid
1685 producing code that needs additional mode switches like for x87/mmx.
1686
1687 So we check have_insn_for which looks for an optab for the operation
1688 in SRC. If it doesn't exist, we can't perform the expansion even
1689 though INSN is valid. */
1690 if (!have_insn_for (code, GET_MODE (src)))
1691 return NULL;
1692
1693 if (!REG_P (dest)
1694 && !(GET_CODE (dest) == SUBREG
1695 && REG_P (SUBREG_REG (dest))))
1696 return NULL;
1697
1698 /* Find the accumulator use within the operation. */
1699 if (code == FMA)
1700 {
1701 /* We only support accumulation via FMA in the ADD position. */
1702 if (!rtx_equal_p (dest, XEXP (src, 2)))
1703 return NULL;
1704 accum_pos = 2;
1705 }
1706 else if (rtx_equal_p (dest, XEXP (src, 0)))
1707 accum_pos = 0;
1708 else if (rtx_equal_p (dest, XEXP (src, 1)))
1709 {
1710 /* The method of expansion that we are using; which includes the
1711 initialization of the expansions with zero and the summation of
1712 the expansions at the end of the computation will yield wrong
1713 results for (x = something - x) thus avoid using it in that case. */
1714 if (code == MINUS)
1715 return NULL;
1716 accum_pos = 1;
1717 }
1718 else
1719 return NULL;
1720
1721 /* It must not otherwise be used. */
1722 if (code == FMA)
1723 {
1724 if (rtx_referenced_p (dest, XEXP (src, 0))
1725 || rtx_referenced_p (dest, XEXP (src, 1)))
1726 return NULL;
1727 }
1728 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1729 return NULL;
1730
1731 /* It must be used in exactly one insn. */
1732 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1733 return NULL;
1734
1735 if (dump_file)
1736 {
1737 fprintf (dump_file, "\n;; Expanding Accumulator ");
1738 print_rtl (dump_file, dest);
1739 fprintf (dump_file, "\n");
1740 }
1741
1742 if (debug_uses)
1743 /* Instead of resetting the debug insns, we could replace each
1744 debug use in the loop with the sum or product of all expanded
1745 accummulators. Since we'll only know of all expansions at the
1746 end, we'd have to keep track of which vars_to_expand a debug
1747 insn in the loop references, take note of each copy of the
1748 debug insn during unrolling, and when it's all done, compute
1749 the sum or product of each variable and adjust the original
1750 debug insn and each copy thereof. What a pain! */
1751 reset_debug_uses_in_loop (loop, dest, debug_uses);
1752
1753 /* Record the accumulator to expand. */
1754 ves = XNEW (struct var_to_expand);
1755 ves->insn = insn;
1756 ves->reg = copy_rtx (dest);
1757 ves->var_expansions = VEC_alloc (rtx, heap, 1);
1758 ves->next = NULL;
1759 ves->op = GET_CODE (src);
1760 ves->expansion_count = 0;
1761 ves->reuse_expansion = 0;
1762 return ves;
1763 }
1764
1765 /* Determine whether there is an induction variable in INSN that
1766 we would like to split during unrolling.
1767
1768 I.e. replace
1769
1770 i = i + 1;
1771 ...
1772 i = i + 1;
1773 ...
1774 i = i + 1;
1775 ...
1776
1777 type chains by
1778
1779 i0 = i + 1
1780 ...
1781 i = i0 + 1
1782 ...
1783 i = i0 + 2
1784 ...
1785
1786 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1787 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1788 pointer to it. */
1789
1790 static struct iv_to_split *
1791 analyze_iv_to_split_insn (rtx insn)
1792 {
1793 rtx set, dest;
1794 struct rtx_iv iv;
1795 struct iv_to_split *ivts;
1796 bool ok;
1797
1798 /* For now we just split the basic induction variables. Later this may be
1799 extended for example by selecting also addresses of memory references. */
1800 set = single_set (insn);
1801 if (!set)
1802 return NULL;
1803
1804 dest = SET_DEST (set);
1805 if (!REG_P (dest))
1806 return NULL;
1807
1808 if (!biv_p (insn, dest))
1809 return NULL;
1810
1811 ok = iv_analyze_result (insn, dest, &iv);
1812
1813 /* This used to be an assert under the assumption that if biv_p returns
1814 true that iv_analyze_result must also return true. However, that
1815 assumption is not strictly correct as evidenced by pr25569.
1816
1817 Returning NULL when iv_analyze_result returns false is safe and
1818 avoids the problems in pr25569 until the iv_analyze_* routines
1819 can be fixed, which is apparently hard and time consuming
1820 according to their author. */
1821 if (! ok)
1822 return NULL;
1823
1824 if (iv.step == const0_rtx
1825 || iv.mode != iv.extend_mode)
1826 return NULL;
1827
1828 /* Record the insn to split. */
1829 ivts = XNEW (struct iv_to_split);
1830 ivts->insn = insn;
1831 ivts->base_var = NULL_RTX;
1832 ivts->step = iv.step;
1833 ivts->next = NULL;
1834 ivts->n_loc = 1;
1835 ivts->loc[0] = 1;
1836
1837 return ivts;
1838 }
1839
1840 /* Determines which of insns in LOOP can be optimized.
1841 Return a OPT_INFO struct with the relevant hash tables filled
1842 with all insns to be optimized. The FIRST_NEW_BLOCK field
1843 is undefined for the return value. */
1844
1845 static struct opt_info *
1846 analyze_insns_in_loop (struct loop *loop)
1847 {
1848 basic_block *body, bb;
1849 unsigned i;
1850 struct opt_info *opt_info = XCNEW (struct opt_info);
1851 rtx insn;
1852 struct iv_to_split *ivts = NULL;
1853 struct var_to_expand *ves = NULL;
1854 PTR *slot1;
1855 PTR *slot2;
1856 VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1857 edge exit;
1858 bool can_apply = false;
1859
1860 iv_analysis_loop_init (loop);
1861
1862 body = get_loop_body (loop);
1863
1864 if (flag_split_ivs_in_unroller)
1865 {
1866 opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1867 si_info_hash, si_info_eq, free);
1868 opt_info->iv_to_split_head = NULL;
1869 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1870 }
1871
1872 /* Record the loop exit bb and loop preheader before the unrolling. */
1873 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1874
1875 if (VEC_length (edge, edges) == 1)
1876 {
1877 exit = VEC_index (edge, edges, 0);
1878 if (!(exit->flags & EDGE_COMPLEX))
1879 {
1880 opt_info->loop_exit = split_edge (exit);
1881 can_apply = true;
1882 }
1883 }
1884
1885 if (flag_variable_expansion_in_unroller
1886 && can_apply)
1887 {
1888 opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1889 ve_info_hash,
1890 ve_info_eq, free);
1891 opt_info->var_to_expand_head = NULL;
1892 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1893 }
1894
1895 for (i = 0; i < loop->num_nodes; i++)
1896 {
1897 bb = body[i];
1898 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1899 continue;
1900
1901 FOR_BB_INSNS (bb, insn)
1902 {
1903 if (!INSN_P (insn))
1904 continue;
1905
1906 if (opt_info->insns_to_split)
1907 ivts = analyze_iv_to_split_insn (insn);
1908
1909 if (ivts)
1910 {
1911 slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1912 gcc_assert (*slot1 == NULL);
1913 *slot1 = ivts;
1914 *opt_info->iv_to_split_tail = ivts;
1915 opt_info->iv_to_split_tail = &ivts->next;
1916 continue;
1917 }
1918
1919 if (opt_info->insns_with_var_to_expand)
1920 ves = analyze_insn_to_expand_var (loop, insn);
1921
1922 if (ves)
1923 {
1924 slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1925 gcc_assert (*slot2 == NULL);
1926 *slot2 = ves;
1927 *opt_info->var_to_expand_tail = ves;
1928 opt_info->var_to_expand_tail = &ves->next;
1929 }
1930 }
1931 }
1932
1933 VEC_free (edge, heap, edges);
1934 free (body);
1935 return opt_info;
1936 }
1937
1938 /* Called just before loop duplication. Records start of duplicated area
1939 to OPT_INFO. */
1940
1941 static void
1942 opt_info_start_duplication (struct opt_info *opt_info)
1943 {
1944 if (opt_info)
1945 opt_info->first_new_block = last_basic_block;
1946 }
1947
1948 /* Determine the number of iterations between initialization of the base
1949 variable and the current copy (N_COPY). N_COPIES is the total number
1950 of newly created copies. UNROLLING is true if we are unrolling
1951 (not peeling) the loop. */
1952
1953 static unsigned
1954 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1955 {
1956 if (unrolling)
1957 {
1958 /* If we are unrolling, initialization is done in the original loop
1959 body (number 0). */
1960 return n_copy;
1961 }
1962 else
1963 {
1964 /* If we are peeling, the copy in that the initialization occurs has
1965 number 1. The original loop (number 0) is the last. */
1966 if (n_copy)
1967 return n_copy - 1;
1968 else
1969 return n_copies;
1970 }
1971 }
1972
1973 /* Locate in EXPR the expression corresponding to the location recorded
1974 in IVTS, and return a pointer to the RTX for this location. */
1975
1976 static rtx *
1977 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1978 {
1979 unsigned i;
1980 rtx *ret = &expr;
1981
1982 for (i = 0; i < ivts->n_loc; i++)
1983 ret = &XEXP (*ret, ivts->loc[i]);
1984
1985 return ret;
1986 }
1987
1988 /* Allocate basic variable for the induction variable chain. */
1989
1990 static void
1991 allocate_basic_variable (struct iv_to_split *ivts)
1992 {
1993 rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1994
1995 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1996 }
1997
1998 /* Insert initialization of basic variable of IVTS before INSN, taking
1999 the initial value from INSN. */
2000
2001 static void
2002 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2003 {
2004 rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2005 rtx seq;
2006
2007 start_sequence ();
2008 expr = force_operand (expr, ivts->base_var);
2009 if (expr != ivts->base_var)
2010 emit_move_insn (ivts->base_var, expr);
2011 seq = get_insns ();
2012 end_sequence ();
2013
2014 emit_insn_before (seq, insn);
2015 }
2016
2017 /* Replace the use of induction variable described in IVTS in INSN
2018 by base variable + DELTA * step. */
2019
2020 static void
2021 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2022 {
2023 rtx expr, *loc, seq, incr, var;
2024 enum machine_mode mode = GET_MODE (ivts->base_var);
2025 rtx src, dest, set;
2026
2027 /* Construct base + DELTA * step. */
2028 if (!delta)
2029 expr = ivts->base_var;
2030 else
2031 {
2032 incr = simplify_gen_binary (MULT, mode,
2033 ivts->step, gen_int_mode (delta, mode));
2034 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2035 ivts->base_var, incr);
2036 }
2037
2038 /* Figure out where to do the replacement. */
2039 loc = get_ivts_expr (single_set (insn), ivts);
2040
2041 /* If we can make the replacement right away, we're done. */
2042 if (validate_change (insn, loc, expr, 0))
2043 return;
2044
2045 /* Otherwise, force EXPR into a register and try again. */
2046 start_sequence ();
2047 var = gen_reg_rtx (mode);
2048 expr = force_operand (expr, var);
2049 if (expr != var)
2050 emit_move_insn (var, expr);
2051 seq = get_insns ();
2052 end_sequence ();
2053 emit_insn_before (seq, insn);
2054
2055 if (validate_change (insn, loc, var, 0))
2056 return;
2057
2058 /* The last chance. Try recreating the assignment in insn
2059 completely from scratch. */
2060 set = single_set (insn);
2061 gcc_assert (set);
2062
2063 start_sequence ();
2064 *loc = var;
2065 src = copy_rtx (SET_SRC (set));
2066 dest = copy_rtx (SET_DEST (set));
2067 src = force_operand (src, dest);
2068 if (src != dest)
2069 emit_move_insn (dest, src);
2070 seq = get_insns ();
2071 end_sequence ();
2072
2073 emit_insn_before (seq, insn);
2074 delete_insn (insn);
2075 }
2076
2077
2078 /* Return one expansion of the accumulator recorded in struct VE. */
2079
2080 static rtx
2081 get_expansion (struct var_to_expand *ve)
2082 {
2083 rtx reg;
2084
2085 if (ve->reuse_expansion == 0)
2086 reg = ve->reg;
2087 else
2088 reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2089
2090 if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2091 ve->reuse_expansion = 0;
2092 else
2093 ve->reuse_expansion++;
2094
2095 return reg;
2096 }
2097
2098
2099 /* Given INSN replace the uses of the accumulator recorded in VE
2100 with a new register. */
2101
2102 static void
2103 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2104 {
2105 rtx new_reg, set;
2106 bool really_new_expansion = false;
2107
2108 set = single_set (insn);
2109 gcc_assert (set);
2110
2111 /* Generate a new register only if the expansion limit has not been
2112 reached. Else reuse an already existing expansion. */
2113 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2114 {
2115 really_new_expansion = true;
2116 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2117 }
2118 else
2119 new_reg = get_expansion (ve);
2120
2121 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2122 if (apply_change_group ())
2123 if (really_new_expansion)
2124 {
2125 VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2126 ve->expansion_count++;
2127 }
2128 }
2129
2130 /* Initialize the variable expansions in loop preheader. PLACE is the
2131 loop-preheader basic block where the initialization of the
2132 expansions should take place. The expansions are initialized with
2133 (-0) when the operation is plus or minus to honor sign zero. This
2134 way we can prevent cases where the sign of the final result is
2135 effected by the sign of the expansion. Here is an example to
2136 demonstrate this:
2137
2138 for (i = 0 ; i < n; i++)
2139 sum += something;
2140
2141 ==>
2142
2143 sum += something
2144 ....
2145 i = i+1;
2146 sum1 += something
2147 ....
2148 i = i+1
2149 sum2 += something;
2150 ....
2151
2152 When SUM is initialized with -zero and SOMETHING is also -zero; the
2153 final result of sum should be -zero thus the expansions sum1 and sum2
2154 should be initialized with -zero as well (otherwise we will get +zero
2155 as the final result). */
2156
2157 static void
2158 insert_var_expansion_initialization (struct var_to_expand *ve,
2159 basic_block place)
2160 {
2161 rtx seq, var, zero_init;
2162 unsigned i;
2163 enum machine_mode mode = GET_MODE (ve->reg);
2164 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2165
2166 if (VEC_length (rtx, ve->var_expansions) == 0)
2167 return;
2168
2169 start_sequence ();
2170 switch (ve->op)
2171 {
2172 case FMA:
2173 /* Note that we only accumulate FMA via the ADD operand. */
2174 case PLUS:
2175 case MINUS:
2176 FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2177 {
2178 if (honor_signed_zero_p)
2179 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2180 else
2181 zero_init = CONST0_RTX (mode);
2182 emit_move_insn (var, zero_init);
2183 }
2184 break;
2185
2186 case MULT:
2187 FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2188 {
2189 zero_init = CONST1_RTX (GET_MODE (var));
2190 emit_move_insn (var, zero_init);
2191 }
2192 break;
2193
2194 default:
2195 gcc_unreachable ();
2196 }
2197
2198 seq = get_insns ();
2199 end_sequence ();
2200
2201 emit_insn_after (seq, BB_END (place));
2202 }
2203
2204 /* Combine the variable expansions at the loop exit. PLACE is the
2205 loop exit basic block where the summation of the expansions should
2206 take place. */
2207
2208 static void
2209 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2210 {
2211 rtx sum = ve->reg;
2212 rtx expr, seq, var, insn;
2213 unsigned i;
2214
2215 if (VEC_length (rtx, ve->var_expansions) == 0)
2216 return;
2217
2218 start_sequence ();
2219 switch (ve->op)
2220 {
2221 case FMA:
2222 /* Note that we only accumulate FMA via the ADD operand. */
2223 case PLUS:
2224 case MINUS:
2225 FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2226 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2227 break;
2228
2229 case MULT:
2230 FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2231 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2232 break;
2233
2234 default:
2235 gcc_unreachable ();
2236 }
2237
2238 expr = force_operand (sum, ve->reg);
2239 if (expr != ve->reg)
2240 emit_move_insn (ve->reg, expr);
2241 seq = get_insns ();
2242 end_sequence ();
2243
2244 insn = BB_HEAD (place);
2245 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2246 insn = NEXT_INSN (insn);
2247
2248 emit_insn_after (seq, insn);
2249 }
2250
2251 /* Apply loop optimizations in loop copies using the
2252 data which gathered during the unrolling. Structure
2253 OPT_INFO record that data.
2254
2255 UNROLLING is true if we unrolled (not peeled) the loop.
2256 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2257 the loop (as it should happen in complete unrolling, but not in ordinary
2258 peeling of the loop). */
2259
2260 static void
2261 apply_opt_in_copies (struct opt_info *opt_info,
2262 unsigned n_copies, bool unrolling,
2263 bool rewrite_original_loop)
2264 {
2265 unsigned i, delta;
2266 basic_block bb, orig_bb;
2267 rtx insn, orig_insn, next;
2268 struct iv_to_split ivts_templ, *ivts;
2269 struct var_to_expand ve_templ, *ves;
2270
2271 /* Sanity check -- we need to put initialization in the original loop
2272 body. */
2273 gcc_assert (!unrolling || rewrite_original_loop);
2274
2275 /* Allocate the basic variables (i0). */
2276 if (opt_info->insns_to_split)
2277 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2278 allocate_basic_variable (ivts);
2279
2280 for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2281 {
2282 bb = BASIC_BLOCK (i);
2283 orig_bb = get_bb_original (bb);
2284
2285 /* bb->aux holds position in copy sequence initialized by
2286 duplicate_loop_to_header_edge. */
2287 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2288 unrolling);
2289 bb->aux = 0;
2290 orig_insn = BB_HEAD (orig_bb);
2291 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2292 {
2293 next = NEXT_INSN (insn);
2294 if (!INSN_P (insn)
2295 || (DEBUG_INSN_P (insn)
2296 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2297 continue;
2298
2299 while (!INSN_P (orig_insn)
2300 || (DEBUG_INSN_P (orig_insn)
2301 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2302 == LABEL_DECL)))
2303 orig_insn = NEXT_INSN (orig_insn);
2304
2305 ivts_templ.insn = orig_insn;
2306 ve_templ.insn = orig_insn;
2307
2308 /* Apply splitting iv optimization. */
2309 if (opt_info->insns_to_split)
2310 {
2311 ivts = (struct iv_to_split *)
2312 htab_find (opt_info->insns_to_split, &ivts_templ);
2313
2314 if (ivts)
2315 {
2316 gcc_assert (GET_CODE (PATTERN (insn))
2317 == GET_CODE (PATTERN (orig_insn)));
2318
2319 if (!delta)
2320 insert_base_initialization (ivts, insn);
2321 split_iv (ivts, insn, delta);
2322 }
2323 }
2324 /* Apply variable expansion optimization. */
2325 if (unrolling && opt_info->insns_with_var_to_expand)
2326 {
2327 ves = (struct var_to_expand *)
2328 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2329 if (ves)
2330 {
2331 gcc_assert (GET_CODE (PATTERN (insn))
2332 == GET_CODE (PATTERN (orig_insn)));
2333 expand_var_during_unrolling (ves, insn);
2334 }
2335 }
2336 orig_insn = NEXT_INSN (orig_insn);
2337 }
2338 }
2339
2340 if (!rewrite_original_loop)
2341 return;
2342
2343 /* Initialize the variable expansions in the loop preheader
2344 and take care of combining them at the loop exit. */
2345 if (opt_info->insns_with_var_to_expand)
2346 {
2347 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2348 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2349 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2350 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2351 }
2352
2353 /* Rewrite also the original loop body. Find them as originals of the blocks
2354 in the last copied iteration, i.e. those that have
2355 get_bb_copy (get_bb_original (bb)) == bb. */
2356 for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2357 {
2358 bb = BASIC_BLOCK (i);
2359 orig_bb = get_bb_original (bb);
2360 if (get_bb_copy (orig_bb) != bb)
2361 continue;
2362
2363 delta = determine_split_iv_delta (0, n_copies, unrolling);
2364 for (orig_insn = BB_HEAD (orig_bb);
2365 orig_insn != NEXT_INSN (BB_END (bb));
2366 orig_insn = next)
2367 {
2368 next = NEXT_INSN (orig_insn);
2369
2370 if (!INSN_P (orig_insn))
2371 continue;
2372
2373 ivts_templ.insn = orig_insn;
2374 if (opt_info->insns_to_split)
2375 {
2376 ivts = (struct iv_to_split *)
2377 htab_find (opt_info->insns_to_split, &ivts_templ);
2378 if (ivts)
2379 {
2380 if (!delta)
2381 insert_base_initialization (ivts, orig_insn);
2382 split_iv (ivts, orig_insn, delta);
2383 continue;
2384 }
2385 }
2386
2387 }
2388 }
2389 }
2390
2391 /* Release OPT_INFO. */
2392
2393 static void
2394 free_opt_info (struct opt_info *opt_info)
2395 {
2396 if (opt_info->insns_to_split)
2397 htab_delete (opt_info->insns_to_split);
2398 if (opt_info->insns_with_var_to_expand)
2399 {
2400 struct var_to_expand *ves;
2401
2402 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2403 VEC_free (rtx, heap, ves->var_expansions);
2404 htab_delete (opt_info->insns_with_var_to_expand);
2405 }
2406 free (opt_info);
2407 }